# Code for Elog scraping
I am using the web site of the elogook to convert the data in a pandas dataframe.

In [1]:
# To get the stack you are using
!which python

/cvmfs/sft.cern.ch/lcg/views/LCG_95apython3/x86_64-centos7-gcc7-opt/bin/python


In [2]:
# To get your path on EOS of your notebook
pwd

'/eos/user/s/sterbini/MD_ANALYSIS/bblumi/docs/how-tos/ElogScraping'

In [3]:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup

def getSoup(date='20180612', shift=1, elog=60):
    ''' This method will convert a specific shift of the elogbook in BeautifulSoup object
    - date is in yyyymmdd format
    - shift is an integer (1= 'morning', 2='afternoon', 3='night')
    - elog is the elog code (e.g., 60='LHC_OP')
    '''
    address=f'http://elogbook.cern.ch/eLogbook/eLogbook.jsp?lgbk={elog}&date={date}&shift={shift}'
    website_url = requests.get(address).text
    soup = BeautifulSoup(website_url,'lxml') 
    return {'soup':soup,'elog':elog,'date':date,'shift':shift}

def getDictionary(soup):
    ''' This method will convert a BeautifulSoup object from elogbook to a pandas DF
    - soup is the dictionary coming from the getSoup method
    '''
    # Table
    myTable = soup['soup'].find('table',{'id':'events_table'})

    # Table header and column description
    tableHead = myTable.find('thead')
    tableDescription=tableHead.find_all('b')
    fieldNumber=len(tableDescription)
    myFields=[]
    for i in tableDescription:
        myFields.append(i.getText())

    # Table body    
    tableBody = myTable.find('tbody',{'id':'body_events'})

    # From the table body to the events
    myEvents=list(tableBody.children)
    myEvents=myEvents[1::2]
    myEventsShort=[]
    dictionaryList=[]
    for i in myEvents:
        myEvent= list(i.children)
        myEvent=myEvent[1::2]
        myDictionary={}
        for j,z in zip(myEvent,myFields):
            myDictionary[z]=list(j.children)[1].getText()
        myLink=myEvent[1].find_all('a', href=True)
        myDictionary['link']='http://elogbook.cern.ch/eLogbook/'+myLink[0]['href']
        dictionaryList.append(myDictionary)

    myDF=pd.DataFrame(dictionaryList)[myFields+['link']]
    myDF['Time']=myDF['Time'].apply(lambda x:re.sub('[^A-Za-z0-9:./;\-,_]+', '', x))
    myDF['Time']=soup['date'] + ' ' + myDF['Time']
    myDF['Time']=myDF['Time'].apply(lambda x : pd.Timestamp(x))
    myDF=myDF.set_index('Time')
    # this is useful for the change of the day
    if soup['shift']==3:
        aux1= myDF.between_time('21:00','00:00')
        aux2=myDF.between_time('00:00','07:00')
        aux2.index=aux2.index+pd.Timedelta('1d')
        myDF=pd.concat([aux1,aux2])
    myDF.index=myDF.index.tz_localize('CET').tz_convert('UTC')
    myDF.index.name=None
    return myDF

# An example

In [5]:
# first I get the soup (this is a bit slow, but is comparable to the loading time of a elog page)
soup=getSoup(date='20180702', shift=3, elog=60)

In [6]:
# after I cast it in a pandas DF
aux=getDictionary(soup)
aux

Unnamed: 0,#,PROTONPHY,Comment,link
2018-07-02 21:00:00+00:00,1,\n\n\n\nNB\n\n\n\n,\n\n\n\n\nGuy and Michaela \n\n\n\ncreated by ...,http://elogbook.cern.ch/eLogbook/event_viewer....
2018-07-02 21:52:00+00:00,2,\n\n\n\nNB\n\n\n\n,\n\n\n\n\nStart precycle of EIS and RD1.LR5 \n...,http://elogbook.cern.ch/eLogbook/event_viewer....
2018-07-02 21:52:00+00:00,3,\n\n\n\n 1 \n\n\n\n,"\n\n\n\n\nRestore RSS.A56B1, which had tripped...",http://elogbook.cern.ch/eLogbook/event_viewer....
2018-07-02 22:08:00+00:00,4,\n\n\n\nSUP\n\n\n\n,\n\n\n\n\n\n\n\nBEAM MODE > SETUP \n\n\n\n\n\...,http://elogbook.cern.ch/eLogbook/event_viewer....
2018-07-02 22:15:00+00:00,5,\n\n\n\nSUP\n\n\n\n,\n\n\n\n\nPrecycle of RD1.LR5 complete \n\n\n\...,http://elogbook.cern.ch/eLogbook/event_viewer....
2018-07-02 22:19:00+00:00,6,\n\n\n\nSUP\n\n\n\n,\n\n\n\n\nLHC SEQ: QPS configuration cross che...,http://elogbook.cern.ch/eLogbook/event_viewer....
2018-07-02 22:30:00+00:00,7,\n\n\n\nBI\n\n\n\n,\n\n\n\n\n\n\n\nBEAM MODE > INJECTION PROBE BE...,http://elogbook.cern.ch/eLogbook/event_viewer....
2018-07-02 22:31:00+00:00,8,\n\n\n\nBI\n\n\n\n,"\n\n\n\n\nDry dump, XPOC interlock B2, MKD ris...",http://elogbook.cern.ch/eLogbook/event_viewer....
2018-07-02 22:38:00+00:00,9,\n\n\n\nBI\n\n\n\n,\n\n\n\n\nRD1 replaced. SIS needs to be update...,http://elogbook.cern.ch/eLogbook/event_viewer....
2018-07-02 22:39:00+00:00,10,\n\n\n\nBI\n\n\n\n,\n\n\n\n\nAnother dry dump to test the XPOC is...,http://elogbook.cern.ch/eLogbook/event_viewer....


In [7]:
# Memory of usage in MB
aux.memory_usage().sum()/1024/1024

0.003204345703125

In [8]:
# Some naive filtering
aux.loc['2018-07-03 04:56:00+00:00']['link']

2018-07-03 04:56:00+00:00    http://elogbook.cern.ch/eLogbook/event_viewer....
2018-07-03 04:56:00+00:00    http://elogbook.cern.ch/eLogbook/event_viewer....
Name: link, dtype: object