In [36]:
import requests, pandas as pd, numpy as np
from requests import session
from bs4 import BeautifulSoup
In [2]:
url='http://www.omnibus.ro/index.php/hu/szekelyfoldi-top-cegek/arbevetel/also-haromszek-2014-2017'
r = requests.get(url)

for c in r.cookies:
    print(c.name, c.value)
__cfduid d24635946898bf1ede7a8ba5f006187701544919044
PHPSESSID hjck9hapmkg9b9rt5vd3nrf5p5
In [129]:
dfs=[]
regions=['also-haromszek','felso-haromszek','csikszek','udvarhelyszek','marosszek','gyergyoszek']
for region in regions:
    url='http://www.omnibus.ro/index.php/hu/szekelyfoldi-top-cegek/alkalmazott/'+\
        region+'-2014-2017/'
    with session() as c:
        response = c.get(url)
        #print(response.headers)
        #print(response.text)
        df=pd.read_html(response.text)[1]
        df.columns=[0]+list(df.loc[0])[:-1]
        df=df.loc[2:].set_index(0)
        df=df[df.columns[1:-1]]
        df=df.loc[list(df.index)[:-1]]
        df['region']=region
        df['nr']=df.index
        
    soup = BeautifulSoup(response.content)
    links=soup.findAll('table')[3].findAll('a')
    coords=[]
    kws=[]
    cms=[]
    for i in range(len(links)):
        print(i,)
        r=requests.get(links[i]['href'])
        g=repr(r.content)
        coord_start=g.find('GLatLng')
        coord_end=coord_start+g[coord_start:].find(')')
        coord=g[coord_start+len('GLatLng')+1:coord_end].split(',')
        kw_start=g.find('<meta name="keywords')
        kw_end=kw_start+g[kw_start:].find('/>')
        kw=g[kw_start+len('<meta name="keywords" content="'):kw_end].split(',')
        kw=[i.strip() for i in kw]
        cm_start=g.find('<b>C&iacute;m:</b>')
        cm_end=cm_start+g[cm_start:].find('<br>')
        cm=g[cm_start+len('<b>C&iacute;m:</b>'):cm_end].strip()
        
        coords.append(coord)
        kws.append(kw)
        cms.append(cm)
   
    df['coords']=coords
    df['kws']=kws
    df['cms']=cms
    dfs.append(df)
    print(region)
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
also-haromszek
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
felso-haromszek
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
csikszek
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
udvarhelyszek
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
marosszek
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
gyergyoszek
In [128]:
dfs2=[]
regions=['also-haromszek','felso-haromszek','csikszek','udvarhelyszek','marosszek','gyergyoszek']
for region in regions:
    url='http://www.omnibus.ro/index.php/hu/szekelyfoldi-top-cegek/arbevetel/'+\
        region+'-2014-2017/'
    with session() as c:
        response = c.get(url)
        #print(response.headers)
        #print(response.text)
        df=pd.read_html(response.text)[1]
        df.columns=[0]+list(df.loc[0])[:-1]
        df=df.loc[2:].set_index(0)
        df=df[df.columns[1:-1]]
        df=df.loc[list(df.index)[:-1]]
        df['region']=region
        df['nr']=df.index
        
    soup = BeautifulSoup(response.content)
    links=soup.findAll('table')[3].findAll('a')
    coords=[]
    kws=[]
    cms=[]
    for i in range(len(links)):
        print(i,)
        r=requests.get(links[i]['href'])
        g=repr(r.content)
        coord_start=g.find('GLatLng')
        coord_end=coord_start+g[coord_start:].find(')')
        coord=g[coord_start+len('GLatLng')+1:coord_end].split(',')
        kw_start=g.find('<meta name="keywords')
        kw_end=kw_start+g[kw_start:].find('/>')
        kw=g[kw_start+len('<meta name="keywords" content="'):kw_end].split(',')
        kw=[i.strip() for i in kw]
        cm_start=g.find('<b>C&iacute;m:</b>')
        cm_end=cm_start+g[cm_start:].find('<br>')
        cm=g[cm_start+len('<b>C&iacute;m:</b>'):cm_end].strip()
        
        coords.append(coord)
        kws.append(kw)
        cms.append(cm)
   
    df['coords']=coords
    df['kws']=kws
    df['cms']=cms
    dfs2.append(df)
    print(region)
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
also-haromszek
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
felso-haromszek
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
csikszek
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
udvarhelyszek
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
marosszek
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
gyergyoszek
In [155]:
dfs[5].columns=dfs[0].columns
In [156]:
dfsi=pd.concat(dfs).reset_index()
In [157]:
dfsi2=pd.concat(dfs2).reset_index()
In [158]:
dfsi.to_csv('dfsi.csv',sep=';')
In [159]:
dfsi2.to_csv('dfsi2.csv',sep=';')