import requests, pandas as pd, numpy as np
from requests import session
from bs4 import BeautifulSoup
url='http://www.omnibus.ro/index.php/hu/szekelyfoldi-top-listak/arbevetel/also-haromszek-2015-2018#oldal'
r = requests.get(url)
for c in r.cookies:
print(c.name, c.value)
__cfduid dc1e6566f055b30ff537c92b686dc9db61577353250 PHPSESSID i1sgviskhsoe31m2vdhp3r1e66
url
'http://www.omnibus.ro/index.php/hu/szekelyfoldi-top-listak/arbevetel/also-haromszek-2015-2018#oldal'
dfs=[]
regions=['also-haromszek','felso-haromszek','csikszek','udvarhelyszek','marosszek','gyergyoszek']
for region in regions:
url='http://www.omnibus.ro/index.php/hu/szekelyfoldi-top-listak/alkalmazott/'+\
region+'-2015-2018#oldal'
with session() as c:
response = c.get(url)
#print(response.headers)
#print(response.text)
df=pd.read_html(response.text)[1]
df.columns=[0]+list(df.loc[0])[:-1]
df=df.loc[2:].set_index(0)
df=df[df.columns[1:-1]]
df=df.loc[list(df.index)[:-1]]
df['region']=region
df['nr']=df.index
soup = BeautifulSoup(response.content)
links=soup.findAll('table')[3].findAll('a')
coords=[]
kws=[]
cms=[]
for i in range(len(links)):
print(i,)
r=requests.get(links[i]['href'])
g=repr(r.content)
coord_start=g.find('GLatLng')
coord_end=coord_start+g[coord_start:].find(')')
coord=g[coord_start+len('GLatLng')+1:coord_end].split(',')
kw_start=g.find('<meta name="keywords')
kw_end=kw_start+g[kw_start:].find('/>')
kw=g[kw_start+len('<meta name="keywords" content="'):kw_end].split(',')
kw=[i.strip() for i in kw]
cm_start=g.find('<b>Cím:</b>')
cm_end=cm_start+g[cm_start:].find('<br>')
cm=g[cm_start+len('<b>Cím:</b>'):cm_end].strip()
coords.append(coord)
kws.append(kw)
cms.append(cm)
df=df[:len(links)]
df['coords']=coords
df['kws']=kws
df['cms']=cms
dfs.append(df)
print(region)
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 also-haromszek 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 felso-haromszek 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 csikszek 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 udvarhelyszek 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 marosszek 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 gyergyoszek
dfs2=[]
regions=['also-haromszek','felso-haromszek','csikszek','udvarhelyszek','marosszek','gyergyoszek']
for region in regions:
url='http://www.omnibus.ro/index.php/hu/szekelyfoldi-top-listak/arbevetel/'+\
region+'-2015-2018#oldal'
with session() as c:
response = c.get(url)
#print(response.headers)
#print(response.text)
df=pd.read_html(response.text)[1]
df.columns=[0]+list(df.loc[0])[:-1]
df=df.loc[2:].set_index(0)
df=df[df.columns[1:-1]]
df=df.loc[list(df.index)[:-1]]
df['region']=region
df['nr']=df.index
soup = BeautifulSoup(response.content)
links=soup.findAll('table')[3].findAll('a')
coords=[]
kws=[]
cms=[]
for i in range(len(links)):
print(i,)
r=requests.get(links[i]['href'])
g=repr(r.content)
coord_start=g.find('GLatLng')
coord_end=coord_start+g[coord_start:].find(')')
coord=g[coord_start+len('GLatLng')+1:coord_end].split(',')
kw_start=g.find('<meta name="keywords')
kw_end=kw_start+g[kw_start:].find('/>')
kw=g[kw_start+len('<meta name="keywords" content="'):kw_end].split(',')
kw=[i.strip() for i in kw]
cm_start=g.find('<b>Cím:</b>')
cm_end=cm_start+g[cm_start:].find('<br>')
cm=g[cm_start+len('<b>Cím:</b>'):cm_end].strip()
coords.append(coord)
kws.append(kw)
cms.append(cm)
df=df[:len(links)]
df['coords']=coords
df['kws']=kws
df['cms']=cms
dfs2.append(df)
print(region)
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 also-haromszek 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 felso-haromszek 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 csikszek 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 udvarhelyszek 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 marosszek 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 gyergyoszek
dfs[5].columns=dfs[0].columns
dfsi=pd.concat(dfs).reset_index()
dfsi2=pd.concat(dfs2).reset_index()
dfsi.to_csv('dfsi.csv',sep=';')
dfsi2.to_csv('dfsi2.csv',sep=';')