#!/usr/bin/env python # coding: utf-8 # In[1]: import requests, pandas as pd, numpy as np from requests import session from bs4 import BeautifulSoup # In[2]: dfsi=pd.read_csv('dfsi.csv',sep=';') # In[3]: dfsi2=pd.read_csv('dfsi2.csv',sep=';') # In[4]: dfsi['nr_alkalmazottak']=dfsi['nr'] dfsi2['nr_arbevetel']=dfsi2['nr'] # In[5]: data=dfsi.set_index('Cégnév').join(dfsi2.set_index('Cégnév'),how='outer',lsuffix='_left', rsuffix='_right') # In[6]: data['Cím']=data[['cms_right','cms_left']].T.ffill().bfill().T['cms_right'] data['Koord']=data[['coords_right','coords_left']].T.ffill().bfill().T['coords_right'] data['Kw']=data[['kws_right','kws_left']].T.ffill().bfill().T['kws_right'] data['Régió']=data[['region_right','region_left']].T.ffill().bfill().T['region_right'] # In[7]: data=data.drop(['0_left', 'region_left','nr_left', 'coords_left', 'kws_left', 'cms_left', '0_right', 'region_right', 'nr_right', 'coords_right', 'kws_right', 'cms_right', 'Unnamed: 0_left','Unnamed: 0_right'],axis=1) # In[8]: lat=[] lon=[] for i in range(len(data.index)): k=data.loc[data.index[i]]['Koord'].replace("'",'').replace('[','')\ .replace(']','').replace(' ','').split(',') lon.append(k[0]) lat.append(k[1]) # In[9]: data['Latitude']=lat data['Longitude']=lon # In[10]: data=data.drop('Koord',axis=1) # In[11]: data # In[42]: manual=data[data['Longitude']==''] data=data[data['Longitude']!=''] # In[123]: data.to_excel('data.xlsx') manual.to_excel('manual.xlsx') # In[111]: sectors=pd.read_excel('sectors.xlsx') sectormap={} repl={'á':'a','é':'e','í':'i','ó':'o','ú':'u','ü':'u','ű':'u','ő':'o','ö':'o'} for s in sectors.columns: for k in sectors[s].values: for j in str(k).replace(' ','').split(','): sectormap[j]=s for c in repl: j=j.replace(c,repl[c]) sectormap[j]=s # In[120]: valid=[] kws=[] cont=True ki=-1 for i in range(len(data.index)): ks=data.loc[data.index[i]]['Kw'].replace("'",'').replace('[','')\ .replace(']','').replace(' ','').lower().split(',')[:-2] for k in ks: if k in sectormap: kws.append(sectormap[k]) break k=k.replace('-','') if k in sectormap: kws.append(sectormap[k]) break if len(kws)