import requests, pandas as pd, numpy as np
from requests import session
from bs4 import BeautifulSoup
manual=pd.read_excel('manual_manual.xlsx').set_index('Cégnév')
data=pd.read_excel('data.xlsx').set_index('Cégnév')
sectors=pd.read_excel('valid_manual.xlsx')
data['Sector']=sectors.set_index(0)[1]
data=pd.concat([data,manual])
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version of pandas will change to not sort by default. To accept the future behavior, pass 'sort=False'. To retain the current behavior and silence the warning, pass 'sort=True'. """Entry point for launching an IPython kernel.
repl={'udvarhelyszek':'Udvarhelyszék',
'csikszek':'Csíkszék',
'gyergyoszek':'Gyergyószék',
'marosszek':'Marosszék',
'also-haromszek':'Alsó-háromszék',
'felso-haromszek':'Felső-háromszék'}
for r in repl:
data=data.replace(r,repl[r])
data.to_excel('export1.xlsx')
d1=data[['Régió', 'Latitude', 'Longitude', 'Sector']]
d1.columns=['Régió', 'Hosszúság', 'Szélesség', 'Iparág']
d1.to_excel('d1.xlsx')
data.columns
Index(['Alkalmazottak száma 2014', 'Alkalmazottak száma 2015', 'Alkalmazottak száma 2016', 'Alkalmazottak száma 2017', 'Cím', 'Kw', 'Latitude', 'Longitude', 'Régió', 'Sector', 'nr_alkalmazottak', 'nr_arbevetel', 'Árbevétel 2014 (RON)', 'Árbevétel 2015 (RON)', 'Árbevétel 2016 (RON)', 'Árbevétel 2017 (RON)'], dtype='object')
ds=[]
ds1=[]
ds2=[]
for i in range(2014,2018):
g='Alkalmazottak száma'
c=g+' '+str(i)
print(c)
df=data[[c]]
df.columns=[g]
df[g]=df[g].fillna('').str.replace(u'\xa0','').str.replace('-','')
df['Év']=i
ds1.append(df)
ds.append(d1.join(df))
g='Árbevétel'
c=g+' '+str(i)+' (RON)'
print(c)
df=data[[c]]
df.columns=[g]
df[g]=df[g].fillna('').str.replace(u'\xa0','').str.replace('-','')
df['Év']=i
ds2.append(df)
Alkalmazottak száma 2014 Árbevétel 2014 (RON) Alkalmazottak száma 2015
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy # Remove the CWD from sys.path while we load stuff. C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy # This is added back by InteractiveShellApp.init_path() C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:19: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:20: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Árbevétel 2015 (RON) Alkalmazottak száma 2016 Árbevétel 2016 (RON) Alkalmazottak száma 2017 Árbevétel 2017 (RON)
df=pd.concat(ds,sort=False)
df1=pd.concat(ds1,sort=False)
df2=pd.concat(ds2,sort=False)
df1.to_excel('df1.xlsx')
df2.to_excel('df2.xlsx')
df1=df1.set_index('Év',append=True)
df2=df2.set_index('Év',append=True)
df=df1.join(df2).replace('',np.nan).astype(float)
df['Bevétel']=np.round(df['Árbevétel']/1000000.0,3)
df['Bevétel/Alkalmazott']=np.round(df['Bevétel']/df['Alkalmazottak száma'],3)
data2=df.reset_index().join(d1,on='Cégnév').set_index('Cégnév')
data2.to_excel('export2.xlsx')
data3=df.unstack()
data3.columns=data3.columns.get_level_values(1).astype(str)
data3=data3.reset_index().join(d1,on='Cégnév').set_index('Cégnév')
data3.index.name=None
data3.to_excel('export3.xlsx')
data4=data3.groupby(['Iparág','Régió']).sum()
data4b=data3.groupby(['Régió']).sum()
data4=data4.unstack().T
data4b=data4b.unstack().T
data4['Mind']=data4b
data4=data4.unstack().T
data4.reset_index().to_excel('export4.xlsx')
data4=data3.groupby(['Régió','Iparág']).sum()
data4b=data3.groupby(['Iparág']).sum()
data4=data4.unstack().T
data4b=data4b.unstack().T
data4['Mind']=data4b
data4=data4.unstack().T
data4.reset_index().to_excel('export4b.xlsx')