In [1]:
import requests, pandas as pd, numpy as np
from requests import session
from bs4 import BeautifulSoup
In [2]:
manual=pd.read_excel('manual_manual.xlsx').set_index('Cégnév')
data=pd.read_excel('data.xlsx').set_index('Cégnév')
sectors=pd.read_excel('valid_manual.xlsx')
In [3]:
data['Sector']=sectors.set_index(0)[1]
In [4]:
data=pd.concat([data,manual])
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  """Entry point for launching an IPython kernel.
In [5]:
repl={'udvarhelyszek':'Udvarhelyszék',
'csikszek':'Csíkszék',
'gyergyoszek':'Gyergyószék',
'marosszek':'Marosszék',
'also-haromszek':'Alsó-háromszék',
'felso-haromszek':'Felső-háromszék'}
for r in repl:
    data=data.replace(r,repl[r])
In [6]:
data.to_excel('export1.xlsx')
In [7]:
d1=data[['Régió', 'Latitude', 'Longitude', 'Sector']]
d1.columns=['Régió', 'Hosszúság', 'Szélesség', 'Iparág']
In [8]:
d1.to_excel('d1.xlsx')
In [9]:
data.columns
Out[9]:
Index(['Alkalmazottak száma 2014', 'Alkalmazottak száma 2015',
       'Alkalmazottak száma 2016', 'Alkalmazottak száma 2017', 'Cím', 'Kw',
       'Latitude', 'Longitude', 'Régió', 'Sector', 'nr_alkalmazottak',
       'nr_arbevetel', 'Árbevétel 2014 (RON)', 'Árbevétel 2015 (RON)',
       'Árbevétel 2016 (RON)', 'Árbevétel 2017 (RON)'],
      dtype='object')
In [11]:
ds=[]
ds1=[]
ds2=[]
for i in range(2014,2018):
    g='Alkalmazottak száma'
    c=g+' '+str(i)
    print(c)
    df=data[[c]]
    df.columns=[g]
    df[g]=df[g].fillna('').str.replace(u'\xa0','').str.replace('-','')
    df['Év']=i
    ds1.append(df)
    ds.append(d1.join(df))
    g='Árbevétel'
    c=g+' '+str(i)+' (RON)'
    print(c)
    df=data[[c]]
    df.columns=[g]
    df[g]=df[g].fillna('').str.replace(u'\xa0','').str.replace('-','')
    df['Év']=i
    ds2.append(df)
Alkalmazottak száma 2014
Árbevétel 2014 (RON)
Alkalmazottak száma 2015
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:19: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:20: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Árbevétel 2015 (RON)
Alkalmazottak száma 2016
Árbevétel 2016 (RON)
Alkalmazottak száma 2017
Árbevétel 2017 (RON)
In [12]:
df=pd.concat(ds,sort=False)
In [13]:
df1=pd.concat(ds1,sort=False)
df2=pd.concat(ds2,sort=False)
In [14]:
df1.to_excel('df1.xlsx')
df2.to_excel('df2.xlsx')
In [15]:
df1=df1.set_index('Év',append=True)
df2=df2.set_index('Év',append=True)
In [16]:
df=df1.join(df2).replace('',np.nan).astype(float)
In [17]:
df['Bevétel']=np.round(df['Árbevétel']/1000000.0,3)
In [18]:
df['Bevétel/Alkalmazott']=np.round(df['Bevétel']/df['Alkalmazottak száma'],3)
In [19]:
data2=df.reset_index().join(d1,on='Cégnév').set_index('Cégnév')
data2.to_excel('export2.xlsx')
In [20]:
data3=df.unstack()
data3.columns=data3.columns.get_level_values(1).astype(str)
data3=data3.reset_index().join(d1,on='Cégnév').set_index('Cégnév')
data3.index.name=None
data3.to_excel('export3.xlsx')
In [124]:
data4=data3.groupby(['Iparág','Régió']).sum()
data4b=data3.groupby(['Régió']).sum()
data4=data4.unstack().T
data4b=data4b.unstack().T
data4['Mind']=data4b
data4=data4.unstack().T
data4.reset_index().to_excel('export4.xlsx')
In [123]:
data4=data3.groupby(['Régió','Iparág']).sum()
data4b=data3.groupby(['Iparág']).sum()
data4=data4.unstack().T
data4b=data4b.unstack().T
data4['Mind']=data4b
data4=data4.unstack().T
data4.reset_index().to_excel('export4b.xlsx')