#!/usr/bin/env python # coding: utf-8 # In[52]: import pandas as pd, numpy as np import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') # In[2]: #load CPI cpi=pd.read_html('IPC_8_5_2015.xls',header=0)[0] cpi.columns=['Year']+range(5) cpi=cpi.drop(range(1,5),axis=1)[2:] cpi=cpi.set_index('Year') cpi.head() # In[3]: #load first part of labor data df=pd.read_csv('exportPivot_FOM103A.csv').reset_index(level=5).reset_index(level=4).drop('level_4',axis=1) df.index=df.index.swaplevel(0,1) df=df.unstack() df.columns = df.columns.droplevel() df=df.drop(u' Ani',axis=1) df.columns=range(2001,2009) df.head() # In[4]: #load 2nd part of labor data dg=pd.read_csv('exportPivot_FOM103A(1).csv').reset_index(level=5).reset_index(level=4).drop('level_4',axis=1) dg.index=dg.index.swaplevel(0,1) dg=dg.unstack() dg.columns = dg.columns.droplevel() dg=dg.drop(u' Ani',axis=1) dg.columns=range(1992,2001) dg.head() # In[5]: #merge dataframes dg[range(2001,2009)]=df[range(2001,2009)] dg.head() # In[6]: #load first part of salary data dh=pd.read_csv('exportPivot_FOM106A.csv').reset_index(level=5).reset_index(level=4).drop('level_4',axis=1) dh.index=dh.index.swaplevel(0,1) dh=dh.unstack() dh.columns = dh.columns.droplevel() dh=dh.drop(u' Ani',axis=1)[1:] dh.columns=range(1990,2009) dh.head() # In[7]: #load second part of labor data CAEN2 di=pd.read_csv('exportPivot_FOM103D.csv').reset_index(level=5).reset_index(level=4).drop('level_4',axis=1) di.index=di.index.swaplevel(0,1) di=di.unstack() di.columns = di.columns.droplevel() di=di.drop(u' Ani',axis=1) di.columns=range(2009,2015) di.head() # In[8]: #load second part of salary data CAEN2 dj=pd.read_csv('exportPivot_FOM106E.csv').reset_index(level=5).reset_index(level=4).drop('level_4',axis=1) dj.index=dj.index.swaplevel(0,1) dj=dj.unstack() dj.columns = dj.columns.droplevel() dj=dj.drop(u' Ani',axis=1) dj.columns=range(2009,2015) dj.head() # In[9]: #load third part of salary data CAEN2, for global normalization dk=pd.read_csv('exportPivot_FOM106F.csv').reset_index(level=5).reset_index(level=4).drop('level_4',axis=1) dk.index=dk.index.swaplevel(0,1) dk.index=dk.index.swaplevel(1,2) dk=dk.unstack() dk.columns = dk.columns.droplevel() dk=dk.drop(u' Ani',axis=1) dk.columns=range(2000,2014) dk.head() # In[10]: #renormalize CAENs # In[11]: cc={#CAEN 2 'B INDUSTRIA EXTRACTIVA':'C Industria extractiva', 'N ACTIVITATI DE SERVICII ADMINISTRATIVE SI ACTIVITATI DE SERVICII SUPORT':'L Administratie publica si aparare', 'M ACTIVITATI PROFESIONALE STIINTIFICE SI TEHNICE':'M Invatamant', 'J INFORMATII SI COMUNICATII':'Posta si telecomunicatii', 'S ALTE ACTIVITATI DE SERVICII':'Celelalte activitati ale economiei nationale', 'R ACTIVITATI DE SPECTACOLE CULTURALE SI RECREATIVE':'Celelalte activitati ale economiei nationale', 'INDUSTRIE':'Industrie', 'A AGRICULTURA SILVICULTURA SI PESCUIT': 'A Agricultura vanatoare si silvicultura', 'O ADMINISTRATIE PUBLICA SI APARARE; ASIGURARI SOCIALE DIN SISTEMUL PUBLIC':'L Administratie publica si aparare', 'P INVATAMANT':'M Invatamant', 'F CONSTRUCTII':'F Constructii', 'K INTERMEDIERI FINANCIARE SI ASIGURARI':'J Intermedieri financiare', 'C INDUSTRIA PRELUCRATOARE':'D Industria prelucratoare', 'TOTAL':'Total', 'D PRODUCTIA SI FURNIZAREA DE ENERGIE ELECTRICA SI TERMICA GAZE APA CALDA SI AER CONDITIONAT':'E Energie electrica si termica gaze si apa', 'Q SANATATE SI ASISTENTA SOCIALA':'N Sanatate si asistenta sociala', 'L TRANZACTII IMOBILIARE':'K Tranzactii imobiliare si alte servicii', 'G COMERT CU RIDICATA SI CU AMANUNTUL; REPARAREA AUTOVEHICULELOR SI MOTOCICLETELOR':'G Comert', 'CAEN Rev.2 (activitati ale economiei nationale)':'CAEN Rev.1 (activitati ale economiei nationale - sectiuni)', 'E DISTRIBUTIA APEI; SALUBRITATE GESTIONAREA DESEURILOR ACTIVITATI DE DECONTAMINARE':'E Energie electrica si termica gaze si apa', 'I HOTELURI SI RESTAURANTE':'H Hoteluri si restaurante', 'H TRANSPORT SI DEPOZITARE':'Transport si depozitare', ' B INDUSTRIA EXTRACTIVA':'C Industria extractiva', ' N ACTIVITATI DE SERVICII ADMINISTRATIVE SI ACTIVITATI DE SERVICII SUPORT':'L Administratie publica si aparare', ' M ACTIVITATI PROFESIONALE STIINTIFICE SI TEHNICE':'M Invatamant', ' J INFORMATII SI COMUNICATII':'Posta si telecomunicatii', ' S ALTE ACTIVITATI DE SERVICII':'Celelalte activitati ale economiei nationale', ' R ACTIVITATI DE SPECTACOLE CULTURALE SI RECREATIVE':'Celelalte activitati ale economiei nationale', ' INDUSTRIE':'Industrie', ' A AGRICULTURA SILVICULTURA SI PESCUIT': 'A Agricultura vanatoare si silvicultura', ' O ADMINISTRATIE PUBLICA SI APARARE; ASIGURARI SOCIALE DIN SISTEMUL PUBLIC':'L Administratie publica si aparare', ' P INVATAMANT':'M Invatamant', ' F CONSTRUCTII':'F Constructii', ' K INTERMEDIERI FINANCIARE SI ASIGURARI':'J Intermedieri financiare', ' C INDUSTRIA PRELUCRATOARE':'D Industria prelucratoare', ' TOTAL':'Total', ' D PRODUCTIA SI FURNIZAREA DE ENERGIE ELECTRICA SI TERMICA GAZE APA CALDA SI AER CONDITIONAT':'E Energie electrica si termica gaze si apa', ' Q SANATATE SI ASISTENTA SOCIALA':'N Sanatate si asistenta sociala', ' L TRANZACTII IMOBILIARE':'K Tranzactii imobiliare si alte servicii', ' G COMERT CU RIDICATA SI CU AMANUNTUL; REPARAREA AUTOVEHICULELOR SI MOTOCICLETELOR':'G Comert', ' CAEN Rev.2 (activitati ale economiei nationale)':'CAEN Rev.1 (activitati ale economiei nationale - sectiuni)', ' E DISTRIBUTIA APEI; SALUBRITATE GESTIONAREA DESEURILOR ACTIVITATI DE DECONTAMINARE':'E Energie electrica si termica gaze si apa', ' I HOTELURI SI RESTAURANTE':'H Hoteluri si restaurante', ' H TRANSPORT SI DEPOZITARE':'Transport si depozitare', 'Posta si telecomunicatii':'Posta si telecomunicatii', 'L Administratie publica si aparare':'L Administratie publica si aparare', 'Silvicultura exploatarea forestiera':'A Agricultura vanatoare si silvicultura', 'Industrie':'Industrie', 'I Transport depozitare si comunicatii':'Transport si depozitare', 'Transport si depozitare':'Transport si depozitare', 'B Pescuit si piscicultura':'A Agricultura vanatoare si silvicultura', 'C Industria extractiva':'C Industria extractiva', 'Celelalte activitati ale economiei nationale':'Celelalte activitati ale economiei nationale', 'A Agricultura vanatoare si silvicultura':'A Agricultura vanatoare si silvicultura', 'F Constructii':'F Constructii', 'H Hoteluri si restaurante':'H Hoteluri si restaurante', 'K Tranzactii imobiliare si alte servicii':'K Tranzactii imobiliare si alte servicii', 'N Sanatate si asistenta sociala':'N Sanatate si asistenta sociala', 'G Comert':'G Comert', 'Agricultura vanatoare':'A Agricultura vanatoare si silvicultura', 'CAEN Rev.1 (activitati ale economiei nationale - sectiuni)':'CAEN Rev.1 (activitati ale economiei nationale - sectiuni)', 'D Industria prelucratoare':'D Industria prelucratoare', 'M Invatamant':'M Invatamant', 'Total':'Total', 'J Intermedieri financiare':'J Intermedieri financiare', 'E Energie electrica si termica gaze si apa':'E Energie electrica si termica gaze si apa'} # In[12]: rr={u' Harghita':'HR', u' Covasna':'CV', u' Mures':'MS', u' TOTAL':'RO', u'Total':'RO', u' Regiunea CENTRU':'TR', u' Regiunea VEST':'TR', u' Regiunea NORD-VEST':'TR'} # In[13]: def interpolate(d,years,gfit=1,depth=2,polyorder=1,override=True): #depth * length of interpolation substrings will be taken to the left and right #for example for {1971:5,1972:6,1973:7,1974:5} interpolating it over 1969-1990 #for the section 1960-1970 (2 elements) the values from 1972,1973,1974 (3 elements) will be taken with depth 1.5 #for the section 1974-1990 (15 elements) all values (4 elements) will be taken to extrapolate if (gfit>2): print 'interpolate takes only 1 (polynomial) or 2 (exponential) as 3rd argument [default=2]' return mydict={} missing_points=[[]] for year in years: if year not in d.keys(): missing_points[-1].append(year) else: missing_points.append([]) for m in missing_points: if m: fit=gfit if ((m[-1]np.sort(d.keys())[-1])): #check if it is ends of the interval, then extrapolate mean only if not override: fit=0 if fit==0: #take average y = {k: d[k] for k in set(d.keys()).intersection(range(max(min(years),min(m)-int(3)),min(max(years),max(m)+int(3))+1))} for i in range(len(m)): mydict[m[i]]=np.mean(y.values()) elif fit==1: #intersector y = {k: d[k] for k in set(d.keys()).intersection(range(max(min(years),min(m)-int(depth*len(m))),min(max(years),max(m)+int(depth*len(m)))+1))} #print y w = np.polyfit(y.keys(),y.values(),polyorder) # obtaining regression parameters if (polyorder==1): intersector=w[0]*np.array(m)+w[1] else: intersector=w[0]*np.array(m)*np.array(m)+w[1]*np.array(m)+w[2] for i in range(len(m)): mydict[m[i]]=max(0,intersector[i]) else: #intersector y = {k: d[k] for k in set(d.keys()).intersection(range(max(min(years),min(m)-int(depth*len(m))),min(max(years),max(m)+int(depth*len(m)))+1))} #print y w = np.polyfit(y.keys(),np.log(y.values()),1) # obtaining log regression parameters (exp fitting) intersector=np.exp(w[1])*np.exp(w[0]*np.array(m)) for i in range(len(m)): mydict[m[i]]=max(0,intersector[i]) #return interpolated points return mydict # In[16]: years=range(1990,2015) # In[17]: #Labor data #CAEN 1 data={} for i in dg.T.iteritems(): if i[0][2] in {u' Harghita',u' Covasna',u' Mures',u' TOTAL',u' Regiunea CENTRU',u' Regiunea VEST',u' Regiunea NORD-VEST'}: if i[0][0] not in data: data[i[0][0]]={} if rr[i[0][2]] not in data[i[0][0]]: data[i[0][0]][rr[i[0][2]]]={} if i[0][1] not in {'Agricultura vanatoare', 'Industrie', 'INDUSTRIE', ' INDUSTRIE'}: #eliminate doublecounting if cc[i[0][1]] not in data[i[0][0]][rr[i[0][2]]]: data[i[0][0]][rr[i[0][2]]][cc[i[0][1]]]=i[1].astype(float) else: data[i[0][0]][rr[i[0][2]]][cc[i[0][1]]]=data[i[0][0]][rr[i[0][2]]][cc[i[0][1]]].add(i[1].astype(float),fill_value=0) mydata={} for i in data: if i not in mydata: mydata[i]={} for j in data[i]: if j not in mydata[i]: mydata[i][j]={} for k in data[i][j]: if k not in mydata[i][j]: mydata[i][j][k]={} for y in data[i][j][k].index: if not np.isnan(data[i][j][k].loc[y] ): mydata[i][j][k][y]=data[i][j][k].loc[y] mydata[i][j][k].update(interpolate(mydata[i][j][k],years)) # In[18]: #CAEN 2 data={} for i in di.T.iteritems(): if i[0][2] in {u' Harghita',u' Covasna',u' Mures',u' TOTAL',u' Regiunea CENTRU',u' Regiunea VEST',u' Regiunea NORD-VEST'}: if i[0][0] not in data: data[i[0][0]]={} if rr[i[0][2]] not in data[i[0][0]]: data[i[0][0]][rr[i[0][2]]]={} if i[0][1] not in {'Agricultura vanatoare','Industrie', 'INDUSTRIE', ' INDUSTRIE'}: #eliminate doublecounting if cc[i[0][1]] not in data[i[0][0]][rr[i[0][2]]]: data[i[0][0]][rr[i[0][2]]][cc[i[0][1]]]=i[1].astype(float) else: data[i[0][0]][rr[i[0][2]]][cc[i[0][1]]]=data[i[0][0]][rr[i[0][2]]][cc[i[0][1]]].add(i[1].astype(float),fill_value=0) for i in data: if i not in mydata: mydata[i]={} for j in data[i]: if j not in mydata[i]: mydata[i][j]={} for k in data[i][j]: if k not in mydata[i][j]: mydata[i][j][k]={} for y in data[i][j][k].index: if not np.isnan(data[i][j][k].loc[y] ): mydata[i][j][k][y]=data[i][j][k].loc[y] if ((k=='K Tranzactii imobiliare si alte servicii') and (y>2008)): mydata[i][j][k].pop(y) mydata[i][j][k].update(interpolate(mydata[i][j][k],years)) #calculate male for i in {' Feminin'}: mydata[' Masculin']={} for j in mydata[i]: mydata[' Masculin'][j]={} for k in mydata[i][j]: mydata[' Masculin'][j][k]={} for y in mydata[i][j][k]: mydata[' Masculin'][j][k][y]=mydata[' Total'][j][k][y]-mydata[' Feminin'][j][k][y] # In[19]: #Salary data #CAEN 1 data={} for i in dh.T.iteritems(): if i[0][2] in {u' Harghita',u' Covasna',u' Mures',u' TOTAL',u' Regiunea CENTRU',u' Regiunea VEST',u' Regiunea NORD-VEST'}: if i[0][0] not in data: data[i[0][0]]={} if rr[i[0][2]] not in data[i[0][0]]: data[i[0][0]][rr[i[0][2]]]={} if i[0][1] not in {'Agricultura vanatoare','Industrie', 'INDUSTRIE', ' INDUSTRIE'}: #eliminate doublecounting if cc[i[0][1]] not in data[i[0][0]][rr[i[0][2]]]: data[i[0][0]][rr[i[0][2]]][cc[i[0][1]]]={} for y in i[1].index: if y not in data[i[0][0]][rr[i[0][2]]][cc[i[0][1]]]: data[i[0][0]][rr[i[0][2]]][cc[i[0][1]]][y]=[] data[i[0][0]][rr[i[0][2]]][cc[i[0][1]]][y].append((float)(i[1].loc[y])) mydata2={} for i in data: if i not in mydata2: mydata2[i]={} for j in data[i]: if j not in mydata2[i]: mydata2[i][j]={} for k in data[i][j]: if k not in mydata2[i][j]: mydata2[i][j][k]={} for y in data[i][j][k]: if not np.isnan(np.nanmean(data[i][j][k][y])): mydata2[i][j][k][y]=np.nanmean(data[i][j][k][y]) #mydata2[i][j][k].update(interpolate(mydata2[i][j][k],years)) #CAEN 2 data={} for i in dj.T.iteritems(): if i[0][2] in {u' Harghita',u' Covasna',u' Mures',u' TOTAL',u' Regiunea CENTRU',u' Regiunea VEST',u' Regiunea NORD-VEST'}: if i[0][0] not in data: data[i[0][0]]={} if rr[i[0][2]] not in data[i[0][0]]: data[i[0][0]][rr[i[0][2]]]={} if i[0][1] not in {'Agricultura vanatoare'}: #eliminate doublecounting if cc[i[0][1]] not in data[i[0][0]][rr[i[0][2]]]: data[i[0][0]][rr[i[0][2]]][cc[i[0][1]]]={} for y in i[1].index: if y not in data[i[0][0]][rr[i[0][2]]][cc[i[0][1]]]: data[i[0][0]][rr[i[0][2]]][cc[i[0][1]]][y]=[] data[i[0][0]][rr[i[0][2]]][cc[i[0][1]]][y].append((float)(i[1].loc[y])) for i in data: if i not in mydata2: mydata2[i]={} for j in data[i]: if j not in mydata2[i]: mydata2[i][j]={} for k in data[i][j]: if k not in mydata2[i][j]: mydata2[i][j][k]={} for y in data[i][j][k]: if not np.isnan(np.nanmean(data[i][j][k][y])): mydata2[i][j][k][y]=np.nanmean(data[i][j][k][y]) #mydata2[i][j][k].update(interpolate(mydata2[i][j][k],years)) # In[20]: #National salary data #CAEN 2 data={} for i in dk.T.iteritems(): if i[0][2] in {u'Total'}: if i[0][0] not in data: data[i[0][0]]={} if rr[i[0][2]] not in data[i[0][0]]: data[i[0][0]][rr[i[0][2]]]={} if i[0][1] not in {'Agricultura vanatoare','Industrie', 'INDUSTRIE', ' INDUSTRIE'}: #eliminate doublecounting if cc[i[0][1]] not in data[i[0][0]][rr[i[0][2]]]: data[i[0][0]][rr[i[0][2]]][cc[i[0][1]]]={} for y in i[1].index: if y not in data[i[0][0]][rr[i[0][2]]][cc[i[0][1]]]: data[i[0][0]][rr[i[0][2]]][cc[i[0][1]]][y]=[] data[i[0][0]][rr[i[0][2]]][cc[i[0][1]]][y].append((float)(i[1].loc[y])) mydata3={} for i in data: if i not in mydata3: mydata3[i]={} for j in data[i]: if j not in mydata3[i]: mydata3[i][j]={} for k in data[i][j]: if k not in mydata3[i][j]: mydata3[i][j][k]={} for y in data[i][j][k]: if not np.isnan(np.nanmean(data[i][j][k][y])): mydata3[i][j][k][y]=np.nanmean(data[i][j][k][y]) #mydata2[i][j][k].update(interpolate(mydata2[i][j][k],years)) # In[21]: mydata3[' Total']['RO']['A Agricultura vanatoare si silvicultura'] # In[22]: #calculate gender inequality gii={" Masculin":{}," Feminin":{}} for i in mydata3[' Masculin']['RO']: gii[" Masculin"][i]={} gii[" Feminin"][i]={} for j in mydata3[' Masculin']['RO'][i]: gii[" Masculin"][i][j]=mydata3[' Masculin']['RO'][i][j]/mydata3[' Total']['RO'][i][j] gii[" Feminin"][i][j]=mydata3[' Feminin']['RO'][i][j]/mydata3[' Total']['RO'][i][j] gii[" Masculin"][i].update(interpolate(gii[" Masculin"][i],years)) gii[" Feminin"][i].update(interpolate(gii[" Feminin"][i],years)) #readjust to country-level earnings GII={} for i in gii: #sex if i not in GII: GII[i]={} for j in mydata2[i]: #judet if j not in GII[i]: GII[i][j]={} for k in gii[i]: #indicator if k not in GII[i][j]: GII[i][j][k]={} for y in gii[i][k]: #year try: GII[i][j][k][y]=gii[i][k][y]*mydata2[' Total'][j][k][y] if np.isnan(GII[i][j][k][y]): GII[i][j][k].pop(y) except: pass GII[i][j][k].update(interpolate(GII[i][j][k],years,2,1.5)) # In[23]: #create cpi dict cp={2013:100} for i in cpi.T.iteritems(): cp[(int)(i[0][5:])]=(int)(i[1])/100.0 cp.update(interpolate(cp,years)) # In[24]: cp # In[25]: #create final data DATA={} for i in GII: if i not in DATA: DATA[i]={} for j in GII[i]: if j not in DATA[i]: DATA[i][j]={} for k in GII[i][j]: if k not in DATA[i][j]: DATA[i][j][k]={} for y in GII[i][j][k]: yr=repr(y) DATA[i][j][k][yr]={} DATA[i][j][k][yr]['munka']=mydata[i][j][k][y] salary=GII[i][j][k][y] DATA[i][j][k][yr]['fizu']=salary if y<2005: salary/=10000.0 DATA[i][j][k][yr]['realfizu']=salary*cp[y]/100.0 if k in {'Celelalte activitati ale economiei nationale','N Sanatate si asistenta sociala'}: DATA[i][j][k][yr]['realfizu']=DATA[i][j][k][yr]['fizu'] # In[26]: ff={'Total':u'Összesen', 'H Hoteluri si restaurante':u'Vendéglátóipar', 'C Industria extractiva':u'Asványkincsek, bányászat', 'Posta si telecomunicatii':u'Távközlés', 'L Administratie publica si aparare':u'Közügy, hatóság', 'Celelalte activitati ale economiei nationale':u'Más', 'K Tranzactii imobiliare si alte servicii':u'Ingatlan', 'N Sanatate si asistenta sociala':u'Egészségügy', 'F Constructii':u'Építőipar', 'D Industria prelucratoare':u'Feldolgozóipar', 'M Invatamant':u'Oktatás', 'Industrie':u'Összes ipar', 'G Comert':u'Kereskedelem', 'J Intermedieri financiare':u'Pénzügy', 'E Energie electrica si termica gaze si apa':u'Energiaipar', 'A Agricultura vanatoare si silvicultura':u'Mezőgasdaság', 'Transport si depozitare':u'Szállítás'} # In[27]: #save data import json for county in DATA[' Feminin']: DATA2=[] for i in DATA[' Feminin'][county]: helper={} helper["name"]=ff[i] helper["region"]=ff[i] helper["income"]=[] helper["lifeExpectancy"]=[] helper["population"]=[] helper["avgwage"]=[] for syear in range(1990,2015): year=repr(syear) helper["income"].append([syear,DATA[' Feminin'][county][i][year]['munka']/\ (DATA[' Feminin'][county][i][year]['munka']+DATA[' Masculin'][county][i][year]['munka'])]) helper["lifeExpectancy"].append([syear,DATA[' Feminin'][county][i][year]['realfizu']/\ (DATA[' Masculin'][county][i][year]['realfizu'])]) helper["avgwage"].append([syear,DATA[' Masculin'][county][i][year]['realfizu']]) helper["population"].append([syear,(DATA[' Feminin'][county][i][year]['munka']+\ DATA[' Masculin'][county][i][year]['munka'])*1000]) DATA2.append(helper) file(county+'nations.json','w').write(json.dumps(DATA2)) # In[28]: #szekelyfold DATA2=[] for i in DATA[' Feminin']['HR']: helper={} helper["name"]=ff[i] helper["region"]=ff[i] helper["income"]=[] helper["lifeExpectancy"]=[] helper["population"]=[] helper["avgwage"]=[] for syear in range(1990,2015): year=repr(syear) helper["income"].append([syear,(DATA[' Feminin']['HR'][i][year]['munka']+\ DATA[' Feminin']['CV'][i][year]['munka']+\ DATA[' Feminin']['MS'][i][year]['munka'])/\ (DATA[' Feminin']['HR'][i][year]['munka']+DATA[' Masculin']['HR'][i][year]['munka']+\ DATA[' Feminin']['CV'][i][year]['munka']+DATA[' Masculin']['CV'][i][year]['munka']+\ DATA[' Feminin']['MS'][i][year]['munka']+DATA[' Masculin']['MS'][i][year]['munka'])]) helper["lifeExpectancy"].append([syear,(DATA[' Feminin']['HR'][i][year]['realfizu']+\ DATA[' Feminin']['CV'][i][year]['realfizu']+\ DATA[' Feminin']['MS'][i][year]['realfizu'])/\ (DATA[' Masculin']['HR'][i][year]['realfizu']+\ DATA[' Masculin']['CV'][i][year]['realfizu']+\ DATA[' Masculin']['MS'][i][year]['realfizu'])]) helper["avgwage"].append([syear,(DATA[' Masculin']['HR'][i][year]['realfizu']+\ DATA[' Masculin']['CV'][i][year]['realfizu']+\ DATA[' Masculin']['MS'][i][year]['realfizu'])/3.0]) helper["population"].append([syear,((DATA[' Feminin']['HR'][i][year]['munka']+\ DATA[' Feminin']['CV'][i][year]['munka']+\ DATA[' Feminin']['MS'][i][year]['munka'])+\ (DATA[' Masculin']['HR'][i][year]['munka']+\ DATA[' Masculin']['CV'][i][year]['munka']+\ DATA[' Masculin']['MS'][i][year]['munka']))*1000]) DATA2.append(helper) file('SZFnations.json','w').write(json.dumps(DATA2)) # In[31]: #load first part of labor data df=pd.read_csv('exportPivot_FOM106F(1).csv').reset_index(level=5).reset_index(level=4).drop('level_4',axis=1) df.index=df.index.swaplevel(0,1) df=df.unstack() df.columns = df.columns.droplevel() df=df.drop(u' Ani',axis=1) df.columns=range(2000,2014) df=df.drop(range(2000,2003),axis=1) df=df.astype(float) df.head() # In[74]: dm=df.loc[' Feminin'].loc[u'Total']/df.loc[' Masculin'].loc[u'Total']*100 dm=dm.astype(int) dm.index.name=u'Agazat' dm.to_csv('data.csv') # In[72]: dm.head()