#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd, copy # In[2]: #UNDATA dbd 1975-2013 #http://data.un.org/Data.aspx?d=UNHCR&f=indID%3AType-Ref #doanload data in batches, query limited to 50.000 df1=pd.read_csv('1985.csv') #years 1975-1984 df2=pd.read_csv('1995.csv') #years 1985-1994 df3=pd.read_csv('2005.csv') #years 1995-2004 df4=pd.read_csv('2015.csv') #years 2005-2013 # In[ ]: #UNHCR db 2001-2014 #http://popstats.unhcr.org/en/overview#_ga=1.40654370.1278371767.1434418671 - select time series df=pd.read_csv('all_data.csv',skiprows=3) # In[30]: #UNHCR db 1951-2014 #http://popstats.unhcr.org/en/overview#_ga=1.40654370.1278371767.1434418671 - person of concern df=pd.read_csv('all_data2.csv',skiprows=3) # In[31]: df # In[32]: df.columns=['year','target','source','a','b','c','d','e','f','g','value'] df=df.drop(['a','b','c','d','e','f','g',],axis=1).dropna()#.set_index(['year','target','source'])[:80440] df.head(50) # In[6]: df.set_index(['year','target','source']).loc[2014].loc['Pakistan'] # palestinian data, from world bank # In[7]: import numpy as np def interpolate(d,years,gfit=2,depth=1,polyorder=1,override=True): #depth * length of interpolation substrings will be taken to the left and right #for example for {1971:5,1972:6,1973:7,1974:5} interpolating it over 1969-1990 #for the section 1960-1970 (2 elements) the values from 1972,1973,1974 (3 elements) will be taken with depth 1.5 #for the section 1974-1990 (15 elements) all values (4 elements) will be taken to extrapolate if (gfit>2): print 'interpolate takes only 1 (polynomial) or 2 (exponential) as 3rd argument [default=2]' return mydict={} missing_points=[[]] for year in years: if year not in d.keys(): missing_points[-1].append(year) else: missing_points.append([]) for m in missing_points: if m: fit=gfit if ((m[-1]np.sort(d.keys())[-1])): #check if it is ends of the interval, then extrapolate mean only if not override: fit=0 if fit==0: #take average y = {k: d[k] for k in set(d.keys()).intersection(range(max(min(years),min(m)-int(3)),min(max(years),max(m)+int(3))+1))} for i in range(len(m)): mydict[m[i]]=np.mean(y.values()) elif fit==1: #intersector y = {k: d[k] for k in set(d.keys()).intersection(range(max(min(years),min(m)-int(depth*len(m))),min(max(years),max(m)+int(depth*len(m)))+1))} #print y w = np.polyfit(y.keys(),y.values(),polyorder) # obtaining regression parameters if (polyorder==1): intersector=w[0]*np.array(m)+w[1] else: intersector=w[0]*np.array(m)*np.array(m)+w[1]*np.array(m)+w[2] for i in range(len(m)): mydict[m[i]]=max(0,intersector[i]) else: #intersector y = {k: d[k] for k in set(d.keys()).intersection(range(max(min(years),min(m)-int(depth*len(m))),min(max(years),max(m)+int(depth*len(m)))+1))} #print y w = np.polyfit(y.keys(),np.log(y.values()),1) # obtaining log regression parameters (exp fitting) intersector=np.exp(w[1])*np.exp(w[0]*np.array(m)) for i in range(len(m)): mydict[m[i]]=max(0,intersector[i]) #return interpolated points return mydict # In[19]: #http://data.worldbank.org/indicator/SM.POP.REFG/countries?display=default dz=pd.read_csv('pal.csv',skiprows=2) dz.columns=['country','cc','a','b']+range(1960,2016) dz=dz.set_index('country').drop(['cc','a','b']+range(1960,1975)+range(2014,2016),axis=1) # In[20]: ccc={'Syrian Arab Republic':'Syria','West Bank and Gaza':'Palestine','Jordan':'Jordan','Lebanon':'Lebanon'} ccr={'Syrian Arab Republic':'Syrian Arab Rep.','West Bank and Gaza':'State of Palestine','Jordan':'Jordan','Lebanon':'Lebanon'} # In[21]: dx=pd.concat([pd.DataFrame(dz.loc['Jordan']).T,pd.DataFrame(dz.loc['Lebanon']).T\ ,pd.DataFrame(dz.loc['Syrian Arab Republic']).T,pd.DataFrame(dz.loc['West Bank and Gaza']).T]) # In[33]: #reset df, without setting index to run this parts palref=['Jordan','Lebanon','Syrian Arab Republic','West Bank and Gaza'] dc=pd.DataFrame(columns=['year','target','source','value']) for k in range(4): hp={} for i in dx.loc[palref[k]].iteritems(): if ~np.isnan(i[1]): hp[i[0]]=i[1] hp.update(interpolate(hp,range(1951,2015))) for y in hp: val=hp[y]-df[((df['target']==ccr[palref[k]])&(df['year']==y))].sum()[3] dc.loc[y+k*1000]=[y,ccc[palref[k]],'Palestine',val] # append plaestine data to main dataframe # In[34]: df=pd.concat([df,dc]) # save data # In[37]: def cc(country): if country in cc2: return cc2[country] else: return country cc2={ 'Bolivia (Plurinational State of)':'Bolivia', 'Micronesia (Federated States of)':'Micronesia', 'Serbia (and Kosovo: S/RES/1244 (1999))':'Serbia & Kosovo', 'The former Yugoslav Rep. of Macedonia':'FYROM', 'Venezuela (Bolivarian Republic of)':'Venezuela', 'Dem. Rep. of the Congo':'DRC', 'Central African Rep.':'CAR', "Dem. People's Rep. of Korea":"North Korea", 'Islamic Rep. of Iran':'Iran', "Lao People's Dem. Rep.":'Lao PDR', 'Papua New Guinea':'PNG', 'Syrian Arab Rep.':'Syria', 'United Rep. of Tanzania':'Tanzania', 'United Arab Emirates':'UAE', 'Antigua and Barbuda':'Antig. & Barb', 'Bosnia and Herzegovina':'Bosnia & Herz.', 'British Virgin Islands':'UK Virgin', 'Brunei Darussalam':'Brunei', 'Hong Kong SAR, China':'Hong Kong', 'Macao SAR, China':'Macao', 'Russian Federation':'Russia', 'Saint Kitts and Nevis':'St. Kitts & Nev.', 'Saint Vincent and the Grenadines':'St. Vinc. & Gren.', 'Sint Maarten (Dutch part)':'St. Maarten', 'State of Palestine':'Palestine', 'Trinidad and Tobago':'Trinid. & Tob.', 'Turks and Caicos Islands':'Turks & Caicos' } # In[38]: data={} countries=set() for i in df.T.iteritems(): year=int(i[1][0]) target=cc(i[1][1]) source=cc(i[1][2]) if target=="Various": target="Other" if source=="Various": source="Other" value=i[1][3] countries.add(target) countries.add(source) if year not in data:data[year]={} if ((source in data[year]) and (target in data[year][source])): data[year][source][target][1]=value else: if target not in data[year]:data[year][target]={} if source not in data[year][target]:data[year][target][source]=[0,0] data[year][target][source][0]=value # In[39]: dk=pd.DataFrame(columns=['importer1','importer2','year','flow1','flow2']) c=0 for year in data: for target in data[year]: for source in data[year][target]: dk.loc[c]=[target,source,year,data[year][target][source][0],data[year][target][source][1]] c+=1 print year,'0' # In[40]: dk=dk.set_index(['year','importer1','importer2']) dk.to_csv('datab.csv') # other direction flow # In[ ]: data={} countries=set() for i in df.T.iteritems(): year=int(i[1][2]) target=cc(i[1][0]) source=cc(i[1][1]) if target=="Various": target="Other" if source=="Various": source="Other" value=i[1][3] countries.add(target) countries.add(source) if year not in data:data[year]={} if ((source in data[year]) and (target in data[year][source])): data[year][source][target][1]=value else: if target not in data[year]:data[year][target]={} if source not in data[year][target]:data[year][target][source]=[0,0] data[year][target][source][0]=value dk=pd.DataFrame(columns=['importer2','importer1','year','flow1','flow2']) c=0 for year in data: for target in data[year]: for source in data[year][target]: dk.loc[c]=[target,source,year,data[year][target][source][0],data[year][target][source][1]] c+=1 print year,'0' dk=dk.set_index(['year','importer2','importer1']) dk.to_csv('data.csv') # extra experimentations # In[225]: data2={} countries2=set() for i in df.T.iteritems(): year=int(i[1][2]) target=cc(i[1][0]) source=cc(i[1][1]) if target=="Various": target="Other" if source=="Various": source="Other" value=i[1][3] countries2.add(target) countries2.add(source) if value<10000: target="Other" if year not in data2:data2[year]={} if ((source in data2[year]) and (target in data2[year][source])): data2[year][source][target][1]+=value else: target=cc(i[1][0]) source=cc(i[1][1]) if target=="Various": target="Other" if source=="Various": source="Other" if value<10000: source="Other" if target not in data2[year]:data2[year][target]={} if source not in data2[year][target]:data2[year][target][source]=[0,0] data2[year][target][source][0]+=value # In[226]: dk2=pd.DataFrame(columns=['importer1','importer2','year','flow1','flow2']) c=0 for year in data2: for target in data2[year]: for source in data2[year][target]: dk2.loc[c]=[target,source,year,data2[year][target][source][0],data2[year][target][source][1]] c+=1 print year,'1' # In[227]: dk2=dk2.set_index(['year','importer1','importer2']) dk2.to_csv('data2.csv') # In[262]: data3={} for i in df.T.iteritems(): year=int(i[1][2]) target=cc(i[1][0]) source=cc(i[1][1]) if target=="Various": target="Other" if source=="Various": source="Other" value=i[1][3] if value<0: target="Other" if year not in data3:data3[year]={} if ((source in data3[year]) and (target in data3[year][source])): data3[year][source][target][1]+=value else: target=cc(i[1][0]) source=cc(i[1][1]) if target=="Various": target="Other" if source=="Various": source="Other" if value<0: source="Other" if target not in data3[year]:data3[year][target]={} if source not in data3[year][target]:data3[year][target][source]=[0,0] data3[year][target][source][0]+=value # In[327]: #for th in {1000,5000,10000,50000,100000}: for th in {10000,100000}: dk3=pd.DataFrame(columns=['importer1','importer2','year','flow1','flow2']) c=0 for year in data3: p=0 dk3.loc[c]=['Other','Other',year,0,0] r=copy.deepcopy(c) c+=1 for target in data3[year]: for source in data3[year][target]: if ((source!='Other') and (target!='Other')): if ((data3[year][target][source][0]>th)or(data3[year][target][source][1]>th)): dk3.loc[c]=[target,source,year,data3[year][target][source][0],data3[year][target][source][1]] else: p+=data3[year][target][source][0]+data3[year][target][source][1] else: dk3.loc[r]['flow1']+=data3[year][target][source][0]+data3[year][target][source][1] c+=1 dk3[(dk3['year']==year)&(dk3['importer2']=='Other')&(dk3['importer1']=='Other')]['flow1']+=p dk3=dk3.set_index(['year','importer1','importer2']) dk3.to_csv(repr(th)+'data3.csv') dk3b=dk3.copy() dk3b.index.names=[u'year', u'importer2', u'importer1'] dk3b.to_csv(repr(th)+'data3b.csv') # In[326]: dk3.loc[1976]