In [1]:
import pandas as pd, copy
In [2]:
#http://data.un.org/Data.aspx?d=UNHCR&f=indID%3AType-Ref
#doanload data in batches, query limited to 50.000
df1=pd.read_csv('1985.csv') #years 1975-1984
df2=pd.read_csv('1995.csv') #years 1985-1994
df3=pd.read_csv('2005.csv') #years 1995-2004
df4=pd.read_csv('2015.csv') #years 2005-2013
In [3]:
df=pd.concat([df4,df3,df2,df1])
df.columns=['target','source','year','value','a','b','c']
df=df.drop(['a','b','c'],axis=1).dropna()#.set_index(['year','target','source'])[:80440]
df.head(50)
Out[3]:
target source year value
0 Afghanistan Iraq 2013 1
1 Afghanistan Islamic Rep. of Iran 2013 36
2 Afghanistan Pakistan 2013 34
3 Afghanistan State of Palestine 2013 1
4 Albania Algeria 2013 0
5 Albania China 2013 12
6 Albania Dem. Rep. of the Congo 2013 5
7 Albania Egypt 2013 3
8 Albania Iraq 2013 5
9 Albania Montenegro 2013 2
10 Albania Peru 2013 1
11 Albania Serbia (and Kosovo: S/RES/1244 (1999)) 2013 54
12 Albania State of Palestine 2013 4
13 Albania Tunisia 2013 0
14 Albania Turkey 2013 4
15 Albania Uzbekistan 2013 3
16 Algeria Afghanistan 2013 2
17 Algeria Angola 2013 4
18 Algeria Burundi 2013 1
19 Algeria Cameroon 2013 10
20 Algeria Central African Rep. 2013 1
21 Algeria Chad 2013 2
22 Algeria Congo 2013 7
23 Algeria Côte d'Ivoire 2013 30
24 Algeria Dem. Rep. of the Congo 2013 43
25 Algeria Eritrea 2013 1
26 Algeria Guinea 2013 1
27 Algeria Iraq 2013 15
28 Algeria Liberia 2013 2
29 Algeria Mali 2013 0
30 Algeria Mauritania 2013 1
31 Algeria Nigeria 2013 2
32 Algeria Somalia 2013 4
33 Algeria State of Palestine 2013 4017
34 Algeria Togo 2013 3
35 Algeria Western Sahara 2013 90000
36 Algeria Yemen 2013 4
37 Angola Algeria 2013 1
38 Angola Burundi 2013 18
39 Angola Central African Rep. 2013 13
40 Angola Chad 2013 195
41 Angola Congo 2013 65
42 Angola Côte d'Ivoire 2013 407
43 Angola Cuba 2013 2
44 Angola Dem. Rep. of the Congo 2013 21104
45 Angola Eritrea 2013 5
46 Angola Guinea 2013 164
47 Angola Guinea-Bissau 2013 5
48 Angola Haiti 2013 2
49 Angola Iraq 2013 2
In [4]:
df.set_index(['year','target','source']).loc[1986].loc['Pakistan']
Out[4]:
value
source
Afghanistan 2878100
Islamic Rep. of Iran 4000

palestinian data, from world bank

In [77]:
import numpy as np
def interpolate(d,years,gfit=2,depth=1,polyorder=1,override=True):
    #depth * length of interpolation substrings will be taken to the left and right
    #for example for {1971:5,1972:6,1973:7,1974:5} interpolating it over 1969-1990
    #for the section 1960-1970 (2 elements) the values from 1972,1973,1974 (3 elements) will be taken with depth 1.5
    #for the section 1974-1990 (15 elements) all values  (4 elements) will be taken to extrapolate
    if (gfit>2): 
        print 'interpolate takes only 1 (polynomial) or 2 (exponential) as 3rd argument [default=2]'
        return
    mydict={}
    missing_points=[[]]
    for year in years:
        if year not in d.keys():
            missing_points[-1].append(year)
        else:
            missing_points.append([])
    for m in missing_points:
        if m:
            fit=gfit
            if ((m[-1]<np.sort(d.keys())[0])|(m[0]>np.sort(d.keys())[-1])): #check if it is ends of the interval, then extrapolate mean only
                if not override: fit=0
            
            if fit==0: #take average
                y = {k: d[k] for k in set(d.keys()).intersection(range(max(min(years),min(m)-int(3)),min(max(years),max(m)+int(3))+1))}
                for i in range(len(m)):
                    mydict[m[i]]=np.mean(y.values())
            elif fit==1:
                #intersector
                y = {k: d[k] for k in set(d.keys()).intersection(range(max(min(years),min(m)-int(depth*len(m))),min(max(years),max(m)+int(depth*len(m)))+1))}
                #print y
                w = np.polyfit(y.keys(),y.values(),polyorder) # obtaining regression parameters
                if (polyorder==1):
                    intersector=w[0]*np.array(m)+w[1]
                else:
                    intersector=w[0]*np.array(m)*np.array(m)+w[1]*np.array(m)+w[2]
                for i in range(len(m)):
                    mydict[m[i]]=max(0,intersector[i])
            else:
                #intersector
                y = {k: d[k] for k in set(d.keys()).intersection(range(max(min(years),min(m)-int(depth*len(m))),min(max(years),max(m)+int(depth*len(m)))+1))}
                #print y
                w = np.polyfit(y.keys(),np.log(y.values()),1) # obtaining log regression parameters (exp fitting)
                intersector=np.exp(w[1])*np.exp(w[0]*np.array(m))
                for i in range(len(m)):
                    mydict[m[i]]=max(0,intersector[i])
    
    #return interpolated points
    return mydict
In [33]:
#http://data.worldbank.org/indicator/SM.POP.REFG/countries?display=default
dz=pd.read_csv('pal.csv',skiprows=2)
dz.columns=['country','cc','a','b']+range(1960,2016)
dz=dz.set_index('country').drop(['cc','a','b']+range(1960,1975)+range(2014,2016),axis=1)
In [110]:
ccc={'Syrian Arab Republic':'Syria','West Bank and Gaza':'Palestine','Jordan':'Jordan','Lebanon':'Lebanon'}
ccr={'Syrian Arab Republic':'Syrian Arab Rep.','West Bank and Gaza':'State of Palestine','Jordan':'Jordan','Lebanon':'Lebanon'}
In [48]:
dx=pd.concat([pd.DataFrame(dz.loc['Jordan']).T,pd.DataFrame(dz.loc['Lebanon']).T\
              ,pd.DataFrame(dz.loc['Syrian Arab Republic']).T,pd.DataFrame(dz.loc['West Bank and Gaza']).T])
In [112]:
palref=['Jordan','Lebanon','Syrian Arab Republic','West Bank and Gaza']
dc=pd.DataFrame(columns=['target','source','year','value'])
    
for k in range(4):
    hp={}
    for i in dx.loc[palref[k]].iteritems():
        if ~np.isnan(i[1]):
            hp[i[0]]=i[1]
    hp.update(interpolate(hp,range(1975,2014)))
    for y in hp:
        val=hp[y]-df[((df['target']==ccr[palref[k]])&(df['year']==y))].sum()[3]
        dc.loc[y+k*1000]=[ccc[palref[k]],'Palestine',y,val]

append plaestine data to main dataframe

In [115]:
df=pd.concat([df,dc])

save data

In [116]:
def cc(country):
    if country in cc2:
        return cc2[country]
    else: return country
cc2={
'Bolivia (Plurinational State of)':'Bolivia',
'Micronesia (Federated States of)':'Micronesia',
'Serbia (and Kosovo: S/RES/1244 (1999))':'Serbia & Kosovo',
'The former Yugoslav Rep. of Macedonia':'FYROM',
'Venezuela (Bolivarian Republic of)':'Venezuela',
'Dem. Rep. of the Congo':'DRC',
'Central African Rep.':'CAR',
"Dem. People's Rep. of Korea":"North Korea",
'Islamic Rep. of Iran':'Iran',
"Lao People's Dem. Rep.":'Lao PDR',
'Papua New Guinea':'PNG',
'Syrian Arab Rep.':'Syria',
'United Rep. of Tanzania':'Tanzania',
'United Arab Emirates':'UAE',
'Antigua and Barbuda':'Antig. & Barb',
'Bosnia and Herzegovina':'Bosnia & Herz.',
'British Virgin Islands':'UK Virgin',
'Brunei Darussalam':'Brunei',
'Hong Kong SAR, China':'Hong Kong',
'Macao SAR, China':'Macao',
'Russian Federation':'Russia',
'Saint Kitts and Nevis':'St. Kitts & Nev.',
'Saint Vincent and the Grenadines':'St. Vinc. & Gren.',
'Sint Maarten (Dutch part)':'St. Maarten',
'State of Palestine':'Palestine',
'Trinidad and Tobago':'Trinid. & Tob.',
'Turks and Caicos Islands':'Turks & Caicos'
}
In [117]:
data={}
countries=set()
for i in df.T.iteritems():
    year=int(i[1][2])
    target=cc(i[1][0])
    source=cc(i[1][1])
    if target=="Various":
        target="Other"
    if source=="Various":
        source="Other"
    value=i[1][3]
    countries.add(target)
    countries.add(source)
    if year not in data:data[year]={}
    if ((source in data[year]) and (target in data[year][source])):
            data[year][source][target][1]=value
    else:
        if target not in data[year]:data[year][target]={}
        if source not in data[year][target]:data[year][target][source]=[0,0]
        data[year][target][source][0]=value  
In [118]:
dk=pd.DataFrame(columns=['importer1','importer2','year','flow1','flow2'])
c=0
for year in data:
    for target in data[year]:
        for source in data[year][target]:
            dk.loc[c]=[target,source,year,data[year][target][source][0],data[year][target][source][1]]
            c+=1
    print year,'0'
1975 0
1976 0
1977 0
1978 0
1979 0
1980 0
1981 0
1982 0
1983 0
1984 0
1985 0
1986 0
1987 0
1988 0
1989 0
1990 0
1991 0
1992 0
1993 0
1994 0
1995 0
1996 0
1997 0
1998 0
1999 0
2000 0
2001 0
2002 0
2003 0
2004 0
2005 0
2006 0
2007 0
2008 0
2009 0
2010 0
2011 0
2012 0
2013 0
In [119]:
dk=dk.set_index(['year','importer1','importer2'])
dk.to_csv('data.csv')

other direction flow

In [ ]:
data={}
countries=set()
for i in df.T.iteritems():
    year=int(i[1][2])
    target=cc(i[1][0])
    source=cc(i[1][1])
    if target=="Various":
        target="Other"
    if source=="Various":
        source="Other"
    value=i[1][3]
    countries.add(target)
    countries.add(source)
    if year not in data:data[year]={}
    if ((source in data[year]) and (target in data[year][source])):
            data[year][source][target][1]=value
    else:
        if target not in data[year]:data[year][target]={}
        if source not in data[year][target]:data[year][target][source]=[0,0]
        data[year][target][source][0]=value  
dk=pd.DataFrame(columns=['importer2','importer1','year','flow1','flow2'])
c=0
for year in data:
    for target in data[year]:
        for source in data[year][target]:
            dk.loc[c]=[target,source,year,data[year][target][source][0],data[year][target][source][1]]
            c+=1
    print year,'0'
dk=dk.set_index(['year','importer2','importer1'])
dk.to_csv('data.csv')

extra experimentations

In [225]:
data2={}
countries2=set()
for i in df.T.iteritems():
    year=int(i[1][2])
    target=cc(i[1][0])
    source=cc(i[1][1])
    if target=="Various":
        target="Other"
    if source=="Various":
        source="Other"
    value=i[1][3]
    countries2.add(target)
    countries2.add(source)
    if value<10000:
        target="Other"
    if year not in data2:data2[year]={}
    if ((source in data2[year]) and (target in data2[year][source])):
        data2[year][source][target][1]+=value
    else:
        target=cc(i[1][0])
        source=cc(i[1][1])
        if target=="Various":
            target="Other"
        if source=="Various":
            source="Other"
        if value<10000:
            source="Other"
        if target not in data2[year]:data2[year][target]={}
        if source not in data2[year][target]:data2[year][target][source]=[0,0]
        
        data2[year][target][source][0]+=value  
In [226]:
dk2=pd.DataFrame(columns=['importer1','importer2','year','flow1','flow2'])
c=0
for year in data2:
    for target in data2[year]:
        for source in data2[year][target]:
            dk2.loc[c]=[target,source,year,data2[year][target][source][0],data2[year][target][source][1]]
            c+=1
    print year,'1'
1975 1
1976 1
1977 1
1978 1
1979 1
1980 1
1981 1
1982 1
1983 1
1984 1
1985 1
1986 1
1987 1
1988 1
1989 1
1990 1
1991 1
1992 1
1993 1
1994 1
1995 1
1996 1
1997 1
1998 1
1999 1
2000 1
2001 1
2002 1
2003 1
2004 1
2005 1
2006 1
2007 1
2008 1
2009 1
2010 1
2011 1
2012 1
2013 1
In [227]:
dk2=dk2.set_index(['year','importer1','importer2'])
dk2.to_csv('data2.csv')
In [262]:
data3={}
for i in df.T.iteritems():
    year=int(i[1][2])
    target=cc(i[1][0])
    source=cc(i[1][1])
    if target=="Various":
        target="Other"
    if source=="Various":
        source="Other"
    value=i[1][3]
    if value<0:
        target="Other"
    if year not in data3:data3[year]={}
    if ((source in data3[year]) and (target in data3[year][source])):
        data3[year][source][target][1]+=value
    else:
        target=cc(i[1][0])
        source=cc(i[1][1])
        if target=="Various":
            target="Other"
        if source=="Various":
            source="Other"
        if value<0:
            source="Other"
        if target not in data3[year]:data3[year][target]={}
        if source not in data3[year][target]:data3[year][target][source]=[0,0]
        
        data3[year][target][source][0]+=value  
In [327]:
#for th in {1000,5000,10000,50000,100000}:
for th in {10000,100000}:    
    dk3=pd.DataFrame(columns=['importer1','importer2','year','flow1','flow2'])
    c=0    
    for year in data3:
        p=0
        dk3.loc[c]=['Other','Other',year,0,0]
        r=copy.deepcopy(c)
        c+=1
        for target in data3[year]:
            for source in data3[year][target]:
                if ((source!='Other') and (target!='Other')):
                    if ((data3[year][target][source][0]>th)or(data3[year][target][source][1]>th)):
                        dk3.loc[c]=[target,source,year,data3[year][target][source][0],data3[year][target][source][1]]
                    else: p+=data3[year][target][source][0]+data3[year][target][source][1]
                else: dk3.loc[r]['flow1']+=data3[year][target][source][0]+data3[year][target][source][1]
                c+=1
        dk3[(dk3['year']==year)&(dk3['importer2']=='Other')&(dk3['importer1']=='Other')]['flow1']+=p
    dk3=dk3.set_index(['year','importer1','importer2'])
    dk3.to_csv(repr(th)+'data3.csv')
    dk3b=dk3.copy()
    dk3b.index.names=[u'year', u'importer2', u'importer1']
    dk3b.to_csv(repr(th)+'data3b.csv')
In [326]:
dk3.loc[1976]
Out[326]:
flow1 flow2
importer1 importer2
Other Other 2472611 0
Ethiopia Sudan 10600 105000
Tanzania Burundi 126600 0
Rwanda 23600 0
DRC Burundi 18220 0
Rwanda 24500 0
Cameroon Equatorial Guinea 30000 0
Algeria Western Sahara 35000 0
Zambia Angola 27730 0
Senegal Guinea-Bissau 37000 0
Thailand Lao PDR 62720 0
Cambodia 17090 0
Gabon Equatorial Guinea 60000 0
Angola DRC 5000 471340
Côte d'Ivoire Guinea 500000 0
Mozambique Zimbabwe 30000 0
Uganda DRC 34230 0
Rwanda 78480 0
Burundi Rwanda 49500 7500