In [1]:

import pandas as pd
import numpy
import json
from collections import defaultdict
import scipy.stats
import math
import pywikibot
from matplotlib.pylab import style
style.use('fivethirtyeight')

%pylab inline
java_min_int = -2147483648

VERBOSE:pywiki:Starting 1 threads...

Populating the interactive namespace from numpy and matplotlib

In [2]:

allrecs = pd.DataFrame.from_dict(json.load(open('helpers/world_cultures_shortcut.json','r')))

In [6]:

#Tranforming QIDs into English labels.
enwp = pywikibot.Site('en','wikipedia')
wikidata = enwp.data_repository()

retrieved = dict()

def english_label(qid):
    
    if type(qid) is float:
        if math.isnan(qid):
            return None
    #first see if we've done it
    try:
        return retrieved[qid]
    except KeyError:
        try:
            page = pywikibot.ItemPage(wikidata, qid)
            data = page.get()
            lab = data['labels']['en']
            retrieved[qid] = lab
            return lab
        except (KeyError, pywikibot.exceptions.NoPage):
            retrieved[qid] = qid
            return qid
#Tranforming QIDs into English labels.
enwp = pywikibot.Site('en','wikipedia')
wikidata = enwp.data_repository()

In [7]:

allrecs['citname'] = allrecs['citizenship'].apply(english_label)
allrecs['countryname'] = allrecs['country'].apply(english_label)

VERBOSE:pywiki:Found 1 wikidata:wikidata processes running, including this one.

In [8]:

wikidatanames = set(allrecs['citname']).union(set(allrecs['countryname']))

In [9]:

def normname(name):
    name = name.replace('*','')
    try:
        return {'Iran, Islamic Rep.': 'Iran',
                        'Korea, Rep.':'South Korea',
                        'Brunei Darussalam': 'Brunei',
                        'United States':'United States of America',
                        'Slovak Republic':'Slovakia',
                        'China':"People's Republic of China",
                        'People’s Republic of China':"People's Republic of China",
                         'Kyrgyz Republic': 'Kyrgyzstan',
                         'Russian Federation': 'Russia',
                         'Macedonia, FYR': 'Republic of Macedonia',
                         'Lao PDR':'Laos',
                         'Bahamas':'The Bahamas',
                         u'C\xf4te d\u2019Ivoire':u"C\xf4te d'Ivoire",
                         'Côte d’Ivoire':u"C\xf4te d'Ivoire",
                         'Plu. St.. of Bolivia':'Bolivia',
                         'Viet Nam':'Vietnam',
                         'Myanmar':'Burma',
                         'Former Yugoslav Republic of Macedonia':'Macedonia',
                         'Lao People’s Democratic Republic':'Laos',
                         'Bolivarian Republic of Venezuela':'Venezuela',
                        'Republic of Moldova':'Moldova',
                         'Central African Rep.':'Central African Republic',
                        'Syrian Arab Republic':'Syria',
                        'Republic of Tanzania':'Tanzania',
                         'Palestine, State of':'Palestine',
                         'Moldova (Republic of)':'Moldova',
                        'Sao Tome and Principe': u'Sao Tom\xe9 and Pr\xedncipe',
                        "Lao People's Democratic Republic":'Laos',
                        'Venezuela (Bolivarian Republic of)':'Venezuela',
                        'The former Yugoslav Republic of Macedonia':'Macedonia',
                        'Iran (Islamic Republic of)':'Iran',
                        'Congo (Democratic Republic of the)': u'Democratic Republic of the Congo',
                        'Congo':u'Republic of the Congo',
                        'Tanzania (United Republic of)':'Tanzania',
                        'Hong Kong, China (SAR)':"People's Republic of China",
                        'Russian Federation':'Russia',
                        'Korea (Republic of)':'South Korea',
                        'Bolivia (Plurinational State of)':'Bolivia'}[name]
    except KeyError:
        return name

In [10]:

wef = pd.io.html.read_html('http://reports.weforum.org/global-gender-gap-report-2014/rankings/')[0]
wef['Economy'] = wef['Economy'].apply(normname)
wefnames = set(wef['Economy'])

In [11]:

geidirty = pd.io.html.read_html('http://www.socialwatch.org/node/14367')[2]
gei = geidirty.iloc[3:,6:8]
gei.columns = ['Economy', 'Score']
gei = gei.dropna()
gei["Rank"] = gei['Score'].rank(ascending=False).apply(lambda x: int(x))

In [12]:

def country_sigi_extract(text_line):
    '''put the first strings together as name and the first float as the sigi value'''
    economy = ''
    sigi_val = float()
    for w in text_line.split(' '):
        try:
            sigi_val  = float(w)
            break
        except ValueError:
            if economy:
                economy += ' '  + w
            else:
                economy = w
    return economy, sigi_val

ec_sigi = dict([country_sigi_extract(text_line) for text_line in  sigipdftext.split('\n')] )

sigi = pd.DataFrame.from_dict(ec_sigi, orient='index')
sigi['Economy'] = sigi.index
sigi['Economy'] = sigi['Economy'].apply(normname)
sigi['Score'] = 1-sigi[0]
sigi["Rank"] = sigi['Score'].rank(ascending=False).apply(lambda x: int(x))

In [13]:

gdidirty = pd.DataFrame.from_csv('helpers/foreign_indexes/Table_5__Gender-related_development_index.csv')
nar = gdidirty.iloc[1:,:3]
nar.columns = ['Economy', 'Score', 'Rank']

In [14]:

gdi = nar[(nar['Score'] != '..') & (nar['Rank'] != '—') ]

In [15]:

gdi['Score']  = gdi['Score'].apply(lambda x: float(x))
gdi['Rank']  = gdi['Rank'].apply(lambda x: int(x))
gdi.sort('Score')

WARNING: -c:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

WARNING:py.warnings:-c:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

WARNING: -c:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

WARNING:py.warnings:-c:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

Out[15]:

	Economy	Score	Rank
HDI ranks
169	Afghanistan	0.602	148
187	Niger	0.714	147
154	Yemen	0.738	146
146	Pakistan	0.750	145
184	Chad	0.762	144
176	Mali	0.771	143
185	Central African Republic	0.776	142
179	Guinea	0.785	141
175	Liberia	0.786	140
183	Sierra Leone	0.799	139
161	Mauritania	0.801	138
120	Iraq	0.802	137
166	Togo	0.803	136
165	Benin	0.822	134
186	Congo (Democratic Republic of the)	0.822	134
129	Morocco	0.828	132
135	India	0.828	132
152	Nigeria	0.839	131
77	Jordan	0.842	130
93	Algeria	0.843	129
75	Iran (Islamic Republic of)	0.847	128
118	Syrian Arab Republic	0.851	127
173	Ethiopia	0.853	126
110	Egypt	0.855	125
163	Senegal	0.864	124
152	Cameroon	0.872	123
128	Timor-Leste	0.875	122
148	Swaziland	0.877	121
178	Mozambique	0.879	120
138	Ghana	0.884	118
...	...	...	...
117	Philippines	0.989	17
10	Denmark	0.989	17
20	France	0.989	17
96	Jamaica	0.989	17
89	Thailand	0.990	14
114	Moldova (Republic of)	0.990	14
14	United Kingdom	0.993	13
87	Armenia	0.994	8
64	Trinidad and Tobago	0.994	8
58	Bulgaria	0.994	8
5	United States	0.995	7
1	Norway	0.997	5
43	Hungary	0.998	4
67	Venezuela (Bolivarian Republic of)	0.999	2
37	Slovakia	1.000	1
49	Argentina	1.001	2
12	Sweden	1.004	6
24	Finland	1.006	8
25	Slovenia	1.006	8
35	Poland	1.010	14
83	Ukraine	1.012	21
50	Uruguay	1.015	25
70	Kazakhstan	1.015	25
53	Belarus	1.021	32
59	Barbados	1.021	32
103	Mongolia	1.021	32
48	Latvia	1.033	52
35	Lithuania	1.036	58
57	Russian Federation	1.038	61
33	Estonia	1.042	70

148 rows × 3 columns

In [19]:

geinames = set(gei['Economy'].apply(normname))
print geinames
unknown = gdinames.difference(wikidatanames)
for uk in unknown:
    print uk

set([u'Canada', u'Turkmenistan', u'Lithuania', u'Cambodia', u'Ethiopia', u'Sri Lanka', u'Swaziland', u'Argentina', u'Bolivia', u'Cameroon', u'Burkina Faso', u'Ghana', u'Saudi Arabia', u'Japan', u'Cape Verde', u'Slovenia', u'Guatemala', u'Bosnia and Herzegovina', u'Jordan', u'Congo, Rep.', u'Spain', u'Liberia', u'Netherlands', u'Pakistan', u'Oman', u'Tanzania', "People's Republic of China", u'Gabon', u'New Zealand', u'Yemen', u'Jamaica', u'Albania', u'United Arab Emirates', u'India', u'Azerbaijan', u'Lesotho', u'Kenya', 'South Korea', u'Tajikistan', u'Turkey', u'Afghanistan', u'Czech Republic', u'Eritrea', u'Mongolia', u'France', u'Rwanda', u'Slovakia', u'Congo, DR', u'Peru', u'Malawi', u'Benin', u'Singapore', u'United States of America', u'Togo', u'Armenia', u'Dominican Republic', u'Ukraine', u'Bahrain', u'Indonesia', u'Finland', u'Mauritius', u'Sweden', u'Belarus', u'Mali', 'Russia', u'Bulgaria', u'Romania', u'Angola', u'Portugal', u'South Africa', u'Nicaragua', u'Qatar', u'Malaysia', u'Austria', 'Vietnam', u'Mozambique', u'Uganda', u'Hungary', u'Niger', u'Brazil', u'Kuwait', u'Panama', u'Costa Rica', u'Luxembourg', u'Ireland', u'Ecuador', u'Bangladesh', 'Brunei', u'Australia', u'Iran', u'Algeria', u'El Salvador', u'Chile', u'Belgium', u'Thailand', u'Haiti', u'Belize', u'Sierra Leone', u'Georgia', u'Denmark', u'Poland', u'Moldova', u'Morocco', u'Namibia', u'Guinea-Bissau', u'Switzerland', u'Chad', u'Estonia', u'Uruguay', u'Equatorial Guinea', u'Lebanon', u'Uzbekistan', u'Djibouti', u'Colombia', u'Burundi', u'Cyprus', u'Madagascar', u'Italy', u'Bhutan', u'Sudan', u'Nepal', u'Malta', u'Maldives', u'Venezuela', u'Israel', u'Iceland', u'Zambia', u'Senegal', u'Papua New Guinea', u'Zimbabwe', u'Germany', u'Gambia', u'Kazakhstan', u'Philippines', u'Mauritania', u'Kyrgyzstan', u'Trinidad and Tobago', u'Latvia', u'Guyana', u'Syria', u"C\xf4te d'Ivoire", u'Honduras', u'Mexico', u'Egypt', u'Lao, PDR', u'Cuba', u'Serbia', u'Comoros', u'United Kingdom', u'Greece', u'Paraguay', u'Croatia', u'Botswana'])

In [18]:

gdinames = set(gdi['Economy'].apply(normname))
unknown = gdinames.difference(wikidatanames)
for uk in unknown:
    print uk

In [20]:

siginames = set(sigi['Economy']) 
unknown = siginames.difference(wikidatanames)
for uk in unknown:
    print uk

In [21]:

def calibrate_rank_corr(foreign_index, short=False):
    '''takes a foreign index and finds the window for which wigi most correlates with it'''
    corr_df = pd.DataFrame(columns=['start_year', 'bios_count', 'spearman', 'spearman_p', 'mannwhitneyu', 'mannwhitneyu_p', 'ranksum', 'ranksum_p'])

    if not short:
        some_modern_history = range(1000, 1800, 100) + range(1800, 1980, 10)
    else:
        some_modern_history = [1900]
    for start_year in some_modern_history:
        modrecs = allrecs[(allrecs['dob'] >= start_year) &(allrecs['dob'] < 1990)]
        cdf = modrecs[['country','citizenship','gender']]

        def combine_economy(row):
            cit = row['citizenship']
            cunt = row['country']
            return cit if cit else cunt
        cdf['Economy_qid'] = cdf.apply(combine_economy,axis=1)
        edf = cdf[cdf['Economy_qid'].apply(lambda x: x is not None)]
        bios_count = len(edf)

        edf['Economy'] = edf['Economy_qid'].apply(english_label)


        country_perc = defaultdict(dict)
        country_groups= edf.groupby(by='Economy')

        for country, group in country_groups:
            nonmale = group[group['gender'] != 'Q6581097']['gender'].count()
            total = group['gender'].count()
            nm_perc = nonmale / float(total)
            country_perc[country]['Economy'] = country #for later on joining
            country_perc[country]['Score'] = nm_perc #for later on joining
            country_perc[country]['total']= total

        wdf = pd.DataFrame.from_dict(country_perc, orient='index')

        wefnames = set(foreign_index['Economy'])

        wdf_matching = wdf[wdf['Economy'].apply(lambda x: x in wefnames)]
        wdf_matching['Rank'] = wdf_matching['Score'].rank(ascending=False).apply(lambda x: int(x))

        rank_compare = foreign_index.join(wdf_matching, on='Economy', how='left', rsuffix='-Wikidata')[['Economy','Rank','Rank-Wikidata','Score','Score-Wikidata']]
        rank_compare['diff'] = rank_compare['Rank'] - rank_compare['Rank-Wikidata']

        spearman_results = scipy.stats.spearmanr(rank_compare[['Rank','Rank-Wikidata']])
        spearman = spearman_results[0]
        spearman_p = spearman_results[1]

        def scale_col(col):
            num = col - min(col)
            denom = max(col) - min(col)
            return num / denom

        rank_compare['Score_norm'] = scale_col(rank_compare['Score'])
        rank_compare['Score_wikidata_norm'] = scale_col(rank_compare['Score-Wikidata'])

        mannwhitneyu, mannwhitneyu_p = scipy.stats.mannwhitneyu(rank_compare['Score_norm'],rank_compare['Score_wikidata_norm'])
        ranksum, ranksum_p = scipy.stats.ranksums(rank_compare['Score_norm'],rank_compare['Score_wikidata_norm'])

        corr_df = corr_df.append(dict(start_year=start_year,
                                                                bios_count=bios_count,
                                                                spearman=spearman,
                                                                spearman_p = spearman_p,
                                                                mannwhitneyu = mannwhitneyu,
                                                                mannwhitneyu_p = mannwhitneyu_p,
                                                                ranksum = ranksum,
                                                                ranksum_p = ranksum_p), ignore_index=True)
        
    return corr_df #todo just return the max spearman

In [22]:

sigi_corr_df = calibrate_rank_corr(sigi, short=False)

WARNING: -c:17: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

WARNING:py.warnings:-c:17: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

WARNING: -c:21: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

WARNING:py.warnings:-c:21: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

WARNING: -c:40: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

WARNING:py.warnings:-c:40: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-22-662364be4d25> in <module>()
----> 1 sigi_corr_df = calibrate_rank_corr(sigi, short=False)

<ipython-input-21-4fa05c099ae1> in calibrate_rank_corr(foreign_index, short)
     15             cunt = row['country']
     16             return cit if cit else cunt
---> 17         cdf['Economy_qid'] = cdf.apply(combine_economy,axis=1)
     18         edf = cdf[cdf['Economy_qid'].apply(lambda x: x is not None)]
     19         bios_count = len(edf)

/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in apply(self, func, axis, broadcast, raw, reduce, args, **kwds)
   3594                     if reduce is None:
   3595                         reduce = True
-> 3596                     return self._apply_standard(f, axis, reduce=reduce)
   3597             else:
   3598                 return self._apply_broadcast(f, axis)

/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in _apply_standard(self, func, axis, ignore_failures, reduce)
   3646                 labels = self._get_agg_axis(axis)
   3647                 result = lib.reduce(values, func, axis=axis, dummy=dummy,
-> 3648                                     labels=labels)
   3649                 return Series(result, index=labels)
   3650             except Exception:

/usr/local/lib/python2.7/dist-packages/pandas/lib.so in pandas.lib.reduce (pandas/lib.c:40234)()

/usr/local/lib/python2.7/dist-packages/pandas/lib.so in pandas.lib.Reducer.get_result (pandas/lib.c:30025)()

<ipython-input-21-4fa05c099ae1> in combine_economy(row)
     12 
     13         def combine_economy(row):
---> 14             cit = row['citizenship']
     15             cunt = row['country']
     16             return cit if cit else cunt

/usr/local/lib/python2.7/dist-packages/pandas/core/series.pyc in __getitem__(self, key)
    509             result = self.index.get_value(self, key)
    510 
--> 511             if not np.isscalar(result):
    512                 if is_list_like(result) and not isinstance(result, Series):
    513 

/usr/local/lib/python2.7/dist-packages/numpy/core/numeric.pyc in isscalar(num)
   1932 
   1933     """
-> 1934     if isinstance(num, generic):
   1935         return True
   1936     else:

KeyboardInterrupt:

In [27]:

wef_corr_df = calibrate_rank_corr(wef, short=False)

In [ ]:

gdi_corr_df = calibrate_rank_corr(gdi, short=False)

In [ ]:

gei_corr_df = calibrate_rank_corr(gei, short=False)

In [ ]:

gei_corr_df

In [ ]:

sigi_corr_df

In [ ]:

gdi_corr_df

In [29]:

wef_corr_df

Out[29]:

	start_year	bios_count	spearman	spearman_p	mannwhitneyu	mannwhitneyu_p	ranksum	ranksum_p
0	1000	887006	0.263995	0.001501	6195.0	9.762673e-09	5.616870	1.944474e-08
1	1100	886514	0.265131	0.001429	6193.0	9.600785e-09	5.619760	1.912226e-08
2	1200	885697	0.265446	0.001410	6187.0	9.130603e-09	5.628431	1.818567e-08
3	1300	884571	0.265739	0.001392	6200.0	1.017908e-08	5.609645	2.027420e-08
4	1400	883044	0.263195	0.001553	6197.0	9.927211e-09	5.613980	1.977249e-08
5	1500	879276	0.262088	0.001628	6206.0	1.070155e-08	5.600975	2.131498e-08
6	1600	870495	0.265848	0.001385	6230.0	1.306420e-08	5.566294	2.602143e-08
7	1700	857099	0.265899	0.001382	6252.0	1.566935e-08	5.534503	3.121122e-08
8	1800	815661	0.270712	0.001120	6421.0	6.130025e-08	5.290291	1.221218e-07
9	1810	805811	0.276275	0.000874	6483.0	9.965155e-08	5.200699	1.985407e-07
10	1820	794371	0.277217	0.000838	6564.0	1.858238e-07	5.083650	3.702490e-07
11	1830	781921	0.276718	0.000857	6621.0	2.858085e-07	5.001283	5.694998e-07
12	1840	768581	0.282408	0.000661	6689.0	4.735930e-07	4.903020	9.437414e-07
13	1850	753693	0.290256	0.000458	6753.0	7.553225e-07	4.810538	1.505246e-06
14	1860	736868	0.295216	0.000362	6834.0	1.347622e-06	4.693490	2.685835e-06
15	1870	716540	0.292414	0.000414	6940.0	2.818059e-06	4.540315	5.617012e-06
16	1880	691692	0.298912	0.000302	7110.0	8.776426e-06	4.294659	1.749623e-05
17	1890	660609	0.302099	0.000258	7285.0	2.660525e-05	4.041777	5.304774e-05
18	1900	623915	0.305469	0.000218	7331.0	3.525074e-05	3.975305	7.028916e-05
19	1910	579592	0.312051	0.000157	7367.0	4.380497e-05	3.923283	8.735029e-05
20	1920	534223	0.292366	0.000415	7516.5	1.050574e-04	3.707250	2.095221e-04
21	1930	472919	0.292359	0.000415	7953.0	1.049781e-03	3.076490	2.094533e-03
22	1940	410994	0.273103	0.001008	7938.0	9.759586e-04	3.098166	1.947226e-03
23	1950	331071	0.257789	0.001953	7283.0	2.627829e-05	4.044667	5.239760e-05
24	1960	248180	0.183820	0.028538	7737.0	3.521279e-04	3.388619	7.024563e-04
25	1970	165324	0.138381	0.100519	6806.0	1.104553e-06	4.733951	2.201911e-06

In [ ]:

for df in [gdi_corr_df, sigi_corr_df, wef_corr_df]:
    df.plot(x='start_year',y=['spearman', 'spearman_p'])
    plt.show()

In [87]:

wef_corr_df.to_pickle('opensym/wefdf')

In [2]:

wef_corr_df = pd.read_pickle('opensym/wefdf')

In [9]:

fig, ax = plt.subplots(1, 1, figsize=(6,4))
wef_corr_df.plot(x='start_year',y=['spearman', 'spearman_p'],ax=ax)
ax.set_ylabel('Correlation coefficient')
ax.set_xlabel('Start Year')
ax.legend((r'Spearman $\rho$','Significance $p$'),loc=3)
fig.suptitle('WIGI-GGGI Rank Correlation by Start Year', size=24)
fig.subplots_adjust(top=0.88)
fig.savefig('opensym/spearman_evolution_gggi.png')

In [ ]:

corr_df.plot(x='start_year',y=['mannwhitneyu', 'mannwhitneyu_p'], secondary_y='mannwhitneyu_p')

In [ ]:

corr_df.plot(x='start_year',y=['ranksum', 'ranksum_p'], secondary_y='ranksum_p')

In [ ]:

wdf_matching.sort('Score',ascending=False).head()

In [ ]:

modrecs = allrecs[(allrecs['dob'] >=1890) &(allrecs['dob'] < 1990)]
cdf = modrecs[['country','citizenship','gender']]

def combine_economy(row):
    cit = row['citizenship']
    cunt = row['country']
    return cit if cit else cunt
cdf['Economy_qid'] = cdf.apply(combine_economy,axis=1)
edf = cdf[cdf['Economy_qid'].apply(lambda x: x is not None)]
bios_count = len(edf)

edf['Economy'] = edf['Economy_qid'].apply(english_label)


country_perc = defaultdict(dict)
country_groups= edf.groupby(by='Economy')

for country, group in country_groups:
    nonmale = group[group['gender'] != 'Q6581097']['gender'].count()
    total = group['gender'].count()
    nm_perc = nonmale / float(total)
    country_perc[country]['Economy'] = country #for later on joining
    country_perc[country]['Score'] = nm_perc #for later on joining
    country_perc[country]['total']= total

wdf = pd.DataFrame.from_dict(country_perc, orient='index')

wdf_matching = wdf[wdf['Economy'].apply(lambda x: x in wefnames)]
wdf_matching['Rank'] = wdf_matching['Score'].rank(ascending=False).apply(lambda x: int(x))

rank_compare = wef.join(wdf_matching, on='Economy', how='left', rsuffix='_wikidata')[['Economy','Rank','Rank_wikidata','Score','Score_wikidata']]
rank_compare['diff'] = rank_compare['Rank'] - rank_compare['Rank_wikidata']

In [ ]:

pd.DataFrame.to_html(formatters=)

In [ ]:

print rank_compare.columns

In [ ]:

rank_compare.columns = ['Country', 'WEF Rank', 'Wikipedia Rank','WEF  Score','Wikipedia Score','Rank Difference']

In [ ]:

rank_compare.sort('WEF Rank').head(10).to_html(index=False,formatters={'Wikipedia Score':lambda x: '{:0.4f}'.format(float(x))})

In [ ]:

rank_compare.sort('Wikipedia Rank').to_csv('helpers/foreign_indexes/WIGI_comparison.csv',encoding = 'utf-8', index=False, formatters={'Wikipedia Score':lambda x: '{:0.4f}'.format(float(x))})

In [ ]:

wdfc = wdf[wdf['total'] > 30]

In [ ]:

wdfc['Rank'] = wdfc['Score'].rank(ascending=False).apply(lambda x: int(x))

UNDP's Gender-related Development Index (GDI) and the Gender Empowerment Measure (GEM), introduced only in 1995. More recently, three new measures were developed: the Gender Equity Index (GEI) introduced by Social Watch in 2005, the Global Gender Gap Index (GGGI) developed by the World Economic Forum in 2006, and the Social Institutions and Gender Index (SIGI) of the OECD Development Centre from 2007.

In [ ]:

fiveway = wdfc[['Economy','Score','Rank']]
fiveway.index = fiveway['Economy']
for findex, ftext in zip([sigi,gdi,gei,wef], ['SIGI', 'GDI', 'GEI', 'GGGI']):
    findex.index = findex['Economy']
    fiveway = fiveway.join(findex[['Score','Rank']], how='outer', on = "Economy", rsuffix=" {}".format(ftext))

In [ ]:

fiveway.columns = fiveway.columns[:1] + ['Score WIGI','Rank WIGI'] + fiveway.columns[2:]

In [ ]:

fiveway.sort('Rank').to_csv('helpers/foreign_indexes/WIGI_comparison.csv',encoding = 'utf-8', index=False, formatters={'Wikipedia Score':lambda x: '{:0.4f}'.format(float(x))})

In [ ]:

Quite uncorrellated. That means that the data is not good, or that the world economic forum methods have little to do with the percentage of women born in those countries recorded semantically on a historic level. And /rho is high

In [ ]:

#not clean data bad pdf copy-paste BUT I think that the first string and then SIGI number have copied over 
sigipdftext = '''Belgium 0.0016 0.0038 very low 0.0316 very low 0.0824 low 0.0000 very low 0.0000 very low
France 0.0034 0.1002 low 0.0000 very low 0.0828 low 0.0000 very low 0.0000 very low
Slovenia 0.0037 0.0031 very low 0.0891 very low 0.1023 low 0.0000 very low 0.0000 very low
Spain 0.0049 0.0856 low 0.0622 very low 0.1144 low 0.0000 very low 0.0000 very low
Serbia 0.0097 0.1094 low 0.1171 very low 0.1504 medium 0.0000 very low 0.0000 very low
Argentina 0.0107 0.0809 low 0.0148 very low 0.0691 very low 0.2048 low 0.0000 very low
Italy 0.0116 0.0025 very low 0.1029 very low 0.0966 low 0.0000 very low 0.1951 low
Cuba 0.0208 0.2420 medium 0.0871 very low 0.0000 very low 0.0000 very low 0.1951 low
Trinidad and Tobago 0.0236 0.2504 medium 0.1306 very low 0.0000 very low 0.0000 very low 0.1951 low
Czech Republic 0.0283 0.0013 very low 0.0956 very low 0.0855 low 0.0000 very low 0.3539 medium
Bosnia and Herzegovina 0.0333 0.2437 medium 0.0672 very low 0.1497 medium 0.2048 low 0.1951 low
Belarus 0.0336 0.0251 very low 0.3544 medium 0.0599 very low 0.0000 very low 0.1951 low
Mongolia 0.0345 0.0226 very low 0.2584 medium 0.1582 medium 0.2048 low 0.1951 low
Dominican Republic 0.0367 0.3691 medium 0.0958 very low 0.0118 very low 0.0000 very low 0.1951 low
Panama 0.0375 0.2344 low 0.0148 very low 0.0855 low 0.0000 very low 0.3539 medium
Bolivarian Republic of Venezuela 0.0389 0.2456 medium 0.0941 very low 0.0071 very low 0.0000 very low 0.3539 medium
Ecuador 0.0422 0.1374 low 0.3737 medium 0.1037 low 0.2048 low 0.0000 very low
Lithuania 0.0424 0.0013 very low 0.2795 medium 0.0931 low 0.0000 very low 0.3539 medium
Bulgaria 0.0449 0.1504 low 0.3926 medium 0.0988 low 0.0000 very low 0.1951 low
Brazil 0.0458 0.2316 low 0.1226 very low 0.0364 very low 0.1837 low 0.3539 medium
Cambodia 0.0477 0.0684 low 0.2601 medium 0.0000 very low 0.2028 low 0.3539 medium
El Salvador 0.0490 0.1066 low 0.2675 medium 0.1049 low 0.3885 medium 0.0000 very low
Costa Rica 0.0506 0.2513 medium 0.1544 low 0.0121 very low 0.4076 medium 0.0000 very low
Latvia 0.0511 0.0044 very low 0.3466 medium 0.1008 low 0.0000 very low 0.3539 medium
Plu. St.. of Bolivia 0.0579 0.3676 medium 0.3207 medium 0.0987 low 0.2048 low 0.0000 very low
Paraguay 0.0580 0.2880 medium 0.0440 very low 0.0291 very low 0.4076 medium 0.1951 low
South Africa 0.0599 0.0213 very low 0.2164 low 0.2196 medium 0.4076 medium 0.1951 low
Republic of Moldova 0.0664 0.3418 medium 0.2189 low 0.0000 very low 0.2048 low 0.3539 medium
Romania 0.0686 0.1134 low 0.1700 low 0.0994 low 0.0000 very low 0.5399 high
Azerbaijan 0.2403 0.1301 low 0.2057 low 0.8587 very high 0.1837 low 0.6093 high
Armenia 0.2428 0.1910 low 0.1853 low 0.9880 very high 0.2048 low 0.3539 medium
Ethiopia 0.2450 0.2820 medium 0.8662 very high 0.0878 low 0.5913 high 0.1951 low
Albania 0.2476 0.1822 low 0.2596 medium 0.8767 very high 0.4076 medium 0.4505 medium
Ukraine 0.0750 0.0414 very low 0.1517 low 0.2430 high 0.0000 very low 0.5399 high 
Peru 0.0826 0.4053 medium 0.2096 low 0.0284 very low 0.4076 medium 0.1951 low 
Colombia 0.0862 0.1748 low 0.1567 low 0.0663 very low 0.0000 very low 0.6093 high United
Republic of Tanzania 0.2504 0.7166 very high 0.5415 high 0.1746 medium 0.5913 high 0.2554 low
Lesotho 0.0876 0.4266 high 0.4112 medium 0.2116 medium 0.2048 low 0.0000 very low 
Côte d’Ivoire 0.2537 0.4955 high 0.5895 high 0.1858 medium 0.5913 high 0.5399 high
Madagascar 0.1002 0.4889 high 0.3079 medium 0.0000 very low 0.2048 low 0.3539 medium
Turkey 0.1032 0.1585 low 0.1913 low 0.4036 high 0.0000 very low 0.5399 high 
Timor-Leste 0.2550 0.3882 medium 0.5421 high 0.2271 medium 0.5913 high 0.6552 high
Iraq 0.2631 0.7035 very high 0.3347 medium 0.3834 high 0.5913 high 0.4601 medium
Morocco 0.1052 0.4610 high 0.3159 medium 0.1574 medium 0.3885 medium 0.1951 low 
India 0.2650 0.6440 very high 0.3772 medium 0.5415 very high 0.5913 high 0.3539 medium
Thailand 0.1056 0.3770 medium 0.2935 medium 0.1533 medium 0.3885 medium 0.3539 medium 
Benin 0.2780 0.2763 medium 0.4432 high 0.3677 high 0.5913 high 0.7953 very high
Honduras 0.1074 0.3891 medium 0.1044 very low 0.1443 medium 0.3885 medium 0.4505 medium 
Cameroon 0.2803 0.5024 high 0.5333 high 0.2066 medium 0.7869 very high 0.4505 medium high 
Burkina Faso 0.2819 0.5419 high 0.7257 very high 0.1910 medium 0.5913 high 0.4505 medium
Lebanon 0.2897 0.6143 very high 0.2488 medium 0.1639 medium 0.5913 high 0.7953 very high
Namibia 0.1173 0.1709 low 0.3522 medium 0.0668 very low 0.5913 high 0.2812 low 
Kazakhstan 0.1196 0.0282 very low 0.2176 low 0.1126 low 0.4076 medium 0.6093 high 
Myanmar 0.2935 0.4963 high 0.4891 high 0.0000 very low 0.5913 high 0.7953 
Ghana 0.2988 0.3946 medium 0.5491 high 0.3136 high 0.8044 very high 0.5399 high
Pakistan 0.3013 0.6908 very high 0.4127 medium 0.6998 very high 0.4076 medium 0.4505 medium
People’s Republic of China 0.1310 0.2885 medium
Guatemala 0.1318 0.3953 medium
Rwanda 0.1339 0.2618 medium 0.1246 very low 0.5578 very high 0.4076 0.3213 medium 0.4082 medium 0.2566 high 0.2048 0.1392 medium 0.5913
Former Yugoslav Republic of Macedonia 0.1345 0.1803 low 0.3911 
Jamaica 0.1350 0.0031 very low 0.2046 low 
Mozambique 0.1375 0.4181 high 0.3793 medium 
Zimbabwe 0.1392 0.5700 very high 0.3435 medium 0.2951
Tajikistan 0.1393 0.3182 medium 0.4138 medium 0.5075 medium 0.5666 medium 0.2812 low low 0.5399 high high 0.2554 low very high 0.4076 medium 0.0271 very low 0.0000 0.0000 very low 0.4076 high very high 
Jordan 0.3119 0.5274 high 0.3150 medium 0.6790 very high 0.5913 high 0.6093 high
Guinea 0.3206 0.5413 high 0.9515 very high 0.2253 medium 0.3885 medium 0.4505 medium
Afghanistan 0.3224 0.7316 very high 0.5473 high 0.4644 very high 0.5913 high 0.4601 medium
Nepal 0.3229 0.1813 low 0.4083 medium 1.0000 very high 0.5913 high 0.2554 low
Central African Rep. 0.3285 0.5327 high 0.6135 high 0.0071 very low 0.5913 high 0.7953 very high
Bangladesh 0.3900 0.9730 very high 0.3323 medium 0.5831 very high 0.5913 high 0.4505 medium 0.2028 low 0.3539 medium 
Nigeria 0.3911 0.6723 very high 0.4766 high 0.2494 high 0.7626 very high 0.7953 very high
Mauritania 0.3954 0.7556 very high 0.9939 very high 0.1746 medium 0.5913 high 0.1951 low
Gabon 0.4022 0.6457 very high 0.5308 high 0.1746 medium 0.7869 very high 0.8140 very high
Syrian Arab Republic 0.4162 0.6914 very high 0.2598 medium 0.4312 high 0.5913 high 1.0000 very high
Lao People’s Democratic Republic 0.1445 0.2606 medium 0.5321 high 0.0506 very low 0.4076 medium 0.4505 medium
Haiti 0.1466 0.5613 very high 0.5010 high 0.0000 very low 0.2048 low 0.3539 medium
Uzbekistan 0.1475 0.2477 medium 0.2966 medium 0.1884 medium 0.5913 high 0.4505 medium
Indonesia 0.1532 0.5612 very high 0.2511 medium 0.3891 high 0.1837 low 0.4505 medium
Nicaragua 0.1595 0.6303 very high 0.1868 low 0.1082 low 0.3885 medium 0.4505 medium
Kyrgyzstan 0.1598 0.1879 low 0.3771 medium 0.2624 high 0.5913 high 0.4505 medium
Burundi 0.1662 0.5602 very high 0.5055 high 0.1746 medium 0.4076 medium 0.2554 low
Angola 0.1719 0.4599 high 0.5041 high 0.0791 low 0.5913 high 0.1951 low
Philippines 0.1765 0.4929 high 0.2597 medium 0.1392 medium 0.5913 high 0.4505 medium
Togo 0.1860 0.3696 medium 0.5488 high 0.1326 medium 0.5913 high 0.3539 medium
Viet Nam 0.1865 0.3374 medium 0.1857 low 0.4967 very high 0.4076 medium 0.6093 high
Sri Lanka 0.1894 0.4203 high 0.2681 medium 0.1483 medium 0.6207 high 0.5399 high
Democratic Republic of the Congo 0.4276 0.5169 high 0.5338 high 0.0691 very low 0.9582 very high 0.8140 very high
Egypt 0.4280 0.6665 very high 0.7373 very high 0.3741 high 0.5913 high 0.8140 very high
Niger 0.4415 1.0000 very high 0.4059 medium 0.1746 medium 0.5913 high 0.8140 very high
Zambia 0.4489 0.5149 high 0.5624 high 0.1746 medium 1.0000 very high 0.7953 very high
Somalia 0.4594 0.5958 very high 0.9905 very high 0.0891 low 0.7626 very high 0.6093 high
Chad 0.4665 0.9705 very high 0.8185 very high 0.0014 very low 0.5913 high 0.6093 high
Mali 0.5164 0.8309 very high 1.0000 very high 0.3048 high 0.4076 medium 0.7953 very high very high
Gambia 0.5240 0.5131 high 0.8509 very high 0.0000 very low 1.0000 very high 0.7953 
Sudan 0.5550 0.8382 very high 0.9781 very high 0.1426 medium 0.8163 very high 0.6552 high very high 0.3414 high 0.5913 high 1.0000 very high'''

In [ ]:

modrecs = allrecs[(allrecs['dob'] >= 1900) &(allrecs['dob'] < 1990)]
cdf = modrecs[['country','citizenship','gender']]

def combine_economy(row):
    cit = row['citizenship']
    cunt = row['country']
    return cit if cit else cunt
cdf['Economy_qid'] = cdf.apply(combine_economy,axis=1)
edf = cdf[cdf['Economy_qid'].apply(lambda x: x is not None)]
bios_count = len(edf)

edf['Economy'] = edf['Economy_qid'].apply(english_label)


country_perc = defaultdict(dict)
country_groups= edf.groupby(by='Economy')

for country, group in country_groups:
    nonmale = group[group['gender'] != 'Q6581097']['gender'].count()
    total = group['gender'].count()
    nm_perc = nonmale / float(total)
    country_perc[country]['Economy'] = country #for later on joining
    country_perc[country]['Score'] = nm_perc #for later on joining
    country_perc[country]['total']= total

wdf = pd.DataFrame.from_dict(country_perc, orient='index')

In [ ]:

wdf[wdf['total']>100].tail(100)

In [ ]:

wdf.ix["People's Republic of China"]

In [ ]:

#magnus' special format
nonzero = wdf[(wdf['Score'] != 0.0) & (wdf['total']> 100)]
magnusformt = zip(nonzero['Economy'],nonzero['Score'])
json.dump(magnusformt, open('Magnus Gender analysis/wigi_gender.json','w'))

In [ ]:

!less Magnus\ Gender\ analysis/wigi_gender.json

In [ ]: