import pandas as pd
import numpy
import json
from collections import defaultdict
from matplotlib.pylab import style
style.use('fivethirtyeight')
%pylab inline
java_min_int = -2147483648
Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['full'] `%matplotlib` prevents importing * from pylab and numpy
allrecs = pd.read_csv('snapshot_data/2014-10-13/gender-index-data-2014-10-13.csv',na_values=[java_min_int])
def split_column(q_str):
if type(q_str) is float:
if numpy.isnan(q_str):
return q_str
if type(q_str) is str:
qs = q_str.split('|')
return qs[0] #cos the format will always end with a |
for col in ['place_of_birth','gender', 'citizenship','ethnic_group']:
allrecs[col] = allrecs[col].apply(split_column)
allrecs.head(5)
qid | dob | dod | gender | ethnic_group | citizenship | place_of_birth | site_links | |
---|---|---|---|---|---|---|---|---|
0 | Q23 | 1732 | 1799 | Q6581097 | NaN | Q30 | Q494413 | zhwiki|kywiki|euwiki|plwiki|bswiki|angwiki|uzw... |
1 | Q42 | 1952 | 2001 | Q6581097 | NaN | Q145 | Q350 | zhwiki|jvwiki|euwiki|plwiki|bswiki|eswiki|tawi... |
2 | Q207 | 1946 | NaN | Q6581097 | NaN | Q30 | Q49145 | uzwiki|eswiki|kowikiquote|huwiki|liwikiquote|p... |
3 | Q297 | NaN | 1660 | Q6581097 | NaN | Q29 | Q8717 | zhwiki|kywiki|plwiki|euwiki|bswiki|uzwiki|eswi... |
4 | Q326 | 1942 | NaN | Q6581097 | NaN | Q298 | Q2887 | zhwiki|plwiki|euwiki|kowiki|frwiki|eswiki|yowi... |
#todo what about mechanical maps
pobs_map = json.load(open('helpers/aggregation_maps/pobs_map.json','r'))
country_map = pd.DataFrame.from_csv('helpers/aggregation_maps/country_maps.csv')
ethnic_group_map = json.load(open('helpers/aggregation_maps/mechanical_turk/ethnic_groups_map.json','r'))
citizenship_map = json.load(open('helpers/aggregation_maps/mechanical_turk/citizenship_map.json','r'))
def map_pob(qid):
if not type(qid) is str:
return None
else:
country_list = pobs_map[qid]
if len(country_list) == 0:
return None
else:
country = country_list[0] #assumption
culture = country_map.ix[country]['culture_name']
return culture
def map_wrapper(m):
def return_fun(qid):
try:
return m[qid]
except KeyError:
return None
return return_fun
mismatch = pd.DataFrame()
#order is important because it determines the preference we will use
col_map_fun = zip(['ethnic_group', 'citizenship', 'place_of_birth'],
[map_wrapper(ethnic_group_map),map_wrapper(citizenship_map), map_pob])
def determine_culture(row):
culture = None
for col, map_fun in col_map_fun:
guess = map_fun(row[col])
if (culture is not None) and (guess is not None):
if culture != guess:
mismatch.append(row,ignore_index=True)
if guess:
culture = guess
return str(culture).lower() if culture else culture #to return None properly
%%timeit -r 1 -n 1
allrecs.iloc[0:2500].apply(lambda x: determine_culture(x), axis=1)
1 loops, best of 1: 1.77 s per loop
%%timeit -r 1 -n 1
allrecs.iloc[0:25000].apply(lambda x: determine_culture(x), axis=1)
1 loops, best of 1: 17 s per loop
allrecs['culture'] = allrecs.apply(lambda x: determine_culture(x), axis=1)
print mismatch
Empty DataFrame Columns: [] Index: []
allrecs.to_json('helpers/world_cultures_shortcut.json')
allrecs = pd.DataFrame.from_dict(json.load(open('helpers/world_cultures_shortcut.json','r')))
import scipy.stats
scipy.stats.spearmanr(rank_compare[['Rank','Rank_wikidata']])
(0.09637690726400637, 0.25388210576052661)
scipy.stats.mannwhitneyu(rank_compare['Rank'],rank_compare['Rank_wikidata'])
(10078.0, 0.49798226262171613)
scipy.stats.ranksums(rank_compare['Rank'],rank_compare['Rank_wikidata'])
(0.0057801597300572065, 0.99538812547307132)
print rank_compare.to_html()
<table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>Economy</th> <th>Rank</th> <th>Rank_wikidata</th> <th>diff</th> <th>abs_diff</th> </tr> </thead> <tbody> <tr> <th>0 </th> <td> Iceland</td> <td> 1</td> <td> 73</td> <td> -72</td> <td> 72</td> </tr> <tr> <th>1 </th> <td> Finland</td> <td> 2</td> <td> 49</td> <td> -47</td> <td> 47</td> </tr> <tr> <th>2 </th> <td> Norway</td> <td> 3</td> <td> 58</td> <td> -55</td> <td> 55</td> </tr> <tr> <th>3 </th> <td> Sweden</td> <td> 4</td> <td> 32</td> <td> -28</td> <td> 28</td> </tr> <tr> <th>4 </th> <td> Denmark</td> <td> 5</td> <td> 59</td> <td> -54</td> <td> 54</td> </tr> <tr> <th>5 </th> <td> Nicaragua</td> <td> 6</td> <td> 25</td> <td> -19</td> <td> 19</td> </tr> <tr> <th>6 </th> <td> Rwanda</td> <td> 7</td> <td> 67</td> <td> -60</td> <td> 60</td> </tr> <tr> <th>7 </th> <td> Ireland</td> <td> 8</td> <td> 77</td> <td> -69</td> <td> 69</td> </tr> <tr> <th>8 </th> <td> Philippines</td> <td> 9</td> <td> 2</td> <td> 7</td> <td> 7</td> </tr> <tr> <th>9 </th> <td> Belgium</td> <td> 10</td> <td> 98</td> <td> -88</td> <td> 88</td> </tr> <tr> <th>10 </th> <td> Switzerland</td> <td> 11</td> <td> 116</td> <td>-105</td> <td> 105</td> </tr> <tr> <th>11 </th> <td> Germany</td> <td> 12</td> <td> 117</td> <td>-105</td> <td> 105</td> </tr> <tr> <th>12 </th> <td> New Zealand</td> <td> 13</td> <td> 38</td> <td> -25</td> <td> 25</td> </tr> <tr> <th>13 </th> <td> Netherlands</td> <td> 14</td> <td> 72</td> <td> -58</td> <td> 58</td> </tr> <tr> <th>14 </th> <td> Latvia</td> <td> 15</td> <td> 60</td> <td> -45</td> <td> 45</td> </tr> <tr> <th>15 </th> <td> France</td> <td> 16</td> <td> 96</td> <td> -80</td> <td> 80</td> </tr> <tr> <th>16 </th> <td> Burundi</td> <td> 17</td> <td> 141</td> <td>-124</td> <td> 124</td> </tr> <tr> <th>17 </th> <td> South Africa</td> <td> 18</td> <td> 103</td> <td> -85</td> <td> 85</td> </tr> <tr> <th>18 </th> <td> Canada</td> <td> 19</td> <td> 23</td> <td> -4</td> <td> 4</td> </tr> <tr> <th>19 </th> <td> United States of America</td> <td> 20</td> <td> 31</td> <td> -11</td> <td> 11</td> </tr> <tr> <th>20 </th> <td> Ecuador</td> <td> 21</td> <td> 109</td> <td> -88</td> <td> 88</td> </tr> <tr> <th>21 </th> <td> Bulgaria</td> <td> 22</td> <td> 53</td> <td> -31</td> <td> 31</td> </tr> <tr> <th>22 </th> <td> Slovenia</td> <td> 23</td> <td> 78</td> <td> -55</td> <td> 55</td> </tr> <tr> <th>23 </th> <td> Australia</td> <td> 24</td> <td> 18</td> <td> 6</td> <td> 6</td> </tr> <tr> <th>24 </th> <td> Moldova</td> <td> 25</td> <td> 68</td> <td> -43</td> <td> 43</td> </tr> <tr> <th>25 </th> <td> United Kingdom</td> <td> 26</td> <td> 42</td> <td> -16</td> <td> 16</td> </tr> <tr> <th>26 </th> <td> Mozambique</td> <td> 27</td> <td> 61</td> <td> -34</td> <td> 34</td> </tr> <tr> <th>27 </th> <td> Luxembourg</td> <td> 28</td> <td> 107</td> <td> -79</td> <td> 79</td> </tr> <tr> <th>28 </th> <td> Spain</td> <td> 29</td> <td> 88</td> <td> -59</td> <td> 59</td> </tr> <tr> <th>29 </th> <td> Cuba</td> <td> 30</td> <td> 26</td> <td> 4</td> <td> 4</td> </tr> <tr> <th>30 </th> <td> Argentina</td> <td> 31</td> <td> 102</td> <td> -71</td> <td> 71</td> </tr> <tr> <th>31 </th> <td> Belarus</td> <td> 32</td> <td> 70</td> <td> -38</td> <td> 38</td> </tr> <tr> <th>32 </th> <td> Barbados</td> <td> 33</td> <td> 55</td> <td> -22</td> <td> 22</td> </tr> <tr> <th>33 </th> <td> Malawi</td> <td> 34</td> <td> 92</td> <td> -58</td> <td> 58</td> </tr> <tr> <th>34 </th> <td> The Bahamas</td> <td> 35</td> <td> 36</td> <td> -1</td> <td> 1</td> </tr> <tr> <th>35 </th> <td> Austria</td> <td> 36</td> <td> 82</td> <td> -46</td> <td> 46</td> </tr> <tr> <th>36 </th> <td> Kenya</td> <td> 37</td> <td> 9</td> <td> 28</td> <td> 28</td> </tr> <tr> <th>37 </th> <td> Lesotho</td> <td> 38</td> <td> 43</td> <td> -5</td> <td> 5</td> </tr> <tr> <th>38 </th> <td> Portugal</td> <td> 39</td> <td> 95</td> <td> -56</td> <td> 56</td> </tr> <tr> <th>39 </th> <td> Namibia</td> <td> 40</td> <td> 112</td> <td> -72</td> <td> 72</td> </tr> <tr> <th>40 </th> <td> Madagascar</td> <td> 41</td> <td> 99</td> <td> -58</td> <td> 58</td> </tr> <tr> <th>41 </th> <td> Mongolia</td> <td> 42</td> <td> 71</td> <td> -29</td> <td> 29</td> </tr> <tr> <th>42 </th> <td> Kazakhstan</td> <td> 43</td> <td> 44</td> <td> -1</td> <td> 1</td> </tr> <tr> <th>43 </th> <td> Lithuania</td> <td> 44</td> <td> 65</td> <td> -21</td> <td> 21</td> </tr> <tr> <th>44 </th> <td> Peru</td> <td> 45</td> <td> 97</td> <td> -52</td> <td> 52</td> </tr> <tr> <th>45 </th> <td> Panama</td> <td> 46</td> <td> 39</td> <td> 7</td> <td> 7</td> </tr> <tr> <th>46 </th> <td> Tanzania</td> <td> 47</td> <td> 16</td> <td> 31</td> <td> 31</td> </tr> <tr> <th>47 </th> <td> Costa Rica</td> <td> 48</td> <td> 129</td> <td> -81</td> <td> 81</td> </tr> <tr> <th>48 </th> <td> Trinidad and Tobago</td> <td> 49</td> <td> 24</td> <td> 25</td> <td> 25</td> </tr> <tr> <th>49 </th> <td> Cape Verde</td> <td> 50</td> <td> 136</td> <td> -86</td> <td> 86</td> </tr> <tr> <th>50 </th> <td> Botswana</td> <td> 51</td> <td> 46</td> <td> 5</td> <td> 5</td> </tr> <tr> <th>51 </th> <td> Jamaica</td> <td> 52</td> <td> 21</td> <td> 31</td> <td> 31</td> </tr> <tr> <th>52 </th> <td> Colombia</td> <td> 53</td> <td> 63</td> <td> -10</td> <td> 10</td> </tr> <tr> <th>53 </th> <td> Serbia</td> <td> 54</td> <td> 62</td> <td> -8</td> <td> 8</td> </tr> <tr> <th>54 </th> <td> Croatia</td> <td> 55</td> <td> 86</td> <td> -31</td> <td> 31</td> </tr> <tr> <th>55 </th> <td> Ukraine</td> <td> 56</td> <td> 79</td> <td> -23</td> <td> 23</td> </tr> <tr> <th>56 </th> <td> Poland</td> <td> 57</td> <td> 84</td> <td> -27</td> <td> 27</td> </tr> <tr> <th>57 </th> <td> Bolivia</td> <td> 58</td> <td> 128</td> <td> -70</td> <td> 70</td> </tr> <tr> <th>58 </th> <td> Singapore</td> <td> 59</td> <td> 8</td> <td> 51</td> <td> 51</td> </tr> <tr> <th>59 </th> <td> Laos</td> <td> 60</td> <td> 137</td> <td> -77</td> <td> 77</td> </tr> <tr> <th>60 </th> <td> Thailand</td> <td> 61</td> <td> 11</td> <td> 50</td> <td> 50</td> </tr> <tr> <th>61 </th> <td> Estonia</td> <td> 62</td> <td> 94</td> <td> -32</td> <td> 32</td> </tr> <tr> <th>62 </th> <td> Zimbabwe</td> <td> 63</td> <td> 35</td> <td> 28</td> <td> 28</td> </tr> <tr> <th>63 </th> <td> Guyana</td> <td> 64</td> <td> 134</td> <td> -70</td> <td> 70</td> </tr> <tr> <th>64 </th> <td> Israel</td> <td> 65</td> <td> 34</td> <td> 31</td> <td> 31</td> </tr> <tr> <th>65 </th> <td> Chile</td> <td> 66</td> <td> 47</td> <td> 19</td> <td> 19</td> </tr> <tr> <th>66 </th> <td> Kyrgyzstan</td> <td> 67</td> <td> 51</td> <td> 16</td> <td> 16</td> </tr> <tr> <th>67 </th> <td> Bangladesh</td> <td> 68</td> <td> 64</td> <td> 4</td> <td> 4</td> </tr> <tr> <th>68 </th> <td> Italy</td> <td> 69</td> <td> 105</td> <td> -36</td> <td> 36</td> </tr> <tr> <th>69 </th> <td> Republic of Macedonia</td> <td> 70</td> <td> 125</td> <td> -55</td> <td> 55</td> </tr> <tr> <th>70 </th> <td> Brazil</td> <td> 71</td> <td> 57</td> <td> 14</td> <td> 14</td> </tr> <tr> <th>71 </th> <td> Romania</td> <td> 72</td> <td> 54</td> <td> 18</td> <td> 18</td> </tr> <tr> <th>72 </th> <td> Honduras</td> <td> 73</td> <td> 131</td> <td> -58</td> <td> 58</td> </tr> <tr> <th>73 </th> <td> Montenegro</td> <td> 74</td> <td> 81</td> <td> -7</td> <td> 7</td> </tr> <tr> <th>74 </th> <td> Russia</td> <td> 75</td> <td> 52</td> <td> 23</td> <td> 23</td> </tr> <tr> <th>75 </th> <td> Vietnam</td> <td> 76</td> <td> 29</td> <td> 47</td> <td> 47</td> </tr> <tr> <th>76 </th> <td> Senegal</td> <td> 77</td> <td> 119</td> <td> -42</td> <td> 42</td> </tr> <tr> <th>77 </th> <td> Dominican Republic</td> <td> 78</td> <td> 22</td> <td> 56</td> <td> 56</td> </tr> <tr> <th>78 </th> <td> Sri Lanka</td> <td> 79</td> <td> 80</td> <td> -1</td> <td> 1</td> </tr> <tr> <th>79 </th> <td> Mexico</td> <td> 80</td> <td> 45</td> <td> 35</td> <td> 35</td> </tr> <tr> <th>80 </th> <td> Paraguay</td> <td> 81</td> <td> 132</td> <td> -51</td> <td> 51</td> </tr> <tr> <th>81 </th> <td> Uruguay</td> <td> 82</td> <td> 135</td> <td> -53</td> <td> 53</td> </tr> <tr> <th>82 </th> <td> Albania</td> <td> 83</td> <td> 115</td> <td> -32</td> <td> 32</td> </tr> <tr> <th>83 </th> <td> El Salvador</td> <td> 84</td> <td> 113</td> <td> -29</td> <td> 29</td> </tr> <tr> <th>84 </th> <td> Georgia</td> <td> 85</td> <td> 91</td> <td> -6</td> <td> 6</td> </tr> <tr> <th>85 </th> <td> Venezuela</td> <td> 86</td> <td> 12</td> <td> 74</td> <td> 74</td> </tr> <tr> <th>86 </th> <td> People's Republic of China</td> <td> 87</td> <td> 13</td> <td> 74</td> <td> 74</td> </tr> <tr> <th>87 </th> <td> Uganda</td> <td> 88</td> <td> 20</td> <td> 68</td> <td> 68</td> </tr> <tr> <th>88 </th> <td> Guatemala</td> <td> 89</td> <td> 120</td> <td> -31</td> <td> 31</td> </tr> <tr> <th>89 </th> <td> Slovakia</td> <td> 90</td> <td> 56</td> <td> 34</td> <td> 34</td> </tr> <tr> <th>90 </th> <td> Greece</td> <td> 91</td> <td> 83</td> <td> 8</td> <td> 8</td> </tr> <tr> <th>91 </th> <td> Swaziland</td> <td> 92</td> <td> 14</td> <td> 78</td> <td> 78</td> </tr> <tr> <th>92 </th> <td> Hungary</td> <td> 93</td> <td> 66</td> <td> 27</td> <td> 27</td> </tr> <tr> <th>93 </th> <td> Azerbaijan</td> <td> 94</td> <td> 106</td> <td> -12</td> <td> 12</td> </tr> <tr> <th>94 </th> <td> Cyprus</td> <td> 95</td> <td> 111</td> <td> -16</td> <td> 16</td> </tr> <tr> <th>95 </th> <td> Czech Republic</td> <td> 96</td> <td> 87</td> <td> 9</td> <td> 9</td> </tr> <tr> <th>96 </th> <td> Indonesia</td> <td> 97</td> <td> 17</td> <td> 80</td> <td> 80</td> </tr> <tr> <th>97 </th> <td> Brunei</td> <td> 98</td> <td> 14</td> <td> 84</td> <td> 84</td> </tr> <tr> <th>98 </th> <td> Malta</td> <td> 99</td> <td> 122</td> <td> -23</td> <td> 23</td> </tr> <tr> <th>99 </th> <td> Belize</td> <td> 100</td> <td> 40</td> <td> 60</td> <td> 60</td> </tr> <tr> <th>100</th> <td> Ghana</td> <td> 101</td> <td> 133</td> <td> -32</td> <td> 32</td> </tr> <tr> <th>101</th> <td> Tajikistan</td> <td> 102</td> <td> 85</td> <td> 17</td> <td> 17</td> </tr> <tr> <th>102</th> <td> Armenia</td> <td> 103</td> <td> 126</td> <td> -23</td> <td> 23</td> </tr> <tr> <th>103</th> <td> Japan</td> <td> 104</td> <td> 4</td> <td> 100</td> <td> 100</td> </tr> <tr> <th>104</th> <td> Maldives</td> <td> 105</td> <td> 101</td> <td> 4</td> <td> 4</td> </tr> <tr> <th>105</th> <td> Mauritius</td> <td> 106</td> <td> 6</td> <td> 100</td> <td> 100</td> </tr> <tr> <th>106</th> <td> Malaysia</td> <td> 107</td> <td> 5</td> <td> 102</td> <td> 102</td> </tr> <tr> <th>107</th> <td> Cambodia</td> <td> 108</td> <td> 30</td> <td> 78</td> <td> 78</td> </tr> <tr> <th>108</th> <td> Suriname</td> <td> 109</td> <td> 74</td> <td> 35</td> <td> 35</td> </tr> <tr> <th>109</th> <td> Burkina Faso</td> <td> 110</td> <td> 138</td> <td> -28</td> <td> 28</td> </tr> <tr> <th>110</th> <td> Liberia</td> <td> 111</td> <td> 50</td> <td> 61</td> <td> 61</td> </tr> <tr> <th>111</th> <td> Nepal</td> <td> 112</td> <td> 3</td> <td> 109</td> <td> 109</td> </tr> <tr> <th>112</th> <td> Kuwait</td> <td> 113</td> <td> 114</td> <td> -1</td> <td> 1</td> </tr> <tr> <th>113</th> <td> India</td> <td> 114</td> <td> 19</td> <td> 95</td> <td> 95</td> </tr> <tr> <th>114</th> <td> United Arab Emirates</td> <td> 115</td> <td> 118</td> <td> -3</td> <td> 3</td> </tr> <tr> <th>115</th> <td> Qatar</td> <td> 116</td> <td> 139</td> <td> -23</td> <td> 23</td> </tr> <tr> <th>116</th> <td> South Korea</td> <td> 117</td> <td> 1</td> <td> 116</td> <td> 116</td> </tr> <tr> <th>117</th> <td> Nigeria</td> <td> 118</td> <td> 92</td> <td> 26</td> <td> 26</td> </tr> <tr> <th>118</th> <td> Zambia</td> <td> 119</td> <td> 127</td> <td> -8</td> <td> 8</td> </tr> <tr> <th>119</th> <td> Bhutan</td> <td> 120</td> <td> 33</td> <td> 87</td> <td> 87</td> </tr> <tr> <th>120</th> <td> Angola</td> <td> 121</td> <td> 90</td> <td> 31</td> <td> 31</td> </tr> <tr> <th>121</th> <td> Fiji</td> <td> 122</td> <td> 104</td> <td> 18</td> <td> 18</td> </tr> <tr> <th>122</th> <td> Tunisia</td> <td> 123</td> <td> 110</td> <td> 13</td> <td> 13</td> </tr> <tr> <th>123</th> <td> Bahrain</td> <td> 124</td> <td> 6</td> <td> 118</td> <td> 118</td> </tr> <tr> <th>124</th> <td> Turkey</td> <td> 125</td> <td> 41</td> <td> 84</td> <td> 84</td> </tr> <tr> <th>125</th> <td> Algeria</td> <td> 126</td> <td> 75</td> <td> 51</td> <td> 51</td> </tr> <tr> <th>126</th> <td> Ethiopia</td> <td> 127</td> <td> 10</td> <td> 117</td> <td> 117</td> </tr> <tr> <th>127</th> <td> Oman</td> <td> 128</td> <td> 28</td> <td> 100</td> <td> 100</td> </tr> <tr> <th>128</th> <td> Egypt</td> <td> 129</td> <td> 48</td> <td> 81</td> <td> 81</td> </tr> <tr> <th>129</th> <td> Saudi Arabia</td> <td> 130</td> <td> 123</td> <td> 7</td> <td> 7</td> </tr> <tr> <th>130</th> <td> Mauritania</td> <td> 131</td> <td> 142</td> <td> -11</td> <td> 11</td> </tr> <tr> <th>131</th> <td> Guinea</td> <td> 132</td> <td> 140</td> <td> -8</td> <td> 8</td> </tr> <tr> <th>132</th> <td> Morocco</td> <td> 133</td> <td> 100</td> <td> 33</td> <td> 33</td> </tr> <tr> <th>133</th> <td> Jordan</td> <td> 134</td> <td> 108</td> <td> 26</td> <td> 26</td> </tr> <tr> <th>134</th> <td> Lebanon</td> <td> 135</td> <td> 76</td> <td> 59</td> <td> 59</td> </tr> <tr> <th>135</th> <td> Côte d'Ivoire</td> <td> 136</td> <td> 130</td> <td> 6</td> <td> 6</td> </tr> <tr> <th>136</th> <td> Iran</td> <td> 137</td> <td> 69</td> <td> 68</td> <td> 68</td> </tr> <tr> <th>137</th> <td> Mali</td> <td> 138</td> <td> 124</td> <td> 14</td> <td> 14</td> </tr> <tr> <th>138</th> <td> Syria</td> <td> 139</td> <td> 121</td> <td> 18</td> <td> 18</td> </tr> <tr> <th>139</th> <td> Chad</td> <td> 140</td> <td> 37</td> <td> 103</td> <td> 103</td> </tr> <tr> <th>140</th> <td> Pakistan</td> <td> 141</td> <td> 27</td> <td> 114</td> <td> 114</td> </tr> <tr> <th>141</th> <td> Yemen</td> <td> 142</td> <td> 88</td> <td> 54</td> <td> 54</td> </tr> </tbody> </table>
Quite uncorrellated. That means that the data is not good, or that the world economic forum methods have little to do with the percentage of women born in those countries recorded semantically on a historic level. And /rho is high
country_map = pd.DataFrame.from_csv('helpers/aggregation_maps/country_maps.csv')
def map_culture(qid):
if not type(qid) is str:
return None
else:
country_list = pobs_map[qid]
if len(country_list) == 0:
return None
else:
country = country_list[0] #assumption
culture = country_map.ix[country]['culture_name']
return culture
allrecs['culture'] = allrecs['place_of_birth'].apply(map_culture)
import math
import pywikibot
#Tranforming QIDs into English labels.
enwp = pywikibot.Site('en','wikipedia')
wikidata = enwp.data_repository()
retrieved = dict()
def english_label(qid):
if type(qid) is float:
if math.isnan(qid):
return None
#first see if we've done it
try:
return retrieved[qid]
except KeyError:
try:
page = pywikibot.ItemPage(wikidata, qid)
data = page.get()
lab = data['labels']['en']
retrieved[qid] = lab
return lab
except KeyError:
retrieved[qid] = qid
return qid
VERBOSE:pywiki:Starting 1 threads...
english_label('Q6581097')
VERBOSE:pywiki:Found 1 wikidata:wikidata processes running, including this one.
u'male'
allrecs['gender_name'] = allrecs['gender'].apply(english_label)
outdf = allrecs[['gender_name','culture']]
outdf.to_csv('helpers/Chi_Squared_Test_Data.csv')
how many records have gender, pob and dob
has = defaultdict(dict)
for col in allrecs.columns:
def test(x):
if isinstance(x, float):
return not math.isnan(x)
else:
return x is not None
nonempty = len(allrecs[allrecs[col].apply(test)])
nonemptyper = nonempty / float(len(allrecs))
has[col]['Items with property'] = nonempty
has[col]['% of total'] = nonemptyper
hasdf = pd.DataFrame.from_dict(has, orient='index')
print hasdf.sort('% of total').to_html(justify='right', formatters={'% of total':lambda x: '%.2f' % (x*100),
'Items with property':lambda x: '{0:,}'.format(x)})
<table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>% of total</th> <th>Items with property</th> </tr> </thead> <tbody> <tr> <th>ethnic_group</th> <td> 0.30</td> <td> 7,772</td> </tr> <tr> <th>country</th> <td> 23.47</td> <td> 601,361</td> </tr> <tr> <th>place_of_birth</th> <td> 23.93</td> <td> 613,092</td> </tr> <tr> <th>dod</th> <td> 28.79</td> <td> 737,522</td> </tr> <tr> <th>citizenship</th> <td> 41.44</td> <td>1,061,634</td> </tr> <tr> <th>culture</th> <td> 45.20</td> <td>1,158,086</td> </tr> <tr> <th>dob</th> <td> 57.92</td> <td>1,484,003</td> </tr> <tr> <th>gender</th> <td> 89.40</td> <td>2,290,433</td> </tr> <tr> <th>site_links</th> <td> 99.05</td> <td>2,537,545</td> </tr> <tr> <th>qid</th> <td>100.00</td> <td>2,561,999</td> </tr> </tbody> </table>
hasdob = allrecs[allrecs['dob'].apply(lambda x: not math.isnan(x))]
len(hasdob)
1484003
hasgender = hasdob[hasdob['gender'].apply(lambda x: not math.isnan(x) if type(x) is float else True)]
len(hasgender)
1484003
hascult = hasgender[hasgender['culture'].apply(lambda x: x is not None)]
len(hascult)
915101
culture_groups = hascult.groupby('culture')
def make_perc_series(df):
years_per = dict()
dobs = df.groupby('dob')
#hate to use a for loop, fixlater
for year, group in dobs:
nmcount = group[group['gender'] != 'Q6581097']['gender'].count()
totalcount = group['gender'].count()
nmper = nmcount / float(totalcount)
years_per[year] = nmper
perc_series = pd.TimeSeries(data=years_per)
return perc_series
perc_dict = dict()
for name, group in culture_groups:
perc_series = make_perc_series(group)
perc_dict[name] = perc_series
fig, (full, modern) = plt.subplots(1,2, figsize=(20,6))
end_year = 2000
for start_year, ra_len, ax in zip((-1000,1800), (100,10), (full, modern)):
ra_dict = dict()
for name, series in perc_dict.iteritems():
ra_dict[name] = pd.rolling_mean(series, ra_len, min_periods=10)
cult_dob_per = pd.DataFrame(ra_dict)
cult_dob_per.plot(cmap='Paired', linewidth=2, ax=ax, legend=False,zorder=-ra_len)
ax.set_xlim((start_year, end_year))
ax.set_xticks(range(start_year, end_year,(end_year-start_year) / 16))
ax.set_ylim((0,0.6))
ax.set_title(u'{}—{}, with {} year Rolling Average'.format(start_year, end_year,ra_len))
ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:.0%}'.format(x )))
full.legend = legend(bbox_to_anchor=(0.05, 0.95), loc=2, borderaxespad=0)
full.set_xticks(range(-1000, end_year,(end_year+1000) / 15))
fig.suptitle('Female % of Biographies by Culture, over Time', fontsize=24)
fig.subplots_adjust(top=0.88)
fig, (full, modern) = plt.subplots(2,1, figsize=(18,12))
end_year = 2000
for start_year in [-1000, 1800]:
for ra_len in [10, 100]:
ra_dict = dict()
for name, series in perc_dict.iteritems():
ra_dict[name] = pd.rolling_mean(series, ra_len, min_periods=10)
cult_dob_per = pd.DataFrame(ra_dict)
plt = cult_dob_per.plot(figsize=(20,6), cmap='Paired', linewidth=1.5)
plt.set_xlim((start_year, end_year))
plt.set_xticks(range(start_year, end_year,(end_year-start_year) / 15))
plt.set_ylim((0,0.6))
plt.set_title('Non-male percentage of Biographies by Date of Birth - %s Year Rolling Average' % str(ra_len))
plt.legend(loc=2)
dobexists = allrecs[allrecs['dob'].apply(lambda x: not math.isnan(x))]
dobcultureexists = dobexists[dobexists['culture'].apply(lambda x: x is not None)]
len(dobcultureexists)
915101
culture_groups = dobcultureexists[['dob','culture']].groupby(by='culture')
def make_tot_series(df):
years_tot = dict()
dobs = df.groupby('dob')
#hate to use a for loop, fixlater
for year, group in dobs:
totalcount = group['culture'].count()
years_tot[year] = totalcount
tot_series = pd.TimeSeries(data=years_tot)
return tot_series
tot_dict = dict()
for name, group in culture_groups:
tot_dict[name] = make_tot_series(group)
end_year = 2014
for start_year in [1500, 1800]:
for ra_len in [2, 5, 10]:
ra_dict = dict()
for name, series in tot_dict.iteritems():
ra_dict[name] = pd.rolling_mean(series, ra_len, min_periods=1)
cult_dob = pd.DataFrame(ra_dict)
plt = cult_dob.plot(figsize=(20,6), cmap='Set2', linewidth=1.5)
plt.set_xlim((start_year,end_year))
plt.set_xticks(range(start_year,end_year,(end_year-start_year) / 15))
plt.set_title('Total Biographies by Date of Birth | %s Year Rolling Average' % str(ra_len))
plt.legend(loc=2)
for start_year, end_year in zip([-2000, -1000], [1000,1500]):
for ra_len in [1,2,10]:
ra_dict = dict()
for name, series in tot_dict.iteritems():
ra_dict[name] = pd.rolling_mean(series, ra_len, min_periods=1)
cult_dob = pd.DataFrame(ra_dict)
plt = cult_dob.plot(figsize=(20,6), cmap='Set2', linewidth=1.5)
plt.set_ylim((0,50))
plt.set_yscale('log')
plt.set_xlim((start_year,end_year))
plt.set_xticks(range(start_year,end_year,(end_year-start_year) / 15))
plt.set_title('Total Biographies by Date of Birth | %s Year Rolling Average' % str(ra_len))
plt.legend(loc=2)
/usr/local/lib/python2.7/dist-packages/numpy/ma/core.py:3895: UserWarning: Warning: converting a masked element to nan. warnings.warn("Warning: converting a masked element to nan.")