import pandas
import math
import datetime
%pylab inline
java_min_int = -2147483648
Populating the interactive namespace from numpy and matplotlib
wikidata_df = pandas.read_csv('data/gender-index-data-2014-09-17.csv',
na_values=[java_min_int])
def split_column(q_str):
if type(q_str) is float:
if numpy.isnan(q_str):
return [q_str] #returning this way so we can gurantee that column contains list
if type(q_str) is str:
qs = q_str.split('|')
return qs[:-1] #cos the format will always end with a |
for column in ['gender', 'ethnic_group', 'citizenship', 'place_of_birth', 'site_links']:
column_plural = column+'s'
wikidata_df[column_plural] = wikidata_df[column].apply(split_column)
'''
df['genders'] = df['gender'].apply(split_gender)
df['langs'] = df['site_links'].apply(split_langs)
'''
wikidata_df.head(5)
qid | dob | dod | gender | ethnic_group | citizenship | place_of_birth | site_links | genders | ethnic_groups | citizenships | place_of_births | site_linkss | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Q23 | 1732 | 1799 | Q6581097| | NaN | Q30| | Q494413| | zhwiki|kywiki|euwiki|plwiki|bswiki|angwiki|uzw... | [Q6581097] | [nan] | [Q30] | [Q494413] | [zhwiki, kywiki, euwiki, plwiki, bswiki, angwi... |
1 | Q42 | 1952 | 2001 | Q6581097| | NaN | Q145| | Q350| | zhwiki|jvwiki|euwiki|plwiki|bswiki|eswiki|tawi... | [Q6581097] | [nan] | [Q145] | [Q350] | [zhwiki, jvwiki, euwiki, plwiki, bswiki, eswik... |
2 | Q207 | 1946 | NaN | Q6581097| | NaN | Q30| | Q49145| | uzwiki|eswiki|kowikiquote|huwiki|liwikiquote|p... | [Q6581097] | [nan] | [Q30] | [Q49145] | [uzwiki, eswiki, kowikiquote, huwiki, liwikiqu... |
3 | Q297 | NaN | 1660 | Q6581097| | NaN | Q29| | Q8717| | zhwiki|kywiki|plwiki|euwiki|bswiki|uzwiki|eswi... | [Q6581097] | [nan] | [Q29] | [Q8717] | [zhwiki, kywiki, plwiki, euwiki, bswiki, uzwik... |
4 | Q326 | 1942 | NaN | Q6581097| | NaN | Q298|Q39| | Q2887| | zhwiki|plwiki|euwiki|kowiki|frwiki|eswiki|yowi... | [Q6581097] | [nan] | [Q298, Q39] | [Q2887] | [zhwiki, plwiki, euwiki, kowiki, frwiki, eswik... |
from collections import defaultdict
import time
def int_dict_factory():
return defaultdict(int)
year_genders = defaultdict(int_dict_factory)
year_langs = defaultdict(int_dict_factory)
def nan_test(v):
try:
if math.isnan(v):
return True
except TypeError:
return False
#get years
for index, row in df.iterrows():
pot_dob = row['dob']
if math.isnan(pot_dob) or pot_dob > 2014:
continue # we are not including nans
else:
dob = int(pot_dob)
genders = row['genders']
langs = row['site_links']
for year_dict, vrs in zip([year_genders, year_langs], [genders, langs]):
for var in vrs:
try:
if math.isnan(var):
continue #there isn't a good var here
except TypeError:
pass
year_dict[dob][var]+=1
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-7-989a37b8c114> in <module>() 19 langs = row['site_links'] 20 for year_dict, vrs in zip([year_genders, year_langs], [genders, langs]): ---> 21 for var in vrs: 22 try: 23 if math.isnan(var): TypeError: 'float' object is not iterable
from collections import defaultdict
import time
def int_dict_factory():
return defaultdict(int)
def nan_test(v):
try:
if math.isnan(v):
return True
except TypeError:
return False
#abstracted: we want year-gender, but also
#gender-ethnicity -citizenship -place of birth, site-links
params = ['dob','dod','genders','ethnic_groups','citizenships','place_of_births','site_linkss']
gender_param = {param:defaultdict(int_dict_factory) for param in params}
for index, row in wikidata_df.iterrows():
row_data = {s:row[s] for s in params}
for param in params:
gender_dict = gender_param[param]
vrs = row_data[param]
genders = row_data['genders']
if not nan_test(vrs):
if not nan_test(genders):
for gender in genders:
if type(vrs) is list:
for var in vrs:
gender_dict[gender][var] += 1
else:
gender_dict[gender][vrs] += 1
gender_dfs = {param: pandas.DataFrame.from_dict(gender_param[param], orient='columns') for param in params}
import pywikibot
#Tranforming QIDs into English labels.
enwp = pywikibot.Site('en','wikipedia')
wikidata = enwp.data_repository()
retrieved = dict()
def english_label(qid):
#first see if we've done it
try:
return retrieved[qid]
except KeyError:
try:
page = pywikibot.ItemPage(wikidata, qid)
data = page.get()
lab = data['labels']['en']
retrieved[qid] = lab
return lab
except:
retrieved[qid] = qid
return qid
def engify_labels(df):
qids = [str(q) for q in df.columns]
labels = [english_label(qid) for qid in qids]
df.columns = labels
for param, gender_df in gender_dfs.iteritems():
print param
engify_labels(gender_df)
VERBOSE:pywiki:Found 1 wikidata:wikidata processes running, including this one. VERBOSE:pywiki:API Error: query= {'action': 'wbgetentities', 'format': 'json', 'ids': 'NAN', 'maxlag': '5'} VERBOSE:pywiki: response= {u'servedby': u'mw1119', u'error': {u'messages': [{u'html': {u'*': u'Could not find such an entity'}, u'name': u'wikibase-api-no-such-entity', u'parameters': []}]}}
dob ethnic_groups dod citizenships place_of_births genders site_linkss
for param, gender_df in gender_dfs.iteritems():
filename = 'data/%s-index-2014-09-17.csv' % param
filepoint = open(filename, 'w')
filepoint.write(gender_df.to_csv())
filepoint.close()
yg.head()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-32-f4756b0db6b1> in <module>() ----> 1 yg.head() NameError: name 'yg' is not defined
gender_dfs['dob'].plot(kind='area',stacked=True, figsize=(24,8))
<matplotlib.axes.AxesSubplot at 0x7f0e98ec9fd0>
plt = gender_dfs['dob'].ix[1800:].plot(kind='area',stacked=True, figsize=(24,8))
plt.set_title('''Wikidata Biography Gender Quantities by Year
1800 onwards''', size=24)
plt.set_xlabel('Year', size=18)
plt.set_ylabel('Biographies', size=18)
plt.legend(title='Gender', loc=2)
<matplotlib.legend.Legend at 0x7f0e9f7d1550>
nonmale = gender_dfs['dob']
del nonmale['nan']
nonmale['nonmale'] = nonmale.sum(axis=1) - nonmale['male']
nonmale['nm_per'] = (nonmale['nonmale'] ) / (nonmale['nonmale'] + nonmale['male'])
plt = nonmale.ix[1800:]['nm_per'].plot(kind='area', figsize=(24,8) )
plt.set_title('Evolution of Non-male Biography Percentages', size=24 )
plt.set_xlabel('Year', size=18)
plt.set_ylabel('Ratio', size=18)
plt.legend(title='Nonmale percentage', loc=2)
<matplotlib.legend.Legend at 0x7f0e9f298290>
yg.plot(kind='scatter', x='year', y='lnmale')
plt.draw()
<matplotlib.axes.AxesSubplot at 0x7f718fd5dc10>
yg_reg = yg
yg_reg['year'] = yg_reg.index
yg_reg['lnmale'] = numpy.log(yg_reg['male'])
import statsmodels.api as sm
nonnan = yg_reg.ix[start_year:1986].fillna(value=0)
model = sm.OLS(nonnan['male'],nonnan['year'])
results = model.fit()
print(results.summary())
print(start_year, results.rsquared)
(None, 0.20534979805159537) (-500, 0.21745102948866801) (0, 0.21941326163168851) (500, 0.22247317415119383) (1000, 0.24992223055451823) (1500, 0.37760491504948823) (1800, 0.72773643874338734) (1900, 0.95518546745325672)
rsquared_results = pandas.DataFrame(columns=['start_year','rsquared'])
for start_year in numpy.arange(-4000,1950,50):
nm_model = sm.OLS(yg_reg.ix[start_year:1986]['nm_per'], sm.add_constant(yg_reg.ix[start_year:1986]['year']) )
nm_results = nm_model.fit()
rsquared_results = rsquared_results.append({'start_year':start_year, 'rsquared':nm_results.rsquared}, ignore_index=True)
ax = rsquared_results[rsquared_results['start_year']>500].plot(kind='line',x='start_year',y='rsquared',
title=r'$R^2$ value for linear regression on non-male percentage')
ax.set_xlabel('starting year of regresssion untilt 1987')
ax.set_ylabel(r'$R^2$')
<matplotlib.text.Text at 0x7f719df8ef50>
This implies $y = -1.7495 + 0.001x$ with $R^2 = 0.885$
setting $y=0.5$ $\implies$ $x=2249.5$ or in the year 2250