import pandas as pd
import numpy
from collections import defaultdict
from matplotlib.pylab import style
import json
style.use('fivethirtyeight')
%pylab inline
java_min_int = -2147483648
Populating the interactive namespace from numpy and matplotlib
import pywikibot
#Tranforming QIDs into English labels.
enwp = pywikibot.Site('en','wikipedia')
wikidata = enwp.data_repository()
retrieved = dict()
def english_label(qid):
if qid:
if type(qid) is float:
if math.isnan(qid):
return None
#first see if we've done it
try:
return retrieved[qid]
except KeyError:
try:
page = pywikibot.ItemPage(wikidata, qid)
data = page.get()
lab = data['labels']['en']
retrieved[qid] = lab
return lab
except KeyError:
retrieved[qid] = qid
return qid
except:
return qid
return qid
lang_map = json.load(open('helpers/wiki_code_map.json','r'))
def lookup_lang(lang):
try:
full= lang_map[lang]
if full.split()[-1].lower() == 'wikipedia':
return ' '.join(full.split()[:-1])
else: return full
except:
return lang
allrecs = pd.read_csv('snapshot_data/2014-10-13/gender-index-data-2014-10-13.csv',na_values=[java_min_int])
def split_column(q_str):
if type(q_str) is float:
if numpy.isnan(q_str):
return q_str
if type(q_str) is str:
qs = q_str.split('|')
return qs[:-1] #cos the format will always end with a |
for col in ['gender','site_links']:
allrecs[col] = allrecs[col].apply(split_column)
allrecs.head(5)
qid | dob | dod | gender | ethnic_group | citizenship | place_of_birth | site_links | |
---|---|---|---|---|---|---|---|---|
0 | Q23 | 1732 | 1799 | [Q6581097] | NaN | Q30| | Q494413| | [zhwiki, kywiki, euwiki, plwiki, bswiki, angwi... |
1 | Q42 | 1952 | 2001 | [Q6581097] | NaN | Q145| | Q350| | [zhwiki, jvwiki, euwiki, plwiki, bswiki, eswik... |
2 | Q207 | 1946 | NaN | [Q6581097] | NaN | Q30| | Q49145| | [uzwiki, eswiki, kowikiquote, huwiki, liwikiqu... |
3 | Q297 | NaN | 1660 | [Q6581097] | NaN | Q29| | Q8717| | [zhwiki, kywiki, plwiki, euwiki, bswiki, uzwik... |
4 | Q326 | 1942 | NaN | [Q6581097] | NaN | Q298|Q39| | Q2887| | [zhwiki, plwiki, euwiki, kowiki, frwiki, eswik... |
for row in allrecs.iterrows():
print(row[1][3])
break
['Q6581097']
def makedict():
return defaultdict(int)
single = defaultdict(makedict)
many = defaultdict(makedict)
for index, row in allrecs.iterrows():
try:
gender = row[3][0]
except:
gender = None
wikiname_list = row[7]
if isinstance(wikiname_list, list):
if len(wikiname_list) == 1:
single[wikiname_list[0]][gender] += 1
else:
for wikiname in wikiname_list:
many[wikiname][gender] += 1
def makedf(indict):
df = pd.DataFrame.from_dict(indict, orient='index')
df.columns = map(english_label, df.columns)
df['human_total'] = df.sum(axis=1)
df['gendered_total'] = df['human_total'] - df[float('nan')]
df['nonbin_total'] = df['gendered_total'] - df['female'] - df['male']
df['fem_per'] = df['female'] / df['gendered_total']
df['nonbin_per'] = df['nonbin_total'] / df['gendered_total']
return df
sdf = makedf(single)
mdf = makedf(many)
tdf = sdf.join(mdf, how='inner', lsuffix='_single', rsuffix='_many')
tdf.fillna(0, inplace=True)
tdf.columns
Index([u'male_single', u'female_single', u'None_single', u'male animal_single', u'intersex_single', u'transgender female_single', u'woman', u'transgender male_single', u'genderqueer_single', u'female animal', u'human_total_single', u'gendered_total_single', u'nonbin_total_single', u'fem_per_single', u'nonbin_per_single', u'intersex_many', u'kathoey', u'transgender male_many', u'genderqueer_many', u'transgender female_many', u'None_many', u'male_many', u'female_many', u'fa'afafine', u'male animal_many', u'human_total_many', u'gendered_total_many', u'nonbin_total_many', u'fem_per_many', u'nonbin_per_many'], dtype='object')
tdf['fem_per_diff'] = tdf['fem_per_single'] - tdf['fem_per_many']
tdf[tdf['gendered_total_single']>7310].sort('fem_per_diff',ascending=False)[['fem_per_diff','fem_per_single','female_single','gendered_total_single','fem_per_many']]
fem_per_diff | fem_per_single | female_single | gendered_total_single | fem_per_many | |
---|---|---|---|---|---|
jawiki | 0.413549 | 0.603585 | 13434 | 22257 | 0.190037 |
hewiki | 0.060952 | 0.216555 | 2386 | 11018 | 0.155602 |
dawiki | 0.036127 | 0.181383 | 2233 | 12311 | 0.145256 |
nowiki | 0.029112 | 0.227961 | 4277 | 18762 | 0.198849 |
svwiki | 0.020537 | 0.213389 | 8772 | 41108 | 0.192852 |
ukwiki | 0.001318 | 0.137714 | 2953 | 21443 | 0.136396 |
fiwiki | -0.000251 | 0.167975 | 3145 | 18723 | 0.168226 |
dewiki | -0.000691 | 0.149034 | 30335 | 203544 | 0.149725 |
plwiki | -0.008691 | 0.149597 | 10353 | 69206 | 0.158288 |
frwiki | -0.012200 | 0.142818 | 14247 | 99756 | 0.155018 |
nlwiki | -0.012330 | 0.148959 | 5482 | 36802 | 0.161290 |
enwiki | -0.015220 | 0.146669 | 80262 | 547233 | 0.161889 |
itwiki | -0.018435 | 0.129315 | 7102 | 54920 | 0.147751 |
eswiki | -0.021227 | 0.143191 | 7851 | 54829 | 0.164418 |
ptwiki | -0.021357 | 0.143572 | 4678 | 32583 | 0.164929 |
etwiki | -0.022251 | 0.127290 | 1042 | 8186 | 0.149541 |
cswiki | -0.024574 | 0.136172 | 2453 | 18014 | 0.160746 |
ruwiki | -0.044208 | 0.105331 | 8083 | 76739 | 0.149539 |
fig, ax = plt.subplots(1,1,figsize=(8,8))
tdf[tdf['gendered_total_single']>2500][['gendered_total_single','fem_per_diff']].plot(kind='scatter', x='gendered_total_single', y='fem_per_diff', logx=True, ax=ax, c='#74bc3a')
ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:.0%}'.format(x )))
ax.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:,.0f}'.format(x )))
ax.set_xlim(min(tdf['gendered_total_single']) * 0.6, max(tdf['gendered_total_single']) *3)
#ax.set_ylim(min(tdf['fem_per_diff']) * 0.85, max(tdf['fem_per_diff']) *1.15)
for label, x, y in zip(tdf.index, tdf['gendered_total_single'], tdf['fem_per_diff']):
plt.annotate(
label,
xy = (x, y), xytext = (5,2),
textcoords = 'offset points', ha = 'left', va = 'bottom')
#bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
#arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))
plt.annotate("", xy=(10000,0.5), xytext=(0,0))
plt.title('Differnce in Female Ratio, of "Language-Unique" Articles vs Others \n by Language-Unique Articles"', fontsize=24)
plt.xlabel('Number of Unique Biographies Recorded')
plt.ylabel('Composition of Biographies Which Are Female')
<matplotlib.text.Text at 0x7f83e125f9d0>
ptdf = tdf[tdf['gendered_total_single']>730].sort('fem_per_diff')
maxbio = max(ptdf['gendered_total_single'])
bios_size = ptdf['gendered_total_single'].apply(lambda x: math.log(x)/math.log(maxbio))
my_colors = [(x/2, x, 0.75) for x in bios_size]
fig, ax = plt.subplots(1,1,figsize=(8,6))
ptdf['fem_per_diff'].plot(kind='bar', ax=ax, colors=my_colors)
ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: '{:.0%}'.format(x )))
wikilabels = ax.get_xticklabels()
wikinames = map(lambda x: x.get_text().split('wiki')[0], wikilabels)
fullnames = map(lookup_lang, wikinames)
ax.set_xticklabels(fullnames)
ax.set_title('Difference in Female Ratio \n by language-unique and language-many articles \n by Wikipedia Language', size=24)
ax.set_ylabel('[language-unique female ratio] - [language-many female ratio]')
ax.set_xlabel('Wikipedia Language | Darker Colours indicate more absolute lanuage-unique articles')
<matplotlib.text.Text at 0x7f83cc7db590>