country_qids = json.load(open('country_qids.json')) ugandaqid, cdiqid= 'Q1036', 'Q1008' print "We are comparing: ", len(country_qids), 'countries".' print "Some of their Wikidata QIDs are: ", map(lambda qid: 'http://wikidata.org/wiki/'+qid, country_qids[:5]) def report_actionable_metrics(wikicode, completeness_weight=0.8, infonoise_weight=0.6, images_weight=0.3): completeness = completeness_weight * num_reg_links(wikicode) informativeness = (infonoise_weight * infonoise(wikicode) ) + (images_weight * num_file_links(wikicode) ) numheadings = len(section_headings(wikicode)) articlelength = readable_text_length(wikicode) referencerate = article_refs(wikicode) / readable_text_length(wikicode) return {'completeness': completeness, 'informativeness': informativeness, 'numheadings': numheadings, 'articlelength': articlelength, 'referencerate': referencerate} metric_analyse_density('informativeness', 12) metric_analyse_density('completeness', 1500) metric_analyse_density('numheadings', 75) metric_analyse_density('articlelength',150000) metric_analyse_density('referencerate', 0.01) top_n_sections('fr',20) top_n_sections('en',20) top_n_sections('sw',7) categoryjson = json.load(open('ethnosets-categories-capitalized.json','r')) for subject, nationdict in categoryjson.iteritems(): print subject for nation, langdict in nationdict.iteritems(): print " |---" + nation for lang in ['fr','en','sw']: try: print" |---" + langdict[lang] except: pass ethnosets = !ls ethnosets/ reduce(lambda a, b: a+b, map(lambda l: len(l), map(lambda f: json.load(open('ethnosets/'+f)), ethnosets))) make_heat_map() display_article_counts()display_article_counts() make_cite_plot() coordinate_frequencies() #Infonoise metric of Stvilia (2005) in concept, although the implementation may differ since we are not stopping and stemming words, because of the multiple languages we need to handle def readable_text_length(wikicode): #could also use wikicode.filter_text() return float(len(wikicode.strip_code())) def infonoise(wikicode): wikicode.strip_code() ratio = readable_text_length(wikicode) / float(len(wikicode)) return ratio #Helper function to mine for section headings, of course if there is a lead it doesn't quite make sense. def section_headings(wikicode): sections = wikicode.get_sections() sec_headings = map( lambda s: filter( lambda l: l != '=', s), map(lambda a: a.split(sep='\n', maxsplit=1)[0], sections)) return sec_headings #i don't know why mwparserfromhell's .fitler_tags() isn't working at the moment. going to hack it for now import re def num_refs(wikicode): text = str(wikicode) reftags = re.findall('<(\ )*?ref', text) return len(reftags) def article_refs(wikicode): sections = wikicode.get_sections() return float(reduce( lambda a,b: a+b ,map(num_refs, sections))) #Predicate for links and files in English French and Swahili def link_a_file(linkstr): fnames = [u'File:', u'Fichier:', u'Image:', u'Picha:'] bracknames = map(lambda a: '[[' + a, fnames) return any(map(lambda b: linkstr.startswith(b), bracknames)) def link_a_cat(linkstr): cnames =[u'Category:', u'Catégorie:', u'Jamii:'] bracknames = map(lambda a: '[[' + a, cnames) return any(map(lambda b: linkstr.startswith(b), bracknames)) def num_reg_links(wikicode): reg_links = filter(lambda a: not link_a_file(a) and not link_a_cat(a), wikicode.filter_wikilinks()) return float(len(reg_links)) def num_file_links(wikicode): file_links = filter(lambda a: link_a_file(a), wikicode.filter_wikilinks()) return float(len(file_links)) import pywikibot import mwparserfromhell as pfh import os import datetime import pandas as pd import json from collections import defaultdict from ggplot import * import operator from IPython.display import HTML %pylab inline langs = ['en','fr','sw'] nations = ['usa', 'fra', 'cdi', 'uga'] wikipedias = {lang: pywikibot.Site(lang, 'wikipedia') for lang in langs} wikidata = wikipedias['fr'].data_repository() def enfrsw(): return {lang: None for lang in langs} def article_attributes(): return {attrib: enfrsw() for attrib in ['sitelinks', 'wikitext', 'wikicode', 'metrics']} def do_sitelinks(langs, qids, data): for qid in qids: page = pywikibot.ItemPage(wikidata, qid) wditem = page.get() for lang in langs: try: data[qid]['sitelinks'][lang] = wditem['sitelinks'][lang+'wiki'] except KeyError: pass return data def get_wikitext(lang, title): page = pywikibot.Page(wikipedias[lang],title) def get_page(page): try: pagetext = page.get() return pagetext except pywikibot.exceptions.IsRedirectPage: redir = page.getRedirectTarget() get_page(redir) except pywikibot.exceptions.NoPage: raise pywikibot.exceptions.NoPage print 're raising' return get_page(page) def do_wikitext(langs, data): for qid, attribs in data.iteritems(): for lang, sl in attribs['sitelinks'].iteritems(): if sl: try: if randint(0,100) == 99: print sl data[qid]['wikitext'][lang] = get_wikitext(lang, sl) except: print 'bad sitelink', sl continue return data def do_wikicode(langs, data): for qid, attribs in data.iteritems(): for lang, pagetext in attribs['wikitext'].iteritems(): if pagetext: data[qid]['wikicode'][lang] = pfh.parse(pagetext) return data def do_metrics(data): for qid, attribs in data.iteritems(): for lang, wikicode in attribs['wikicode'].iteritems(): if wikicode: data[qid]['metrics'][lang] = report_actionable_metrics(wikicode) return data # this will take a lot of network time since we are going to load about 300 pages, but we'll save the data off so we don't have to do it uneccesarrily def make_data(langs, qids, savename): print 'getting these qids: ', qids data = defaultdict(article_attributes) print 'getting sitelinks' data = do_sitelinks(langs, qids, data) print 'getting wikitext' data = do_wikitext(langs, data) print 'converting to wikicode' data = do_wikicode(langs, data) print 'computing metrics' data = do_metrics(data) hashable_data = {qid: {'wikitext': attribdict['wikitext'], 'metrics': attribdict['metrics'], 'sitelinks': attribdict['sitelinks']} for qid, attribdict in data.iteritems()} print 'saving now' #save the results safefilename = savename+str(datetime.datetime.now())+'.json' with open(safefilename,'w') as f3: json.dump(hashable_data,f3) with open(savename+'latest.json','w') as f4: json.dump(hashable_data, f4) return data #i don't call this unless i have time to uncomment it #arts = make_data(langs, country_qids, 'countrydata') #time to get into pandas, lets throw everything into a data frame df = pd.DataFrame(columns=['Country','language','metric','val']) arts = json.load(open('countrydata-latest.json','r')) for qid, attribdict in arts.iteritems(): for attribname, langdict in attribdict.iteritems(): if attribname == 'metrics': for lang, metrics in langdict.iteritems(): try: #someteimes there wasn't an article in that language and thus no corresponding len for metric_name, metric_val in metrics.iteritems(): df = df.append({'Country': qid, 'language':lang, 'metric':metric_name, 'val':float(metric_val)}, ignore_index=True) except: pass df = df.convert_objects(convert_numeric=True) metric_list = ['completeness','informativeness','numheadings','articlelength','referencerate'] langs_df_dict = {lang: df[df['language'] == lang] for lang in langs} metric_df_dict = {metric: df[df['metric'] == metric] for metric in metric_list} #for later calculation uganda_zscores = defaultdict(list) cdi_zscores = defaultdict(list) def metric_analyse_density(ametric, xlimit): inf_df = metric_df_dict[ametric] zscore = lambda x: (x - x.mean()) / x.std() inf_piv = inf_df.pivot(index='Country', columns='language', values='val') inf_piv_z = inf_piv.apply(zscore) metric_analyse_density_plot(ametric, xlimit, inf_df) print 'Uganda ('+ugandaqid+"), Côte d'Ivoire ("+cdiqid+") " +ametric+ " z-scores." return inf_piv_z.ix[[ugandaqid,cdiqid]] def metric_analyse_density_plot(ametric, xlimit, inf_df): p = ggplot(aes(x='val', colour='language', fill=True, alpha = 0.3), data=inf_df) + geom_density() + labs("score", "frequency") + \ scale_x_continuous(limits=(0,xlimit)) + ggtitle(ametric + '\nall country articles\n ') p.rcParams["figure.figsize"] = "4, 3" p.draw() def defaultint(): return defaultdict(int) section_count = defaultdict(defaultint) sorted_secs = defaultdict(list) total_articles = defaultdict(int) articles = json.load(open('countrydata-latest.json','r')) for qid, attribdict in articles.iteritems(): for attribname, langdict in attribdict.iteritems(): if attribname == 'wikitext': for lang, wikitext in langdict.iteritems(): if wikitext: total_articles[lang] += 1 wikicode = pfh.parse(wikitext) secs = section_headings(wikicode) for sec in secs: sec = sec.strip() section_count[lang][sec] += 1 section_df = pd.DataFrame(columns=['lang','secname','freq']) for lang, sec_dict in section_count.iteritems(): for secname, seccount in sec_dict.iteritems(): freq = seccount/float(total_articles[lang]) section_df = section_df.append({'lang':lang, 'secname':secname, 'freq':freq}, ignore_index=True) #section_df = section_df.convert_objects(convert_numeric=True) section_df.head() top_secs = section_df[section_df.freq > 0.1] sort_secs= top_secs.sort(columns='freq', ascending=False) def top_n_sections(lang,n): top = sort_secs[sort_secs.lang==lang].iloc[:n].convert_objects(convert_numeric=True) print str(total_articles[lang]) + ' total articles inspected in ' + lang + '.' return HTML(top.to_html(index=False, columns=['lang','secname','freq'])) def top_sections_ethnoset(ethnoset_filename): def defaultint(): return defaultdict(int) section_count = defaultdict(defaultint) sorted_secs = defaultdict(list) total_articles = defaultdict(int) articles = json.load(open(ethnoset_filename,'r')) for qid, attribdict in articles.iteritems(): for attribname, langdict in attribdict.iteritems(): if attribname == 'wikitext': for lang, wikitext in langdict.iteritems(): if wikitext: total_articles[lang] += 1 wikicode = pfh.parse(wikitext) secs = section_headings(wikicode) for sec in secs: sec = sec.strip() section_count[lang][sec] += 1 section_df = pd.DataFrame(columns=['lang','secname','freq']) for lang, sec_dict in section_count.iteritems(): for secname, seccount in sec_dict.iteritems(): freq = seccount/float(total_articles[lang]) section_df = section_df.append({'lang':lang, 'secname':secname, 'freq':freq}, ignore_index=True) #section_df = section_df.convert_objects(convert_numeric=True) section_df.head() top_secs = section_df[section_df.freq > 0.1] sort_secs= top_secs.sort(columns='freq', ascending=False) return sort_secs !ls ethnosa!ls ethnosave/ def display_article_counts(): filenames = !ls ethnosave art_counts = pd.DataFrame(columns=['subj','nation', 'lang','count']) for filename in filenames: spl = filename.split('-') subj, nation = spl[0], spl[1].split('.')[0] fileaddr = 'ethnosave/' + filename articles = json.load(open(fileaddr,'r')) total_articles = defaultdict(int) for qid, attribdict in articles.iteritems(): for attribname, langdict in attribdict.iteritems(): if attribname == 'wikitext': for lang, wikitext in langdict.iteritems(): if wikitext: total_articles[lang] += 1 for lang, count in total_articles.iteritems(): art_counts = art_counts.append({'subj': subj, 'nation': nation, 'lang': lang, 'count': count} ,ignore_index = True) return HTML(art_counts.sort(columns='count', ascending=False).to_html(index=False)) def make_heat_map(): subj_list = ['economy','history','geography'] metric_list = ['completeness','informativeness','numheadings','articlelength','referencerate'] #pivtables = {metric: {subj: None for subj in subj_list} for metric in metric_list} fig, axes = plt.subplots(nrows = len(metric_list), ncols = len(subj_list), sharex='col', sharey='row' ) ''' for metric, subjdict in pivtables.iteritems(): for subj, pivtab in subjdict.iteritems(): natlangdf = means_df[(means_df.metric == metric) & (means_df.subj == subj)] natlangpiv = pd.pivot_table(natlangdf, values='means', rows='lang', cols='nation') pivtables[metric][subj] = natlangpiv ''' for axarr, metric in zip(axes, metric_list): for ax, subj in zip(axarr, subj_list): natlangdf = means_df[(means_df.metric == metric) & (means_df.subj == subj)] natlangpiv = pd.pivot_table(natlangdf, values='means', rows='lang', cols='nation') heatmap = ax.pcolor(natlangpiv, cmap='Blues') ax.set_yticks(np.arange(0.5, len(natlangpiv.index), 1)) ax.set_yticklabels(natlangpiv.index) ax.set_xticks(np.arange(0.5, len(natlangpiv.columns), 1)) ax.set_xticklabels(natlangpiv.columns) cbar = plt.colorbar(mappable=heatmap, ax=ax) fig.suptitle('Heatmap of Actionable Metrics by Country versus Wikipedia Language, \n by Subject Category', fontsize=18) fig.set_size_inches(12,12,dpi=600) #fig.tight_layout() subj_titles = ['Economy','History','Geography'] metric_titles =['Wikilinks','Code & Images to Text Ratio','Section Count','Article Length', 'References per Article Length'] for i in range(len(subj_titles)): axes[0][i].set_title(subj_titles[i]) for j in range(len(metric_titles)): axes[j][0].set_ylabel(metric_titles[j]) means_df[(means_df.metric == 'referencerate') & (means_df.subj == 'geography')] def load_ethnosaves(): ethnosaves = !ls ethnosave subj_df_dict = {subj: pd.DataFrame(columns=['qid','subj','nation','lang','metric','val']) for subj in ethnosaves} for ethnosavefile in ethnosaves: nameparts = ethnosavefile.split('-') subj = nameparts[0] dotparts = nameparts[1].split('.') nation = dotparts[0] arts = json.load(open('ethnosave/'+ethnosavefile,'r')) print subj, nation sdf = subj_df_dict[ethnosavefile] for qid, attribdict in arts.iteritems(): for attribname, langdict in attribdict.iteritems(): if attribname == 'metrics': for lang, metrics in langdict.iteritems(): try: #someteimes there wasn't an article in that language and thus no corresponding len for metric_name, metric_val in metrics.iteritems(): sdf = sdf.append({'qid': qid, 'subj':subj, 'nation':nation, 'lang':lang, 'metric':metric_name, 'val':float(metric_val)}, ignore_index=True) except: pass subj_df_dict[ethnosavefile] = sdf lens = map(lambda d: len(d), subj_df_dict.itervalues()) print lens return subj_df_dict subj_df_dict = load_ethnosaves() subj_df = pd.concat(subj_df_dict) assert(len(subj_df) == reduce(lambda a, b: a+b, map(lambda df: len(df), subj_df_dict.itervalues()))) subj_df = subj_df.convert_objects() means_df = pd.DataFrame(columns=['subj','nation','lang','metric','means']) for subj in ['geography','history','economy']: for metric in ['completeness','informativeness','numheadings','articlelength','referencerate']: for nation in ['usa','fra','cdi','uga']: for lang in ['en','fr','sw']: spec_df = subj_df[(subj_df.subj == subj) & (subj_df.nation == nation) & (subj_df.metric == metric) & (subj_df.lang == lang)]['val'] mean = spec_df.mean() if (not str(mean)[0] in '0123456789'): mean = 0.0 if len(spec_df) <= 25: print len(spec_df), subj, metric, nation, lang mean = 0.0 means_df = means_df.append({'subj':subj, 'nation':nation, 'lang':lang, 'metric':metric, 'means':mean}, ignore_index=True) means_df = means_df.convert_objects(convert_numeric=True) def top_sections_ethnoset(ethnoset_filename): print ethnoset_filename def defaultint(): return defaultdict(int) section_count = defaultdict(defaultint) sorted_secs = defaultdict(list) total_articles = defaultdict(int) articles = json.load(open(ethnoset_filename,'r')) for qid, attribdict in articles.iteritems(): for attribname, langdict in attribdict.iteritems(): if attribname == 'wikitext': for lang, wikitext in langdict.iteritems(): if wikitext: total_articles[lang] += 1 wikicode = pfh.parse(wikitext) secs = section_headings(wikicode) for sec in secs: sec = sec.strip() section_count[lang][sec] += 1 section_df = pd.DataFrame(columns=['lang','secname','freq']) for lang, sec_dict in section_count.iteritems(): for secname, seccount in sec_dict.iteritems(): freq = seccount/float(total_articles[lang]) section_df = section_df.append({'lang':lang, 'secname':secname, 'freq':freq}, ignore_index=True) #section_df = section_df.convert_objects(convert_numeric=True) section_df.head() top_secs = section_df[section_df.freq > 0.1] sort_secs= top_secs.sort(columns='freq', ascending=False) return sort_secs def top_templates_ethnoset(ethnoset_filename): def defaultint(): return defaultdict(int) template_count = defaultdict(defaultint) sorted_templates = defaultdict(list) total_articles = defaultdict(int) articles = json.load(open(ethnoset_filename,'r')) for qid, attribdict in articles.iteritems(): for attribname, langdict in attribdict.iteritems(): if attribname == 'wikitext': for lang, wikitext in langdict.iteritems(): if wikitext: total_articles[lang] += 1 wikicode = pfh.parse(wikitext) temps = wikicode.filter_templates() for temp in temps: tempname = temp.name tempname = tempname.strip().lower() template_count[lang][tempname] += 1 temp_df = pd.DataFrame(columns=['lang','tempname','freq']) for lang, temp_dict in template_count.iteritems(): for tempname, tempcount in temp_dict.iteritems(): freq = tempcount/float(total_articles[lang]) temp_df = temp_df.append({'lang':lang, 'tempname':tempname, 'freq':freq}, ignore_index=True) #section_df = section_df.convert_objects(convert_numeric=True) top_templates = temp_df[temp_df.freq > 0.1] sort_temps= top_templates.sort(columns='freq', ascending=False) temps_dict = dict() for lang in template_count.iterkeys(): try: temps_dict[lang] = sort_temps[sort_temps.lang==lang].iloc[:20].convert_objects(convert_numeric=True) except: temps_dict[lang] = sort_temps[sort_temps.lang==lang].convert_objects(convert_numeric=True) return temps_dict ethnosaves = !ls ethnosave filenames = map(lambda name: 'ethnosave/'+name, ethnosaves)ethnosaves = !ls ethnosave filenames = map(lambda name: 'ethnosave/'+name, ethnosaves) sort_dfs = map(top_sections_ethnoset, filenames) def make_cite_plot(): citedf = pd.DataFrame(columns=['setname','cite','freq']) for i in range(len(filenames)): for lang, df in temp_dfs[i].iteritems(): if lang == 'en': df = df[(df.tempname == 'cite web') | (df.tempname == 'cite book') | (df.tempname == 'cite news') | (df.tempname == 'cite journal')] setname = filenames[i][10:-16] tot = 0 for row in df.iterrows(): cols = row[1] tot += cols['freq'] for row in df.iterrows(): cols = row[1] citedf = citedf.append({'setname':setname, 'cite': cols['tempname'], 'freq':cols['freq']/float(tot)}, ignore_index=True) cite_dict = {"cite book":536258, "cite journal":328129, "cite news":444447, "cite web":1560207} globaltot = reduce(lambda a,b: a+b, cite_dict.itervalues()) globaltotfloat = float(globaltot) globciteratio = map(lambda cd: (cd[0], cd[1]/globaltotfloat), cite_dict.iteritems() ) for cite in globciteratio: citetype, freq = cite[0], cite[1] citedf = citedf.append({'setname':'English WP Global', 'cite': citetype, 'freq':freq}, ignore_index=True) citedf = citedf.convert_objects(convert_numeric=True) citepiv = citedf.pivot(index = 'setname', columns = 'cite') citeplot = citepiv.plot(kind='bar', stacked=True) citeplot.legend(('Citation type', 'Cite book', 'Cite journal', 'Cite news', 'Cite web'), loc=9) citeplot.figure.set_size_inches(12,8) citeplot.set_xlabel('subject-nation') citeplot.set_title('Composition of Citation Type, by Subject-Nation') ethnosaves = !ls ethnosave filenames = map(lambda name: 'ethnosave/'+name, ethnosaves) temp_dfs = map(top_templates_ethnoset, filenames) for i in range(len(filenames)): for lang, df in temp_dfs[i].iteritems(): #print '' #print filenames[i] #print df def coordinate_frequencies(): def coord_templates_ethnoset(ethnosavefile, coord_df): nameparts = ethnosavefile.split('-') subj = nameparts[0] dotparts = nameparts[1].split('.') nation = dotparts[0] total_articles = defaultdict(int) coord_articles = defaultdict(int) articles = json.load(open('ethnosave/'+ethnosavefile,'r')) for qid, attribdict in articles.iteritems(): for attribname, langdict in attribdict.iteritems(): if attribname == 'wikitext': for lang, wikitext in langdict.iteritems(): if wikitext: total_articles[lang] += 1 wikicode = pfh.parse(wikitext) temps = wikicode.filter_templates() for temp in temps: tempname = temp.name tempname = tempname.strip().lower() if tempname == 'coord': coord_articles[lang] += 1 for lang, coord_count in coord_articles.iteritems(): freq = coord_count / float(total_articles[lang]) coord_df = coord_df.append({'subj':subj, 'nation':nation, 'lang':lang, 'coord_count':coord_count, 'freq':freq}, ignore_index=True) return coord_df coord_df = pd.DataFrame(columns=['subj','nation','lang','coord_count','freq']) ethnosaves = !ls ethnosave for ethnosave in ethnosaves: coord_df = coord_templates_ethnoset(ethnosave, coord_df) coord_df_sort = coord_df.sort(columns='freq', ascending=False) return HTML(coord_df_sort.to_html(index=False, columns=['subj','nation','lang','coord_count','freq']))