Notebook

In [1]:

#------------------------------------------------------------------------------------------------------
# imports, logger initiation, loading persistant data
#------------------------------------------------------------------------------------------------------
from IPython.core.display import display, HTML
from IPython.display import clear_output, Markdown
from IPython.display import FileLink, FileLinks

import logging
import dill
# import qgrid
# from wikidata.client import Client

import pandas as pd
from pandas.compat import StringIO


import numpy as np
import json
from SPARQLWrapper import SPARQLWrapper, JSON
import ipywidgets as widgets

from urllib.parse import unquote
from urllib.parse import quote

import wikipedia
from tqdm import tnrange, tqdm_notebook
from tqdm import tqdm

import ipywidgets as widgetsM
from ipywidgets import HBox, VBox
from ipywidgets import Button, Layout

from operator import itemgetter
from collections import OrderedDict

import time
from datetime import datetime
import html2text
import re
import sys
import os
import io

from Bio import Entrez
from urllib.error import HTTPError

# from fuzzywuzzy import fuzz
# from fuzzywuzzy import process

from  itertools import chain
from collections import Counter


display(HTML("<style>.container {width:100% !important;}</style>"))

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

# create file handler which logs even debug messages
fh1 = logging.FileHandler('./logs/log.html',mode='w')
fh1.setLevel(logging.DEBUG)

# create console handler with a higher log level
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.ERROR) #------------------change your log setting here DEBUG INFO WARNING ERROR CRITICAL

formatter = logging.Formatter('<p> %(asctime)s - %(name)s - %(levelname)s - %(message)s <p>' , datefmt='%d-%b-%y %H:%M:%S')
fh1.setFormatter(formatter)
console_handler.setFormatter(formatter)

# add the handlers to the logger
logger.addHandler(fh1)
logger.addHandler(console_handler)

listOfFiles = os.listdir('./persistent_storage/')
pkl_file = (sorted(listOfFiles, reverse=True)[0])

display(HTML(f"<h2>Loading results of previous run on <u>\
             {time.ctime(os.path.getmtime(f'./persistent_storage/{pkl_file}'))}</u> from ./persistent_storage/{pkl_file}"))
logger.info(f"Loading df from ./persistent_storage/{pkl_file}")
#with open('./persistent_storage/df.dill', 'rb') as in_strm:
#     df = dill.load(in_strm)
#dill.load_session('./persistent_storage/dill_session')   
df=pd.read_pickle(f"./persistent_storage/{pkl_file}")
df.info()

# def on_button_load_clicked(b):
#     display(HTML(f"<h2>Loading df from ./persistent_storage/..."))
#     logger.info(f"Loading df from ./persistent_storage...")
#     #with open('./persistent_storage/df.dill', 'rb') as in_strm:
#     #     df = dill.load(in_strm)
#     #dill.load_session('./persistent_storage/dill_session')   
#     df=pd.read_pickle("./persistent_storage/df.pkl")
#     df.info()
#     text_area.value=df.info()

    
# button_load = widgets.Button(description="Load data from persistent storage",layout=Layout(width='20%', height='80px'))
# button_load.style.button_color = 'lightgreen'
# button_load.on_click(on_button_load_clicked)
# display(button_load)

# text_area= widgets.Textarea(value=df.info(),placeholder='',description='Loaded data summary:',disabled=False)
# display(text_area)

Loading results of previous run on Mon Oct 7 20:16:48 2019 from ./persistent_storage/20191007.pkl

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12954 entries, 0 to 12953
Data columns (total 9 columns):
WkD_disease              12954 non-null object
WkD_diseaseLabel         12954 non-null object
WP_en_article            4715 non-null object
class                    4525 non-null object
importance               4515 non-null object
taskForces               2600 non-null object
cochrane_reviews_html    4715 non-null object
cochrane_reviews         4715 non-null object
talkPage_categories      4668 non-null object
dtypes: object(9)
memory usage: 910.9+ KB

In [2]:

#------------------------------------------------------------------------------------------------------
# Statistical summary generator 
#------------------------------------------------------------------------------------------------------
datatables_js_script="""
<link rel="stylesheet" type="text/css" href="https://cdn.datatables.net/1.10.19/css/jquery.dataTables.css">
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.3.1/jquery.min.js"></script>
<script type="text/javascript" charset="utf8" src="https://cdn.datatables.net/1.10.19/js/jquery.dataTables.js"></script>
<script>
$(document).ready( function () {
    $('table.results').dataTable({
    "lengthMenu": [[10, 25, 50, -1], [10, 25, 50, "All"]], "bAutoWidth": false, 

    });
    //$('table.dataframe').dataTable();    
} );
</script>
"""

def make_clickable_PMID(val):
    # target _blank to open new window
    if not val: 
        return None
    else:
        return f'<a target="_blank" href="https://www.ncbi.nlm.nih.gov/pubmed/{val}">{val}</a>'

def imp2int(WP_imp,targetImportance):
    val=0
    if not WP_imp:
        if targetImportance=="Unknown":
            val=1
    else:
        if targetImportance==WP_imp:
            val=1
    return val

def class2int(WP_class,targetClass):
    val=0
    if not WP_class:
        if targetClass=="Unknown":
            val=1
    else:
        if targetClass==WP_class or (targetClass=="Other" and WP_class in ['List','Disambig','FL']):
            val=1
    return val        

display(Markdown("# Statistical Summaries:"))
display(Markdown(f"### Total number of diseases identified in Wikidata: {df.shape[0]}"))
display(Markdown(f"### Total number of diseases identified in Wikidata which have a corresponding article in Wikipedia:\
                  {df[df['WP_en_article'].notnull()].shape[0]}"))
display(Markdown("-----------------------------------------------------"))                  

display(Markdown("### Wikipedia articles class:"))
display(df.loc[df['WP_en_article'].notnull()]['class'].value_counts(dropna=False).to_frame().reset_index().rename_axis('index').\
        rename(columns = {'class': 'Total count','index': 'Article quality class'}).\
        sort_values(by = 'Total count', ascending = False).style.hide_index()) 

display(Markdown("### Wikipedia articles importance:"))
display(df.loc[df['WP_en_article'].notnull()]['importance'].value_counts(dropna=False).to_frame().reset_index().rename_axis('index').\
        rename(columns = {'importance': 'Total count','index': 'Article quality category'}).\
        sort_values(by = 'Total count', ascending = False).style.hide_index())


display(Markdown('### Wikipedia articles taskForces:'))
#display(pd.Series(Counter(chain.from_iterable( df.loc[df['taskForces'].notnull()]['taskForces'] ) )))
taskForces_count = {'NA':0}
for taskForces in df.loc[df['WP_en_article'].notnull()]['taskForces']:
    if taskForces:
        for taskForce in taskForces:
            if taskForce in taskForces_count:
                taskForces_count[taskForce] += 1
            else :
              taskForces_count[taskForce] = 1
    else:
        taskForces_count['NA'] += 1
display(pd.DataFrame(taskForces_count.items(), columns = ['taskForce', 'Total count']).style.hide_index())
display(Markdown("-----------------------------------------------------")) 

                
display(Markdown(f"### Number of disease related Wikipedia articles which cite one or more Cochrane reviews: \
                 {df[df['cochrane_reviews'].notnull()].shape[0] - df[(df['cochrane_reviews']=={})].shape[0] }"))
display(Markdown(f"### Number of disease related Wikipedia articles which do not cite any  Cochrane reviews: \
                 { df[(df['cochrane_reviews']=={})].shape[0] }"))
display(Markdown("-----------------------------------------------------")) 

#------------------------------------------------------------------------------------------------------
# PIMD stats  - PMIDs of Cochrane reviews (LATEST versions only)
#------------------------------------------------------------------------------------------------------                 
uniquePMIDs_count={}
for index, row in df.iterrows():                 
    WP_class = row['class']
    WP_imp = row['importance']
    PIMD_paper_dict=row['cochrane_reviews']
    if PIMD_paper_dict:
        for PMID, paper in PIMD_paper_dict.items():
            title = paper[0]
            cited = paper[1]
            latestVersion = paper[2]
            if latestVersion:
                if PMID in uniquePMIDs_count:
                   uniquePMIDs_count[PMID]=[title, uniquePMIDs_count[PMID][1]+1, uniquePMIDs_count[PMID][2]+int(cited),int(latestVersion)\
                                           , uniquePMIDs_count[PMID][4]+(imp2int(WP_imp,'1-Top')*int(cited))
                                           , uniquePMIDs_count[PMID][5]+(imp2int(WP_imp,'2-High')*int(cited))
                                           , uniquePMIDs_count[PMID][6]+(imp2int(WP_imp,'3-Mid')*int(cited))
                                           , uniquePMIDs_count[PMID][7]+(imp2int(WP_imp,'4-Low')*int(cited))
                                           , uniquePMIDs_count[PMID][8]+(imp2int(WP_imp,'Unknown')*int(cited))                                            
                                           , uniquePMIDs_count[PMID][9]+(class2int(WP_class,'FA')*int(cited))
                                           , uniquePMIDs_count[PMID][10]+(class2int(WP_class,'A')*int(cited))
                                           , uniquePMIDs_count[PMID][11]+(class2int(WP_class,'GA')*int(cited))
                                           , uniquePMIDs_count[PMID][12]+(class2int(WP_class,'B')*int(cited))
                                           , uniquePMIDs_count[PMID][13]+(class2int(WP_class,'C')*int(cited))
                                           , uniquePMIDs_count[PMID][14]+(class2int(WP_class,'Start')*int(cited))
                                           , uniquePMIDs_count[PMID][15]+(class2int(WP_class,'Stub')*int(cited))
                                           , uniquePMIDs_count[PMID][16]+(class2int(WP_class,'Other')*int(cited))                                            
                                           , uniquePMIDs_count[PMID][17]+(class2int(WP_class,'Unknown')*int(cited))]
                else:
                   uniquePMIDs_count[PMID]=[title,                            1,                            int(cited),int(latestVersion)
                                           , imp2int(WP_imp,'1-Top')*int(cited)
                                           , imp2int(WP_imp,'2-High')*int(cited)
                                           , imp2int(WP_imp,'3-Mid')*int(cited)
                                           , imp2int(WP_imp,'4-Low')*int(cited)
                                           , imp2int(WP_imp,'Unknown')*int(cited)
                                           , class2int(WP_class,'FA')*int(cited)
                                           , class2int(WP_class,'A')*int(cited)
                                           , class2int(WP_class,'GA')*int(cited)
                                           , class2int(WP_class,'B')*int(cited)
                                           , class2int(WP_class,'C')*int(cited)
                                           , class2int(WP_class,'Start')*int(cited)
                                           , class2int(WP_class,'Stub')*int(cited)
                                           , class2int(WP_class,'Other')*int(cited)
                                           , class2int(WP_class,'Unknown')*int(cited)]

display(Markdown(f'### PMIDs statistical summary 1 (LATEST VERSIONS of Cochrane reviews found and/or cited):'))
PMIDs_df= pd.DataFrame.from_dict(uniquePMIDs_count,orient='index').reset_index().rename_axis('index')\
        .rename(columns = {'index': 'PMIDs of Cochrane reviews (old versions only)'
                           ,0: 'Title'
                           ,1: 'Times appeared in search results'
                           ,2: 'Times cited in Wikipedia'
                           ,3: 'Version (1=latest, 0=old)'
                           ,4: 'Times cited in a Top Importance WP articles'
                           ,5: 'Times cited in a High Importance WP articles'
                           ,6: 'Times cited in a Mid Importance WP articles'
                           ,7: 'Times cited in a Low Importance WP articles'
                           ,8: 'Times cited in an Unknown Importance WP articles'
                           ,9: 'Times cited in a FA quality WP articles'
                           ,10: 'Times cited in a A quality WP articles'
                           ,11: 'Times cited in a GA quality WP articles'
                           ,12: 'Times cited in a B quality WP articles'
                           ,13: 'Times cited in a C quality WP articles'
                           ,14: 'Times cited in a Start quality WP articles'
                           ,15: 'Times cited in a Stub quality WP articles'
                           ,16: 'Times cited in a Other quality WP articles'
                           ,17: 'Times cited in an Unknown quality WP articles'
                          }).sort_values(by = 'Times cited in Wikipedia', ascending = False)
PMIDs_df.loc['Total'] = ['N/A','N/A',PMIDs_df['Times appeared in search results'].sum()\
                         ,PMIDs_df['Times cited in Wikipedia'].sum()
                         #,PMIDs_df['Latest version (1=Yes, 0=No)'].sum()
                         ,PMIDs_df['Version (1=latest, 0=old)'].count()
                         ,PMIDs_df['Times cited in a Top Importance WP articles'].sum()
                         ,PMIDs_df['Times cited in a High Importance WP articles'].sum()
                         ,PMIDs_df['Times cited in a Mid Importance WP articles'].sum()
                         ,PMIDs_df['Times cited in a Low Importance WP articles'].sum()
                         ,PMIDs_df['Times cited in an Unknown Importance WP articles'].sum()
                         ,PMIDs_df['Times cited in a FA quality WP articles'].sum()
                         ,PMIDs_df['Times cited in a A quality WP articles'].sum()
                         ,PMIDs_df['Times cited in a GA quality WP articles'].sum()
                         ,PMIDs_df['Times cited in a B quality WP articles'].sum()
                         ,PMIDs_df['Times cited in a C quality WP articles'].sum()
                         ,PMIDs_df['Times cited in a Start quality WP articles'].sum()
                         ,PMIDs_df['Times cited in a Stub quality WP articles'].sum()
                         ,PMIDs_df['Times cited in a Other quality WP articles'].sum()
                         ,PMIDs_df['Times cited in an Unknown quality WP articles'].sum()
                        ]
display(PMIDs_df.sort_values(by = 'Times cited in Wikipedia', ascending = False).head(10)
        .style.set_table_attributes('class="results"')\
        .format({'PMIDs of Cochrane reviews (old versions only)': make_clickable_PMID}))
display(HTML('<h2>⋮</h2>'))
f=open("./results/PMIDs_latestVersions_only.html","w")
f.write(datatables_js_script + PMIDs_df.sort_values(by = 'Times cited in Wikipedia', ascending = False)
        .style.set_table_attributes('class="results"')\
        .format({'PMIDs of Cochrane reviews (latest versions only)': make_clickable_PMID}).render())
f.close()
display(HTML('<H3>See the complete table here (sort, filter, and search):</H3>')
        ,FileLink('./results/PMIDs_latestVersions_only.html', result_html_prefix='<b>', result_html_suffix='</b>'))                 
display(Markdown("-----------------------------------------------------"))                

#------------------------------------------------------------------------------------------------------
# PIMD stats  - PMIDs of Cochrane reviews (old versions only)
#------------------------------------------------------------------------------------------------------                 
uniquePMIDs_count={}
for index, row in df.iterrows():                 
    WP_class = row['class']
    WP_imp = row['importance']
    PIMD_paper_dict=row['cochrane_reviews']
    if PIMD_paper_dict:
        for PMID, paper in PIMD_paper_dict.items():
            title = paper[0]
            cited = paper[1]
            latestVersion = paper[2]
            if not latestVersion:
                if PMID in uniquePMIDs_count:
                   uniquePMIDs_count[PMID]=[title, uniquePMIDs_count[PMID][1]+1, uniquePMIDs_count[PMID][2]+int(cited),int(latestVersion)\
                                           , uniquePMIDs_count[PMID][4]+(imp2int(WP_imp,'1-Top')*int(cited))
                                           , uniquePMIDs_count[PMID][5]+(imp2int(WP_imp,'2-High')*int(cited))
                                           , uniquePMIDs_count[PMID][6]+(imp2int(WP_imp,'3-Mid')*int(cited))
                                           , uniquePMIDs_count[PMID][7]+(imp2int(WP_imp,'4-Low')*int(cited))
                                           , uniquePMIDs_count[PMID][8]+(imp2int(WP_imp,'Unknown')*int(cited))                                            
                                           , uniquePMIDs_count[PMID][9]+(class2int(WP_class,'FA')*int(cited))
                                           , uniquePMIDs_count[PMID][10]+(class2int(WP_class,'A')*int(cited))
                                           , uniquePMIDs_count[PMID][11]+(class2int(WP_class,'GA')*int(cited))
                                           , uniquePMIDs_count[PMID][12]+(class2int(WP_class,'B')*int(cited))
                                           , uniquePMIDs_count[PMID][13]+(class2int(WP_class,'C')*int(cited))
                                           , uniquePMIDs_count[PMID][14]+(class2int(WP_class,'Start')*int(cited))
                                           , uniquePMIDs_count[PMID][15]+(class2int(WP_class,'Stub')*int(cited))
                                           , uniquePMIDs_count[PMID][16]+(class2int(WP_class,'Other')*int(cited))                                            
                                           , uniquePMIDs_count[PMID][17]+(class2int(WP_class,'Unknown')*int(cited))]
                else:
                   uniquePMIDs_count[PMID]=[title,                            1,                            int(cited),int(latestVersion)
                                           , imp2int(WP_imp,'1-Top')*int(cited)
                                           , imp2int(WP_imp,'2-High')*int(cited)
                                           , imp2int(WP_imp,'3-Mid')*int(cited)
                                           , imp2int(WP_imp,'4-Low')*int(cited)
                                           , imp2int(WP_imp,'Unknown')*int(cited)
                                           , class2int(WP_class,'FA')*int(cited)
                                           , class2int(WP_class,'A')*int(cited)
                                           , class2int(WP_class,'GA')*int(cited)
                                           , class2int(WP_class,'B')*int(cited)
                                           , class2int(WP_class,'C')*int(cited)
                                           , class2int(WP_class,'Start')*int(cited)
                                           , class2int(WP_class,'Stub')*int(cited)
                                           , class2int(WP_class,'Other')*int(cited)
                                           , class2int(WP_class,'Unknown')*int(cited)]

display(Markdown(f'### PMIDs statistical summary 2 (OLD VERSIONS of Cochrane reviews found and/or cited):'))
PMIDs_df= pd.DataFrame.from_dict(uniquePMIDs_count,orient='index').reset_index().rename_axis('index')\
        .rename(columns = {'index': 'PMIDs of Cochrane reviews (old versions only)'
                           ,0: 'Title'
                           ,1: 'Times appeared in search results'
                           ,2: 'Times cited in Wikipedia'
                           ,3: 'Version (1=latest, 0=old)'
                           ,4: 'Times cited in a Top Importance WP articles'
                           ,5: 'Times cited in a High Importance WP articles'
                           ,6: 'Times cited in a Mid Importance WP articles'
                           ,7: 'Times cited in a Low Importance WP articles'
                           ,8: 'Times cited in an Unknown Importance WP articles'
                           ,9: 'Times cited in a FA quality WP articles'
                           ,10: 'Times cited in a A quality WP articles'
                           ,11: 'Times cited in a GA quality WP articles'
                           ,12: 'Times cited in a B quality WP articles'
                           ,13: 'Times cited in a C quality WP articles'
                           ,14: 'Times cited in a Start quality WP articles'
                           ,15: 'Times cited in a Stub quality WP articles'
                           ,16: 'Times cited in a Other quality WP articles'
                           ,17: 'Times cited in an Unknown quality WP articles'
                          }).sort_values(by = 'Times cited in Wikipedia', ascending = False)
PMIDs_df.loc['Total'] = ['N/A','N/A',PMIDs_df['Times appeared in search results'].sum()\
                         ,PMIDs_df['Times cited in Wikipedia'].sum()
                         #,PMIDs_df['Latest version (1=Yes, 0=No)'].sum()
                         ,PMIDs_df['Version (1=latest, 0=old)'].count()
                         ,PMIDs_df['Times cited in a Top Importance WP articles'].sum()
                         ,PMIDs_df['Times cited in a High Importance WP articles'].sum()
                         ,PMIDs_df['Times cited in a Mid Importance WP articles'].sum()
                         ,PMIDs_df['Times cited in a Low Importance WP articles'].sum()
                         ,PMIDs_df['Times cited in an Unknown Importance WP articles'].sum()
                         ,PMIDs_df['Times cited in a FA quality WP articles'].sum()
                         ,PMIDs_df['Times cited in a A quality WP articles'].sum()
                         ,PMIDs_df['Times cited in a GA quality WP articles'].sum()
                         ,PMIDs_df['Times cited in a B quality WP articles'].sum()
                         ,PMIDs_df['Times cited in a C quality WP articles'].sum()
                         ,PMIDs_df['Times cited in a Start quality WP articles'].sum()
                         ,PMIDs_df['Times cited in a Stub quality WP articles'].sum()
                         ,PMIDs_df['Times cited in a Other quality WP articles'].sum()
                         ,PMIDs_df['Times cited in an Unknown quality WP articles'].sum()
                        ]
display(PMIDs_df.sort_values(by = 'Times cited in Wikipedia', ascending = False).head(10)
        .style.set_table_attributes('class="results"')\
        .format({'PMIDs of Cochrane reviews (old versions only)': make_clickable_PMID}))
display(HTML('<h2>⋮</h2>'))
f=open("./results/PMIDs_oldVersions_only.html","w")
f.write(datatables_js_script + PMIDs_df.sort_values(by = 'Times cited in Wikipedia', ascending = False)
        .style.set_table_attributes('class="results"')\
        .format({'PMIDs of Cochrane reviews (old versions only)': make_clickable_PMID}).render())
f.close()
display(HTML('<H3>See the complete table here (sort, filter, and search):</H3>')
        ,FileLink('./results/PMIDs_oldVersions_only.html', result_html_prefix='<b>', result_html_suffix='</b>'))                 
display(Markdown("-----------------------------------------------------"))

Statistical Summaries:¶

Total number of diseases identified in Wikidata: 12954¶

Total number of diseases identified in Wikidata which have a corresponding article in Wikipedia: 4715¶

Wikipedia articles class:¶

Article quality class	Total count
Start	1829
Stub	1035
C	991
B	541
nan	190
GA	79
FA	23
List	16
Disambig	11

Wikipedia articles importance:¶

Article quality category	Total count
3-Mid	2151
4-Low	2007
2-High	292
nan	200
1-Top	65

Wikipedia articles taskForces:¶

taskForce	Total count
NA	2115
Ophthalmology	237
Dermatology	951
Toxicology	11
Cardiology	186
Neurology	398
Pathology	247
Medical genetics	462
Pulmonology	65
Reproductive medicine	72
Hematology-oncology	251
Psychiatry	74
Gastroenterology	52
Nephrology	68
Livestock	2
Radiology	4
Applied Linguistics	1
Emergency medicine and EMS	6
Ethics	1
Sustainability	1
Military logistics and medicine	1
World War I	1
Balkan military history	1
European military history	1
German military history	1
World War II	1
Theoretical Linguistics	1

PMIDs statistical summary 1 (LATEST VERSIONS of Cochrane reviews found and/or cited):¶

	PMIDs of Cochrane reviews (old versions only)	Title	Times appeared in search results	Times cited in Wikipedia	Version (1=latest, 0=old)	Times cited in a Top Importance WP articles	Times cited in a High Importance WP articles	Times cited in a Mid Importance WP articles	Times cited in a Low Importance WP articles	Times cited in an Unknown Importance WP articles	Times cited in a FA quality WP articles	Times cited in a A quality WP articles	Times cited in a GA quality WP articles	Times cited in a B quality WP articles	Times cited in a C quality WP articles	Times cited in a Start quality WP articles	Times cited in a Stub quality WP articles	Times cited in a Other quality WP articles	Times cited in an Unknown quality WP articles
index
Total	N/A	N/A	15382	1143	6894	425	391	286	30	11	86	0	288	521	169	62	5	1	11
1545	18254088	Intravitreal steroids for macular edema in diabetes.	4	2	1	0	0	2	0	0	0	0	0	1	0	1	0	0	0
5718	27245310	Speech and language therapy for aphasia following stroke.	2	2	1	1	0	1	0	0	0	0	0	2	0	0	0	0	0
4633	26241698	Post-pyloric versus gastric tube feeding for preventing pneumonia and improving nutritional outcomes in critically ill adults.	2	2	1	1	0	1	0	0	0	0	1	0	0	1	0	0	0
5015	29664187	Exercise interventions and patient beliefs for people with hip, knee or hip and knee osteoarthritis: a mixed methods review.	2	2	1	2	0	0	0	0	0	0	0	2	0	0	0	0	0
5022	27103611	Topical NSAIDs for chronic musculoskeletal pain in adults.	2	2	1	2	0	0	0	0	0	0	0	2	0	0	0	0	0
4713	26824399	Surgical versus non-surgical treatment for lumbar spinal stenosis.	3	2	1	0	0	2	0	0	0	0	0	0	2	0	0	0	0
1148	16856036	Self-help and guided self-help for eating disorders.	3	2	1	0	0	1	0	1	0	0	0	0	1	0	0	0	1
2661	20927726	Grommets (ventilation tubes) for hearing loss associated with otitis media with effusion in children.	2	2	1	0	1	1	0	0	0	0	0	1	1	0	0	0	0
6254	24170669	Blood pressure targets for hypertension in people with diabetes mellitus.	2	2	1	2	0	0	0	0	0	0	1	1	0	0	0	0	0

⋮

See the complete table here (sort, filter, and search):

./results/PMIDs_latestVersions_only.html

PMIDs statistical summary 2 (OLD VERSIONS of Cochrane reviews found and/or cited):¶

	PMIDs of Cochrane reviews (old versions only)	Title	Times appeared in search results	Times cited in Wikipedia	Version (1=latest, 0=old)	Times cited in a Top Importance WP articles	Times cited in a High Importance WP articles	Times cited in a Mid Importance WP articles	Times cited in a Low Importance WP articles	Times cited in an Unknown Importance WP articles	Times cited in a FA quality WP articles	Times cited in a A quality WP articles	Times cited in a GA quality WP articles	Times cited in a B quality WP articles	Times cited in a C quality WP articles	Times cited in a Start quality WP articles	Times cited in a Stub quality WP articles	Times cited in a Other quality WP articles	Times cited in an Unknown quality WP articles
index
Total	N/A	N/A	5103	34	2770	4	10	15	4	1	2	0	6	11	11	1	2	0	1
110	24142399	Pharmacological treatment for pain in Guillain-Barré syndrome.	14	1	0	0	0	1	0	0	0	0	1	0	0	0	0	0	0
480	25102015	Antioxidant supplementation for lung disease in cystic fibrosis.	1	1	0	0	1	0	0	0	0	0	0	1	0	0	0	0	0
1943	26174592	Assisted reproductive technology: an overview of Cochrane Reviews.	1	1	0	0	0	0	1	0	0	0	0	0	1	0	0	0	0
1242	27089005	Non-absorbable disaccharides versus placebo/no intervention and lactulose versus lactitol for the prevention and treatment of hepatic encephalopathy in people with cirrhosis.	5	1	0	0	0	1	0	0	0	0	1	0	0	0	0	0	0
2554	22161393	Progestogen for treating threatened miscarriage.	1	1	0	0	1	0	0	0	0	0	0	1	0	0	0	0	0
502	27552284	Inhaled corticosteroids for cystic fibrosis.	1	1	0	0	1	0	0	0	0	0	0	1	0	0	0	0	0
1229	27884041	Follow-up strategies for patients treated for non-metastatic colorectal cancer.	2	1	0	1	0	0	0	0	0	0	0	1	0	0	0	0	0
199	26932750	Anti-vascular endothelial growth factor (VEGF) drugs for treatment of retinopathy of prematurity.	5	1	0	0	1	0	0	0	0	0	0	0	1	0	0	0	0
530	25093421	Pneumococcal vaccines for cystic fibrosis.	1	1	0	0	1	0	0	0	0	0	0	1	0	0	0	0	0

⋮

See the complete table here (sort, filter, and search):

./results/PMIDs_oldVersions_only.html

In [3]:

#------------------------------------------------------------------------------------------------------
# tabular data vizualization, and storage in CSV and HTML format (find these files in ./results folder)
#------------------------------------------------------------------------------------------------------

README="""
# WPM2Cochrane - a tool for linking WikiProject Medicine to the Cochrane Library

## Launch in JupyterLab (recommended) [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/ajoorabchi/WP2Cochrane/master?urlpath=lab/tree/index.ipynb)

## Launch in Jupyter Notebook  [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/ajoorabchi/WP2Cochrane/master?filepath=index.ipynb)

## Results of linking (full dataset)
- [Complete Tabular Results in HTML](https://ajoorabchi.github.io/WP2Cochrane/results/full_data.html)
- [Complete Tabular Results in CSV](https://ajoorabchi.github.io/WP2Cochrane/results/full_data.csv)
- [Complete Tabular Results in CSV (SPSS-friendly version)](https://ajoorabchi.github.io/WP2Cochrane/results/full_data_SPSS-friendy.csv)
- [PMIDs statistical summary 1 (LATEST VERSIONS of Cochrane reviews found and/or cited)](https://ajoorabchi.github.io/WP2Cochrane/results/PMIDs_latestVersions_only.html)
- [PMIDs statistical summary 2 (OLD VERSIONS of Cochrane reviews found and/or cited)](https://ajoorabchi.github.io/WP2Cochrane/results/PMIDs_oldVersions_only.html)

## Results of linking (Specialized HTML results per task force)
"""

datatables_js_script="""
<link rel="stylesheet" type="text/css" href="https://cdn.datatables.net/1.10.19/css/jquery.dataTables.css">
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.3.1/jquery.min.js"></script>
<script type="text/javascript" charset="utf8" src="https://cdn.datatables.net/1.10.19/js/jquery.dataTables.js"></script>
<script>
$(document).ready( function () {
    $('table.results').dataTable({
    "lengthMenu": [[10, 25, 50, -1], [10, 25, 50, "All"]], "bAutoWidth": false, 
        "columnDefs": [
            { "width": "100%", "targets": 7 }
        ]
    });
    //$('table.dataframe').dataTable();    
} );
</script>
"""

def get_df_info(content: pd.DataFrame):
#function code from: https://stackoverflow.com/a/44087453/2339926
    #display(content.info())
    content_info = StringIO()
    content.info(buf=content_info)
    str_ = content_info.getvalue()

    lines = str_.split("\n")
    table = StringIO("\n".join(lines[3:-3]))
    datatypes = pd.read_csv(table, delim_whitespace=True, 
                   names=["column", "count", "null", "dtype"])
    datatypes.set_index("column", inplace=True)

    info = "\n".join(lines[0:2] + lines[-2:-1])

    return info, datatypes

def make_clickable_wkd_items(val):
    # target _blank to open new window
    return '<a target="_blank" href="{}">{}</a>'.format(val, val[31:])

def make_clickable_taskForces(val):
    # target _blank to open new window
    if not val: 
        return None
    else:
        html=""
        for taskForce in val:
            html += f'<p><a target="_blank" href="https://en.wikipedia.org/wiki/Wikipedia:WikiProject_Medicine/\
            {taskForce.replace(" ","_")}_task_force">{taskForce}</a></p>' 
    return html

def make_clickable_WP_en_articles(val):
    # target _blank to open new window
    if not val: 
        return None
    else:
        return '<a target="_blank" href="{}">{}</a>'.format(val, unquote(unquote(val[30:].replace("_", " "))))
    
def list2text(list_):
    txt=""
    for item in list_:
        txt += item + ", "
    return txt[:-2]

#------------------------------------------------------------------------------------------------------
# full-data.csv generator
#------------------------------------------------------------------------------------------------------
h = html2text.HTML2Text()
h.ignore_links = False
df_plainText =df.copy()
df_plainText['cochrane_reviews_plainText'] = [h.handle(text) if text is not None else  text for text in df['cochrane_reviews_html']]
df_plainText['taskForces_plainText'] = [list2text(text) if text is not None else  text for text in df['taskForces']]
df_plainText.to_csv('./results/full_data.csv', index=False)

#------------------------------------------------------------------------------------------------------
# full_data_SPSS-friendy.csv generator (one review per row)
#------------------------------------------------------------------------------------------------------
df_spss_friendly=pd.DataFrame()
for index, row in tqdm(df[['WkD_disease', 'WkD_diseaseLabel', 'WP_en_article','class'
                      ,'importance','taskForces','cochrane_reviews','talkPage_categories']].iterrows()):
    #display(index,row)
    PIMD_paper_dict = row['cochrane_reviews']
    if PIMD_paper_dict:
        search_results_count = len(PIMD_paper_dict)
        cited_count=0
        outofdate_cited_count=0
        for PMID, paper in PIMD_paper_dict.items():
            title = paper[0]
            cited = paper[1]
            latestVersion = paper[2]
            if cited:
                cited_count +=1
            if cited and not latestVersion:
                outofdate_cited_count +=1
    else:
        search_results_count = None
        cited_count = None
        outofdate_cited_count = None
    
    PIMD_paper_dict = row['cochrane_reviews']
    if PIMD_paper_dict:
        for PMID, paper in PIMD_paper_dict.items():
            title = paper[0]
            cited = paper[1]
            latestVersion = paper[2]
            data = pd.DataFrame(row.items())
            data = data.transpose()
            data.columns = data.iloc[0]
            data = data.drop(data.index[[0]])
            data = data.drop(columns=['cochrane_reviews'])
            data['PMID']=PMID
            data['Title']=title
            data['Cited']=cited
            data['LatestVersion']=latestVersion
            data['Search results count'] = search_results_count
            data['Cited count'] = cited_count
            data['outofdate_cited_count'] = outofdate_cited_count
            df_spss_friendly = df_spss_friendly.append(data)
    else:
            data = pd.DataFrame(row.items())
            data = data.transpose()
            data.columns = data.iloc[0]
            data = data.drop(data.index[[0]])
            data = data.drop(columns=['cochrane_reviews'])
            data['PMID']=None
            data['Title']=None
            data['Cited']=None
            data['LatestVersion']=None
            data['Search results count'] = None
            data['Cited count'] = None
            data['outofdate_cited_count'] = None
            df_spss_friendly = df_spss_friendly.append(data)
            
#     if index==10:
#         break
        
df_spss_friendly.to_csv('./results/full_data_SPSS-friendy.csv', index=False)

#------------------------------------------------------------------------------------------------------
# full-data.html generator
#------------------------------------------------------------------------------------------------------
th    = dict(selector="th",           props=[('text-align', 'left'),('font','blod 14px arial, sans-serif'),('vertical-align','top')])
rh    = dict(selector=".row_heading", props=[("text-align", "left"),('font','bold 14px arial, sans-serif'),('vertical-align','top')])

col0 = dict(selector=".col0", props=[("text-align", "left"),('font','bold 12px arial, sans-serif'),('max-width','100px'),('vertical-align','top')])
col1 = dict(selector=".col1", props=[("text-align", "left"),('font','bold 12px arial, sans-serif'),('max-width','130px'),('vertical-align','top')])
col2 = dict(selector=".col2", props=[("text-align", "left"),('font','bold 12px arial, sans-serif'),('max-width','110px'),('vertical-align','top'),('word-wrap','break-word')])
col3 = dict(selector=".col3", props=[("text-align", "left"),('font','bold 12px arial, sans-serif'),('max-width','60px'),('vertical-align','top'),('word-wrap','break-word')])
col4 = dict(selector=".col4", props=[("text-align", "left"),('font','bold 12px arial, sans-serif'),('max-width','90px'),('vertical-align','top'),('word-wrap','break-word')])
col5 = dict(selector=".col5", props=[("text-align", "left"),('font','bold 12px arial, sans-serif'),('max-width','90px'),('vertical-align','top'),('word-wrap','break-word')])

col6 = dict(selector=".col6", props=[("text-align", "left"),('font','12px arial, sans-serif'),('vertical-align','top')])

fullSize =  sys.maxsize
testSize =  10
df_vizTable = df[['WkD_disease', 'WkD_diseaseLabel', 'WP_en_article','class','importance','taskForces','cochrane_reviews_html']]
df_vizTable_styled=df_vizTable\
        .head(fullSize).rename_axis('index')\
        .sort_values(by=['importance'],na_position='last')\
        .style.set_table_styles([th,rh,col0,col1,col2,col3,col4,col5,col6]).set_table_attributes('class="results"')\
        .format({'WkD_disease': make_clickable_wkd_items,'WP_en_article': make_clickable_WP_en_articles,
                 'taskForces': make_clickable_taskForces})

tableGuide='<H3>Table Guide:</H3>\
        <p><b>Grouping:</b></p>\
        <p>In cases, where there are multiple version of a Cochrane review, they are grouped toghther (showing the same background color)\
        ,and are listed chronologically,latest version first.</p>\
        <p><b>Color Codes:</b></p>\
        <ol>\
          <li><p style="color:Green;">Green:  up-to-date and  CITED</p></li>\
          <li><p style="color:Red;">  Red:    up-to-date and  NOT CITED</p></li>\
        <li><p style="color:Orange;">Orange:  out-of-date and CITED</p></li>\
            <li><p style="color:Grey;">Grey:  out-of-date and NOT CITED</p></li>\
        </ol><hr>'

f=open("./results/full_data.html","w")
info,datatypes = get_df_info(df_vizTable)
#display(datatypes,datatypes.at['WkD_disease','count'])
table_size_message=f"<H3>Results Table contains <i>{datatypes.at['WkD_disease','count']}</i> rows</H3><hr>"
table_size_warning=f"<H3>This is a large table, so it could take up to 30 seconds to fully load and render in your browser</H3><hr>"

if datatypes.at['WkD_disease','count']>1000:
    table_size_message += table_size_warning
f.write(datatables_js_script + info + datatypes.to_html() + tableGuide + table_size_message
        + df_vizTable_styled.render())
f.close()

#------------------------------------------------------------------------------------------------------
# Creates a dedicated HTML rsults file for each WikiProject medicine task force group in Wikipeida
#------------------------------------------------------------------------------------------------------
for taskForce, count in taskForces_count.items():
    if taskForce!="NA":

#         display(HTML(f"Creating a HTML results file for task Force <b>'{taskForce}'</b>:\
#                      <i>./results/HTML_results_per_task_force/{taskForce}_taskForce.html</i>"))

        #vizTable_per_taskForce = df_vizTable.loc[df_vizTable['taskForces'].apply(str)=="[]"]
        vizTable_per_taskForce = df_vizTable.loc[df_vizTable['taskForces'].notnull()]
        mask = vizTable_per_taskForce.taskForces.apply(lambda x: taskForce in x)
        vizTable_per_taskForce = vizTable_per_taskForce[mask]

       
        vizTable_per_taskForce_styled = vizTable_per_taskForce.rename_axis('index')\
        .sort_values(by=['importance'],na_position='last')\
        .style.set_table_styles([th,rh,col0,col1,col2,col3,col4,col5,col6]).set_table_attributes('class="results"')\
        .format({'WkD_disease': make_clickable_wkd_items,'WP_en_article': make_clickable_WP_en_articles,
                 'taskForces': make_clickable_taskForces})

        f=open(f"./results/HTML_results_per_task_force/{taskForce}_taskForce.html","w")
        info,datatypes = get_df_info(vizTable_per_taskForce)
        #display(datatypes,datatypes.at['WkD_disease','count'])
        table_size_message=f"<H3>Results Table contains <i>{datatypes.at['WkD_disease','count']}</i> rows</H3><hr>"
        table_size_warning=f"<H3>This is a large table so it needs ~10s to fully load and render in your browser</H3><hr>"

        if datatypes.at['WkD_disease','count']>1000:
            table_size_message += table_size_warning
        f.write(datatables_js_script + info + datatypes.to_html() + tableGuide + table_size_message
                + vizTable_per_taskForce_styled.render()) 
        
        f.close()
        #update README.md
        README += f"\n- [{taskForce}](https://ajoorabchi.github.io/WP2Cochrane/results/HTML_results_per_task_force/{quote(taskForce)}_taskForce.html)"

#------------------------------------------------------------------------------------------------------
# display CSV & HTML view/download options
#------------------------------------------------------------------------------------------------------
display(HTML('<H3>Complete Tabular Results in CSV or HTML Format:</H3>'),FileLinks('./results/'\
            , result_html_prefix='<li><b>', result_html_suffix='</b></li>',recursive=False))

display(HTML('<H3>Specialized HTML results for each <a href="https://en.wikipedia.org/wiki/Wikipedia:WikiProject_Medicine">\
             WikiProject medicine</a> task force \
             (see list of active task forces <a href="https://en.wikipedia.org/wiki/Wikipedia:WikiProject_Medicine/Task_forces">\
             here</a>):</H3>'),FileLinks('./results//HTML_results_per_task_force/'\
             , result_html_prefix='<li><b>', result_html_suffix='</b></li>',recursive=False))


#display(HTML(datatables_js_script + info + datatypes.to_html() + tableGuide + df_vizTable_styled.render()))

#------------------------------------------------------------------------------------------------------
# write README.md
#------------------------------------------------------------------------------------------------------
f=open("./README.md","w")
f.write(README)
f.close()

12954it [07:10, 30.06it/s]

Complete Tabular Results in CSV or HTML Format:

./results/

full_data.html

full_data.csv

full_data_SPSS-friendy.csv

PMIDs_oldVersions_only.html

PMIDs_latestVersions_only.html

Specialized HTML results for each WikiProject medicine task force (see list of active task forces here):

./results//HTML_results_per_task_force/

Pathology_taskForce.html

Nephrology_taskForce.html

Balkan military history_taskForce.html

European military history_taskForce.html

Applied Linguistics_taskForce.html

Radiology_taskForce.html

Reproductive medicine_taskForce.html

Hematology-oncology_taskForce.html

Gastroenterology_taskForce.html

Ethics_taskForce.html

Neurology_taskForce.html

Livestock_taskForce.html

Sustainability_taskForce.html

Emergency medicine and EMS_taskForce.html

Psychiatry_taskForce.html

Medical genetics_taskForce.html

Military logistics and medicine_taskForce.html

World War II_taskForce.html

Theoretical Linguistics_taskForce.html

Dermatology_taskForce.html

Cardiology_taskForce.html

Ophthalmology_taskForce.html

World War I_taskForce.html

German military history_taskForce.html

Pulmonology_taskForce.html

Toxicology_taskForce.html

In [4]:

###### ------------------------------------------------------------------------------------------------------
# WikiData search: searches WkD for a list of dieeas
#------------------------------------------------------------------------------------------------------
"""
This is a modified version of code from: 
1. https://lawlesst.github.io/notebook/sparql-dataframe.html
2. https://github.com/SuLab/sparql_to_pandas/blob/master/SPARQL_pandas.ipynb

Demonstrating how to get JupyterLab working with Binder: 

https://github.com/binder-examples/jupyterlab
https://github.com/binder-examples/jupyter-extension/blob/master/index.ipynb
"""

def get_sparql_dataframe(service, query):
    """
    Helper function to convert SPARQL results into a Pandas data frame.
    """
    sparql = SPARQLWrapper(service)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    result = sparql.query()

    processed_results = json.load(result.response)
    cols = processed_results['head']['vars']

    out = []
    for row in processed_results['results']['bindings']:
        item = []
        for c in cols:
            item.append(row.get(c, {}).get('value'))
        out.append(item)

    return pd.DataFrame(out, columns=cols)

#help: https://en.wikibooks.org/wiki/SPARQL/Wikidata_Query_Service_-_Introduction

wds = "https://query.wikidata.org/sparql"
rq = """
SELECT ?WkD_disease ?WkD_diseaseLabel ?WP_en_article 
WHERE {
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
  ?WkD_disease wdt:P31 wd:Q12136.
  
      OPTIONAL {
      ?WP_en_article schema:about ?WkD_disease .
      ?WP_en_article schema:inLanguage "en" .
      ?WP_en_article schema:isPartOf <https://en.wikipedia.org/> .
    }
}
#order by desc(?WkD_disease)
"""

df = get_sparql_dataframe(wds, rq)

#WkD api sample
# client = Client()  # doctest: +SKIP
# entity = client.get('Q1472', load=True)
# print (entity)
# print (entity.description)

# image_prop = client.get('P18')
# image = entity[image_prop]
# print (image)

# print(image.image_resolution)

# print(image.image_url)

In [5]:

#------------------------------------------------------------------------------------------------------
# PubMed search 
#------------------------------------------------------------------------------------------------------

"""
This is a modified version of code from: 
1. https://gist.github.com/bonzanini/5a4c39e4c02502a8451d
2. https://gist.github.com/bonzanini/5a4c39e4c02502a8451d

# Full discussion:
# https://marcobonzanini.wordpress.com/2015/01/12/searching-pubmed-with-python/
"""

def search(index,query):
    Entrez.email = 'your.email@example.com'
    
    logger.debug (f"{index:>5}. PubMed search query: {query}")
    handle = Entrez.esearch(db='pubmed', sort='relevance', retmax='200', retmode='xml', term=query)
    results = Entrez.read(handle)
    return results

def fetch_details(index, id_list):
    ids = ','.join(id_list)
    Entrez.email = 'your.email@example.com'

    sleep_time = 10
    num_retries = 1000
    error = None
    for x in range(0, num_retries):  
        try:
            logger.debug (f"{index:>5}. Fetching article details for PMIDs: {ids}")
            handle  = Entrez.efetch(db='pubmed', retmode='xml', id=ids)
            results = Entrez.read(handle)
            error = None
        except Exception as error:
            logger.error (f"{index:>5}. I got a HTTPError - reason {error}- while trying to fetch the articles details from pubmed")
            logger.error (f"{index:>5}. Sleeping for {sleep_time} seconds before trying again...")
            sleep(sleep_time)  # wait before trying to fetch the data again
            sleep_time *= 1.5  # Implement your backoff algorithm here i.e. exponential backoff
            pass

        if not error:
            break
            
    if error:
        logger.critical (f"{index:>5}. Retried fetching article's details {num_retries} times with no success!")
        raise error
    return results

def pubmed_search(index, disease, searchTitle, searchAbstract):
    
    logger.debug (f"{index:>5}. Searching PubMed for {disease} in searchTitle:{searchTitle} , searchAbstract:{searchAbstract}")
    if searchTitle and searchAbstract:
        results = search(index,"("+disease+'[Title/Abstract] NOT "withdrawn"[Title]) AND "The Cochrane database of systematic reviews"[Journal]')
    if searchTitle and not searchAbstract:
        results = search(index,"("+disease+'[Title] NOT "withdrawn"[Title]) AND "The Cochrane database of systematic reviews"[Journal]')
        
    #pprint.pprint (results)
    return results['IdList']

# display(len(pubmed_search("Crohn's disease",True,True)))
# display(len(pubmed_search("Crohn's disease",True,False)))

In [6]:

#------------------------------------------------------------------------------------------------------
# string comparator 
#------------------------------------------------------------------------------------------------------

# def searh_wp_refs_4title(pubmedArticleTitle,WPpageTitle):
#     ''' looksup an article title in a WikiPedia page '''
#     #display (wikipedia.WikipediaPage(WPpageTitle).html())
#     WPpageHTML= wikipedia.WikipediaPage(WPpageTitle).html()
#     ratio = fuzz.ratio(WPpageHTML, pubmedArticleTitle)
#     print ("fuzz.ratio:",ratio)

#     WPpageHTML=re.sub(r'\W+', '', WPpageHTML.lower())
#     pubmedArticleTitle=re.sub(r'\W+', '', pubmedArticleTitle.lower())
#     print(pubmedArticleTitle)
#     if pubmedArticleTitle in WPpageHTML:
#         return True
#     else:
#         return False
    
#print (searh_wp_refs_4title("interventions to slow progression of myopia in children...","Near-sightedness"))

def searh_wp_refs_4PMID(PMID,en_article_HTML):
    ''' looksup a PMIDs in a WikiPedia page '''  

    #print('searching for PMID: <b>'+PMID+ '</b>')
    if PMID in en_article_HTML:
        #print ('found')
        return True
    else:
        #print ('not found')
        return False
    
#en_article_HTML= wikipedia.WikipediaPage('Near-sightedness').html()
#print (searh_wp_refs_4PMID("22161388",en_article_HTML))

In [7]:

"""------------------------------------------------------------------------------------------------------
Main: this is the main part of the code which encompasses the following steps:

    1. content and metadata of WP articles which correspond to a disease in WikiData 
    are retrieved (title, class, importance, taskforce, content)

    2. For each WP article a search query is submitted to PubMed to retrieve a list of relevant Cochrane reviews

    3. the retrieved list of reviews are cross checked agains the WP articles reference section to identify cited, missing
    and out of date reviews.

    4. gathered infomation is converted tocolor-coded tabular format.
------------------------------------------------------------------------------------------------------"""

h2t = html2text.HTML2Text()
h2t.ignore_links = True

df['class']=None
df['importance']=None
df['taskForces']=None
df['cochrane_reviews_html']=None
df['cochrane_reviews']=None
df['talkPage_categories']=None

disease_search_log= widgets.HTML()
citation_search_log= widgets.HTML()

display(disease_search_log)
display(citation_search_log)
for index, row in tqdm_notebook(df.iterrows(),desc='Progress',total=df['WP_en_article'].count(), unit="WkD_disease"):
    if row['WP_en_article'] is not None:
        #and row['WP_en_article']=="https://en.wikipedia.org/wiki/Agalactia":
        if console_handler.level<=20:
            display(HTML(f"<b>START of row {index:>5}<b>"))
        else:
            logger.info(f"<{index:>5}------------------------------------------------")

        disease_search_log.value= f'<p style="color:blue;"><b>processing:</b> disease #{index:>5}\
            &emsp; {row["WkD_diseaseLabel"]} &emsp; {row["WkD_disease"]} &emsp; {row["WP_en_article"]} </p>'
        WP_en_article_Title= unquote(unquote(row['WP_en_article'][30:]))
        
        #will raise a DisambiguationError if the page is a disambiguation page, or a PageError if 
        #the page doesn’t exist (although by default, it tries to find the page you meant with suggest and search.)
        try:
            logger.info(f"{index:>5}. Getting the WikiPedia content for: {WP_en_article_Title}")
            WP_en_article_obj = wikipedia.WikipediaPage(WP_en_article_Title)
            WP_en_article_HTML = WP_en_article_obj.html()
            
            #checking for redirects
            if WP_en_article_Title.casefold() != WP_en_article_obj.title.casefold().replace(" ","_"):
                logger.info(f"{index:>5}. Redirected to: {WP_en_article_obj.title}")            
            
            logger.info(f"{index:>5}. Getting the WikiPedia talkPage_categories for: {WP_en_article_obj.title}")
            WP_en_article_talk_obj = wikipedia.WikipediaPage("Talk:"+ WP_en_article_obj.title)
            row['talkPage_categories'] = WP_en_article_talk_obj.categories
        
        except wikipedia.exceptions.DisambiguationError as e:
            logger.warning (f"{index:>5}. '{WP_en_article_talk_obj.title}' is an Ambiguous title: {e.options}")
            logger.warning (f"{index:>5}. Picking the first sense in the list by default: {e.options[0]}")
            WP_en_article_obj = wikipedia.WikipediaPage(e.options[0])
            WP_en_article_HTML = WP_en_article_obj.html()
            
            #checking for redirects
            if e.options[0].casefold() != WP_en_article_obj.title.casefold().replace(" ","_"):
                logger.warning (f"{index:>5}. Redirected to: {WP_en_article_obj.title}")
            
            logger.warning (f"{index:>5}. Getting the talkPage_categories for: {WP_en_article_obj.title}")
            WP_en_article_talk_obj = wikipedia.WikipediaPage("Talk:"+ WP_en_article_obj.title)
            row['talkPage_categories'] = WP_en_article_talk_obj.categories

        except wikipedia.exceptions.PageError as e:
            logger.error (f"{index:>5}. {WP_en_article_obj.title} - I got a PageError - reason: {e} - Article has no talk page yet")
        except KeyError as e:
            logger.error (f"{index:>5}. {WP_en_article_obj.title} - I got a KeyError  - reason: {e} - Article's Talk page has no Category")
            
        finally:
            if console_handler.level==10: 
                display(row['talkPage_categories'])
            else:
                logger.debug(row['talkPage_categories'])
            
        logger.debug(f"{index:>5}. Extracting task forces, class, and importance")
        taskForces=[]
        for cat in row['talkPage_categories'] or []:
            if cat.casefold().endswith("-class medicine articles"):
                row['class'] = cat[0:-24]
            if cat.casefold().endswith('-importance medicine articles'):
                imp = {
                    'NA'      : None,
                    '???'     : None,
                    'Unknown' : None,
                    'Low'     : '4-Low',
                    'Mid'     : '3-Mid',
                    'High'    : '2-High',
                    'Top'     : '1-Top'
                    }
                if imp[cat[0:-29]]:
                    row['importance'] = imp[cat[0:-29]]
                else:
                    row['importance'] = None
                
            if cat.casefold().endswith(' task force articles') and "wikiproject" not in cat.casefold():
                taskForce = cat[0:-20]
                taskForces.append(taskForce)
            
        if taskForces:
            row['taskForces']=taskForces
        if console_handler.level<=20:
            display(HTML(f"{index:>5}. class: {row['class']}, importance: {row['importance']}, task forces: {row['taskForces']}"))
        else:
            logger.info(f"{index:>5}. class: {row['class']}, importance: {row['importance']}, task forces: {row['taskForces']}")



        matches=0
        PIMD_paper_dict={}
        logger.info(f"{index:>5}. searching Pubmed for WkD_diseaseLabel: {row['WkD_diseaseLabel']}") 
        id_list= pubmed_search(index,row['WkD_diseaseLabel'],True,True)
        if len(id_list)==200:
            logger.warning(f"{index:>5}. (a) {row['WkD_diseaseLabel']} - Too many matches found (>200)") 
            logger.warning(f"{index:>5}. (b) restrcting search to Titles only (excluding Abstracts)")
            time.sleep(1)
            id_list= pubmed_search(index,row['WkD_diseaseLabel'],True,False)
            if len(id_list)==0:
                logger.warning(f"{index:>5}. (a) {row['WkD_diseaseLabel']} - Restrcting search to Titles only returned no results")
                logger.warning(f"{index:>5}. (b) reverting back to title/abstract")
                time.sleep(1)
                id_list= pubmed_search(index,row['WkD_diseaseLabel'],True,True)                      
        
        if not id_list and row['WkD_diseaseLabel'].lower()!= re.sub(r" ?\([^)]+\)", "", WP_en_article_Title.replace("_", " ").lower()):
            logger.warning(f"{index:>5}. (a) searching for '{row['WkD_diseaseLabel']}' returned {len(id_list)} results")
            logger.warning(f"{index:>5}. (b) searching for '{WP_en_article_Title.replace('_', ' ')}' instead")
            time.sleep(1)
            id_list= pubmed_search(index,WP_en_article_Title.replace("_", " "),True,True)
            if len(id_list)==200:
                logger.warning(f"{index:>5}. (a) {WP_en_article_Title.replace('_',' ')} - Retruned too many matches (>200)")
                logger.warning(f"{index:>5}. (b) restrcting search to Titles only (excluding Abstracts)")
                time.sleep(1)
                id_list= pubmed_search(index,WP_en_article_Title.replace("_", " "),True,False)
                if len(id_list)==0:
                    logger.warning(f"{index:>5}. (a) {WP_en_article_Title.replace('_',' ')} - Restrcting search to Titles only returned no results")
                    logger.warning(f"{index:>5}. (b) reverting back to title/abstract")
                    time.sleep(1)
                    id_list= pubmed_search(index,WP_en_article_Title.replace("_", " "),True,True)                      
        
        logger.info(f"{index:>5}. {len(id_list)} matching PMIDs found.")
        if  id_list:
            papers = fetch_details(index,id_list)
            #print(json.dumps(papers, indent=5))

            for i, paper in enumerate(papers['PubmedArticle']):
                articleTitle = paper['MedlineCitation']['Article']['ArticleTitle']
                PMID = int(paper['MedlineCitation']['PMID'])
                cited = searh_wp_refs_4PMID(str(PMID),WP_en_article_HTML)
                if cited:
                    matches +=1
                #display(articleTitle,PMID,cited)
                latestVersion=None                       
                PIMD_paper_dict[int(PMID)]=[articleTitle,cited,latestVersion]
                
        PIMD_paper_dict = OrderedDict(sorted(PIMD_paper_dict.items(), key=lambda t: [str(title).lower() for title in t[1][0]]))
        #display(PIMD_paper_dict)
        row['cochrane_reviews']= PIMD_paper_dict

        if PIMD_paper_dict:
            row['cochrane_reviews_html']='<div align="left" style="margin:0px;"><ol start="1" style="margin-left:0px">'
            bgc="White"
            for PMID, paper in PIMD_paper_dict.items():
                cited=paper[1]
                if cited:
                    color="green"
                    cited_message="<b> [CITED] </b>"
                if not cited:
                    color="red"
                    cited_message="<b> [NOT CITED] </b>"
                
                title=paper[0]
                latestVersion=True
                foundItems = (key for key, vals in PIMD_paper_dict.items() if title.lower() in [str(val).lower() for val in vals] and key!=PMID)
                for item in foundItems:
                    #display(item)
                    if item > PMID:
                        latestVersion=False
                paper[2]=latestVersion
                #display(title,PMID,cited,latestVersion)
                
                if latestVersion:
                    version_message="<b> [LATEST Version] </b>"
                else:
                    version_message="<b> [OLD Version] </b>"
                    if cited:
                        color="orange"
                        version_message +="<b> [UPDATE NEEDED] </b>"
                    else:
                        color="grey"
                        
                if latestVersion:
                    if bgc=="white":
                        bgc="#E0F5FE"
                    else:
                        bgc="white"
                
                row['cochrane_reviews_html']+='<li style="padding:5px;color:'+color+';background-color:'+bgc+';">'\
                    +paper[0]+' <a target="_blank" href="https://www.ncbi.nlm.nih.gov/pubmed/'\
                    +str(PMID)+'">PMID: '+str(PMID)+'</a>'+cited_message+version_message+"</li>"

            row['cochrane_reviews_html']+="</ol></div>"
        else:
            row['cochrane_reviews_html']="No matching publication found!"

        
        if console_handler.level<=10: #10=DEBUG
            display(HTML(row['cochrane_reviews_html']))
        else:
            #logger.debug(h2t.handle(str(row['cochrane_reviews_html'])))
            logger.debug(row['cochrane_reviews_html'])            

        citation_search_log.value = f'<p style="color:green;"><b>processed</b>: disease #{index:>5} &emsp; \
            {row["WkD_diseaseLabel"]} &emsp; {row["WkD_disease"]} &emsp; {row["WP_en_article"]}</p> \
            <p style="color:green;"><b>{matches} of {len(id_list)}</b>\
            Cochrane reviews found (via PubMed) are cited in the Wikipedia article: {WP_en_article_Title}</p>'
        
        if console_handler.level<=20:
            display(HTML(f"<b>END of row {index:>5}<b><hr>"))
        else:
            logger.info(f"------------------------------------------------{index:>5}><hr>")
                                   
                                   

"""------------------------------------------------------------------------------------------------------
# Once all the tasks (above cells) are completed successfuly, the dataframe 
# is sotored in persistent storage for future use (e.g., Binder)
#------------------------------------------------------------------------------------------------------"""
# Creating a datetime object
current_datetime = datetime.now()
# Converting a to string in the desired format (YYYYMMDD) using strftime
# and then to int.
current_datetime_int = int(current_datetime.strftime('%Y%m%d'))

display(HTML(f"<h2>Saving df in ./persistent_storage/{current_datetime_int}.pkl"))
logger.info(f"Saving df in ./persistent_storage/{current_datetime_int}.pkl")
# with open('./persistent_storage/df.dill', 'wb') as out_strm: 
#     dill.dump(df, out_strm) 
#dill.dump_session('./persistent_storage/dill_session')   
df.to_pickle(f"./persistent_storage/{current_datetime_int}.pkl")

HTML(value='')

HTML(value='')

HBox(children=(IntProgress(value=0, description='Progress', max=4717, style=ProgressStyle(description_width='i…

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
~/anaconda3/lib/python3.6/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    376             try:  # Python 2.7, use buffering of HTTP responses
--> 377                 httplib_response = conn.getresponse(buffering=True)
    378             except TypeError:  # Python 3

TypeError: getresponse() got an unexpected keyword argument 'buffering'

During handling of the above exception, another exception occurred:

KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-7-e0f69c66ef4e> in <module>
     45             logger.info(f"{index:>5}. Getting the WikiPedia content for: {WP_en_article_Title}")
     46             WP_en_article_obj = wikipedia.WikipediaPage(WP_en_article_Title)
---> 47             WP_en_article_HTML = WP_en_article_obj.html()
     48 
     49             #checking for redirects

~/anaconda3/lib/python3.6/site-packages/wikipedia/wikipedia.py in html(self)
    452       }
    453 
--> 454       request = _wiki_request(query_params)
    455       self._html = request['query']['pages'][self.pageid]['revisions'][0]['*']
    456 

~/anaconda3/lib/python3.6/site-packages/wikipedia/wikipedia.py in _wiki_request(params)
    735     time.sleep(int(wait_time.total_seconds()))
    736 
--> 737   r = requests.get(API_URL, params=params, headers=headers)
    738 
    739   if RATE_LIMIT:

~/anaconda3/lib/python3.6/site-packages/requests/api.py in get(url, params, **kwargs)
     73 
     74     kwargs.setdefault('allow_redirects', True)
---> 75     return request('get', url, params=params, **kwargs)
     76 
     77 

~/anaconda3/lib/python3.6/site-packages/requests/api.py in request(method, url, **kwargs)
     58     # cases, and look like a memory leak in others.
     59     with sessions.Session() as session:
---> 60         return session.request(method=method, url=url, **kwargs)
     61 
     62 

~/anaconda3/lib/python3.6/site-packages/requests/sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
    531         }
    532         send_kwargs.update(settings)
--> 533         resp = self.send(prep, **send_kwargs)
    534 
    535         return resp

~/anaconda3/lib/python3.6/site-packages/requests/sessions.py in send(self, request, **kwargs)
    666 
    667         # Resolve redirects if allowed.
--> 668         history = [resp for resp in gen] if allow_redirects else []
    669 
    670         # Shuffle things around if there's history.

~/anaconda3/lib/python3.6/site-packages/requests/sessions.py in <listcomp>(.0)
    666 
    667         # Resolve redirects if allowed.
--> 668         history = [resp for resp in gen] if allow_redirects else []
    669 
    670         # Shuffle things around if there's history.

~/anaconda3/lib/python3.6/site-packages/requests/sessions.py in resolve_redirects(self, resp, req, stream, timeout, verify, cert, proxies, yield_requests, **adapter_kwargs)
    245                     proxies=proxies,
    246                     allow_redirects=False,
--> 247                     **adapter_kwargs
    248                 )
    249 

~/anaconda3/lib/python3.6/site-packages/requests/sessions.py in send(self, request, **kwargs)
    644 
    645         # Send the request
--> 646         r = adapter.send(request, **kwargs)
    647 
    648         # Total elapsed time of the request (approximately)

~/anaconda3/lib/python3.6/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
    447                     decode_content=False,
    448                     retries=self.max_retries,
--> 449                     timeout=timeout
    450                 )
    451 

~/anaconda3/lib/python3.6/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
    598                                                   timeout=timeout_obj,
    599                                                   body=body, headers=headers,
--> 600                                                   chunked=chunked)
    601 
    602             # If we're going to release the connection in ``finally:``, then

~/anaconda3/lib/python3.6/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    378             except TypeError:  # Python 3
    379                 try:
--> 380                     httplib_response = conn.getresponse()
    381                 except Exception as e:
    382                     # Remove the TypeError from the exception chain in Python 3;

~/anaconda3/lib/python3.6/http/client.py in getresponse(self)
   1329         try:
   1330             try:
-> 1331                 response.begin()
   1332             except ConnectionError:
   1333                 self.close()

~/anaconda3/lib/python3.6/http/client.py in begin(self)
    295         # read until we get a non-100 response
    296         while True:
--> 297             version, status, reason = self._read_status()
    298             if status != CONTINUE:
    299                 break

~/anaconda3/lib/python3.6/http/client.py in _read_status(self)
    256 
    257     def _read_status(self):
--> 258         line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
    259         if len(line) > _MAXLINE:
    260             raise LineTooLong("status line")

~/anaconda3/lib/python3.6/socket.py in readinto(self, b)
    584         while True:
    585             try:
--> 586                 return self._sock.recv_into(b)
    587             except timeout:
    588                 self._timeout_occurred = True

~/anaconda3/lib/python3.6/site-packages/urllib3/contrib/pyopenssl.py in recv_into(self, *args, **kwargs)
    292     def recv_into(self, *args, **kwargs):
    293         try:
--> 294             return self.connection.recv_into(*args, **kwargs)
    295         except OpenSSL.SSL.SysCallError as e:
    296             if self.suppress_ragged_eofs and e.args == (-1, 'Unexpected EOF'):

~/anaconda3/lib/python3.6/site-packages/OpenSSL/SSL.py in recv_into(self, buffer, nbytes, flags)
   1819             result = _lib.SSL_peek(self._ssl, buf, nbytes)
   1820         else:
-> 1821             result = _lib.SSL_read(self._ssl, buf, nbytes)
   1822         self._raise_ssl_error(self._ssl, result)
   1823 

KeyboardInterrupt:

In [ ]:

Loading results of previous run on Mon Oct 7 20:16:48 2019 from ./persistent_storage/20191007.pkl

Statistical Summaries:¶

Total number of diseases identified in Wikidata: 12954¶

Total number of diseases identified in Wikidata which have a corresponding article in Wikipedia: 4715¶

Wikipedia articles class:¶

Wikipedia articles importance:¶

Wikipedia articles taskForces:¶

Number of disease related Wikipedia articles which cite one or more Cochrane reviews: 1219¶

Number of disease related Wikipedia articles which do not cite any Cochrane reviews: 3496¶

PMIDs statistical summary 1 (LATEST VERSIONS of Cochrane reviews found and/or cited):¶

⋮

See the complete table here (sort, filter, and search):

PMIDs statistical summary 2 (OLD VERSIONS of Cochrane reviews found and/or cited):¶

⋮

See the complete table here (sort, filter, and search):

Complete Tabular Results in CSV or HTML Format:

Specialized HTML results for each WikiProject medicine task force (see list of active task forces here):