#------------------------------------------------------------------------------------------------------
# imports, logger initiation, loading persistant data
#------------------------------------------------------------------------------------------------------
from IPython.core.display import display, HTML
from IPython.display import clear_output, Markdown
from IPython.display import FileLink, FileLinks
import logging
import dill
# import qgrid
# from wikidata.client import Client
import pandas as pd
from pandas.compat import StringIO
import numpy as np
import json
from SPARQLWrapper import SPARQLWrapper, JSON
import ipywidgets as widgets
from urllib.parse import unquote
from urllib.parse import quote
import wikipedia
from tqdm import tnrange, tqdm_notebook
from tqdm import tqdm
import ipywidgets as widgetsM
from ipywidgets import HBox, VBox
from ipywidgets import Button, Layout
from operator import itemgetter
from collections import OrderedDict
import time
from datetime import datetime
import html2text
import re
import sys
import os
import io
from Bio import Entrez
from urllib.error import HTTPError
# from fuzzywuzzy import fuzz
# from fuzzywuzzy import process
from itertools import chain
from collections import Counter
display(HTML("<style>.container {width:100% !important;}</style>"))
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
# create file handler which logs even debug messages
fh1 = logging.FileHandler('./logs/log.html',mode='w')
fh1.setLevel(logging.DEBUG)
# create console handler with a higher log level
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.ERROR) #------------------change your log setting here DEBUG INFO WARNING ERROR CRITICAL
formatter = logging.Formatter('<p> %(asctime)s - %(name)s - %(levelname)s - %(message)s <p>' , datefmt='%d-%b-%y %H:%M:%S')
fh1.setFormatter(formatter)
console_handler.setFormatter(formatter)
# add the handlers to the logger
logger.addHandler(fh1)
logger.addHandler(console_handler)
listOfFiles = os.listdir('./persistent_storage/')
pkl_file = (sorted(listOfFiles, reverse=True)[0])
display(HTML(f"<h2>Loading results of previous run on <u>\
{time.ctime(os.path.getmtime(f'./persistent_storage/{pkl_file}'))}</u> from ./persistent_storage/{pkl_file}"))
logger.info(f"Loading df from ./persistent_storage/{pkl_file}")
#with open('./persistent_storage/df.dill', 'rb') as in_strm:
# df = dill.load(in_strm)
#dill.load_session('./persistent_storage/dill_session')
df=pd.read_pickle(f"./persistent_storage/{pkl_file}")
df.info()
# def on_button_load_clicked(b):
# display(HTML(f"<h2>Loading df from ./persistent_storage/..."))
# logger.info(f"Loading df from ./persistent_storage...")
# #with open('./persistent_storage/df.dill', 'rb') as in_strm:
# # df = dill.load(in_strm)
# #dill.load_session('./persistent_storage/dill_session')
# df=pd.read_pickle("./persistent_storage/df.pkl")
# df.info()
# text_area.value=df.info()
# button_load = widgets.Button(description="Load data from persistent storage",layout=Layout(width='20%', height='80px'))
# button_load.style.button_color = 'lightgreen'
# button_load.on_click(on_button_load_clicked)
# display(button_load)
# text_area= widgets.Textarea(value=df.info(),placeholder='',description='Loaded data summary:',disabled=False)
# display(text_area)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 12954 entries, 0 to 12953 Data columns (total 9 columns): WkD_disease 12954 non-null object WkD_diseaseLabel 12954 non-null object WP_en_article 4715 non-null object class 4525 non-null object importance 4515 non-null object taskForces 2600 non-null object cochrane_reviews_html 4715 non-null object cochrane_reviews 4715 non-null object talkPage_categories 4668 non-null object dtypes: object(9) memory usage: 910.9+ KB
#------------------------------------------------------------------------------------------------------
# Statistical summary generator
#------------------------------------------------------------------------------------------------------
datatables_js_script="""
<link rel="stylesheet" type="text/css" href="https://cdn.datatables.net/1.10.19/css/jquery.dataTables.css">
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.3.1/jquery.min.js"></script>
<script type="text/javascript" charset="utf8" src="https://cdn.datatables.net/1.10.19/js/jquery.dataTables.js"></script>
<script>
$(document).ready( function () {
$('table.results').dataTable({
"lengthMenu": [[10, 25, 50, -1], [10, 25, 50, "All"]], "bAutoWidth": false,
});
//$('table.dataframe').dataTable();
} );
</script>
"""
def make_clickable_PMID(val):
# target _blank to open new window
if not val:
return None
else:
return f'<a target="_blank" href="https://www.ncbi.nlm.nih.gov/pubmed/{val}">{val}</a>'
def imp2int(WP_imp,targetImportance):
val=0
if not WP_imp:
if targetImportance=="Unknown":
val=1
else:
if targetImportance==WP_imp:
val=1
return val
def class2int(WP_class,targetClass):
val=0
if not WP_class:
if targetClass=="Unknown":
val=1
else:
if targetClass==WP_class or (targetClass=="Other" and WP_class in ['List','Disambig','FL']):
val=1
return val
display(Markdown("# Statistical Summaries:"))
display(Markdown(f"### Total number of diseases identified in Wikidata: {df.shape[0]}"))
display(Markdown(f"### Total number of diseases identified in Wikidata which have a corresponding article in Wikipedia:\
{df[df['WP_en_article'].notnull()].shape[0]}"))
display(Markdown("-----------------------------------------------------"))
display(Markdown("### Wikipedia articles class:"))
display(df.loc[df['WP_en_article'].notnull()]['class'].value_counts(dropna=False).to_frame().reset_index().rename_axis('index').\
rename(columns = {'class': 'Total count','index': 'Article quality class'}).\
sort_values(by = 'Total count', ascending = False).style.hide_index())
display(Markdown("### Wikipedia articles importance:"))
display(df.loc[df['WP_en_article'].notnull()]['importance'].value_counts(dropna=False).to_frame().reset_index().rename_axis('index').\
rename(columns = {'importance': 'Total count','index': 'Article quality category'}).\
sort_values(by = 'Total count', ascending = False).style.hide_index())
display(Markdown('### Wikipedia articles taskForces:'))
#display(pd.Series(Counter(chain.from_iterable( df.loc[df['taskForces'].notnull()]['taskForces'] ) )))
taskForces_count = {'NA':0}
for taskForces in df.loc[df['WP_en_article'].notnull()]['taskForces']:
if taskForces:
for taskForce in taskForces:
if taskForce in taskForces_count:
taskForces_count[taskForce] += 1
else :
taskForces_count[taskForce] = 1
else:
taskForces_count['NA'] += 1
display(pd.DataFrame(taskForces_count.items(), columns = ['taskForce', 'Total count']).style.hide_index())
display(Markdown("-----------------------------------------------------"))
display(Markdown(f"### Number of disease related Wikipedia articles which cite one or more Cochrane reviews: \
{df[df['cochrane_reviews'].notnull()].shape[0] - df[(df['cochrane_reviews']=={})].shape[0] }"))
display(Markdown(f"### Number of disease related Wikipedia articles which do not cite any Cochrane reviews: \
{ df[(df['cochrane_reviews']=={})].shape[0] }"))
display(Markdown("-----------------------------------------------------"))
#------------------------------------------------------------------------------------------------------
# PIMD stats - PMIDs of Cochrane reviews (LATEST versions only)
#------------------------------------------------------------------------------------------------------
uniquePMIDs_count={}
for index, row in df.iterrows():
WP_class = row['class']
WP_imp = row['importance']
PIMD_paper_dict=row['cochrane_reviews']
if PIMD_paper_dict:
for PMID, paper in PIMD_paper_dict.items():
title = paper[0]
cited = paper[1]
latestVersion = paper[2]
if latestVersion:
if PMID in uniquePMIDs_count:
uniquePMIDs_count[PMID]=[title, uniquePMIDs_count[PMID][1]+1, uniquePMIDs_count[PMID][2]+int(cited),int(latestVersion)\
, uniquePMIDs_count[PMID][4]+(imp2int(WP_imp,'1-Top')*int(cited))
, uniquePMIDs_count[PMID][5]+(imp2int(WP_imp,'2-High')*int(cited))
, uniquePMIDs_count[PMID][6]+(imp2int(WP_imp,'3-Mid')*int(cited))
, uniquePMIDs_count[PMID][7]+(imp2int(WP_imp,'4-Low')*int(cited))
, uniquePMIDs_count[PMID][8]+(imp2int(WP_imp,'Unknown')*int(cited))
, uniquePMIDs_count[PMID][9]+(class2int(WP_class,'FA')*int(cited))
, uniquePMIDs_count[PMID][10]+(class2int(WP_class,'A')*int(cited))
, uniquePMIDs_count[PMID][11]+(class2int(WP_class,'GA')*int(cited))
, uniquePMIDs_count[PMID][12]+(class2int(WP_class,'B')*int(cited))
, uniquePMIDs_count[PMID][13]+(class2int(WP_class,'C')*int(cited))
, uniquePMIDs_count[PMID][14]+(class2int(WP_class,'Start')*int(cited))
, uniquePMIDs_count[PMID][15]+(class2int(WP_class,'Stub')*int(cited))
, uniquePMIDs_count[PMID][16]+(class2int(WP_class,'Other')*int(cited))
, uniquePMIDs_count[PMID][17]+(class2int(WP_class,'Unknown')*int(cited))]
else:
uniquePMIDs_count[PMID]=[title, 1, int(cited),int(latestVersion)
, imp2int(WP_imp,'1-Top')*int(cited)
, imp2int(WP_imp,'2-High')*int(cited)
, imp2int(WP_imp,'3-Mid')*int(cited)
, imp2int(WP_imp,'4-Low')*int(cited)
, imp2int(WP_imp,'Unknown')*int(cited)
, class2int(WP_class,'FA')*int(cited)
, class2int(WP_class,'A')*int(cited)
, class2int(WP_class,'GA')*int(cited)
, class2int(WP_class,'B')*int(cited)
, class2int(WP_class,'C')*int(cited)
, class2int(WP_class,'Start')*int(cited)
, class2int(WP_class,'Stub')*int(cited)
, class2int(WP_class,'Other')*int(cited)
, class2int(WP_class,'Unknown')*int(cited)]
display(Markdown(f'### PMIDs statistical summary 1 (LATEST VERSIONS of Cochrane reviews found and/or cited):'))
PMIDs_df= pd.DataFrame.from_dict(uniquePMIDs_count,orient='index').reset_index().rename_axis('index')\
.rename(columns = {'index': 'PMIDs of Cochrane reviews (old versions only)'
,0: 'Title'
,1: 'Times appeared in search results'
,2: 'Times cited in Wikipedia'
,3: 'Version (1=latest, 0=old)'
,4: 'Times cited in a Top Importance WP articles'
,5: 'Times cited in a High Importance WP articles'
,6: 'Times cited in a Mid Importance WP articles'
,7: 'Times cited in a Low Importance WP articles'
,8: 'Times cited in an Unknown Importance WP articles'
,9: 'Times cited in a FA quality WP articles'
,10: 'Times cited in a A quality WP articles'
,11: 'Times cited in a GA quality WP articles'
,12: 'Times cited in a B quality WP articles'
,13: 'Times cited in a C quality WP articles'
,14: 'Times cited in a Start quality WP articles'
,15: 'Times cited in a Stub quality WP articles'
,16: 'Times cited in a Other quality WP articles'
,17: 'Times cited in an Unknown quality WP articles'
}).sort_values(by = 'Times cited in Wikipedia', ascending = False)
PMIDs_df.loc['Total'] = ['N/A','N/A',PMIDs_df['Times appeared in search results'].sum()\
,PMIDs_df['Times cited in Wikipedia'].sum()
#,PMIDs_df['Latest version (1=Yes, 0=No)'].sum()
,PMIDs_df['Version (1=latest, 0=old)'].count()
,PMIDs_df['Times cited in a Top Importance WP articles'].sum()
,PMIDs_df['Times cited in a High Importance WP articles'].sum()
,PMIDs_df['Times cited in a Mid Importance WP articles'].sum()
,PMIDs_df['Times cited in a Low Importance WP articles'].sum()
,PMIDs_df['Times cited in an Unknown Importance WP articles'].sum()
,PMIDs_df['Times cited in a FA quality WP articles'].sum()
,PMIDs_df['Times cited in a A quality WP articles'].sum()
,PMIDs_df['Times cited in a GA quality WP articles'].sum()
,PMIDs_df['Times cited in a B quality WP articles'].sum()
,PMIDs_df['Times cited in a C quality WP articles'].sum()
,PMIDs_df['Times cited in a Start quality WP articles'].sum()
,PMIDs_df['Times cited in a Stub quality WP articles'].sum()
,PMIDs_df['Times cited in a Other quality WP articles'].sum()
,PMIDs_df['Times cited in an Unknown quality WP articles'].sum()
]
display(PMIDs_df.sort_values(by = 'Times cited in Wikipedia', ascending = False).head(10)
.style.set_table_attributes('class="results"')\
.format({'PMIDs of Cochrane reviews (old versions only)': make_clickable_PMID}))
display(HTML('<h2>⋮</h2>'))
f=open("./results/PMIDs_latestVersions_only.html","w")
f.write(datatables_js_script + PMIDs_df.sort_values(by = 'Times cited in Wikipedia', ascending = False)
.style.set_table_attributes('class="results"')\
.format({'PMIDs of Cochrane reviews (latest versions only)': make_clickable_PMID}).render())
f.close()
display(HTML('<H3>See the complete table here (sort, filter, and search):</H3>')
,FileLink('./results/PMIDs_latestVersions_only.html', result_html_prefix='<b>', result_html_suffix='</b>'))
display(Markdown("-----------------------------------------------------"))
#------------------------------------------------------------------------------------------------------
# PIMD stats - PMIDs of Cochrane reviews (old versions only)
#------------------------------------------------------------------------------------------------------
uniquePMIDs_count={}
for index, row in df.iterrows():
WP_class = row['class']
WP_imp = row['importance']
PIMD_paper_dict=row['cochrane_reviews']
if PIMD_paper_dict:
for PMID, paper in PIMD_paper_dict.items():
title = paper[0]
cited = paper[1]
latestVersion = paper[2]
if not latestVersion:
if PMID in uniquePMIDs_count:
uniquePMIDs_count[PMID]=[title, uniquePMIDs_count[PMID][1]+1, uniquePMIDs_count[PMID][2]+int(cited),int(latestVersion)\
, uniquePMIDs_count[PMID][4]+(imp2int(WP_imp,'1-Top')*int(cited))
, uniquePMIDs_count[PMID][5]+(imp2int(WP_imp,'2-High')*int(cited))
, uniquePMIDs_count[PMID][6]+(imp2int(WP_imp,'3-Mid')*int(cited))
, uniquePMIDs_count[PMID][7]+(imp2int(WP_imp,'4-Low')*int(cited))
, uniquePMIDs_count[PMID][8]+(imp2int(WP_imp,'Unknown')*int(cited))
, uniquePMIDs_count[PMID][9]+(class2int(WP_class,'FA')*int(cited))
, uniquePMIDs_count[PMID][10]+(class2int(WP_class,'A')*int(cited))
, uniquePMIDs_count[PMID][11]+(class2int(WP_class,'GA')*int(cited))
, uniquePMIDs_count[PMID][12]+(class2int(WP_class,'B')*int(cited))
, uniquePMIDs_count[PMID][13]+(class2int(WP_class,'C')*int(cited))
, uniquePMIDs_count[PMID][14]+(class2int(WP_class,'Start')*int(cited))
, uniquePMIDs_count[PMID][15]+(class2int(WP_class,'Stub')*int(cited))
, uniquePMIDs_count[PMID][16]+(class2int(WP_class,'Other')*int(cited))
, uniquePMIDs_count[PMID][17]+(class2int(WP_class,'Unknown')*int(cited))]
else:
uniquePMIDs_count[PMID]=[title, 1, int(cited),int(latestVersion)
, imp2int(WP_imp,'1-Top')*int(cited)
, imp2int(WP_imp,'2-High')*int(cited)
, imp2int(WP_imp,'3-Mid')*int(cited)
, imp2int(WP_imp,'4-Low')*int(cited)
, imp2int(WP_imp,'Unknown')*int(cited)
, class2int(WP_class,'FA')*int(cited)
, class2int(WP_class,'A')*int(cited)
, class2int(WP_class,'GA')*int(cited)
, class2int(WP_class,'B')*int(cited)
, class2int(WP_class,'C')*int(cited)
, class2int(WP_class,'Start')*int(cited)
, class2int(WP_class,'Stub')*int(cited)
, class2int(WP_class,'Other')*int(cited)
, class2int(WP_class,'Unknown')*int(cited)]
display(Markdown(f'### PMIDs statistical summary 2 (OLD VERSIONS of Cochrane reviews found and/or cited):'))
PMIDs_df= pd.DataFrame.from_dict(uniquePMIDs_count,orient='index').reset_index().rename_axis('index')\
.rename(columns = {'index': 'PMIDs of Cochrane reviews (old versions only)'
,0: 'Title'
,1: 'Times appeared in search results'
,2: 'Times cited in Wikipedia'
,3: 'Version (1=latest, 0=old)'
,4: 'Times cited in a Top Importance WP articles'
,5: 'Times cited in a High Importance WP articles'
,6: 'Times cited in a Mid Importance WP articles'
,7: 'Times cited in a Low Importance WP articles'
,8: 'Times cited in an Unknown Importance WP articles'
,9: 'Times cited in a FA quality WP articles'
,10: 'Times cited in a A quality WP articles'
,11: 'Times cited in a GA quality WP articles'
,12: 'Times cited in a B quality WP articles'
,13: 'Times cited in a C quality WP articles'
,14: 'Times cited in a Start quality WP articles'
,15: 'Times cited in a Stub quality WP articles'
,16: 'Times cited in a Other quality WP articles'
,17: 'Times cited in an Unknown quality WP articles'
}).sort_values(by = 'Times cited in Wikipedia', ascending = False)
PMIDs_df.loc['Total'] = ['N/A','N/A',PMIDs_df['Times appeared in search results'].sum()\
,PMIDs_df['Times cited in Wikipedia'].sum()
#,PMIDs_df['Latest version (1=Yes, 0=No)'].sum()
,PMIDs_df['Version (1=latest, 0=old)'].count()
,PMIDs_df['Times cited in a Top Importance WP articles'].sum()
,PMIDs_df['Times cited in a High Importance WP articles'].sum()
,PMIDs_df['Times cited in a Mid Importance WP articles'].sum()
,PMIDs_df['Times cited in a Low Importance WP articles'].sum()
,PMIDs_df['Times cited in an Unknown Importance WP articles'].sum()
,PMIDs_df['Times cited in a FA quality WP articles'].sum()
,PMIDs_df['Times cited in a A quality WP articles'].sum()
,PMIDs_df['Times cited in a GA quality WP articles'].sum()
,PMIDs_df['Times cited in a B quality WP articles'].sum()
,PMIDs_df['Times cited in a C quality WP articles'].sum()
,PMIDs_df['Times cited in a Start quality WP articles'].sum()
,PMIDs_df['Times cited in a Stub quality WP articles'].sum()
,PMIDs_df['Times cited in a Other quality WP articles'].sum()
,PMIDs_df['Times cited in an Unknown quality WP articles'].sum()
]
display(PMIDs_df.sort_values(by = 'Times cited in Wikipedia', ascending = False).head(10)
.style.set_table_attributes('class="results"')\
.format({'PMIDs of Cochrane reviews (old versions only)': make_clickable_PMID}))
display(HTML('<h2>⋮</h2>'))
f=open("./results/PMIDs_oldVersions_only.html","w")
f.write(datatables_js_script + PMIDs_df.sort_values(by = 'Times cited in Wikipedia', ascending = False)
.style.set_table_attributes('class="results"')\
.format({'PMIDs of Cochrane reviews (old versions only)': make_clickable_PMID}).render())
f.close()
display(HTML('<H3>See the complete table here (sort, filter, and search):</H3>')
,FileLink('./results/PMIDs_oldVersions_only.html', result_html_prefix='<b>', result_html_suffix='</b>'))
display(Markdown("-----------------------------------------------------"))
Article quality class | Total count |
---|---|
Start | 1829 |
Stub | 1035 |
C | 991 |
B | 541 |
nan | 190 |
GA | 79 |
FA | 23 |
List | 16 |
Disambig | 11 |
Article quality category | Total count |
---|---|
3-Mid | 2151 |
4-Low | 2007 |
2-High | 292 |
nan | 200 |
1-Top | 65 |
taskForce | Total count |
---|---|
NA | 2115 |
Ophthalmology | 237 |
Dermatology | 951 |
Toxicology | 11 |
Cardiology | 186 |
Neurology | 398 |
Pathology | 247 |
Medical genetics | 462 |
Pulmonology | 65 |
Reproductive medicine | 72 |
Hematology-oncology | 251 |
Psychiatry | 74 |
Gastroenterology | 52 |
Nephrology | 68 |
Livestock | 2 |
Radiology | 4 |
Applied Linguistics | 1 |
Emergency medicine and EMS | 6 |
Ethics | 1 |
Sustainability | 1 |
Military logistics and medicine | 1 |
World War I | 1 |
Balkan military history | 1 |
European military history | 1 |
German military history | 1 |
World War II | 1 |
Theoretical Linguistics | 1 |
PMIDs of Cochrane reviews (old versions only) | Title | Times appeared in search results | Times cited in Wikipedia | Version (1=latest, 0=old) | Times cited in a Top Importance WP articles | Times cited in a High Importance WP articles | Times cited in a Mid Importance WP articles | Times cited in a Low Importance WP articles | Times cited in an Unknown Importance WP articles | Times cited in a FA quality WP articles | Times cited in a A quality WP articles | Times cited in a GA quality WP articles | Times cited in a B quality WP articles | Times cited in a C quality WP articles | Times cited in a Start quality WP articles | Times cited in a Stub quality WP articles | Times cited in a Other quality WP articles | Times cited in an Unknown quality WP articles | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
index | |||||||||||||||||||
Total | N/A | N/A | 15382 | 1143 | 6894 | 425 | 391 | 286 | 30 | 11 | 86 | 0 | 288 | 521 | 169 | 62 | 5 | 1 | 11 |
1545 | 18254088 | Intravitreal steroids for macular edema in diabetes. | 4 | 2 | 1 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 |
5718 | 27245310 | Speech and language therapy for aphasia following stroke. | 2 | 2 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 |
4633 | 26241698 | Post-pyloric versus gastric tube feeding for preventing pneumonia and improving nutritional outcomes in critically ill adults. | 2 | 2 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
5015 | 29664187 | Exercise interventions and patient beliefs for people with hip, knee or hip and knee osteoarthritis: a mixed methods review. | 2 | 2 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 |
5022 | 27103611 | Topical NSAIDs for chronic musculoskeletal pain in adults. | 2 | 2 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 |
4713 | 26824399 | Surgical versus non-surgical treatment for lumbar spinal stenosis. | 3 | 2 | 1 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 |
1148 | 16856036 | Self-help and guided self-help for eating disorders. | 3 | 2 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
2661 | 20927726 | Grommets (ventilation tubes) for hearing loss associated with otitis media with effusion in children. | 2 | 2 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 |
6254 | 24170669 | Blood pressure targets for hypertension in people with diabetes mellitus. | 2 | 2 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 |
PMIDs of Cochrane reviews (old versions only) | Title | Times appeared in search results | Times cited in Wikipedia | Version (1=latest, 0=old) | Times cited in a Top Importance WP articles | Times cited in a High Importance WP articles | Times cited in a Mid Importance WP articles | Times cited in a Low Importance WP articles | Times cited in an Unknown Importance WP articles | Times cited in a FA quality WP articles | Times cited in a A quality WP articles | Times cited in a GA quality WP articles | Times cited in a B quality WP articles | Times cited in a C quality WP articles | Times cited in a Start quality WP articles | Times cited in a Stub quality WP articles | Times cited in a Other quality WP articles | Times cited in an Unknown quality WP articles | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
index | |||||||||||||||||||
Total | N/A | N/A | 5103 | 34 | 2770 | 4 | 10 | 15 | 4 | 1 | 2 | 0 | 6 | 11 | 11 | 1 | 2 | 0 | 1 |
110 | 24142399 | Pharmacological treatment for pain in Guillain-Barré syndrome. | 14 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
480 | 25102015 | Antioxidant supplementation for lung disease in cystic fibrosis. | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
1943 | 26174592 | Assisted reproductive technology: an overview of Cochrane Reviews. | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
1242 | 27089005 | Non-absorbable disaccharides versus placebo/no intervention and lactulose versus lactitol for the prevention and treatment of hepatic encephalopathy in people with cirrhosis. | 5 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
2554 | 22161393 | Progestogen for treating threatened miscarriage. | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
502 | 27552284 | Inhaled corticosteroids for cystic fibrosis. | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
1229 | 27884041 | Follow-up strategies for patients treated for non-metastatic colorectal cancer. | 2 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
199 | 26932750 | Anti-vascular endothelial growth factor (VEGF) drugs for treatment of retinopathy of prematurity. | 5 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
530 | 25093421 | Pneumococcal vaccines for cystic fibrosis. | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
#------------------------------------------------------------------------------------------------------
# tabular data vizualization, and storage in CSV and HTML format (find these files in ./results folder)
#------------------------------------------------------------------------------------------------------
README="""
# WPM2Cochrane - a tool for linking WikiProject Medicine to the Cochrane Library
## Launch in JupyterLab (recommended) [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/ajoorabchi/WP2Cochrane/master?urlpath=lab/tree/index.ipynb)
## Launch in Jupyter Notebook [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/ajoorabchi/WP2Cochrane/master?filepath=index.ipynb)
## Results of linking (full dataset)
- [Complete Tabular Results in HTML](https://ajoorabchi.github.io/WP2Cochrane/results/full_data.html)
- [Complete Tabular Results in CSV](https://ajoorabchi.github.io/WP2Cochrane/results/full_data.csv)
- [Complete Tabular Results in CSV (SPSS-friendly version)](https://ajoorabchi.github.io/WP2Cochrane/results/full_data_SPSS-friendy.csv)
- [PMIDs statistical summary 1 (LATEST VERSIONS of Cochrane reviews found and/or cited)](https://ajoorabchi.github.io/WP2Cochrane/results/PMIDs_latestVersions_only.html)
- [PMIDs statistical summary 2 (OLD VERSIONS of Cochrane reviews found and/or cited)](https://ajoorabchi.github.io/WP2Cochrane/results/PMIDs_oldVersions_only.html)
## Results of linking (Specialized HTML results per task force)
"""
datatables_js_script="""
<link rel="stylesheet" type="text/css" href="https://cdn.datatables.net/1.10.19/css/jquery.dataTables.css">
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.3.1/jquery.min.js"></script>
<script type="text/javascript" charset="utf8" src="https://cdn.datatables.net/1.10.19/js/jquery.dataTables.js"></script>
<script>
$(document).ready( function () {
$('table.results').dataTable({
"lengthMenu": [[10, 25, 50, -1], [10, 25, 50, "All"]], "bAutoWidth": false,
"columnDefs": [
{ "width": "100%", "targets": 7 }
]
});
//$('table.dataframe').dataTable();
} );
</script>
"""
def get_df_info(content: pd.DataFrame):
#function code from: https://stackoverflow.com/a/44087453/2339926
#display(content.info())
content_info = StringIO()
content.info(buf=content_info)
str_ = content_info.getvalue()
lines = str_.split("\n")
table = StringIO("\n".join(lines[3:-3]))
datatypes = pd.read_csv(table, delim_whitespace=True,
names=["column", "count", "null", "dtype"])
datatypes.set_index("column", inplace=True)
info = "\n".join(lines[0:2] + lines[-2:-1])
return info, datatypes
def make_clickable_wkd_items(val):
# target _blank to open new window
return '<a target="_blank" href="{}">{}</a>'.format(val, val[31:])
def make_clickable_taskForces(val):
# target _blank to open new window
if not val:
return None
else:
html=""
for taskForce in val:
html += f'<p><a target="_blank" href="https://en.wikipedia.org/wiki/Wikipedia:WikiProject_Medicine/\
{taskForce.replace(" ","_")}_task_force">{taskForce}</a></p>'
return html
def make_clickable_WP_en_articles(val):
# target _blank to open new window
if not val:
return None
else:
return '<a target="_blank" href="{}">{}</a>'.format(val, unquote(unquote(val[30:].replace("_", " "))))
def list2text(list_):
txt=""
for item in list_:
txt += item + ", "
return txt[:-2]
#------------------------------------------------------------------------------------------------------
# full-data.csv generator
#------------------------------------------------------------------------------------------------------
h = html2text.HTML2Text()
h.ignore_links = False
df_plainText =df.copy()
df_plainText['cochrane_reviews_plainText'] = [h.handle(text) if text is not None else text for text in df['cochrane_reviews_html']]
df_plainText['taskForces_plainText'] = [list2text(text) if text is not None else text for text in df['taskForces']]
df_plainText.to_csv('./results/full_data.csv', index=False)
#------------------------------------------------------------------------------------------------------
# full_data_SPSS-friendy.csv generator (one review per row)
#------------------------------------------------------------------------------------------------------
df_spss_friendly=pd.DataFrame()
for index, row in tqdm(df[['WkD_disease', 'WkD_diseaseLabel', 'WP_en_article','class'
,'importance','taskForces','cochrane_reviews','talkPage_categories']].iterrows()):
#display(index,row)
PIMD_paper_dict = row['cochrane_reviews']
if PIMD_paper_dict:
search_results_count = len(PIMD_paper_dict)
cited_count=0
outofdate_cited_count=0
for PMID, paper in PIMD_paper_dict.items():
title = paper[0]
cited = paper[1]
latestVersion = paper[2]
if cited:
cited_count +=1
if cited and not latestVersion:
outofdate_cited_count +=1
else:
search_results_count = None
cited_count = None
outofdate_cited_count = None
PIMD_paper_dict = row['cochrane_reviews']
if PIMD_paper_dict:
for PMID, paper in PIMD_paper_dict.items():
title = paper[0]
cited = paper[1]
latestVersion = paper[2]
data = pd.DataFrame(row.items())
data = data.transpose()
data.columns = data.iloc[0]
data = data.drop(data.index[[0]])
data = data.drop(columns=['cochrane_reviews'])
data['PMID']=PMID
data['Title']=title
data['Cited']=cited
data['LatestVersion']=latestVersion
data['Search results count'] = search_results_count
data['Cited count'] = cited_count
data['outofdate_cited_count'] = outofdate_cited_count
df_spss_friendly = df_spss_friendly.append(data)
else:
data = pd.DataFrame(row.items())
data = data.transpose()
data.columns = data.iloc[0]
data = data.drop(data.index[[0]])
data = data.drop(columns=['cochrane_reviews'])
data['PMID']=None
data['Title']=None
data['Cited']=None
data['LatestVersion']=None
data['Search results count'] = None
data['Cited count'] = None
data['outofdate_cited_count'] = None
df_spss_friendly = df_spss_friendly.append(data)
# if index==10:
# break
df_spss_friendly.to_csv('./results/full_data_SPSS-friendy.csv', index=False)
#------------------------------------------------------------------------------------------------------
# full-data.html generator
#------------------------------------------------------------------------------------------------------
th = dict(selector="th", props=[('text-align', 'left'),('font','blod 14px arial, sans-serif'),('vertical-align','top')])
rh = dict(selector=".row_heading", props=[("text-align", "left"),('font','bold 14px arial, sans-serif'),('vertical-align','top')])
col0 = dict(selector=".col0", props=[("text-align", "left"),('font','bold 12px arial, sans-serif'),('max-width','100px'),('vertical-align','top')])
col1 = dict(selector=".col1", props=[("text-align", "left"),('font','bold 12px arial, sans-serif'),('max-width','130px'),('vertical-align','top')])
col2 = dict(selector=".col2", props=[("text-align", "left"),('font','bold 12px arial, sans-serif'),('max-width','110px'),('vertical-align','top'),('word-wrap','break-word')])
col3 = dict(selector=".col3", props=[("text-align", "left"),('font','bold 12px arial, sans-serif'),('max-width','60px'),('vertical-align','top'),('word-wrap','break-word')])
col4 = dict(selector=".col4", props=[("text-align", "left"),('font','bold 12px arial, sans-serif'),('max-width','90px'),('vertical-align','top'),('word-wrap','break-word')])
col5 = dict(selector=".col5", props=[("text-align", "left"),('font','bold 12px arial, sans-serif'),('max-width','90px'),('vertical-align','top'),('word-wrap','break-word')])
col6 = dict(selector=".col6", props=[("text-align", "left"),('font','12px arial, sans-serif'),('vertical-align','top')])
fullSize = sys.maxsize
testSize = 10
df_vizTable = df[['WkD_disease', 'WkD_diseaseLabel', 'WP_en_article','class','importance','taskForces','cochrane_reviews_html']]
df_vizTable_styled=df_vizTable\
.head(fullSize).rename_axis('index')\
.sort_values(by=['importance'],na_position='last')\
.style.set_table_styles([th,rh,col0,col1,col2,col3,col4,col5,col6]).set_table_attributes('class="results"')\
.format({'WkD_disease': make_clickable_wkd_items,'WP_en_article': make_clickable_WP_en_articles,
'taskForces': make_clickable_taskForces})
tableGuide='<H3>Table Guide:</H3>\
<p><b>Grouping:</b></p>\
<p>In cases, where there are multiple version of a Cochrane review, they are grouped toghther (showing the same background color)\
,and are listed chronologically,latest version first.</p>\
<p><b>Color Codes:</b></p>\
<ol>\
<li><p style="color:Green;">Green: up-to-date and CITED</p></li>\
<li><p style="color:Red;"> Red: up-to-date and NOT CITED</p></li>\
<li><p style="color:Orange;">Orange: out-of-date and CITED</p></li>\
<li><p style="color:Grey;">Grey: out-of-date and NOT CITED</p></li>\
</ol><hr>'
f=open("./results/full_data.html","w")
info,datatypes = get_df_info(df_vizTable)
#display(datatypes,datatypes.at['WkD_disease','count'])
table_size_message=f"<H3>Results Table contains <i>{datatypes.at['WkD_disease','count']}</i> rows</H3><hr>"
table_size_warning=f"<H3>This is a large table, so it could take up to 30 seconds to fully load and render in your browser</H3><hr>"
if datatypes.at['WkD_disease','count']>1000:
table_size_message += table_size_warning
f.write(datatables_js_script + info + datatypes.to_html() + tableGuide + table_size_message
+ df_vizTable_styled.render())
f.close()
#------------------------------------------------------------------------------------------------------
# Creates a dedicated HTML rsults file for each WikiProject medicine task force group in Wikipeida
#------------------------------------------------------------------------------------------------------
for taskForce, count in taskForces_count.items():
if taskForce!="NA":
# display(HTML(f"Creating a HTML results file for task Force <b>'{taskForce}'</b>:\
# <i>./results/HTML_results_per_task_force/{taskForce}_taskForce.html</i>"))
#vizTable_per_taskForce = df_vizTable.loc[df_vizTable['taskForces'].apply(str)=="[]"]
vizTable_per_taskForce = df_vizTable.loc[df_vizTable['taskForces'].notnull()]
mask = vizTable_per_taskForce.taskForces.apply(lambda x: taskForce in x)
vizTable_per_taskForce = vizTable_per_taskForce[mask]
vizTable_per_taskForce_styled = vizTable_per_taskForce.rename_axis('index')\
.sort_values(by=['importance'],na_position='last')\
.style.set_table_styles([th,rh,col0,col1,col2,col3,col4,col5,col6]).set_table_attributes('class="results"')\
.format({'WkD_disease': make_clickable_wkd_items,'WP_en_article': make_clickable_WP_en_articles,
'taskForces': make_clickable_taskForces})
f=open(f"./results/HTML_results_per_task_force/{taskForce}_taskForce.html","w")
info,datatypes = get_df_info(vizTable_per_taskForce)
#display(datatypes,datatypes.at['WkD_disease','count'])
table_size_message=f"<H3>Results Table contains <i>{datatypes.at['WkD_disease','count']}</i> rows</H3><hr>"
table_size_warning=f"<H3>This is a large table so it needs ~10s to fully load and render in your browser</H3><hr>"
if datatypes.at['WkD_disease','count']>1000:
table_size_message += table_size_warning
f.write(datatables_js_script + info + datatypes.to_html() + tableGuide + table_size_message
+ vizTable_per_taskForce_styled.render())
f.close()
#update README.md
README += f"\n- [{taskForce}](https://ajoorabchi.github.io/WP2Cochrane/results/HTML_results_per_task_force/{quote(taskForce)}_taskForce.html)"
#------------------------------------------------------------------------------------------------------
# display CSV & HTML view/download options
#------------------------------------------------------------------------------------------------------
display(HTML('<H3>Complete Tabular Results in CSV or HTML Format:</H3>'),FileLinks('./results/'\
, result_html_prefix='<li><b>', result_html_suffix='</b></li>',recursive=False))
display(HTML('<H3>Specialized HTML results for each <a href="https://en.wikipedia.org/wiki/Wikipedia:WikiProject_Medicine">\
WikiProject medicine</a> task force \
(see list of active task forces <a href="https://en.wikipedia.org/wiki/Wikipedia:WikiProject_Medicine/Task_forces">\
here</a>):</H3>'),FileLinks('./results//HTML_results_per_task_force/'\
, result_html_prefix='<li><b>', result_html_suffix='</b></li>',recursive=False))
#display(HTML(datatables_js_script + info + datatypes.to_html() + tableGuide + df_vizTable_styled.render()))
#------------------------------------------------------------------------------------------------------
# write README.md
#------------------------------------------------------------------------------------------------------
f=open("./README.md","w")
f.write(README)
f.close()
12954it [07:10, 30.06it/s]
###### ------------------------------------------------------------------------------------------------------
# WikiData search: searches WkD for a list of dieeas
#------------------------------------------------------------------------------------------------------
"""
This is a modified version of code from:
1. https://lawlesst.github.io/notebook/sparql-dataframe.html
2. https://github.com/SuLab/sparql_to_pandas/blob/master/SPARQL_pandas.ipynb
Demonstrating how to get JupyterLab working with Binder:
https://github.com/binder-examples/jupyterlab
https://github.com/binder-examples/jupyter-extension/blob/master/index.ipynb
"""
def get_sparql_dataframe(service, query):
"""
Helper function to convert SPARQL results into a Pandas data frame.
"""
sparql = SPARQLWrapper(service)
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
result = sparql.query()
processed_results = json.load(result.response)
cols = processed_results['head']['vars']
out = []
for row in processed_results['results']['bindings']:
item = []
for c in cols:
item.append(row.get(c, {}).get('value'))
out.append(item)
return pd.DataFrame(out, columns=cols)
#help: https://en.wikibooks.org/wiki/SPARQL/Wikidata_Query_Service_-_Introduction
wds = "https://query.wikidata.org/sparql"
rq = """
SELECT ?WkD_disease ?WkD_diseaseLabel ?WP_en_article
WHERE {
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
?WkD_disease wdt:P31 wd:Q12136.
OPTIONAL {
?WP_en_article schema:about ?WkD_disease .
?WP_en_article schema:inLanguage "en" .
?WP_en_article schema:isPartOf <https://en.wikipedia.org/> .
}
}
#order by desc(?WkD_disease)
"""
df = get_sparql_dataframe(wds, rq)
#WkD api sample
# client = Client() # doctest: +SKIP
# entity = client.get('Q1472', load=True)
# print (entity)
# print (entity.description)
# image_prop = client.get('P18')
# image = entity[image_prop]
# print (image)
# print(image.image_resolution)
# print(image.image_url)
#------------------------------------------------------------------------------------------------------
# PubMed search
#------------------------------------------------------------------------------------------------------
"""
This is a modified version of code from:
1. https://gist.github.com/bonzanini/5a4c39e4c02502a8451d
2. https://gist.github.com/bonzanini/5a4c39e4c02502a8451d
# Full discussion:
# https://marcobonzanini.wordpress.com/2015/01/12/searching-pubmed-with-python/
"""
def search(index,query):
Entrez.email = 'your.email@example.com'
logger.debug (f"{index:>5}. PubMed search query: {query}")
handle = Entrez.esearch(db='pubmed', sort='relevance', retmax='200', retmode='xml', term=query)
results = Entrez.read(handle)
return results
def fetch_details(index, id_list):
ids = ','.join(id_list)
Entrez.email = 'your.email@example.com'
sleep_time = 10
num_retries = 1000
error = None
for x in range(0, num_retries):
try:
logger.debug (f"{index:>5}. Fetching article details for PMIDs: {ids}")
handle = Entrez.efetch(db='pubmed', retmode='xml', id=ids)
results = Entrez.read(handle)
error = None
except Exception as error:
logger.error (f"{index:>5}. I got a HTTPError - reason {error}- while trying to fetch the articles details from pubmed")
logger.error (f"{index:>5}. Sleeping for {sleep_time} seconds before trying again...")
sleep(sleep_time) # wait before trying to fetch the data again
sleep_time *= 1.5 # Implement your backoff algorithm here i.e. exponential backoff
pass
if not error:
break
if error:
logger.critical (f"{index:>5}. Retried fetching article's details {num_retries} times with no success!")
raise error
return results
def pubmed_search(index, disease, searchTitle, searchAbstract):
logger.debug (f"{index:>5}. Searching PubMed for {disease} in searchTitle:{searchTitle} , searchAbstract:{searchAbstract}")
if searchTitle and searchAbstract:
results = search(index,"("+disease+'[Title/Abstract] NOT "withdrawn"[Title]) AND "The Cochrane database of systematic reviews"[Journal]')
if searchTitle and not searchAbstract:
results = search(index,"("+disease+'[Title] NOT "withdrawn"[Title]) AND "The Cochrane database of systematic reviews"[Journal]')
#pprint.pprint (results)
return results['IdList']
# display(len(pubmed_search("Crohn's disease",True,True)))
# display(len(pubmed_search("Crohn's disease",True,False)))
#------------------------------------------------------------------------------------------------------
# string comparator
#------------------------------------------------------------------------------------------------------
# def searh_wp_refs_4title(pubmedArticleTitle,WPpageTitle):
# ''' looksup an article title in a WikiPedia page '''
# #display (wikipedia.WikipediaPage(WPpageTitle).html())
# WPpageHTML= wikipedia.WikipediaPage(WPpageTitle).html()
# ratio = fuzz.ratio(WPpageHTML, pubmedArticleTitle)
# print ("fuzz.ratio:",ratio)
# WPpageHTML=re.sub(r'\W+', '', WPpageHTML.lower())
# pubmedArticleTitle=re.sub(r'\W+', '', pubmedArticleTitle.lower())
# print(pubmedArticleTitle)
# if pubmedArticleTitle in WPpageHTML:
# return True
# else:
# return False
#print (searh_wp_refs_4title("interventions to slow progression of myopia in children...","Near-sightedness"))
def searh_wp_refs_4PMID(PMID,en_article_HTML):
''' looksup a PMIDs in a WikiPedia page '''
#print('searching for PMID: <b>'+PMID+ '</b>')
if PMID in en_article_HTML:
#print ('found')
return True
else:
#print ('not found')
return False
#en_article_HTML= wikipedia.WikipediaPage('Near-sightedness').html()
#print (searh_wp_refs_4PMID("22161388",en_article_HTML))
"""------------------------------------------------------------------------------------------------------
Main: this is the main part of the code which encompasses the following steps:
1. content and metadata of WP articles which correspond to a disease in WikiData
are retrieved (title, class, importance, taskforce, content)
2. For each WP article a search query is submitted to PubMed to retrieve a list of relevant Cochrane reviews
3. the retrieved list of reviews are cross checked agains the WP articles reference section to identify cited, missing
and out of date reviews.
4. gathered infomation is converted tocolor-coded tabular format.
------------------------------------------------------------------------------------------------------"""
h2t = html2text.HTML2Text()
h2t.ignore_links = True
df['class']=None
df['importance']=None
df['taskForces']=None
df['cochrane_reviews_html']=None
df['cochrane_reviews']=None
df['talkPage_categories']=None
disease_search_log= widgets.HTML()
citation_search_log= widgets.HTML()
display(disease_search_log)
display(citation_search_log)
for index, row in tqdm_notebook(df.iterrows(),desc='Progress',total=df['WP_en_article'].count(), unit="WkD_disease"):
if row['WP_en_article'] is not None:
#and row['WP_en_article']=="https://en.wikipedia.org/wiki/Agalactia":
if console_handler.level<=20:
display(HTML(f"<b>START of row {index:>5}<b>"))
else:
logger.info(f"<{index:>5}------------------------------------------------")
disease_search_log.value= f'<p style="color:blue;"><b>processing:</b> disease #{index:>5}\
  {row["WkD_diseaseLabel"]}   {row["WkD_disease"]}   {row["WP_en_article"]} </p>'
WP_en_article_Title= unquote(unquote(row['WP_en_article'][30:]))
#will raise a DisambiguationError if the page is a disambiguation page, or a PageError if
#the page doesn’t exist (although by default, it tries to find the page you meant with suggest and search.)
try:
logger.info(f"{index:>5}. Getting the WikiPedia content for: {WP_en_article_Title}")
WP_en_article_obj = wikipedia.WikipediaPage(WP_en_article_Title)
WP_en_article_HTML = WP_en_article_obj.html()
#checking for redirects
if WP_en_article_Title.casefold() != WP_en_article_obj.title.casefold().replace(" ","_"):
logger.info(f"{index:>5}. Redirected to: {WP_en_article_obj.title}")
logger.info(f"{index:>5}. Getting the WikiPedia talkPage_categories for: {WP_en_article_obj.title}")
WP_en_article_talk_obj = wikipedia.WikipediaPage("Talk:"+ WP_en_article_obj.title)
row['talkPage_categories'] = WP_en_article_talk_obj.categories
except wikipedia.exceptions.DisambiguationError as e:
logger.warning (f"{index:>5}. '{WP_en_article_talk_obj.title}' is an Ambiguous title: {e.options}")
logger.warning (f"{index:>5}. Picking the first sense in the list by default: {e.options[0]}")
WP_en_article_obj = wikipedia.WikipediaPage(e.options[0])
WP_en_article_HTML = WP_en_article_obj.html()
#checking for redirects
if e.options[0].casefold() != WP_en_article_obj.title.casefold().replace(" ","_"):
logger.warning (f"{index:>5}. Redirected to: {WP_en_article_obj.title}")
logger.warning (f"{index:>5}. Getting the talkPage_categories for: {WP_en_article_obj.title}")
WP_en_article_talk_obj = wikipedia.WikipediaPage("Talk:"+ WP_en_article_obj.title)
row['talkPage_categories'] = WP_en_article_talk_obj.categories
except wikipedia.exceptions.PageError as e:
logger.error (f"{index:>5}. {WP_en_article_obj.title} - I got a PageError - reason: {e} - Article has no talk page yet")
except KeyError as e:
logger.error (f"{index:>5}. {WP_en_article_obj.title} - I got a KeyError - reason: {e} - Article's Talk page has no Category")
finally:
if console_handler.level==10:
display(row['talkPage_categories'])
else:
logger.debug(row['talkPage_categories'])
logger.debug(f"{index:>5}. Extracting task forces, class, and importance")
taskForces=[]
for cat in row['talkPage_categories'] or []:
if cat.casefold().endswith("-class medicine articles"):
row['class'] = cat[0:-24]
if cat.casefold().endswith('-importance medicine articles'):
imp = {
'NA' : None,
'???' : None,
'Unknown' : None,
'Low' : '4-Low',
'Mid' : '3-Mid',
'High' : '2-High',
'Top' : '1-Top'
}
if imp[cat[0:-29]]:
row['importance'] = imp[cat[0:-29]]
else:
row['importance'] = None
if cat.casefold().endswith(' task force articles') and "wikiproject" not in cat.casefold():
taskForce = cat[0:-20]
taskForces.append(taskForce)
if taskForces:
row['taskForces']=taskForces
if console_handler.level<=20:
display(HTML(f"{index:>5}. class: {row['class']}, importance: {row['importance']}, task forces: {row['taskForces']}"))
else:
logger.info(f"{index:>5}. class: {row['class']}, importance: {row['importance']}, task forces: {row['taskForces']}")
matches=0
PIMD_paper_dict={}
logger.info(f"{index:>5}. searching Pubmed for WkD_diseaseLabel: {row['WkD_diseaseLabel']}")
id_list= pubmed_search(index,row['WkD_diseaseLabel'],True,True)
if len(id_list)==200:
logger.warning(f"{index:>5}. (a) {row['WkD_diseaseLabel']} - Too many matches found (>200)")
logger.warning(f"{index:>5}. (b) restrcting search to Titles only (excluding Abstracts)")
time.sleep(1)
id_list= pubmed_search(index,row['WkD_diseaseLabel'],True,False)
if len(id_list)==0:
logger.warning(f"{index:>5}. (a) {row['WkD_diseaseLabel']} - Restrcting search to Titles only returned no results")
logger.warning(f"{index:>5}. (b) reverting back to title/abstract")
time.sleep(1)
id_list= pubmed_search(index,row['WkD_diseaseLabel'],True,True)
if not id_list and row['WkD_diseaseLabel'].lower()!= re.sub(r" ?\([^)]+\)", "", WP_en_article_Title.replace("_", " ").lower()):
logger.warning(f"{index:>5}. (a) searching for '{row['WkD_diseaseLabel']}' returned {len(id_list)} results")
logger.warning(f"{index:>5}. (b) searching for '{WP_en_article_Title.replace('_', ' ')}' instead")
time.sleep(1)
id_list= pubmed_search(index,WP_en_article_Title.replace("_", " "),True,True)
if len(id_list)==200:
logger.warning(f"{index:>5}. (a) {WP_en_article_Title.replace('_',' ')} - Retruned too many matches (>200)")
logger.warning(f"{index:>5}. (b) restrcting search to Titles only (excluding Abstracts)")
time.sleep(1)
id_list= pubmed_search(index,WP_en_article_Title.replace("_", " "),True,False)
if len(id_list)==0:
logger.warning(f"{index:>5}. (a) {WP_en_article_Title.replace('_',' ')} - Restrcting search to Titles only returned no results")
logger.warning(f"{index:>5}. (b) reverting back to title/abstract")
time.sleep(1)
id_list= pubmed_search(index,WP_en_article_Title.replace("_", " "),True,True)
logger.info(f"{index:>5}. {len(id_list)} matching PMIDs found.")
if id_list:
papers = fetch_details(index,id_list)
#print(json.dumps(papers, indent=5))
for i, paper in enumerate(papers['PubmedArticle']):
articleTitle = paper['MedlineCitation']['Article']['ArticleTitle']
PMID = int(paper['MedlineCitation']['PMID'])
cited = searh_wp_refs_4PMID(str(PMID),WP_en_article_HTML)
if cited:
matches +=1
#display(articleTitle,PMID,cited)
latestVersion=None
PIMD_paper_dict[int(PMID)]=[articleTitle,cited,latestVersion]
PIMD_paper_dict = OrderedDict(sorted(PIMD_paper_dict.items(), key=lambda t: [str(title).lower() for title in t[1][0]]))
#display(PIMD_paper_dict)
row['cochrane_reviews']= PIMD_paper_dict
if PIMD_paper_dict:
row['cochrane_reviews_html']='<div align="left" style="margin:0px;"><ol start="1" style="margin-left:0px">'
bgc="White"
for PMID, paper in PIMD_paper_dict.items():
cited=paper[1]
if cited:
color="green"
cited_message="<b> [CITED] </b>"
if not cited:
color="red"
cited_message="<b> [NOT CITED] </b>"
title=paper[0]
latestVersion=True
foundItems = (key for key, vals in PIMD_paper_dict.items() if title.lower() in [str(val).lower() for val in vals] and key!=PMID)
for item in foundItems:
#display(item)
if item > PMID:
latestVersion=False
paper[2]=latestVersion
#display(title,PMID,cited,latestVersion)
if latestVersion:
version_message="<b> [LATEST Version] </b>"
else:
version_message="<b> [OLD Version] </b>"
if cited:
color="orange"
version_message +="<b> [UPDATE NEEDED] </b>"
else:
color="grey"
if latestVersion:
if bgc=="white":
bgc="#E0F5FE"
else:
bgc="white"
row['cochrane_reviews_html']+='<li style="padding:5px;color:'+color+';background-color:'+bgc+';">'\
+paper[0]+' <a target="_blank" href="https://www.ncbi.nlm.nih.gov/pubmed/'\
+str(PMID)+'">PMID: '+str(PMID)+'</a>'+cited_message+version_message+"</li>"
row['cochrane_reviews_html']+="</ol></div>"
else:
row['cochrane_reviews_html']="No matching publication found!"
if console_handler.level<=10: #10=DEBUG
display(HTML(row['cochrane_reviews_html']))
else:
#logger.debug(h2t.handle(str(row['cochrane_reviews_html'])))
logger.debug(row['cochrane_reviews_html'])
citation_search_log.value = f'<p style="color:green;"><b>processed</b>: disease #{index:>5}   \
{row["WkD_diseaseLabel"]}   {row["WkD_disease"]}   {row["WP_en_article"]}</p> \
<p style="color:green;"><b>{matches} of {len(id_list)}</b>\
Cochrane reviews found (via PubMed) are cited in the Wikipedia article: {WP_en_article_Title}</p>'
if console_handler.level<=20:
display(HTML(f"<b>END of row {index:>5}<b><hr>"))
else:
logger.info(f"------------------------------------------------{index:>5}><hr>")
"""------------------------------------------------------------------------------------------------------
# Once all the tasks (above cells) are completed successfuly, the dataframe
# is sotored in persistent storage for future use (e.g., Binder)
#------------------------------------------------------------------------------------------------------"""
# Creating a datetime object
current_datetime = datetime.now()
# Converting a to string in the desired format (YYYYMMDD) using strftime
# and then to int.
current_datetime_int = int(current_datetime.strftime('%Y%m%d'))
display(HTML(f"<h2>Saving df in ./persistent_storage/{current_datetime_int}.pkl"))
logger.info(f"Saving df in ./persistent_storage/{current_datetime_int}.pkl")
# with open('./persistent_storage/df.dill', 'wb') as out_strm:
# dill.dump(df, out_strm)
#dill.dump_session('./persistent_storage/dill_session')
df.to_pickle(f"./persistent_storage/{current_datetime_int}.pkl")
HTML(value='')
HTML(value='')
HBox(children=(IntProgress(value=0, description='Progress', max=4717, style=ProgressStyle(description_width='i…
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) ~/anaconda3/lib/python3.6/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw) 376 try: # Python 2.7, use buffering of HTTP responses --> 377 httplib_response = conn.getresponse(buffering=True) 378 except TypeError: # Python 3 TypeError: getresponse() got an unexpected keyword argument 'buffering' During handling of the above exception, another exception occurred: KeyboardInterrupt Traceback (most recent call last) <ipython-input-7-e0f69c66ef4e> in <module> 45 logger.info(f"{index:>5}. Getting the WikiPedia content for: {WP_en_article_Title}") 46 WP_en_article_obj = wikipedia.WikipediaPage(WP_en_article_Title) ---> 47 WP_en_article_HTML = WP_en_article_obj.html() 48 49 #checking for redirects ~/anaconda3/lib/python3.6/site-packages/wikipedia/wikipedia.py in html(self) 452 } 453 --> 454 request = _wiki_request(query_params) 455 self._html = request['query']['pages'][self.pageid]['revisions'][0]['*'] 456 ~/anaconda3/lib/python3.6/site-packages/wikipedia/wikipedia.py in _wiki_request(params) 735 time.sleep(int(wait_time.total_seconds())) 736 --> 737 r = requests.get(API_URL, params=params, headers=headers) 738 739 if RATE_LIMIT: ~/anaconda3/lib/python3.6/site-packages/requests/api.py in get(url, params, **kwargs) 73 74 kwargs.setdefault('allow_redirects', True) ---> 75 return request('get', url, params=params, **kwargs) 76 77 ~/anaconda3/lib/python3.6/site-packages/requests/api.py in request(method, url, **kwargs) 58 # cases, and look like a memory leak in others. 59 with sessions.Session() as session: ---> 60 return session.request(method=method, url=url, **kwargs) 61 62 ~/anaconda3/lib/python3.6/site-packages/requests/sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json) 531 } 532 send_kwargs.update(settings) --> 533 resp = self.send(prep, **send_kwargs) 534 535 return resp ~/anaconda3/lib/python3.6/site-packages/requests/sessions.py in send(self, request, **kwargs) 666 667 # Resolve redirects if allowed. --> 668 history = [resp for resp in gen] if allow_redirects else [] 669 670 # Shuffle things around if there's history. ~/anaconda3/lib/python3.6/site-packages/requests/sessions.py in <listcomp>(.0) 666 667 # Resolve redirects if allowed. --> 668 history = [resp for resp in gen] if allow_redirects else [] 669 670 # Shuffle things around if there's history. ~/anaconda3/lib/python3.6/site-packages/requests/sessions.py in resolve_redirects(self, resp, req, stream, timeout, verify, cert, proxies, yield_requests, **adapter_kwargs) 245 proxies=proxies, 246 allow_redirects=False, --> 247 **adapter_kwargs 248 ) 249 ~/anaconda3/lib/python3.6/site-packages/requests/sessions.py in send(self, request, **kwargs) 644 645 # Send the request --> 646 r = adapter.send(request, **kwargs) 647 648 # Total elapsed time of the request (approximately) ~/anaconda3/lib/python3.6/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies) 447 decode_content=False, 448 retries=self.max_retries, --> 449 timeout=timeout 450 ) 451 ~/anaconda3/lib/python3.6/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw) 598 timeout=timeout_obj, 599 body=body, headers=headers, --> 600 chunked=chunked) 601 602 # If we're going to release the connection in ``finally:``, then ~/anaconda3/lib/python3.6/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw) 378 except TypeError: # Python 3 379 try: --> 380 httplib_response = conn.getresponse() 381 except Exception as e: 382 # Remove the TypeError from the exception chain in Python 3; ~/anaconda3/lib/python3.6/http/client.py in getresponse(self) 1329 try: 1330 try: -> 1331 response.begin() 1332 except ConnectionError: 1333 self.close() ~/anaconda3/lib/python3.6/http/client.py in begin(self) 295 # read until we get a non-100 response 296 while True: --> 297 version, status, reason = self._read_status() 298 if status != CONTINUE: 299 break ~/anaconda3/lib/python3.6/http/client.py in _read_status(self) 256 257 def _read_status(self): --> 258 line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1") 259 if len(line) > _MAXLINE: 260 raise LineTooLong("status line") ~/anaconda3/lib/python3.6/socket.py in readinto(self, b) 584 while True: 585 try: --> 586 return self._sock.recv_into(b) 587 except timeout: 588 self._timeout_occurred = True ~/anaconda3/lib/python3.6/site-packages/urllib3/contrib/pyopenssl.py in recv_into(self, *args, **kwargs) 292 def recv_into(self, *args, **kwargs): 293 try: --> 294 return self.connection.recv_into(*args, **kwargs) 295 except OpenSSL.SSL.SysCallError as e: 296 if self.suppress_ragged_eofs and e.args == (-1, 'Unexpected EOF'): ~/anaconda3/lib/python3.6/site-packages/OpenSSL/SSL.py in recv_into(self, buffer, nbytes, flags) 1819 result = _lib.SSL_peek(self._ssl, buf, nbytes) 1820 else: -> 1821 result = _lib.SSL_read(self._ssl, buf, nbytes) 1822 self._raise_ssl_error(self._ssl, result) 1823 KeyboardInterrupt: