import os
import gzip
import io
import json
import xml.etree.ElementTree as ET
import networkx
import pandas
#!wget --timestamping --directory-prefix download/ ftp://nlmpubs.nlm.nih.gov/online/mesh/.xmlmesh/desc2015.gz
# Read MeSH xml release
xml_path = os.path.join('download', 'desc2015.gz')
with gzip.open(xml_path) as xml_file:
tree = ET.parse(xml_file)
root = tree.getroot()
# Extract mesh terms
term_dicts = list()
for descriptor in root:
for concept in descriptor.findall('ConceptList/Concept'):
for term in concept.findall('TermList/Term'):
term_dict = {
'DescriptorUI': descriptor.findtext('DescriptorUI'),
'ConceptUI': concept.findtext('ConceptUI'),
'TermUI': term.findtext('TermUI'),
'TermName': term.findtext('String')
}
term_dict.update(concept.attrib)
term_dict.update(term.attrib)
term_dicts.append(term_dict)
columns = ['DescriptorUI', 'ConceptUI', 'PreferredConceptYN', 'TermUI', 'TermName',
'ConceptPreferredTermYN', 'IsPermutedTermYN', 'LexicalTag', 'PrintFlagYN', 'RecordPreferredTermYN']
term_df = pandas.DataFrame(term_dicts)[columns]
term_df.to_csv('data/descriptor-terms.tsv', index=False, sep='\t')
# Test whether MeSH term names are unique
len(term_df) == len(set(term_df.TermName))
True
# Parse MeSH xml release
terms = list()
for elem in root:
term = dict()
term['mesh_id'] = elem.findtext('DescriptorUI')
term['mesh_name'] = elem.findtext('DescriptorName/String')
term['semantic_types'] = list({x.text for x in elem.findall(
'ConceptList/Concept/SemanticTypeList/SemanticType/SemanticTypeUI')})
term['tree_numbers'] = [x.text for x in elem.findall('TreeNumberList/TreeNumber')]
terms.append(term)
# Determine ontology parents
tree_number_to_id = {tn: term['mesh_id'] for term in terms for tn in term['tree_numbers']}
for term in terms:
parents = set()
for tree_number in term['tree_numbers']:
try:
parent_tn, self_tn = tree_number.rsplit('.', 1)
parents.add(tree_number_to_id[parent_tn])
except ValueError:
pass
term['parents'] = list(parents)
path = os.path.join('data', 'mesh.json')
with open(path, 'w') as write_file:
json.dump(terms, write_file, indent=2)
# Create a newtorkx directed graph represented mesh
network = networkx.DiGraph()
# add nodes
for term in terms:
network.add_node(term['mesh_id'], name=term['mesh_name'])
# add edges
for term in terms:
for parent in term['parents']:
network.add_edge(parent, term['mesh_id'])
assert networkx.is_directed_acyclic_graph(network)
networkx.write_gexf(network, 'data/ontology.gexf.gz')
# Read mesh
path = os.path.join('data', 'mesh.json')
with open(path) as read_file:
mesh = json.load(read_file)
mesh_df = pandas.DataFrame.from_dict(mesh)[['mesh_id', 'mesh_name']]
mesh_df.to_csv('data/terms.tsv', sep='\t', index=False)
# Extract (mesh_id, mesh_tree_number) pairs
rows = []
for term in mesh:
mesh_id = term['mesh_id']
mesh_name = term['mesh_name']
for tree_number in term['tree_numbers']:
rows.append([mesh_id, mesh_name, tree_number])
tn_df = pandas.DataFrame(rows, columns=['mesh_id', 'mesh_name', 'mesh_tree_number'])
tn_df.to_csv('data/tree-numbers.tsv', sep='\t', index=False)
def is_human_disease(tn):
"""Given a tree number, return whether the heirarchical path suggests a human disease."""
# F03 (mental disorders)
if tn.startswith('F03'):
return True
# C01 though C21 and C24 -- C26
for i in list(range(1, 22)) + ['C24', 'C25', 'C26']:
if tn.startswith('C' + str(i).zfill(2)):
return True
# C23 exlcuding C23.888 (Symptoms and Signs)
if tn.startswith('C23') and not tn.startswith('C23.888'):
return True
return False
diseases = {term['mesh_id'] for term in terms if any(map(is_human_disease, term['tree_numbers']))}
len(diseases)
4501
# Read HSDN symptoms
url = 'https://raw.githubusercontent.com/LABrueggs/HSDN/master/Symptom-Occurence-Output.tsv'
hsdn_symptom_df = pandas.read_table(url, index_col=0)
hsdn_symptoms = hsdn_symptom_df['MeSH Symptom ID']
# find MeSH symptoms
symptoms = networkx.descendants(network, 'D012816') # signs and symptoms
symptom_df = mesh_df[mesh_df.mesh_id.isin(symptoms)]
pandas.options.mode.chained_assignment = None
symptom_df['in_hsdn'] = symptom_df.mesh_id.isin(hsdn_symptoms).astype(int)
symptom_df.to_csv('data/symptoms.tsv', index=False, sep='\t')
sum(symptom_df.in_hsdn)
319
side_effects = networkx.descendants(network, 'D064420') # Drug-Related Side Effects and Adverse Reactions
side_effect_df = mesh_df[mesh_df.mesh_id.isin(side_effects)]
len(side_effect_df)