#!/usr/bin/env python # coding: utf-8 # ## Fear of Bees: Extracting Ontologies from Wikidata # # Wikidata includes links between entities using predicates such as SubClassOf (P279). These form a classification hierarchy, # although as this comes from multiple sources, it may not conform to the same rules as ontology hierarchies. # # OntoBio includes a wikidata ontology factory, so we can transparently create an Ontology object from wikidata, # and leverage the same methods available in ontobio. # # This example is focused around [Anxiety disorders](https://www.wikidata.org/wiki/Q544006) # # In[1]: from ontobio.ontol_factory import OntologyFactory f = OntologyFactory() ## OntologyFactory recognizes the prefix wdq for wikidata queries; ## We use this to make a sub-ontology ## (currently we have no lazy wrapper for WD, only Eager, so we limit the size) ont = f.create('wdq:Q544006') # Anxiety disorder # In[2]: ## Find terms starting with Anxiety in the sub-ontology qids = ont.search('Anxiety%') qids # In[3]: ## Traverse up and down from query node in our sub-ontology nodes = ont.traverse_nodes(qids, up=True, down=True) labels = [ont.label(n) for n in nodes] labels[:25] # In[16]: ## Test for cycles import networkx as nx g = ont.get_graph() def show_cycle(nl): print(["{} {}".format(n, ont.label(n)) for n in nl]) cycles_list = list(nx.simple_cycles(g)) show_cycle(cycles_list[0]) # In[5]: ## Show our extract of the sub-ontology as an ascii tree ## (note this is resilient to cycles) ## only traverse down from our query nodes ## (including ancestors causes multiple paths, and a verbose display) nodes = ont.traverse_nodes(qids, up=False, down=True) from ontobio.io.ontol_renderers import GraphRenderer w = GraphRenderer.create('tree') w.write_subgraph(ont, nodes, query_ids=qids) # In[6]: ## Show as graph using GraphViz ## We can do this for both descendants and ancestors nodes = ont.traverse_nodes(qids, up=True, down=True) w = GraphRenderer.create('png') w.outfile = 'output/anxiety-disorder.png' w.write_subgraph(ont, nodes, query_ids=qids) # ![img](output/anxiety-disorder.png) # ## Querying for associated entities # # TODO: Drugs # # In[4]: ## What proteins are associated with PTSD? (via GWAS) [ptsd] = ont.search('post-traumatic stress disorder') import ontobio.sparql.wikidata as wd proteins = wd.canned_query('disease2protein', ptsd) # In[5]: proteins # In[10]: ## Find GO terms for all genes/products associated with all nodes in Anxiety sub-ontology ## First create a GO handle and get association sets for GO (in human) go = f.create('go') from ontobio.assoc_factory import AssociationSetFactory afactory = AssociationSetFactory() aset = afactory.create(ontology=go, subject_category='gene', object_category='function', taxon='NCBITaxon:9606') # In[19]: for n in ont.nodes(): proteins = wd.canned_query('disease2protein', n) anns = [a for p in proteins for a in aset.annotations(p)] if len(anns) > 0: print("{} {}".format(n,ont.label(n))) for a in anns: print(" {} {}".format(a, go.label(a))) # In[ ]: