Implementation of diffusion hetmech¶

In [1]:

import pandas
from neo4j.v1 import GraphDatabase
import hetio.readwrite

from hetmech.diffusion import diffuse

In [2]:

url = 'https://github.com/dhimmel/hetionet/raw/76550e6c93fbe92124edc71725e8c7dd4ca8b1f5/hetnet/json/hetionet-v1.0.json.bz2'
graph = hetio.readwrite.read_graph(url)
metagraph = graph.metagraph

In [3]:

# MetaGraph node/edge count
metagraph.n_nodes, metagraph.n_edges

Out[3]:

(11, 24)

In [4]:

# Graph node/edge count
graph.n_nodes, graph.n_edges

Out[4]:

(47031, 2250197)

In [5]:

# Uses the official neo4j-python-driver. See https://github.com/neo4j/neo4j-python-driver

query = '''
MATCH (disease:Disease)-[assoc:ASSOCIATES_DaG]-(gene:Gene)
WHERE disease.name = 'epilepsy syndrome'
RETURN
 gene.name AS gene_symbol,
 gene.description AS gene_name,
 gene.identifier AS entrez_gene_id,
 assoc.sources AS sources
ORDER BY gene_symbol
'''

driver = GraphDatabase.driver("bolt://neo4j.het.io")
with driver.session() as session:
    result = session.run(query)
    gene_df = pandas.DataFrame((x.values() for x in result), columns=result.keys())

gene_df.head()

Out[5]:

	gene_symbol	gene_name	entrez_gene_id	sources
0	ABAT	4-aminobutyrate aminotransferase	18	[DisGeNET]
1	ABCB1	ATP-binding cassette, sub-family B (MDR/TAP), ...	5243	[DISEASES, DOAF, DisGeNET]
2	ABCC2	ATP-binding cassette, sub-family C (CFTR/MRP),...	1244	[DisGeNET]
3	ABCG2	ATP-binding cassette, sub-family G (WHITE), me...	9429	[DisGeNET]
4	ACKR4	atypical chemokine receptor 4	51554	[DISEASES]

In [6]:

epilepsy_genes = list()
for entrez_gene_id in gene_df.entrez_gene_id:
    node_id = 'Gene', entrez_gene_id
    node = graph.node_dict.get(node_id)
    if node:
        epilepsy_genes.append(node)
len(epilepsy_genes)

Out[6]:

In [7]:

metapath = metagraph.metapath_from_abbrev('GiGpBP')
source_node_weights = {gene: 1 for gene in epilepsy_genes}
pathway_scores = diffuse(graph, metapath, source_node_weights, column_damping=1, row_damping=1)
target_df = pandas.DataFrame(list(pathway_scores.items()), columns=['target_node', 'score'])
target_df['target_name'] = target_df.target_node.map(lambda x: graph.node_dict[('Biological Process', x)].name)
target_df = target_df.sort_values('score', ascending=False)

In [8]:

len(target_df)

Out[8]:

In [9]:

sum(target_df.score)

Out[9]:

353.7693384197814

In [10]:

metapath

Out[10]:

GiGpBP

In [11]:

target_df.head()

Out[11]:

	target_node	score	target_name
4751	GO:0035235	1.091022	ionotropic glutamate receptor signaling pathway
2530	GO:0010992	1.038370	ubiquitin homeostasis
1783	GO:0007586	0.971243	digestion
7663	GO:0060081	0.948960	membrane hyperpolarization
1485	GO:0006895	0.907327	Golgi to endosome transport

Diagnosing ubiquitin homeostasis¶

ubiquitin homeostasis contains 3 genes: [UBB, UBC, IDE]

MATCH (bp:BiologicalProcess)-[rel:PARTICIPATES_GpBP]-(gene)-[INTERACTS_GiG]-(gene_target)
WHERE bp.name ='ubiquitin homeostasis'
RETURN
  gene.name AS ubiquitin_homeostasis_gene,
  count(gene_target) AS n_interacting_genes

Returns the following table:

ubiquitin_homeostasis_gene	n_interacting_genes
IDE	243
UBC	9371
UBB	1040