import pandas
from neo4j.v1 import GraphDatabase
import hetio.readwrite
from hetmech.diffusion import diffuse
url = 'https://github.com/dhimmel/hetionet/raw/76550e6c93fbe92124edc71725e8c7dd4ca8b1f5/hetnet/json/hetionet-v1.0.json.bz2'
graph = hetio.readwrite.read_graph(url)
metagraph = graph.metagraph
# MetaGraph node/edge count
metagraph.n_nodes, metagraph.n_edges
(11, 24)
# Graph node/edge count
graph.n_nodes, graph.n_edges
(47031, 2250197)
# Uses the official neo4j-python-driver. See https://github.com/neo4j/neo4j-python-driver
query = '''
MATCH (disease:Disease)-[assoc:ASSOCIATES_DaG]-(gene:Gene)
WHERE disease.name = 'epilepsy syndrome'
RETURN
gene.name AS gene_symbol,
gene.description AS gene_name,
gene.identifier AS entrez_gene_id,
assoc.sources AS sources
ORDER BY gene_symbol
'''
driver = GraphDatabase.driver("bolt://neo4j.het.io")
with driver.session() as session:
result = session.run(query)
gene_df = pandas.DataFrame((x.values() for x in result), columns=result.keys())
gene_df.head()
gene_symbol | gene_name | entrez_gene_id | sources | |
---|---|---|---|---|
0 | ABAT | 4-aminobutyrate aminotransferase | 18 | [DisGeNET] |
1 | ABCB1 | ATP-binding cassette, sub-family B (MDR/TAP), ... | 5243 | [DISEASES, DOAF, DisGeNET] |
2 | ABCC2 | ATP-binding cassette, sub-family C (CFTR/MRP),... | 1244 | [DisGeNET] |
3 | ABCG2 | ATP-binding cassette, sub-family G (WHITE), me... | 9429 | [DisGeNET] |
4 | ACKR4 | atypical chemokine receptor 4 | 51554 | [DISEASES] |
epilepsy_genes = list()
for entrez_gene_id in gene_df.entrez_gene_id:
node_id = 'Gene', entrez_gene_id
node = graph.node_dict.get(node_id)
if node:
epilepsy_genes.append(node)
len(epilepsy_genes)
399
metapath = metagraph.metapath_from_abbrev('GiGpBP')
source_node_weights = {gene: 1 for gene in epilepsy_genes}
pathway_scores = diffuse(graph, metapath, source_node_weights, column_damping=1, row_damping=1)
target_df = pandas.DataFrame(list(pathway_scores.items()), columns=['target_node', 'score'])
target_df['target_name'] = target_df.target_node.map(lambda x: graph.node_dict[('Biological Process', x)].name)
target_df = target_df.sort_values('score', ascending=False)
len(target_df)
11381
sum(target_df.score)
353.7693384197814
metapath
GiGpBP
target_df.head()
target_node | score | target_name | |
---|---|---|---|
4751 | GO:0035235 | 1.091022 | ionotropic glutamate receptor signaling pathway |
2530 | GO:0010992 | 1.038370 | ubiquitin homeostasis |
1783 | GO:0007586 | 0.971243 | digestion |
7663 | GO:0060081 | 0.948960 | membrane hyperpolarization |
1485 | GO:0006895 | 0.907327 | Golgi to endosome transport |
ubiquitin homeostasis contains 3 genes: [UBB, UBC, IDE]
MATCH (bp:BiologicalProcess)-[rel:PARTICIPATES_GpBP]-(gene)-[INTERACTS_GiG]-(gene_target)
WHERE bp.name ='ubiquitin homeostasis'
RETURN
gene.name AS ubiquitin_homeostasis_gene,
count(gene_target) AS n_interacting_genes
Returns the following table:
ubiquitin_homeostasis_gene | n_interacting_genes |
---|---|
IDE | 243 |
UBC | 9371 |
UBB | 1040 |