import configparser
import math
import psycopg2
import pandas
from neo4j import GraphDatabase
import tqdm
import hetio.readwrite
import hetio.neo4j
parser = configparser.ConfigParser()
parser.read('database.ini')
db_password = parser['psql']['password']
epilepsy_id = 'DOID:1826'
# Get top ten most important metapaths for epilepsy (which are all compound-disease pairs)
query = f'''SELECT outer_pc.dwpc as dwpc, outer_pc.p_value as p_value, outer_pc.metapath_id as metapath_id,
top_ids.source_name as source_name, top_ids.target_name as target_name
FROM
(SELECT dwpc, p_value, metapath_id, source_id, target_id, n1.name AS source_name, n2.name AS target_name
FROM dj_hetmech_app_pathcount pc
JOIN dj_hetmech_app_node join_node
ON pc.target_id=join_node.id OR pc.source_id=join_node.id
JOIN dj_hetmech_app_node n1
ON pc.source_id = n1.id
JOIN dj_hetmech_app_node n2
ON pc.target_id = n2.id
WHERE join_node.identifier='{epilepsy_id}'
ORDER BY pc.p_value) AS top_ids
JOIN dj_hetmech_app_pathcount outer_pc
ON (top_ids.source_id = outer_pc.source_id AND
top_ids.target_id = outer_pc.target_id) OR
(top_ids.source_id = outer_pc.target_id AND
top_ids.target_id = outer_pc.source_id)
ORDER BY outer_pc.p_value;
'''
connection = psycopg2.connect(host = 'hetmech-db-dev.cobepk65dd7j.us-east-1.rds.amazonaws.com',
database = 'dj_hetmech', user = 'read_only_user', password = db_password)
top_metapaths = pandas.read_sql(query, connection)
top_metapaths = top_metapaths.sort_values(by=['source_name', 'metapath_id'])
# Ensure that you only have one copy of each (source_name, metapath_id) pair
top_metapaths = top_metapaths.drop_duplicates(subset=['source_name', 'metapath_id'])
top_metapaths = top_metapaths.sort_values(by='p_value')
# Remove any rows with NaN values
top_metapaths = top_metapaths.dropna()
min_p_value = top_metapaths[top_metapaths.p_value != 0].p_value.min()
top_metapaths.loc[top_metapaths.p_value == 0, 'p_value'] = min_p_value
print(top_metapaths.p_value.min())
top_metapaths['neg_log_p_value'] = top_metapaths.p_value.apply(lambda x: -math.log10(x))
top_metapaths.head()
3.13181113155575e-17
dwpc | p_value | metapath_id | source_name | target_name | neg_log_p_value | |
---|---|---|---|---|---|---|
0 | 3.509434 | 3.131811e-17 | CcSEcCtD | Nitrazepam | epilepsy syndrome | 16.504204 |
9 | 3.296422 | 5.733828e-17 | CcSEcCtD | Bromazepam | epilepsy syndrome | 16.241555 |
16 | 3.579689 | 7.032840e-17 | CcSEcCtD | Lorazepam | epilepsy syndrome | 16.152869 |
28 | 3.369589 | 7.210640e-17 | CcSEcCtD | Phenobarbital | epilepsy syndrome | 16.142026 |
34 | 3.346266 | 2.518406e-16 | CcSEcCtD | Ezogabine | epilepsy syndrome | 15.598874 |
url = 'https://github.com/dhimmel/hetionet/raw/76550e6c93fbe92124edc71725e8c7dd4ca8b1f5/hetnet/json/hetionet-v1.0-metagraph.json'
metagraph = hetio.readwrite.read_metagraph(url)
def get_paths_for_metapath(metagraph, row):
'''
Return a list of dictionaries containing the information for all paths with a given source, target, and metapath
Parameters
----------
metagraph : a hetio.hetnet.Metagraph instance to interpret metapath abbreviations
row : a row from a pandas dataframe with information about the given metapath, source, and target
'''
damping_exponent = .5
metapath_data = metagraph.metapath_from_abbrev(row['metapath_id'])
query = hetio.neo4j.construct_pdp_query(metapath_data, path_style='string', property='name')
driver = GraphDatabase.driver("bolt://neo4j.het.io")
params = {
'source': row['source_name'],
'target': row['target_name'],
'w': damping_exponent
}
with driver.session() as session:
metapath_result = session.run(query, params)
metapath_result = metapath_result.data()
for path in metapath_result:
path['metapath'] = row['metapath_id']
path['metapath_importance'] = row['neg_log_p_value']
path['path_importance'] = path['metapath_importance'] * path['percent_of_DWPC']
path['source'] = row['source_name']
metapath_df = pandas.DataFrame(metapath_result)
return metapath_df
# For row in top_metapaths
result_list = []
for index, row in tqdm.tqdm_notebook(top_metapaths.iterrows(), total=len(top_metapaths.index)):
metapath_df = get_paths_for_metapath(metagraph, row)
result_list.append(metapath_df)
result_df = pandas.concat(result_list, ignore_index=True)
HBox(children=(IntProgress(value=0, max=6740), HTML(value='')))
result_df = result_df.sort_values(by=['source', 'path_importance', 'metapath'], ascending=[True, False, True])
result_df.head()
PDP | metapath | metapath_importance | path | path_importance | percent_of_DWPC | source | |
---|---|---|---|---|---|---|---|
1181486 | 0.011800 | CbGaD | 1.968488 | Abacavir–ADK–epilepsy syndrome | 196.848818 | 100.000000 | Abacavir |
1133753 | 0.000303 | CbGdAlD | 2.186818 | Abacavir–ADH6–telencephalon–epilepsy syndrome | 68.693397 | 31.412493 | Abacavir |
1133754 | 0.000155 | CbGdAlD | 2.186818 | Abacavir–ADH6–medulla oblongata–epilepsy syndrome | 35.187079 | 16.090540 | Abacavir |
1133755 | 0.000153 | CbGdAlD | 2.186818 | Abacavir–ADH6–cerebellum–epilepsy syndrome | 34.732125 | 15.882496 | Abacavir |
1410458 | 0.000846 | CtDdGaD | 1.106620 | Abacavir–acquired immunodeficiency syndrome–HS... | 24.962412 | 22.557351 | Abacavir |
result_df.to_csv('data/epilepsy_paths.tsv.gz', index=False, sep='\t')