Proposed in https://github.com/greenelab/hetmech/pull/77.
import collections
import pandas
import hetio.readwrite
import numpy
from hetmech.degree_weight import dwpc
repo_url = 'https://github.com/dhimmel/hetionet'
commit = '6d26d15e9055b33b4fd97a180fa288e4f2060b96'
names = ['hetionet-v1.0'] + [f'hetionet-v1.0-perm-{i + 1}' for i in range(5)]
paths = ['hetnet/json/hetionet-v1.0.json.bz2'] + [
f'hetnet/permuted/json/{name}.json.bz2' for name in names[1:]
]
hetnets = collections.OrderedDict()
for name, path in zip(names, paths):
url = f'{repo_url}/raw/{commit}/{path}'
hetnets[name] = hetio.readwrite.read_graph(url)
list(hetnets)
['hetionet-v1.0', 'hetionet-v1.0-perm-1', 'hetionet-v1.0-perm-2', 'hetionet-v1.0-perm-3', 'hetionet-v1.0-perm-4', 'hetionet-v1.0-perm-5']
DWPCs = collections.OrderedDict()
for name, graph in hetnets.items():
metapath = graph.metagraph.metapath_from_abbrev('GiGpBP')
rows, cols, dwpc_matrix, seconds = dwpc(graph, metapath, damping=0.4)
DWPCs[name] = dwpc_matrix
print(f'Computing DWPC matrix for the {metapath} metapath in {name} took {seconds:.1f} seconds')
Computing DWPC matrix for the GiGpBP metapath in hetionet-v1.0 took 449.6 seconds Computing DWPC matrix for the GiGpBP metapath in hetionet-v1.0-perm-1 took 180.7 seconds Computing DWPC matrix for the GiGpBP metapath in hetionet-v1.0-perm-2 took 178.7 seconds Computing DWPC matrix for the GiGpBP metapath in hetionet-v1.0-perm-3 took 178.7 seconds Computing DWPC matrix for the GiGpBP metapath in hetionet-v1.0-perm-4 took 174.0 seconds Computing DWPC matrix for the GiGpBP metapath in hetionet-v1.0-perm-5 took 176.4 seconds
Note that gneerating the DWPC matrices on the unpermuted network took longer. We may want to investigate the cause of this differential runtime, as it may provide a valuable insight.
metapath.get_unicode_str()
'Gene–interacts–Gene–participates–Biological Process'
# Differentially expressed blood pressure genes from https://doi.org/10.1371/journal.pgen.1005035
url = 'https://doi.org/10.1371/journal.pgen.1005035.s006'
bp_df = (
pandas.read_excel(url, skiprows=[0, 2])
.rename(columns={
'EntrezGeneID_FHS': 'entrez_gene_id',
})
.dropna(subset=['entrez_gene_id'])
.drop_duplicates(subset=['entrez_gene_id'])
.query("BP_sixCohort_meta_p < 0.001")
[['entrez_gene_id', 'BP_sixCohort_meta_TE', 'BP_sixCohort_meta_p']]
)
# Entrez Genes should be ints
bp_df.entrez_gene_id = bp_df.entrez_gene_id.astype(int)
# Replace p-values that are zero
bp_df.loc[bp_df.BP_sixCohort_meta_p == 0, 'BP_sixCohort_meta_p'] = 1e-15
bp_df['weight'] = bp_df.BP_sixCohort_meta_TE * -numpy.log10(bp_df.BP_sixCohort_meta_p)
bp_df['weight_down'] = numpy.maximum(-bp_df.weight, 0)
bp_df['weight_up'] = numpy.maximum(bp_df.weight, 0)
bp_df.head(2)
entrez_gene_id | BP_sixCohort_meta_TE | BP_sixCohort_meta_p | weight | weight_down | weight_up | |
---|---|---|---|---|---|---|
0 | 1318 | 0.002282 | 1.000000e-15 | 0.034224 | 0.0 | 0.034224 |
1 | 91663 | 0.002578 | 1.000000e-15 | 0.038671 | 0.0 | 0.038671 |
pandas.Series(numpy.sign(bp_df.weight)).value_counts()
1.0 68 -1.0 65 Name: weight, dtype: int64
gene_df = (
pandas.DataFrame({
'entrez_gene_id': rows,
'gene_symbol': [graph.get_node((metapath.source().identifier, x)).name for x in rows],
})
.merge(bp_df, how='left')
[['entrez_gene_id', 'gene_symbol', 'weight', 'weight_down', 'weight_up']]
.fillna(0)
)
gene_df.head(2)
entrez_gene_id | gene_symbol | weight | weight_down | weight_up | |
---|---|---|---|---|---|
0 | 1 | A1BG | 0.0 | 0.0 | 0.0 |
1 | 2 | A2M | 0.0 | 0.0 | 0.0 |
target_df = pandas.DataFrame({
'metapath': str(metapath),
'target_id': cols,
'target_name': [graph.get_node((metapath.target().identifier, x)).name for x in cols],
}).set_index(['metapath', 'target_id', 'target_name'])
for name, array in DWPCs.items():
target_df[name] = gene_df.weight_up @ array
# Scaling as per https://think-lab.github.io/d/193/#4
dwpc_scaler = target_df['hetionet-v1.0'].mean()
target_df = numpy.arcsinh(target_df / dwpc_scaler)
perm_df = target_df.iloc[:, 1:]
target_df['z-score'] = (target_df.iloc[:, 0] - perm_df.mean(axis='columns')) / perm_df.std(axis='columns')
(
target_df
# Remove targets without sufficient nonzero DWPCs
[(perm_df > 0).sum(axis='columns') >= 3]
.sort_values('z-score', ascending=False)
.head(20)
)
hetionet-v1.0 | hetionet-v1.0-perm-1 | hetionet-v1.0-perm-2 | hetionet-v1.0-perm-3 | hetionet-v1.0-perm-4 | hetionet-v1.0-perm-5 | z-score | |||
---|---|---|---|---|---|---|---|---|---|
metapath | target_id | target_name | |||||||
GiGpBP | GO:0051208 | sequestering of calcium ion | 1.293332 | 0.000000 | 0.053656 | 0.033142 | 0.000000 | 0.016187 | 55.307699 |
GO:0072236 | metanephric loop of Henle development | 1.240933 | 0.048121 | 0.076781 | 0.000000 | 0.035537 | 0.000000 | 36.759604 | |
GO:0061299 | retina vasculature morphogenesis in camera-type eye | 0.835832 | 0.075949 | 0.110748 | 0.050562 | 0.067146 | 0.051620 | 31.135702 | |
GO:0070426 | positive regulation of nucleotide-binding oligomerization domain containing signaling pathway | 1.759714 | 0.063897 | 0.000000 | 0.053867 | 0.000000 | 0.140074 | 29.612934 | |
GO:0070318 | positive regulation of G0 to G1 transition | 1.376872 | 0.102426 | 0.000000 | 0.044912 | 0.076564 | 0.000000 | 29.166387 | |
GO:0048936 | peripheral nervous system neuron axonogenesis | 1.826185 | 0.121804 | 0.154761 | 0.046665 | 0.024011 | 0.000000 | 26.615738 | |
GO:0030316 | osteoclast differentiation | 1.415508 | 0.593321 | 0.524189 | 0.560869 | 0.573532 | 0.514871 | 26.017101 | |
GO:0032464 | positive regulation of protein homooligomerization | 2.260878 | 0.000000 | 0.083897 | 0.017376 | 0.206801 | 0.000000 | 24.931638 | |
GO:0090400 | stress-induced premature senescence | 1.383377 | 0.000000 | 0.000000 | 0.113039 | 0.078198 | 0.094745 | 24.720282 | |
GO:1901099 | negative regulation of signal transduction in absence of ligand | 1.658419 | 0.286770 | 0.165110 | 0.313619 | 0.265006 | 0.273724 | 24.700407 | |
GO:0030050 | vesicle transport along actin filament | 1.457428 | 0.000000 | 0.024260 | 0.127136 | 0.117742 | 0.080365 | 24.679673 | |
GO:1903265 | positive regulation of tumor necrosis factor-mediated signaling pathway | 1.821209 | 0.102778 | 0.000000 | 0.000000 | 0.174518 | 0.081794 | 23.631223 | |
GO:0071287 | cellular response to manganese ion | 1.607277 | 0.216493 | 0.151072 | 0.276680 | 0.119977 | 0.164598 | 23.068022 | |
GO:0090435 | protein localization to nuclear envelope | 0.634820 | 0.048058 | 0.034562 | 0.068980 | 0.026258 | 0.089055 | 22.657843 | |
GO:0036015 | response to interleukin-3 | 1.332852 | 0.217064 | 0.115212 | 0.086108 | 0.165861 | 0.085956 | 21.134876 | |
GO:0033693 | neurofilament bundle assembly | 2.810967 | 0.284976 | 0.167546 | 0.000000 | 0.000000 | 0.215103 | 20.779777 | |
GO:0052040 | modulation by symbiont of host programmed cell death | 1.382360 | 0.017887 | 0.037502 | 0.000000 | 0.165182 | 0.079007 | 20.106359 | |
GO:2000644 | regulation of receptor catabolic process | 0.808170 | 0.094987 | 0.018814 | 0.034342 | 0.000000 | 0.074711 | 19.387848 | |
GO:0002758 | innate immune response-activating signal transduction | 2.778018 | 1.789522 | 1.839803 | 1.900086 | 1.857775 | 1.781762 | 19.200993 | |
GO:0097084 | vascular smooth muscle cell development | 2.300940 | 0.000000 | 0.137458 | 0.164608 | 0.000000 | 0.274812 | 18.646273 |