This notebooks loads and processes the Dataset 1 and analyzes its mapping with PathMe, stratificating it by omic and by database.
Finally, it exports to the format ready to be used by DiffuPy/DiffuPath
import os
import pybel
from pybel_tools.summary import plot_summary, count_top_centrality
import matplotlib.pyplot as plt
import json
import networkx as nx
from diffupath.constants import DEFAULT_DIFFUPATH_DIR, ROOT_RESULTS_DIR
from diffupath.pathme_processing import get_labels_by_db_and_omic_from_pathme, get_labels_by_db_and_omic_from_graph
from diffupath.utils import print_dict_dimensions, get_labels_set_from_dict, reduce_dict_dimension, get_count_and_labels_from_two_dim_dict, from_pickle, to_pickle, get_three_venn_intersections, random_disjoint_intersection_three_subsets
from diffupath.views import show_heatmap
from diffupy.process_network import process_graph_from_file
from diffupy.process_input import process_input_data
from diffupy.utils import lists_combinations, parse_xls_to_df, to_json
from diffupy.constants import *
VALIDATION_PATH = os.path.join(ROOT_RESULTS_DIR, 'data', 'validation_datasets')
VALSET_1_PATH = os.path.join(VALIDATION_PATH, 'set1.xlsx')
set_entity_type = ['Genes', 'microRNA', 'Metabolite']
set_concentration = ['3µM', '20µM']
set_time_points = ['12h', '48h', '72h', '72h Pathways']
relevant_sheets = lists_combinations(set_entity_type,
lists_combinations(set_concentration,
set_time_points)
)
relevant_sheets
['Genes 3µM 12h', 'Genes 3µM 48h', 'Genes 3µM 72h', 'Genes 3µM 72h Pathways', 'Genes 20µM 12h', 'Genes 20µM 48h', 'Genes 20µM 72h', 'Genes 20µM 72h Pathways', 'microRNA 3µM 12h', 'microRNA 3µM 48h', 'microRNA 3µM 72h', 'microRNA 3µM 72h Pathways', 'microRNA 20µM 12h', 'microRNA 20µM 48h', 'microRNA 20µM 72h', 'microRNA 20µM 72h Pathways', 'Metabolite 3µM 12h', 'Metabolite 3µM 48h', 'Metabolite 3µM 72h', 'Metabolite 3µM 72h Pathways', 'Metabolite 20µM 12h', 'Metabolite 20µM 48h', 'Metabolite 20µM 72h', 'Metabolite 20µM 72h Pathways']
relevant_cols = ['hgnc symbol',
'MicroRNA',
'Metabolite',
'Mean p-value',
'Mean FC',
'Mean p-value',
'FC',
'fc',
't.test',
'Q.value',
'Pathway'
]
cols_titles_mapping = {'hgnc symbol':NODE,
'MicroRNA': NODE,
'Metabolite': NODE,
'Pathway':NODE,
'Mean p-value': P_VALUE,
'Mean FC': LOG_FC,
'FC': LOG_FC,
'fc': LOG_FC,
}
parsed_xls = parse_xls_to_df(VALSET_1_PATH,
min_row = 4,
relevant_sheets = relevant_sheets,
relevant_cols = relevant_cols
)
processed_input_data = process_input_data(parsed_xls,
cols_titles_mapping = cols_titles_mapping
)
selected_input = {'gene_nodes': processed_input_data['genes 20µm 72h'],
'mirna_nodes': processed_input_data['microrna 20µm 72h'],
'metabolite_nodes': processed_input_data['metabolite 20µm 72h'],
'bp_nodes': processed_input_data['genes 20µm 72h pathways'],
}
selected_input_size = {'gene_nodes': len(processed_input_data['genes 20µm 72h']),
'mirna_nodes': len(processed_input_data['microrna 20µm 72h']),
'metabolite_nodes': len(processed_input_data['metabolite 20µm 72h']),
'bp_nodes': len(processed_input_data['genes 20µm 72h pathways']),
}
selected_input_size
{'gene_nodes': 3080, 'mirna_nodes': 43, 'metabolite_nodes': 16, 'bp_nodes': 224}
GRAPH_PATH = os.path.join(DEFAULT_DIFFUPATH_DIR, 'pickles', 'universe', 'pathme_universe_non_flatten_collapsed_names_no_isolates_16_03_2020.pickle')
pathme_universe_graph_no_flatten = process_graph_from_file(GRAPH_PATH)
pathme_universe_graph_no_flatten.summarize()
plot_summary(pathme_universe_graph_no_flatten, plt, figsize=(10, 4))
plt.show()
PathMe Universe v1.0.0 Number of Nodes: 42272 Number of Edges: 105871 Number of Citations: 7695 Number of Authors: 0 Network Density: 5.92E-05 Number of Components: 435 Number of Warnings: 0
bg_labels_from_pathmeuniverse_by_db_and_omic, bg_labels_from_pathmeuniverse_by_omic_and_db = get_labels_by_db_and_omic_from_graph(pathme_universe_graph_no_flatten)
print_dict_dimensions(bg_labels_from_pathmeuniverse_by_db_and_omic, 'Entities in PathMeUniverse :')
Total number of wikipathways: gene_nodes(2555), mirna_nodes(91), metabolite_nodes(637), bp_nodes(138), Total number of kegg: gene_nodes(6048), mirna_nodes(122), metabolite_nodes(1770), bp_nodes(107), Total number of reactome: gene_nodes(8617), mirna_nodes(13), metabolite_nodes(2682), bp_nodes(3), Total: 18001
bg_labels_from_pathmeuniverse_by_db = reduce_dict_dimension(bg_labels_from_pathmeuniverse_by_db_and_omic)
print_dict_dimensions(bg_labels_from_pathmeuniverse_by_db, 'Entities in PathMeUniverse :')
Total number of wikipathways: 3421 Total number of kegg: 8047 Total number of reactome: 11315 Total: 18001
bg_labels_from_pathmeuniverse_by_omic = reduce_dict_dimension(bg_labels_from_pathmeuniverse_by_omic_and_db)
print_dict_dimensions(bg_labels_from_pathmeuniverse_by_omic, 'Entities in PathMeUniverse :')
Total number of gene_nodes: 13282 Total number of mirna_nodes: 171 Total number of metabolite_nodes: 4306 Total number of bp_nodes: 248 Total: 18001
from diffupy.process_input import map_labels_input
from diffupy.process_input import mapping_statistics
mapping_by_database_and_entity = map_labels_input(input_labels=selected_input,
background_labels=bg_labels_from_pathmeuniverse_by_db_and_omic,
show_descriptive_stat = True
)
Mapping descriptive statistics wikipathways: gene_nodes (474, 0.1538961038961039) mirna_nodes (2, 0.046511627906976744) metabolite_nodes (12, 0.75) bp_nodes (1, 0.004464285714285714) total (489, 0.14540588760035683) kegg: gene_nodes (1041, 0.337987012987013) mirna_nodes (3, 0.06976744186046512) metabolite_nodes (6, 0.375) bp_nodes (12, 0.05357142857142857) total (1062, 0.3157894736842105) reactome: gene_nodes (709, 0.2301948051948052) mirna_nodes (1, 0.023255813953488372) metabolite_nodes (6, 0.375) total (716, 0.22809812042051608) total: gene_nodes (1461, 0.4344335414808207) mirna_nodes (4, 0.0011894142134998512) metabolite_nodes (13, 0.003865596193874517) bp_nodes (13, 0.003865596193874517) total (1491, 0.4433541480820696)
mapping_by_database_and_entity_statistics = mapping_statistics(selected_input,
mapping_by_database_and_entity)
total1 = mapping_by_database_and_entity_statistics['wikipathways'].pop('metabolite_nodes')
total3 = mapping_by_database_and_entity_statistics['wikipathways'].pop('bp_nodes')
total2 = mapping_by_database_and_entity_statistics['wikipathways'].pop('total')
mapping_by_database_and_entity_statistics['wikipathways']['metabolite_nodes'] = total1
mapping_by_database_and_entity_statistics['wikipathways']['mirna_nodes'] = (0,0)
mapping_by_database_and_entity_statistics['wikipathways']['bp_nodes'] = total3
mapping_by_database_and_entity_statistics['wikipathways']['total'] = total2
total = mapping_by_database_and_entity_statistics['kegg'].pop('mirna_nodes')
total1 = mapping_by_database_and_entity_statistics['kegg'].pop('metabolite_nodes')
total3 = mapping_by_database_and_entity_statistics['kegg'].pop('bp_nodes')
total2 = mapping_by_database_and_entity_statistics['kegg'].pop('total')
mapping_by_database_and_entity_statistics['kegg']['metabolite_nodes'] = total1
mapping_by_database_and_entity_statistics['kegg']['mirna_nodes'] = total
mapping_by_database_and_entity_statistics['kegg']['bp_nodes'] = total3
mapping_by_database_and_entity_statistics['kegg']['total'] = total2
total1 = mapping_by_database_and_entity_statistics['reactome'].pop('metabolite_nodes')
total2 = mapping_by_database_and_entity_statistics['reactome'].pop('total')
mapping_by_database_and_entity_statistics['reactome']['metabolite_nodes'] = total1
mapping_by_database_and_entity_statistics['reactome']['mirna_nodes'] = (0,0)
mapping_by_database_and_entity_statistics['reactome']['bp_nodes'] = (0,0)
mapping_by_database_and_entity_statistics['reactome']['total'] = total2
total = mapping_by_database_and_entity_statistics['total'].pop('mirna_nodes')
total1 = mapping_by_database_and_entity_statistics['total'].pop('metabolite_nodes')
total2 = mapping_by_database_and_entity_statistics['total'].pop('total')
total3 = mapping_by_database_and_entity_statistics['total'].pop('bp_nodes')
mapping_by_database_and_entity_statistics['total']['metabolite_nodes'] = total1
mapping_by_database_and_entity_statistics['total']['mirna_nodes'] = total
mapping_by_database_and_entity_statistics['total']['bp_nodes'] = total3
mapping_by_database_and_entity_statistics['total']['total'] = total2
mapping_by_database_and_entity_statistics
{'wikipathways': {'gene_nodes': (474, 0.1538961038961039), 'mirna_nodes': (0, 0), 'metabolite_nodes': (12, 0.75), 'bp_nodes': (1, 0.004464285714285714), 'total': (489, 0.14540588760035683)}, 'kegg': {'gene_nodes': (1041, 0.337987012987013), 'metabolite_nodes': (6, 0.375), 'mirna_nodes': (3, 0.06976744186046512), 'bp_nodes': (12, 0.05357142857142857), 'total': (1062, 0.3157894736842105)}, 'reactome': {'gene_nodes': (709, 0.2301948051948052), 'mirna_nodes': (0, 0), 'metabolite_nodes': (6, 0.375), 'bp_nodes': (0, 0), 'total': (716, 0.22809812042051608)}, 'total': {'gene_nodes': (1461, 0.4344335414808207), 'metabolite_nodes': (13, 0.003865596193874517), 'mirna_nodes': (4, 0.0011894142134998512), 'bp_nodes': (13, 0.003865596193874517), 'total': (1491, 0.4433541480820696)}}
count, percentage, db_labels, entity_labels = get_count_and_labels_from_two_dim_dict(mapping_by_database_and_entity_statistics)
show_heatmap(count, percentage, db_labels, entity_labels, 'Dataset 1 DiffuPath Mapping')
from diffupath.utils import get_three_venn_intersections
from diffupath.views import show_venn_diagram
mapping_by_database = reduce_dict_dimension(mapping_by_database_and_entity)
intersecctions = get_three_venn_intersections(mapping_by_database['kegg'], mapping_by_database['reactome'], mapping_by_database['wikipathways'])
show_venn_diagram(intersecctions)
from diffupath.utils import reduce_dict_two_dimensional, subvert_twodim_dict
from diffupath.views import show_distribution
mapping_by_database = reduce_dict_dimension(mapping_by_database_and_entity)
mapping_by_entity = reduce_dict_dimension(subvert_twodim_dict(mapping_by_database_and_entity))
show_distribution(mapping_by_database, "Input measures distribution by database", "FC distribution")
mapping_by_database_absolute = process_input_data(mapping_by_database, absolute_value = True)
show_distribution(mapping_by_database_absolute)
show_distribution(mapping_by_entity, "Input measures distribution by entity type", "FC distribution")
/usr/local/lib/python3.7/site-packages/statsmodels/nonparametric/kde.py:488: RuntimeWarning: invalid value encountered in true_divide binned = fast_linbin(X, a, b, gridsize) / (delta * nobs) /usr/local/lib/python3.7/site-packages/statsmodels/nonparametric/kdetools.py:34: RuntimeWarning: invalid value encountered in double_scalars FAC1 = 2*(np.pi*bw/RANGE)**2
mapping_by_enitity_absolute = process_input_data(mapping_by_entity, absolute_value = True)
show_distribution(mapping_by_enitity_absolute)
mapping_by_database_binn = process_input_data(mapping_by_database, binning = True)
show_distribution(mapping_by_database_binn)
MAPPING_OUTPUT_PATH = os.path.join(ROOT_RESULTS_DIR, 'data', 'input_mappings', 'dataset_1_mapping.json')
to_json(mapping_by_database_and_entity, MAPPING_OUTPUT_PATH)
from collections import defaultdict
mapping_by_database_and_entity_abs = defaultdict(lambda: defaultdict(set))
for db_label, v1 in mapping_by_database_and_entity.items():
for omic_label, v2 in v1.items():
mapping_by_database_and_entity_abs[db_label][omic_label] = process_input_data(v2, absolute_value = True)
MAPPING_OUTPUT_PATH = os.path.join(ROOT_RESULTS_DIR, 'data', 'input_mappings', 'dataset_1_mapping_absolute_value_bp.json')
to_json(mapping_by_database_and_entity_abs, MAPPING_OUTPUT_PATH)