import json
import matplotlib.pyplot
import pandas
import numpy
import seaborn
import mpld3
%matplotlib inline
path = '../all-features/data/metapaths.json'
with open(path) as fp:
metapaths = json.load(fp)
dwpc_df = pandas.read_table('../all-features/data/dwpc.tsv.bz2')
dwpc_df.head(2)
hetnet | compound_id | disease_id | metapath | PC | w | DWPC | seconds | |
---|---|---|---|---|---|---|---|---|
0 | rephetio-v2.0 | DB00014 | DOID:0050741 | CpDpCpD | 0 | 0.4 | 0.0 | 0.7353 |
1 | rephetio-v2.0 | DB00014 | DOID:10283 | CpDpCpD | 0 | 0.4 | 0.0 | 0.7317 |
# Number of queries
len(dwpc_df)
27308958
time_df = dwpc_df.groupby('metapath').seconds.mean().reset_index()
len(time_df)
1206
cols = ['sequential_complexity', 'optimal_join_complexity', 'midpoint_join_complexity']
rows = [[
item['abbreviation'],
item['join_complexities'][item['midpoint_index']],
item['join_complexities'][item['optimal_join_index']],
item['join_complexities'][-1],
item['join_complexities'][0],
] for item in metapaths]
complexity_df = pandas.DataFrame(rows, columns=
['metapath', 'midpoint_complexity', 'optimal_complexity', 'forward_complexity', 'backward_complexity'])
complexity_df = time_df.merge(complexity_df)
complexity_df['log10_seconds_per_query'] = numpy.log10(complexity_df['seconds'])
complexity_df.head(2)
metapath | seconds | midpoint_complexity | optimal_complexity | forward_complexity | backward_complexity | log10_seconds_per_query | |
---|---|---|---|---|---|---|---|
0 | CbG<rG<rGaD | 0.035545 | 3.10150 | 2.859092 | 2.859092 | 3.913263 | -1.449222 |
1 | CbG<rG<rGdD | 0.023686 | 2.90328 | 2.640056 | 2.640056 | 3.694227 | -1.625503 |
matplotlib.pyplot.figure(figsize=(10, 7))
ax = seaborn.regplot('forward_complexity', 'log10_seconds_per_query', data=complexity_df,
lowess=True, scatter_kws={'alpha': 0.5}, line_kws={'color': 'black'}, ci=False)
points = ax.collections[0]
labels = complexity_df.metapath.tolist()
tooltip = mpld3.plugins.PointLabelTooltip(points, labels)
mpld3.plugins.connect(ax.figure, tooltip)
mpld3.display()
matplotlib.pyplot.figure(figsize=(10, 7))
ax = seaborn.regplot('optimal_complexity', 'log10_seconds_per_query', data=complexity_df,
lowess=True, scatter_kws={'alpha': 0.5}, line_kws={'color': 'black'}, ci=False)
points = ax.collections[0]
labels = complexity_df.metapath.tolist()
tooltip = mpld3.plugins.PointLabelTooltip(points, labels)
mpld3.plugins.connect(ax.figure, tooltip)
mpld3.display()
matplotlib.pyplot.figure(figsize=(10, 7))
ax = seaborn.regplot('midpoint_complexity', 'log10_seconds_per_query', data=complexity_df,
lowess=True, scatter_kws={'alpha': 0.5}, line_kws={'color': 'black'}, ci=False)
points = ax.collections[0]
labels = complexity_df.metapath.tolist()
tooltip = mpld3.plugins.PointLabelTooltip(points, labels)
mpld3.plugins.connect(ax.figure, tooltip)
mpld3.display()