%matplotlib inline
import collections
import numpy as np
import pandas as pd
from hetmech.degree_weight import categorize, get_segments, get_all_segments, order_segments
from hetmech.hetmat import HetMat
hetmat = HetMat('../data/hetionet-v1.0.hetmat/')
metapaths = hetmat.metagraph.extract_all_metapaths(max_length=5)
When splitting a metapath like CrCbGaDrDaG
, which will be split more than once, I think it is helpful to include intermediate splits. This is because, in light of our desire to cache, we would like to cache the results of larger computations if they are reused. By counting this way, we will discover if there are frequently-repeated longer segments that would otherwise be sub-split.
Basically:
CrCbGaDrDaG
-> [CrC
, CbG
, GaDrDaG
] -> [CrC
, CbG
, GaDrDaG
, GaD
, DrD
, DaG
]
instead of
CrCbGaDrDaG
-> [CrC
, CbG
, GaDrDaG
] -> [CrC
, CbG
, GaD
, DrD
, DaG
]
%%time
segments_to_cache = order_segments(hetmat.metagraph, metapaths)
CPU times: user 1min 1s, sys: 15.2 ms, total: 1min 1s Wall time: 1min 1s
segments_to_cache.most_common(10)
[(GaD, 62653), (GdD, 62653), (GuD, 62653), (GbC, 60118), (GuC, 60118), (GdC, 60118), (GdA, 40542), (GeA, 40542), (GuA, 40542), (Gr>G, 34768)]
len(segments_to_cache)
15562
len([v for v in segments_to_cache.values() if v > 100])
405
len([v for v in segments_to_cache.values() if v > 500])
126
pd.Series(list(segments_to_cache.values())).hist()
<matplotlib.axes._subplots.AxesSubplot at 0x7ff8c46b2208>
pd.Series([v for v in segments_to_cache.values() if v > 100]).hist()
<matplotlib.axes._subplots.AxesSubplot at 0x7ff8c45f35f8>
segments = pd.DataFrame.from_dict(segments_to_cache, orient='index').sort_values(by=0, ascending=False)
num_calls = segments.sum()[0]
arr = []
for num_cached in range(len(segments_to_cache)):
cached_values = np.sum(segments[0].iloc[0:num_cached].values) / num_calls
arr.append([num_cached, cached_values])
calls = pd.DataFrame(arr)
calls.plot(x=0, y=1)
<matplotlib.axes._subplots.AxesSubplot at 0x7ff8c4561400>