import numpy
import pathlib
import pandas
import regex
import plotnine as pln
/home/dhimmel/anaconda3/envs/hetmech/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead. from pandas.core import datetools
path = pathlib.Path('data/rephetio-DWPCs-hetmech-runtime.tsv')
runtime_df = pandas.read_table(path)
runtime_df.dropna(subset=['dwpc_hetmech_runtime'], inplace=True)
runtime_df.length = pandas.Categorical(runtime_df.length, ordered=True)
runtime_df.head(2)
abbreviation | category | length | dwpc_hetmech_runtime | |
---|---|---|---|---|
0 | CbGaD | no_repeats | 2 | 0.77151 |
1 | CbGdD | no_repeats | 2 | 0.72944 |
pln.options.figure_size = 4, 3
(
pln.ggplot(runtime_df, pln.aes(x = 'dwpc_hetmech_runtime')) +
pln.geom_histogram(breaks=numpy.arange(1 + runtime_df.dwpc_hetmech_runtime.max())) +
pln.xlab('Matrix DWPC Runtime (seconds)') +
pln.ylab('Number of metapaths') +
pln.ggtitle('All supported metapaths') +
pln.theme_bw()
)
<ggplot: (-9223363263509039237)>
This shows that the longest nine metapaths were all of the G_X_G form
# Dangerous method to count number of Gene metanodes in a metapath
runtime_df['n_genes'] = runtime_df.abbreviation.map(lambda x: x.count('G'))
runtime_df.head(2)
abbreviation | category | length | dwpc_hetmech_runtime | n_genes | |
---|---|---|---|---|---|
0 | CbGaD | no_repeats | 2 | 0.77151 | 1 |
1 | CbGdD | no_repeats | 2 | 0.72944 | 1 |
(pln.ggplot(runtime_df, pln.aes(x='n_genes', y='dwpc_hetmech_runtime'))
+ pln.geom_jitter(width=0.35, size=2, fill='#C44E52')
+ pln.xlab('Number of Gene metanodes')
+ pln.ylab('Metapath runtime (s)')
+ pln.theme_bw()
)
<ggplot: (-9223363263511225449)>
pattern = regex.compile(r'[G][a-z<>{1,2}][A-Z{1,2}][a-z<>{1,2}][G]')
G_G_runtime_df = runtime_df[runtime_df.abbreviation.map(pattern.search).astype(bool)].copy()
(
pln.ggplot(G_G_runtime_df, pln.aes(x = 'dwpc_hetmech_runtime')) +
pln.geom_histogram(breaks=numpy.arange(1 + runtime_df.dwpc_hetmech_runtime.max())) +
pln.xlab('Matrix DWPC Runtime (seconds)') +
pln.ylab('Number of metapaths') +
pln.ggtitle('G_X_G metapaths') +
pln.theme_bw()
)
<ggplot: (8773343555304)>
pattern = regex.compile(r'[G][a-z<>{1,2}][A][a-z<>{1,2}][G]')
G_G_runtime_df['gene_pattern'] = G_G_runtime_df.abbreviation.map(lambda x: 'G_A_G' if pattern.search(x) else 'G_X_G')
G_G_runtime_df.loc[G_G_runtime_df.abbreviation.str.contains('GeAeG'), 'gene_pattern'] = 'GeAeG'
G_G_runtime_df.head(2)
abbreviation | category | length | dwpc_hetmech_runtime | n_genes | gene_pattern | |
---|---|---|---|---|---|---|
158 | CbGaDaGaD | BABA | 4 | 1.5764 | 2 | G_X_G |
159 | CbGaDaGdD | BABA | 4 | 1.5876 | 2 | G_X_G |
(pln.ggplot(G_G_runtime_df, pln.aes(x='gene_pattern', y='dwpc_hetmech_runtime'))
+ pln.geom_jitter(width=0.35, size=2, fill='#C44E52')
+ pln.xlab('')
+ pln.ylab('Metapath runtime (s)')
+ pln.ggtitle('DWPC times within G_X_G')
+ pln.theme_bw()
)
<ggplot: (8773343514239)>
(pln.ggplot(runtime_df, pln.aes(x='length', y='dwpc_hetmech_runtime'))
+ pln.geom_jitter(width=0.35, size=2, fill='#C44E52')
+ pln.xlab('Metapath length')
+ pln.ylab('Metapath runtime (s)')
+ pln.theme_bw()
)
<ggplot: (8773343594381)>
runtime_df.length.cat.reorder_categories([4, 3, 2], inplace=True)
(pln.ggplot(runtime_df, pln.aes(x='category', y='dwpc_hetmech_runtime', fill='length'))
+ pln.geom_jitter(width=0.35, size=2)
+ pln.xlab('Metapath length')
+ pln.ylab('Metapath runtime (s)')
+ pln.theme_bw()
)
<ggplot: (8773343515126)>