%matplotlib inline
import pandas as pd
import numpy as np
import re
import os
import math
from multiprocessing import Pool
from tqdm import tqdm
from scipy import stats
## init
mySpecie='Homo_sapiens'
prealigned_dir='/cellar/users/btsui/all_seq_snp/Homo_sapiens_all_merged_snp.TCGA.prealigned.pickle'
targetted_align_dir='/cellar/users/btsui/all_seq_snp/Homo_sapiens_all_merged_snp.TCGA.pickle'
#!ls /cellar/users/btsui/all_seq_snp/Homo_sapiens_all_merged_snp.TCGA.pickle
prealigned_df=pd.read_pickle(prealigned_dir).loc['TCGA']
targetted_df=pd.read_pickle(targetted_align_dir).loc["TCGA"]
#prealigned_df.head()
Run_digits_inter=np.intersect1d(targetted_df.index.get_level_values('Run_digits').unique(),
prealigned_df.index.get_level_values('Run_digits').unique())
len(Run_digits_inter)
342
myRunDigit='2c4ed1e6-85ae-45c1-8c5b-5ef8bfc3aba0'
myDict={}
for myRunDigit in tqdm(Run_digits_inter):
tmpS1=targetted_df.loc[myRunDigit]['ReadDepth']
tmpS1=tmpS1.groupby(tmpS1.index.names).first()
tmpS2=prealigned_df.loc[myRunDigit]['ReadDepth']
tmpS2=tmpS2.groupby(tmpS2.index.names).first()
mergedDf=pd.DataFrame({'targetted':tmpS1,
'prealigned':tmpS2}).dropna()
mergedDf=np.log2(mergedDf+1)
g=stats.linregress(mergedDf['targetted'],mergedDf['prealigned'])
r,p=stats.pearsonr(mergedDf['targetted'],mergedDf['prealigned'])
fractMoreThanOneFold=((mergedDf['targetted']-mergedDf['prealigned']).abs()>1).mean()
myDict[myRunDigit]={'r':r,'n':mergedDf.shape[0],'slope':g.slope,'fold':fractMoreThanOneFold}
#print (fractMoreThanOneFold)
0%| | 0/342 [00:00<?, ?it/s] 0%| | 1/342 [00:02<16:36, 2.92s/it]/cellar/users/btsui/anaconda3/lib/python3.6/site-packages/scipy/stats/_stats_mstats_common.py:107: RuntimeWarning: invalid value encountered in double_scalars slope = r_num / ssxm /cellar/users/btsui/anaconda3/lib/python3.6/site-packages/scipy/stats/_stats_mstats_common.py:117: RuntimeWarning: invalid value encountered in sqrt t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY))) /cellar/users/btsui/anaconda3/lib/python3.6/site-packages/scipy/stats/_stats_mstats_common.py:119: RuntimeWarning: invalid value encountered in double_scalars sterrest = np.sqrt((1 - r**2) * ssym / ssxm / df) /cellar/users/btsui/anaconda3/lib/python3.6/site-packages/scipy/stats/stats.py:3010: RuntimeWarning: invalid value encountered in double_scalars r = r_num / r_den 1%| | 2/342 [00:05<14:39, 2.59s/it]/cellar/users/btsui/anaconda3/lib/python3.6/site-packages/scipy/stats/stats.py:3020: RuntimeWarning: invalid value encountered in double_scalars prob = _betai(0.5*df, 0.5, df/(df+t_squared)) 1%| | 3/342 [00:07<14:40, 2.60s/it] 1%| | 4/342 [00:11<15:48, 2.81s/it] 1%|▏ | 5/342 [00:14<16:01, 2.85s/it] 2%|▏ | 6/342 [00:16<15:39, 2.80s/it] 2%|▏ | 7/342 [00:19<15:22, 2.75s/it] 2%|▏ | 8/342 [00:21<15:12, 2.73s/it] 3%|▎ | 9/342 [00:24<15:03, 2.71s/it] 3%|▎ | 10/342 [00:27<15:13, 2.75s/it] 3%|▎ | 11/342 [00:30<15:17, 2.77s/it] 4%|▎ | 12/342 [00:32<15:04, 2.74s/it] 4%|▍ | 13/342 [00:36<15:17, 2.79s/it] 4%|▍ | 14/342 [00:39<15:17, 2.80s/it] 4%|▍ | 15/342 [00:41<15:03, 2.76s/it] 5%|▍ | 16/342 [00:44<15:03, 2.77s/it] 5%|▍ | 17/342 [00:47<15:00, 2.77s/it] 5%|▌ | 18/342 [00:49<14:55, 2.76s/it] 6%|▌ | 19/342 [00:52<14:50, 2.76s/it] 6%|▌ | 20/342 [00:54<14:43, 2.74s/it] 6%|▌ | 21/342 [00:58<14:46, 2.76s/it] 6%|▋ | 22/342 [01:00<14:44, 2.76s/it] 7%|▋ | 23/342 [01:03<14:44, 2.77s/it] 7%|▋ | 24/342 [01:06<14:39, 2.77s/it] 7%|▋ | 25/342 [01:08<14:33, 2.75s/it] 8%|▊ | 26/342 [01:11<14:32, 2.76s/it] 8%|▊ | 27/342 [01:14<14:31, 2.77s/it] 8%|▊ | 28/342 [01:16<14:21, 2.74s/it] 8%|▊ | 29/342 [01:19<14:17, 2.74s/it] 9%|▉ | 30/342 [01:22<14:13, 2.74s/it] 9%|▉ | 31/342 [01:24<14:09, 2.73s/it] 9%|▉ | 32/342 [01:27<14:04, 2.72s/it] 10%|▉ | 33/342 [01:29<13:59, 2.72s/it] 10%|▉ | 34/342 [01:32<13:59, 2.72s/it] 10%|█ | 35/342 [01:35<13:55, 2.72s/it] 11%|█ | 36/342 [01:38<13:55, 2.73s/it] 11%|█ | 37/342 [01:40<13:51, 2.73s/it] 11%|█ | 38/342 [01:43<13:46, 2.72s/it] 11%|█▏ | 39/342 [01:45<13:43, 2.72s/it] 12%|█▏ | 40/342 [01:48<13:39, 2.71s/it] 12%|█▏ | 41/342 [01:51<13:38, 2.72s/it] 12%|█▏ | 42/342 [01:54<13:35, 2.72s/it] 13%|█▎ | 43/342 [01:56<13:32, 2.72s/it] 13%|█▎ | 44/342 [01:59<13:29, 2.71s/it] 13%|█▎ | 45/342 [02:02<13:25, 2.71s/it] 13%|█▎ | 46/342 [02:04<13:18, 2.70s/it] 14%|█▎ | 47/342 [02:06<13:14, 2.69s/it] 14%|█▍ | 48/342 [02:09<13:14, 2.70s/it] 14%|█▍ | 49/342 [02:12<13:11, 2.70s/it] 15%|█▍ | 50/342 [02:14<13:07, 2.70s/it] 15%|█▍ | 51/342 [02:17<13:02, 2.69s/it] 15%|█▌ | 52/342 [02:19<12:57, 2.68s/it] 15%|█▌ | 53/342 [02:23<13:00, 2.70s/it] 16%|█▌ | 54/342 [02:26<12:59, 2.71s/it] 16%|█▌ | 55/342 [02:28<12:56, 2.71s/it] 16%|█▋ | 56/342 [02:31<12:51, 2.70s/it] 17%|█▋ | 57/342 [02:33<12:47, 2.69s/it] 17%|█▋ | 58/342 [02:36<12:44, 2.69s/it] 17%|█▋ | 59/342 [02:38<12:39, 2.68s/it] 18%|█▊ | 60/342 [02:40<12:36, 2.68s/it] 18%|█▊ | 61/342 [02:43<12:33, 2.68s/it] 18%|█▊ | 62/342 [02:45<12:27, 2.67s/it] 18%|█▊ | 63/342 [02:48<12:28, 2.68s/it] 19%|█▊ | 64/342 [02:51<12:24, 2.68s/it] 19%|█▉ | 65/342 [02:54<12:23, 2.69s/it] 19%|█▉ | 66/342 [02:57<12:20, 2.68s/it] 20%|█▉ | 67/342 [02:59<12:17, 2.68s/it] 20%|█▉ | 68/342 [03:01<12:13, 2.68s/it] 20%|██ | 69/342 [03:03<12:07, 2.67s/it] 20%|██ | 70/342 [03:06<12:06, 2.67s/it] 21%|██ | 71/342 [03:09<12:05, 2.68s/it] 21%|██ | 72/342 [03:12<12:02, 2.68s/it] 21%|██▏ | 73/342 [03:15<12:01, 2.68s/it] 22%|██▏ | 74/342 [03:18<11:59, 2.68s/it] 22%|██▏ | 75/342 [03:21<11:57, 2.69s/it] 22%|██▏ | 76/342 [03:23<11:53, 2.68s/it] 23%|██▎ | 77/342 [03:26<11:49, 2.68s/it] 23%|██▎ | 78/342 [03:29<11:47, 2.68s/it] 23%|██▎ | 79/342 [03:32<11:46, 2.68s/it] 23%|██▎ | 80/342 [03:35<11:46, 2.70s/it] 24%|██▎ | 81/342 [03:38<11:43, 2.70s/it] 24%|██▍ | 82/342 [03:41<11:41, 2.70s/it] 24%|██▍ | 83/342 [03:43<11:37, 2.69s/it] 25%|██▍ | 84/342 [03:46<11:35, 2.69s/it] 25%|██▍ | 85/342 [03:49<11:35, 2.71s/it] 25%|██▌ | 86/342 [03:53<11:33, 2.71s/it] 25%|██▌ | 87/342 [03:55<11:30, 2.71s/it] 26%|██▌ | 88/342 [03:58<11:27, 2.71s/it] 26%|██▌ | 89/342 [04:00<11:24, 2.71s/it]
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) <ipython-input-36-7d339300a869> in <module>() 1 myDict={} 2 for myRunDigit in tqdm(Run_digits_inter): ----> 3 tmpS1=targetted_df.loc[myRunDigit]['ReadDepth'] 4 tmpS1=tmpS1.groupby(tmpS1.index.names).first() 5 ~/anaconda3/lib/python3.6/site-packages/pandas/core/indexing.py in __getitem__(self, key) 1476 1477 maybe_callable = com._apply_if_callable(key, self.obj) -> 1478 return self._getitem_axis(maybe_callable, axis=axis) 1479 1480 def _is_scalar_access(self, key): ~/anaconda3/lib/python3.6/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis) 1910 # fall thru to straight lookup 1911 self._validate_key(key, axis) -> 1912 return self._get_label(key, axis=axis) 1913 1914 ~/anaconda3/lib/python3.6/site-packages/pandas/core/indexing.py in _get_label(self, label, axis) 138 raise IndexingError('no slices here, handle elsewhere') 139 --> 140 return self.obj._xs(label, axis=axis) 141 142 def _get_loc(self, key, axis=None): ~/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in xs(self, key, axis, level, drop_level) 2980 if isinstance(index, MultiIndex): 2981 loc, new_index = self.index.get_loc_level(key, -> 2982 drop_level=drop_level) 2983 else: 2984 loc = self.index.get_loc(key) ~/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/multi.py in get_loc_level(self, key, level, drop_level) 2441 return indexer, maybe_droplevels(indexer, ilevels, drop_level) 2442 else: -> 2443 indexer = self._get_level_indexer(key, level=level) 2444 return indexer, maybe_droplevels(indexer, [level], drop_level) 2445 ~/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/multi.py in _get_level_indexer(self, key, level, indexer) 2525 return loc 2526 elif level > 0 or self.lexsort_depth == 0: -> 2527 return np.array(labels == loc, dtype=bool) 2528 2529 i = labels.searchsorted(loc, side='left') KeyboardInterrupt:
0.037037037037037035
#stats.lingress(mergedDf['targetted'],mergedDf['prealigned'])
statDf=pd.DataFrame(myDict).T.dropna()
statDf[statDf.n>10]['fold'].median()
0.018018018018018018
statDf[statDf.n>10]['slope'].median()
0.9483168308839256
statDf[statDf.n>10]['slope'].quantile(0.025)
0.9189229304525588
import seaborn as sns
sns.jointplot(data=statDf,x='n',y='r')
/cellar/users/btsui/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg. warnings.warn("The 'normed' kwarg is deprecated, and has been " /cellar/users/btsui/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg. warnings.warn("The 'normed' kwarg is deprecated, and has been "
<seaborn.axisgrid.JointGrid at 0x2b00110840f0>
#0.94, 0.92
statDf['r'].quantile(0.025)
0.9277931805387337
statDf['r'].median()
0.9847056959600939
import matplotlib.pyplot as plt
fig,ax=plt.subplots(figsize=(3,4))
statDf[statDf['n']>100].r.plot.hist(normed=True,ax=ax)
ax.set_ylabel('% of TCGA WXS BAMS')
ax.set_xlim([0.85,1.0])
xlabel='Allelic read count correlation\n'\
'between targetted alignment and full alignment\n'
ax.set_xlabel(xlabel)
/cellar/users/btsui/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg. warnings.warn("The 'normed' kwarg is deprecated, and has been "
Text(0.5,0,'Allelic read count correlation\nbetween targetted alignment and full alignment\n')
statDf['r'].quantile(0.025),statDf['r'].quantile(1-0.025)
(0.9277931805387337, 0.9921622434631636)
pd.Series(myDict).quantile(0.05)
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-18-51c9c6bfb9fc> in <module>() ----> 1 pd.Series(myDict).quantile(0.05) ~/anaconda3/lib/python3.6/site-packages/pandas/core/series.py in quantile(self, q, interpolation) 1874 self._check_percentile(q) 1875 -> 1876 result = self._data.quantile(qs=q, interpolation=interpolation) 1877 1878 if is_list_like(q): ~/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in quantile(self, **kwargs) 3688 3689 def quantile(self, **kwargs): -> 3690 return self.reduction('quantile', **kwargs) 3691 3692 def setitem(self, **kwargs): ~/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in reduction(self, f, axis, consolidate, transposed, **kwargs) 3617 for b in self.blocks: 3618 kwargs['mgr'] = self -> 3619 axe, block = getattr(b, f)(axis=axis, **kwargs) 3620 3621 axes.append(axe) ~/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in quantile(self, qs, interpolation, axis, mgr) 1681 result = np.array([self._na_value] * len(self)) 1682 else: -> 1683 result = _nanpercentile(values, qs * 100, axis=axis, **kw) 1684 1685 ndim = getattr(result, 'ndim', None) or 0 ~/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in _nanpercentile(values, q, axis, **kw) 1637 return result 1638 else: -> 1639 return np.percentile(values, q, axis=axis, **kw) 1640 1641 from pandas import Float64Index ~/anaconda3/lib/python3.6/site-packages/numpy/lib/function_base.py in percentile(a, q, axis, out, overwrite_input, interpolation, keepdims) 4289 r, k = _ureduce(a, func=_percentile, q=q, axis=axis, out=out, 4290 overwrite_input=overwrite_input, -> 4291 interpolation=interpolation) 4292 if keepdims: 4293 return r.reshape(q.shape + k) ~/anaconda3/lib/python3.6/site-packages/numpy/lib/function_base.py in _ureduce(a, func, **kwargs) 4031 keepdim = (1,) * a.ndim 4032 -> 4033 r = func(a, **kwargs) 4034 return r, keepdim 4035 ~/anaconda3/lib/python3.6/site-packages/numpy/lib/function_base.py in _percentile(a, q, axis, out, overwrite_input, interpolation, keepdims) 4390 weights_above.shape = weights_shape 4391 -> 4392 ap.partition(concatenate((indices_below, indices_above)), axis=axis) 4393 4394 # ensure axis with qth is first TypeError: '<' not supported between instances of 'dict' and 'dict'