wDupRemoveDir='/cellar/users/btsui/all_seq_snp/Homo_sapiens_all_merged_snp.TCGA.with_pcr_rm.pickle'
woDupRemoveDir='/cellar/users/btsui/all_seq_snp/Homo_sapiens_all_merged_snp.TCGA.pickle'
import pandas as pd
from tqdm import tqdm
import numpy as np
from scipy import stats
wDupRemoved=pd.read_pickle(wDupRemoveDir).loc['TCGA']
ProcessedRunDigits=wDupRemoved.index.get_level_values('Run_digits').unique()
woDupRemoved=pd.read_pickle(woDupRemoveDir)
m=woDupRemoved.index.get_level_values('Run_digits').isin(ProcessedRunDigits)
woDupRemoved_inDf=woDupRemoved[m].loc['TCGA']
corrDict={}
for queryUUID in tqdm(ProcessedRunDigits):
woDupRemoved_inDf_tmp=woDupRemoved_inDf['ReadDepth'].loc[queryUUID]
wDupRemoved_inDf_tmp=wDupRemoved['ReadDepth'].loc[queryUUID]
mergedDf=np.log10(pd.DataFrame({'wo':woDupRemoved_inDf_tmp,
'w':wDupRemoved_inDf_tmp})+1).dropna()
r,p=stats.pearsonr(mergedDf['wo'],mergedDf['w'])
#print (r)
### fit for high allelic read counts
mergedDfh=mergedDf[mergedDf['wo']>2]
l_h =stats.linregress(mergedDfh['wo'],mergedDfh['w'])
l_h.slope
mergedDfl=mergedDf[mergedDf['wo']<2]
l_l =stats.linregress(mergedDfl['wo'],mergedDfl['w'])
#print(l_l.slope)
corrDict[queryUUID]={'overall_r':r,'l_l_slope':l_l.slope,
'l_h_slope':l_h.slope}
0%| | 0/452 [00:00<?, ?it/s] 0%| | 1/452 [00:09<1:10:13, 9.34s/it] 0%| | 2/452 [00:19<1:12:51, 9.71s/it] 1%| | 3/452 [00:29<1:12:29, 9.69s/it] 1%| | 4/452 [00:39<1:13:45, 9.88s/it] 1%| | 5/452 [00:49<1:14:02, 9.94s/it] 1%|▏ | 6/452 [00:59<1:14:02, 9.96s/it] 2%|▏ | 7/452 [01:08<1:12:52, 9.83s/it] 2%|▏ | 8/452 [01:16<1:10:45, 9.56s/it] 2%|▏ | 9/452 [01:24<1:09:18, 9.39s/it] 2%|▏ | 10/452 [01:32<1:08:04, 9.24s/it] 2%|▏ | 11/452 [01:37<1:05:26, 8.90s/it] 3%|▎ | 12/452 [01:46<1:04:54, 8.85s/it] 3%|▎ | 13/452 [01:53<1:03:41, 8.70s/it] 3%|▎ | 14/452 [02:00<1:03:03, 8.64s/it] 3%|▎ | 15/452 [02:08<1:02:31, 8.58s/it] 4%|▎ | 16/452 [02:16<1:02:01, 8.53s/it] 4%|▍ | 17/452 [02:25<1:01:56, 8.54s/it] 4%|▍ | 18/452 [02:32<1:01:23, 8.49s/it] 4%|▍ | 19/452 [02:39<1:00:40, 8.41s/it] 4%|▍ | 20/452 [02:47<1:00:18, 8.38s/it] 5%|▍ | 21/452 [02:54<59:31, 8.29s/it] 5%|▍ | 22/452 [03:01<59:12, 8.26s/it] 5%|▌ | 23/452 [03:09<58:47, 8.22s/it] 5%|▌ | 24/452 [03:17<58:38, 8.22s/it] 6%|▌ | 25/452 [03:24<58:08, 8.17s/it] 6%|▌ | 26/452 [03:31<57:52, 8.15s/it] 6%|▌ | 27/452 [03:40<57:57, 8.18s/it] 6%|▌ | 28/452 [03:48<57:43, 8.17s/it] 6%|▋ | 29/452 [03:55<57:18, 8.13s/it] 7%|▋ | 30/452 [04:03<57:07, 8.12s/it] 7%|▋ | 31/452 [04:11<56:51, 8.10s/it] 7%|▋ | 32/452 [04:17<56:17, 8.04s/it] 7%|▋ | 33/452 [04:25<56:06, 8.03s/it] 8%|▊ | 34/452 [04:32<55:51, 8.02s/it] 8%|▊ | 35/452 [04:38<55:21, 7.97s/it] 8%|▊ | 36/452 [04:45<55:02, 7.94s/it] 8%|▊ | 37/452 [04:52<54:41, 7.91s/it] 8%|▊ | 38/452 [05:00<54:31, 7.90s/it] 9%|▊ | 39/452 [05:07<54:20, 7.90s/it] 9%|▉ | 40/452 [05:15<54:10, 7.89s/it] 9%|▉ | 41/452 [05:23<54:05, 7.90s/it] 9%|▉ | 42/452 [05:30<53:47, 7.87s/it] 10%|▉ | 43/452 [05:38<53:36, 7.87s/it] 10%|▉ | 44/452 [05:45<53:26, 7.86s/it] 10%|▉ | 45/452 [05:52<53:08, 7.83s/it] 10%|█ | 46/452 [06:00<52:58, 7.83s/it] 10%|█ | 47/452 [06:08<52:52, 7.83s/it] 11%|█ | 48/452 [06:15<52:43, 7.83s/it] 11%|█ | 49/452 [06:21<52:21, 7.79s/it] 11%|█ | 50/452 [06:29<52:11, 7.79s/it] 11%|█▏ | 51/452 [06:37<52:02, 7.79s/it] 12%|█▏ | 52/452 [06:45<51:56, 7.79s/it] 12%|█▏ | 53/452 [06:53<51:51, 7.80s/it] 12%|█▏ | 54/452 [07:00<51:38, 7.79s/it] 12%|█▏ | 55/452 [07:06<51:21, 7.76s/it] 12%|█▏ | 56/452 [07:14<51:09, 7.75s/it] 13%|█▎ | 57/452 [07:21<51:01, 7.75s/it] 13%|█▎ | 58/452 [07:30<50:58, 7.76s/it] 13%|█▎ | 59/452 [07:37<50:49, 7.76s/it] 13%|█▎ | 60/452 [07:46<50:47, 7.77s/it] 13%|█▎ | 61/452 [07:54<50:38, 7.77s/it] 14%|█▎ | 62/452 [08:01<50:30, 7.77s/it] 14%|█▍ | 63/452 [08:09<50:21, 7.77s/it] 14%|█▍ | 64/452 [08:16<50:10, 7.76s/it] 14%|█▍ | 65/452 [08:24<50:03, 7.76s/it] 15%|█▍ | 66/452 [08:32<49:54, 7.76s/it] 15%|█▍ | 67/452 [08:39<49:45, 7.76s/it] 15%|█▌ | 68/452 [08:46<49:35, 7.75s/it] 15%|█▌ | 69/452 [08:54<49:27, 7.75s/it] 15%|█▌ | 70/452 [09:02<49:19, 7.75s/it] 16%|█▌ | 71/452 [09:09<49:11, 7.75s/it] 16%|█▌ | 72/452 [09:16<48:58, 7.73s/it] 16%|█▌ | 73/452 [09:24<48:50, 7.73s/it] 16%|█▋ | 74/452 [09:31<48:36, 7.72s/it] 17%|█▋ | 75/452 [09:38<48:27, 7.71s/it] 17%|█▋ | 76/452 [09:46<48:20, 7.71s/it] 17%|█▋ | 77/452 [09:54<48:14, 7.72s/it] 17%|█▋ | 78/452 [10:02<48:07, 7.72s/it] 17%|█▋ | 79/452 [10:09<47:59, 7.72s/it] 18%|█▊ | 80/452 [10:17<47:52, 7.72s/it] 18%|█▊ | 81/452 [10:26<47:49, 7.73s/it] 18%|█▊ | 82/452 [10:33<47:39, 7.73s/it] 18%|█▊ | 83/452 [10:41<47:32, 7.73s/it] 19%|█▊ | 84/452 [10:49<47:25, 7.73s/it] 19%|█▉ | 85/452 [10:57<47:18, 7.74s/it] 19%|█▉ | 86/452 [11:03<47:05, 7.72s/it] 19%|█▉ | 87/452 [11:09<46:48, 7.70s/it] 19%|█▉ | 88/452 [11:14<46:32, 7.67s/it] 20%|█▉ | 89/452 [11:19<46:11, 7.63s/it] 20%|█▉ | 90/452 [11:27<46:04, 7.64s/it] 20%|██ | 91/452 [11:35<45:58, 7.64s/it] 20%|██ | 92/452 [11:41<45:45, 7.63s/it] 21%|██ | 93/452 [11:49<45:38, 7.63s/it] 21%|██ | 94/452 [11:56<45:30, 7.63s/it] 21%|██ | 95/452 [12:05<45:24, 7.63s/it] 21%|██ | 96/452 [12:12<45:16, 7.63s/it] 21%|██▏ | 97/452 [12:20<45:09, 7.63s/it] 22%|██▏ | 98/452 [12:28<45:04, 7.64s/it] 22%|██▏ | 99/452 [12:34<44:48, 7.62s/it] 22%|██▏ | 100/452 [12:42<44:44, 7.63s/it] 22%|██▏ | 101/452 [12:50<44:37, 7.63s/it] 23%|██▎ | 102/452 [12:56<44:25, 7.62s/it] 23%|██▎ | 103/452 [13:07<44:26, 7.64s/it] 23%|██▎ | 104/452 [13:16<44:26, 7.66s/it] 23%|██▎ | 105/452 [13:25<44:20, 7.67s/it] 23%|██▎ | 106/452 [13:32<44:13, 7.67s/it] 24%|██▎ | 107/452 [13:40<44:06, 7.67s/it] 24%|██▍ | 108/452 [13:49<44:02, 7.68s/it] 24%|██▍ | 109/452 [13:57<43:55, 7.68s/it] 24%|██▍ | 110/452 [14:05<43:48, 7.69s/it] 25%|██▍ | 111/452 [14:13<43:41, 7.69s/it] 25%|██▍ | 112/452 [14:21<43:35, 7.69s/it] 25%|██▌ | 113/452 [14:29<43:29, 7.70s/it] 25%|██▌ | 114/452 [14:38<43:24, 7.71s/it] 25%|██▌ | 115/452 [14:46<43:18, 7.71s/it] 26%|██▌ | 116/452 [14:53<43:06, 7.70s/it] 26%|██▌ | 117/452 [15:01<42:59, 7.70s/it] 26%|██▌ | 118/452 [15:08<42:50, 7.70s/it] 26%|██▋ | 119/452 [15:16<42:43, 7.70s/it] 27%|██▋ | 120/452 [15:23<42:36, 7.70s/it] 27%|██▋ | 121/452 [15:32<42:30, 7.71s/it] 27%|██▋ | 122/452 [15:40<42:24, 7.71s/it] 27%|██▋ | 123/452 [15:48<42:16, 7.71s/it] 27%|██▋ | 124/452 [15:54<42:03, 7.70s/it] 28%|██▊ | 125/452 [15:59<41:50, 7.68s/it] 28%|██▊ | 126/452 [16:04<41:35, 7.66s/it] 28%|██▊ | 127/452 [16:08<41:19, 7.63s/it] 28%|██▊ | 128/452 [16:14<41:05, 7.61s/it] 29%|██▊ | 129/452 [16:20<40:53, 7.60s/it] 29%|██▉ | 130/452 [16:25<40:41, 7.58s/it] 29%|██▉ | 131/452 [16:31<40:28, 7.57s/it] 29%|██▉ | 132/452 [16:37<40:18, 7.56s/it] 29%|██▉ | 133/452 [16:43<40:08, 7.55s/it] 30%|██▉ | 134/452 [16:48<39:52, 7.53s/it] 30%|██▉ | 135/452 [16:52<39:37, 7.50s/it] 30%|███ | 136/452 [16:56<39:22, 7.48s/it] 30%|███ | 137/452 [17:02<39:10, 7.46s/it] 31%|███ | 138/452 [17:06<38:55, 7.44s/it] 31%|███ | 139/452 [17:10<38:40, 7.41s/it] 31%|███ | 140/452 [17:15<38:27, 7.39s/it] 31%|███ | 141/452 [17:20<38:14, 7.38s/it] 31%|███▏ | 142/452 [17:24<38:01, 7.36s/it] 32%|███▏ | 143/452 [17:30<37:50, 7.35s/it] 32%|███▏ | 144/452 [17:35<37:37, 7.33s/it] 32%|███▏ | 145/452 [17:39<37:24, 7.31s/it] 32%|███▏ | 146/452 [17:46<37:15, 7.31s/it] 33%|███▎ | 147/452 [17:52<37:05, 7.30s/it] 33%|███▎ | 148/452 [17:58<36:55, 7.29s/it] 33%|███▎ | 149/452 [18:04<36:45, 7.28s/it] 33%|███▎ | 150/452 [18:10<36:35, 7.27s/it] 33%|███▎ | 151/452 [18:15<36:24, 7.26s/it] 34%|███▎ | 152/452 [18:21<36:13, 7.25s/it] 34%|███▍ | 153/452 [18:27<36:04, 7.24s/it] 34%|███▍ | 154/452 [18:32<35:52, 7.22s/it] 34%|███▍ | 155/452 [18:38<35:42, 7.21s/it] 35%|███▍ | 156/452 [18:43<35:32, 7.20s/it] 35%|███▍ | 157/452 [18:49<35:22, 7.20s/it] 35%|███▍ | 158/452 [18:55<35:12, 7.19s/it] 35%|███▌ | 159/452 [19:01<35:03, 7.18s/it] 35%|███▌ | 160/452 [19:07<34:54, 7.17s/it] 36%|███▌ | 161/452 [19:12<34:43, 7.16s/it] 36%|███▌ | 162/452 [19:18<34:33, 7.15s/it] 36%|███▌ | 163/452 [19:23<34:23, 7.14s/it] 36%|███▋ | 164/452 [19:29<34:12, 7.13s/it] 37%|███▋ | 165/452 [19:34<34:02, 7.12s/it] 37%|███▋ | 166/452 [19:39<33:52, 7.11s/it] 37%|███▋ | 167/452 [19:45<33:43, 7.10s/it] 37%|███▋ | 168/452 [19:51<33:34, 7.09s/it] 37%|███▋ | 169/452 [19:56<33:24, 7.08s/it] 38%|███▊ | 170/452 [20:02<33:14, 7.07s/it] 38%|███▊ | 171/452 [20:08<33:05, 7.07s/it] 38%|███▊ | 172/452 [20:13<32:55, 7.06s/it] 38%|███▊ | 173/452 [20:19<32:46, 7.05s/it] 38%|███▊ | 174/452 [20:24<32:36, 7.04s/it] 39%|███▊ | 175/452 [20:29<32:26, 7.03s/it] 39%|███▉ | 176/452 [20:35<32:17, 7.02s/it] 39%|███▉ | 177/452 [20:41<32:08, 7.01s/it] 39%|███▉ | 178/452 [20:47<32:01, 7.01s/it] 40%|███▉ | 179/452 [20:53<31:52, 7.00s/it] 40%|███▉ | 180/452 [20:59<31:43, 7.00s/it] 40%|████ | 181/452 [21:05<31:34, 6.99s/it] 40%|████ | 182/452 [21:10<31:25, 6.98s/it] 40%|████ | 183/452 [21:16<31:16, 6.98s/it] 41%|████ | 184/452 [21:21<31:07, 6.97s/it] 41%|████ | 185/452 [21:27<30:57, 6.96s/it] 41%|████ | 186/452 [21:32<30:47, 6.95s/it] 41%|████▏ | 187/452 [21:38<30:39, 6.94s/it] 42%|████▏ | 188/452 [21:44<30:31, 6.94s/it] 42%|████▏ | 189/452 [21:48<30:20, 6.92s/it] 42%|████▏ | 190/452 [21:52<30:10, 6.91s/it] 42%|████▏ | 191/452 [21:57<30:00, 6.90s/it] 42%|████▏ | 192/452 [22:01<29:49, 6.88s/it] 43%|████▎ | 193/452 [22:05<29:39, 6.87s/it] 43%|████▎ | 194/452 [22:10<29:28, 6.86s/it] 43%|████▎ | 195/452 [22:14<29:18, 6.84s/it] 43%|████▎ | 196/452 [22:19<29:09, 6.83s/it] 44%|████▎ | 197/452 [22:23<28:59, 6.82s/it] 44%|████▍ | 198/452 [22:28<28:49, 6.81s/it] 44%|████▍ | 199/452 [22:32<28:39, 6.80s/it] 44%|████▍ | 200/452 [22:36<28:28, 6.78s/it] 44%|████▍ | 201/452 [22:40<28:18, 6.77s/it] 45%|████▍ | 202/452 [22:44<28:09, 6.76s/it] 45%|████▍ | 203/452 [22:48<27:58, 6.74s/it] 45%|████▌ | 204/452 [22:53<27:49, 6.73s/it] 45%|████▌ | 205/452 [22:58<27:41, 6.73s/it] 46%|████▌ | 206/452 [23:03<27:32, 6.72s/it] 46%|████▌ | 207/452 [23:08<27:22, 6.71s/it] 46%|████▌ | 208/452 [23:12<27:13, 6.70s/it] 46%|████▌ | 209/452 [23:17<27:04, 6.69s/it] 46%|████▋ | 210/452 [23:22<26:55, 6.68s/it] 47%|████▋ | 211/452 [23:27<26:47, 6.67s/it] 47%|████▋ | 212/452 [23:31<26:37, 6.66s/it] 47%|████▋ | 213/452 [23:35<26:28, 6.65s/it] 47%|████▋ | 214/452 [23:40<26:19, 6.64s/it] 48%|████▊ | 215/452 [23:45<26:11, 6.63s/it] 48%|████▊ | 216/452 [23:50<26:02, 6.62s/it] 48%|████▊ | 217/452 [23:55<25:54, 6.61s/it] 48%|████▊ | 218/452 [24:01<25:46, 6.61s/it] 48%|████▊ | 219/452 [24:06<25:39, 6.61s/it] 49%|████▊ | 220/452 [24:13<25:32, 6.61s/it] 49%|████▉ | 221/452 [24:18<25:24, 6.60s/it] 49%|████▉ | 222/452 [24:23<25:16, 6.59s/it] 49%|████▉ | 223/452 [24:28<25:07, 6.58s/it] 50%|████▉ | 224/452 [24:33<24:59, 6.58s/it] 50%|████▉ | 225/452 [24:39<24:52, 6.58s/it] 50%|█████ | 226/452 [24:44<24:44, 6.57s/it] 50%|█████ | 227/452 [24:51<24:37, 6.57s/it] 50%|█████ | 228/452 [24:55<24:29, 6.56s/it] 51%|█████ | 229/452 [25:00<24:21, 6.55s/it] 51%|█████ | 230/452 [25:05<24:13, 6.55s/it] 51%|█████ | 231/452 [25:11<24:05, 6.54s/it] 51%|█████▏ | 232/452 [25:16<23:58, 6.54s/it] 52%|█████▏ | 233/452 [25:22<23:51, 6.53s/it] 52%|█████▏ | 234/452 [25:27<23:43, 6.53s/it] 52%|█████▏ | 235/452 [25:33<23:35, 6.52s/it] 52%|█████▏ | 236/452 [25:39<23:29, 6.52s/it] 52%|█████▏ | 237/452 [25:45<23:21, 6.52s/it] 53%|█████▎ | 238/452 [25:50<23:14, 6.51s/it] 53%|█████▎ | 239/452 [25:56<23:06, 6.51s/it] 53%|█████▎ | 240/452 [26:02<23:00, 6.51s/it] 53%|█████▎ | 241/452 [26:08<22:53, 6.51s/it] 54%|█████▎ | 242/452 [26:14<22:45, 6.50s/it] 54%|█████▍ | 243/452 [26:20<22:39, 6.50s/it] 54%|█████▍ | 244/452 [26:25<22:31, 6.50s/it] 54%|█████▍ | 245/452 [26:31<22:24, 6.50s/it] 54%|█████▍ | 246/452 [26:37<22:17, 6.49s/it] 55%|█████▍ | 247/452 [26:44<22:11, 6.50s/it] 55%|█████▍ | 248/452 [26:51<22:05, 6.50s/it] 55%|█████▌ | 249/452 [26:57<21:58, 6.50s/it] 55%|█████▌ | 250/452 [27:02<21:51, 6.49s/it] 56%|█████▌ | 251/452 [27:06<21:42, 6.48s/it] 56%|█████▌ | 252/452 [27:11<21:34, 6.47s/it] 56%|█████▌ | 253/452 [27:17<21:27, 6.47s/it] 56%|█████▌ | 254/452 [27:23<21:21, 6.47s/it] 56%|█████▋ | 255/452 [27:29<21:14, 6.47s/it] 57%|█████▋ | 256/452 [27:35<21:07, 6.47s/it] 57%|█████▋ | 257/452 [27:40<21:00, 6.46s/it] 57%|█████▋ | 258/452 [27:47<20:53, 6.46s/it] 57%|█████▋ | 259/452 [27:54<20:47, 6.46s/it] 58%|█████▊ | 260/452 [28:00<20:40, 6.46s/it] 58%|█████▊ | 261/452 [28:06<20:34, 6.46s/it] 58%|█████▊ | 262/452 [28:11<20:26, 6.46s/it] 58%|█████▊ | 263/452 [28:17<20:20, 6.46s/it] 58%|█████▊ | 264/452 [28:23<20:13, 6.45s/it] 59%|█████▊ | 265/452 [28:30<20:07, 6.46s/it] 59%|█████▉ | 266/452 [28:36<20:00, 6.45s/it] 59%|█████▉ | 267/452 [28:45<19:55, 6.46s/it] 59%|█████▉ | 268/452 [28:51<19:48, 6.46s/it] 60%|█████▉ | 269/452 [28:58<19:42, 6.46s/it] 60%|█████▉ | 270/452 [29:06<19:37, 6.47s/it] 60%|█████▉ | 271/452 [29:13<19:31, 6.47s/it] 60%|██████ | 272/452 [29:19<19:24, 6.47s/it] 60%|██████ | 273/452 [29:25<19:17, 6.47s/it] 61%|██████ | 274/452 [29:30<19:10, 6.46s/it] 61%|██████ | 275/452 [29:35<19:02, 6.45s/it] 61%|██████ | 276/452 [29:40<18:55, 6.45s/it] 61%|██████▏ | 277/452 [29:47<18:49, 6.45s/it] 62%|██████▏ | 278/452 [29:55<18:44, 6.46s/it] 62%|██████▏ | 279/452 [30:03<18:38, 6.47s/it] 62%|██████▏ | 280/452 [30:11<18:32, 6.47s/it] 62%|██████▏ | 281/452 [30:19<18:27, 6.47s/it] 62%|██████▏ | 282/452 [30:26<18:21, 6.48s/it] 63%|██████▎ | 283/452 [30:34<18:15, 6.48s/it] 63%|██████▎ | 284/452 [30:41<18:09, 6.49s/it] 63%|██████▎ | 285/452 [30:50<18:04, 6.49s/it] 63%|██████▎ | 286/452 [30:57<17:58, 6.50s/it] 63%|██████▎ | 287/452 [31:03<17:51, 6.49s/it] 64%|██████▎ | 288/452 [31:08<17:44, 6.49s/it] 64%|██████▍ | 289/452 [31:13<17:36, 6.48s/it] 64%|██████▍ | 290/452 [31:18<17:29, 6.48s/it] 64%|██████▍ | 291/452 [31:22<17:21, 6.47s/it] 65%|██████▍ | 292/452 [31:29<17:15, 6.47s/it] 65%|██████▍ | 293/452 [31:36<17:09, 6.47s/it] 65%|██████▌ | 294/452 [31:44<17:03, 6.48s/it] 65%|██████▌ | 295/452 [31:50<16:56, 6.48s/it] 65%|██████▌ | 296/452 [31:56<16:50, 6.48s/it] 66%|██████▌ | 297/452 [32:02<16:43, 6.47s/it] 66%|██████▌ | 298/452 [32:09<16:37, 6.47s/it] 66%|██████▌ | 299/452 [32:14<16:30, 6.47s/it] 66%|██████▋ | 300/452 [32:21<16:23, 6.47s/it] 67%|██████▋ | 301/452 [32:28<16:17, 6.47s/it] 67%|██████▋ | 302/452 [32:34<16:10, 6.47s/it] 67%|██████▋ | 303/452 [32:40<16:03, 6.47s/it] 67%|██████▋ | 304/452 [32:47<15:57, 6.47s/it] 67%|██████▋ | 305/452 [32:54<15:51, 6.47s/it] 68%|██████▊ | 306/452 [33:01<15:45, 6.48s/it] 68%|██████▊ | 307/452 [33:07<15:38, 6.47s/it] 68%|██████▊ | 308/452 [33:12<15:31, 6.47s/it] 68%|██████▊ | 309/452 [33:21<15:26, 6.48s/it] 69%|██████▊ | 310/452 [33:27<15:19, 6.48s/it] 69%|██████▉ | 311/452 [33:35<15:13, 6.48s/it] 69%|██████▉ | 312/452 [33:40<15:06, 6.48s/it] 69%|██████▉ | 313/452 [33:48<15:01, 6.48s/it] 69%|██████▉ | 314/452 [33:57<14:55, 6.49s/it] 70%|██████▉ | 315/452 [34:02<14:48, 6.48s/it] 70%|██████▉ | 316/452 [34:08<14:41, 6.48s/it] 70%|███████ | 317/452 [34:16<14:35, 6.49s/it] 70%|███████ | 318/452 [34:24<14:29, 6.49s/it] 71%|███████ | 319/452 [34:31<14:23, 6.49s/it] 71%|███████ | 320/452 [34:39<14:17, 6.50s/it] 71%|███████ | 321/452 [34:46<14:11, 6.50s/it] 71%|███████ | 322/452 [34:54<14:05, 6.51s/it] 71%|███████▏ | 323/452 [35:02<13:59, 6.51s/it] 72%|███████▏ | 324/452 [35:09<13:53, 6.51s/it] 72%|███████▏ | 325/452 [35:17<13:47, 6.52s/it] 72%|███████▏ | 326/452 [35:25<13:41, 6.52s/it] 72%|███████▏ | 327/452 [35:32<13:35, 6.52s/it] 73%|███████▎ | 328/452 [35:39<13:29, 6.52s/it] 73%|███████▎ | 329/452 [35:48<13:23, 6.53s/it] 73%|███████▎ | 330/452 [35:56<13:17, 6.53s/it] 73%|███████▎ | 331/452 [36:04<13:11, 6.54s/it] 73%|███████▎ | 332/452 [36:11<13:05, 6.54s/it] 74%|███████▎ | 333/452 [36:20<12:59, 6.55s/it] 74%|███████▍ | 334/452 [36:29<12:53, 6.55s/it] 74%|███████▍ | 335/452 [36:34<12:46, 6.55s/it] 74%|███████▍ | 336/452 [36:42<12:40, 6.56s/it] 75%|███████▍ | 337/452 [36:49<12:34, 6.56s/it] 75%|███████▍ | 338/452 [36:57<12:27, 6.56s/it] 75%|███████▌ | 339/452 [37:04<12:21, 6.56s/it] 75%|███████▌ | 340/452 [37:11<12:15, 6.56s/it] 75%|███████▌ | 341/452 [37:19<12:08, 6.57s/it] 76%|███████▌ | 342/452 [37:27<12:03, 6.57s/it] 76%|███████▌ | 343/452 [37:35<11:56, 6.58s/it] 76%|███████▌ | 344/452 [37:40<11:49, 6.57s/it] 76%|███████▋ | 345/452 [37:48<11:43, 6.57s/it] 77%|███████▋ | 346/452 [37:55<11:37, 6.58s/it] 77%|███████▋ | 347/452 [38:05<11:31, 6.59s/it] 77%|███████▋ | 348/452 [38:11<11:24, 6.59s/it] 77%|███████▋ | 349/452 [38:19<11:18, 6.59s/it] 77%|███████▋ | 350/452 [38:27<11:12, 6.59s/it] 78%|███████▊ | 351/452 [38:35<11:06, 6.60s/it] 78%|███████▊ | 352/452 [38:44<11:00, 6.60s/it] 78%|███████▊ | 353/452 [38:52<10:54, 6.61s/it] 78%|███████▊ | 354/452 [38:59<10:47, 6.61s/it] 79%|███████▊ | 355/452 [39:06<10:41, 6.61s/it] 79%|███████▉ | 356/452 [39:13<10:34, 6.61s/it] 79%|███████▉ | 357/452 [39:20<10:28, 6.61s/it] 79%|███████▉ | 358/452 [39:27<10:21, 6.61s/it] 79%|███████▉ | 359/452 [39:34<10:15, 6.61s/it] 80%|███████▉ | 360/452 [39:43<10:08, 6.62s/it] 80%|███████▉ | 361/452 [39:48<10:02, 6.62s/it] 80%|████████ | 362/452 [39:54<09:55, 6.62s/it] 80%|████████ | 363/452 [40:00<09:48, 6.61s/it] 81%|████████ | 364/452 [40:07<09:42, 6.61s/it] 81%|████████ | 365/452 [40:13<09:35, 6.61s/it] 81%|████████ | 366/452 [40:18<09:28, 6.61s/it] 81%|████████ | 367/452 [40:23<09:21, 6.60s/it] 81%|████████▏ | 368/452 [40:28<09:14, 6.60s/it] 82%|████████▏ | 369/452 [40:34<09:07, 6.60s/it] 82%|████████▏ | 370/452 [40:39<09:00, 6.59s/it] 82%|████████▏ | 371/452 [40:45<08:53, 6.59s/it] 82%|████████▏ | 372/452 [40:50<08:47, 6.59s/it] 83%|████████▎ | 373/452 [40:56<08:40, 6.59s/it] 83%|████████▎ | 374/452 [41:01<08:33, 6.58s/it] 83%|████████▎ | 375/452 [41:07<08:26, 6.58s/it] 83%|████████▎ | 376/452 [41:13<08:20, 6.58s/it] 83%|████████▎ | 377/452 [41:19<08:13, 6.58s/it] 84%|████████▎ | 378/452 [41:26<08:06, 6.58s/it] 84%|████████▍ | 379/452 [41:32<08:00, 6.58s/it] 84%|████████▍ | 380/452 [41:38<07:53, 6.58s/it] 84%|████████▍ | 381/452 [41:44<07:46, 6.57s/it] 85%|████████▍ | 382/452 [41:50<07:40, 6.57s/it] 85%|████████▍ | 383/452 [41:57<07:33, 6.57s/it] 85%|████████▍ | 384/452 [42:02<07:26, 6.57s/it] 85%|████████▌ | 385/452 [42:07<07:19, 6.56s/it] 85%|████████▌ | 386/452 [42:12<07:12, 6.56s/it] 86%|████████▌ | 387/452 [42:16<07:06, 6.56s/it] 86%|████████▌ | 388/452 [42:21<06:59, 6.55s/it] 86%|████████▌ | 389/452 [42:26<06:52, 6.55s/it] 86%|████████▋ | 390/452 [42:32<06:45, 6.54s/it] 87%|████████▋ | 391/452 [42:37<06:39, 6.54s/it] 87%|████████▋ | 392/452 [42:42<06:32, 6.54s/it] 87%|████████▋ | 393/452 [42:48<06:25, 6.53s/it] 87%|████████▋ | 394/452 [42:52<06:18, 6.53s/it] 87%|████████▋ | 395/452 [42:58<06:12, 6.53s/it] 88%|████████▊ | 396/452 [43:03<06:05, 6.52s/it] 88%|████████▊ | 397/452 [43:07<05:58, 6.52s/it] 88%|████████▊ | 398/452 [43:12<05:51, 6.51s/it] 88%|████████▊ | 399/452 [43:17<05:45, 6.51s/it] 88%|████████▊ | 400/452 [43:22<05:38, 6.51s/it] 89%|████████▊ | 401/452 [43:27<05:31, 6.50s/it] 89%|████████▉ | 402/452 [43:31<05:24, 6.50s/it] 89%|████████▉ | 403/452 [43:37<05:18, 6.50s/it] 89%|████████▉ | 404/452 [43:43<05:11, 6.49s/it] 90%|████████▉ | 405/452 [43:49<05:05, 6.49s/it] 90%|████████▉ | 406/452 [43:54<04:58, 6.49s/it] 90%|█████████ | 407/452 [43:59<04:51, 6.49s/it] 90%|█████████ | 408/452 [44:04<04:45, 6.48s/it] 90%|█████████ | 409/452 [44:08<04:38, 6.48s/it] 91%|█████████ | 410/452 [44:14<04:31, 6.47s/it] 91%|█████████ | 411/452 [44:18<04:25, 6.47s/it] 91%|█████████ | 412/452 [44:24<04:18, 6.47s/it] 91%|█████████▏| 413/452 [44:30<04:12, 6.47s/it] 92%|█████████▏| 414/452 [44:35<04:05, 6.46s/it] 92%|█████████▏| 415/452 [44:41<03:59, 6.46s/it] 92%|█████████▏| 416/452 [44:46<03:52, 6.46s/it] 92%|█████████▏| 417/452 [44:53<03:46, 6.46s/it] 92%|█████████▏| 418/452 [44:59<03:39, 6.46s/it] 93%|█████████▎| 419/452 [45:07<03:33, 6.46s/it] 93%|█████████▎| 420/452 [45:14<03:26, 6.46s/it] 93%|█████████▎| 421/452 [45:22<03:20, 6.47s/it] 93%|█████████▎| 422/452 [45:28<03:13, 6.46s/it] 94%|█████████▎| 423/452 [45:33<03:07, 6.46s/it] 94%|█████████▍| 424/452 [45:41<03:01, 6.47s/it] 94%|█████████▍| 425/452 [45:49<02:54, 6.47s/it] 94%|█████████▍| 426/452 [45:54<02:48, 6.47s/it] 94%|█████████▍| 427/452 [46:01<02:41, 6.47s/it] 95%|█████████▍| 428/452 [46:07<02:35, 6.47s/it] 95%|█████████▍| 429/452 [46:12<02:28, 6.46s/it] 95%|█████████▌| 430/452 [46:18<02:22, 6.46s/it] 95%|█████████▌| 431/452 [46:26<02:15, 6.46s/it] 96%|█████████▌| 432/452 [46:34<02:09, 6.47s/it] 96%|█████████▌| 433/452 [46:43<02:03, 6.47s/it] 96%|█████████▌| 434/452 [46:50<01:56, 6.48s/it] 96%|█████████▌| 435/452 [46:58<01:50, 6.48s/it] 96%|█████████▋| 436/452 [47:06<01:43, 6.48s/it] 97%|█████████▋| 437/452 [47:14<01:37, 6.49s/it] 97%|█████████▋| 438/452 [47:21<01:30, 6.49s/it] 97%|█████████▋| 439/452 [47:27<01:24, 6.49s/it] 97%|█████████▋| 440/452 [47:35<01:17, 6.49s/it] 98%|█████████▊| 441/452 [47:43<01:11, 6.49s/it] 98%|█████████▊| 442/452 [47:50<01:04, 6.49s/it] 98%|█████████▊| 443/452 [47:58<00:58, 6.50s/it] 98%|█████████▊| 444/452 [48:06<00:52, 6.50s/it] 98%|█████████▊| 445/452 [48:13<00:45, 6.50s/it] 99%|█████████▊| 446/452 [48:20<00:39, 6.50s/it] 99%|█████████▉| 447/452 [48:26<00:32, 6.50s/it] 99%|█████████▉| 448/452 [48:34<00:26, 6.51s/it] 99%|█████████▉| 449/452 [48:40<00:19, 6.50s/it] 100%|█████████▉| 450/452 [48:44<00:12, 6.50s/it] 100%|█████████▉| 451/452 [48:49<00:06, 6.50s/it] 100%|██████████| 452/452 [48:55<00:00, 6.49s/it]
#mergedStatDf.loc[ProcessedRunDigits[0]]
mergedStatDf=pd.DataFrame(corrDict).T
mergedStatDf.quantile(0.5,axis=0)
l_h_slope 0.337517 l_l_slope 0.889199 overall_r 0.986250 Name: 0.5, dtype: float64
mergedStatDf.quantile(0.025,axis=0)
l_h_slope 0.281840 l_l_slope 0.877407 overall_r 0.982995 Name: 0.025, dtype: float64
fig,ax=plt.subplots(figsize=(3,2))
ax=sns.boxplot(data=mergedStatDf[['l_h_slope','l_l_slope']])
ax.set_title('slope')
<matplotlib.axes._subplots.AxesSubplot at 0x2b5171677eb8>
axes=mergedStatDf[['l_h_slope','l_l_slope']].hist(sharey=True)
for ax in axes.flatten():
ax.grid(False)
r_l
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-255-6401eb861faf> in <module>() ----> 1 r_l NameError: name 'r_l' is not defined
corrS=pd.Series(corrDict)
corrS.quantile(0.025),corrS.quantile(0.05),corrS.quantile(1-0.025)
(0.9828144840044503, 0.9831999657939651, 0.990159227830007)
import seaborn as sns
%matplotlib inline
from scipy import stats
tmpMergedDf=(mergedDf)
g=sns.jointplot(data=mergedDf,x='wo',y='w',kind='hex',xlim=[0,4],ylim=[0,4],size=5,stat_func=None)
g.set_axis_labels(xlabel='without duplicate removal\n log10(allelic read count)',ylabel='with duplicate removal\nlog10(allelic read count)')
#g.ax_joint.axhline(6)
base=3
cutoff=2
tmpSubDf=tmpMergedDf[(tmpMergedDf.mean(axis=1)<=cutoff)]
l_l=stats.linregress(tmpSubDf['wo'],tmpSubDf['w'])
print(l_l)
g.ax_joint.plot(
np.arange(0,cutoff+1,),np.arange(0,cutoff+1)*l_l.slope+l_l.intercept,alpha=0.5,linestyle='-.',color='black')
cutoff=2
tmpSubDf=tmpMergedDf[(tmpMergedDf.mean(axis=1)>cutoff)]
l_h=stats.linregress(tmpSubDf['wo'],tmpSubDf['w'])
print(l_h)
g.ax_joint.plot(
np.arange(0,cutoff+1)+cutoff,np.arange(cutoff,cutoff+cutoff+1)*l_h.slope+l_h.intercept,linestyle='--',color='red')
#g.savefig('./')
#g.ax_joint.grid(True)
#g.ax_joint.axvline(6)
LinregressResult(slope=0.8857445760190211, intercept=0.035048564933380266, rvalue=0.989165368230053, pvalue=0.0, stderr=0.0003237167782106638) LinregressResult(slope=0.30473605250695773, intercept=1.2763011970689413, rvalue=0.7949899160609349, pvalue=0.0, stderr=0.000864140900160121)
/cellar/users/btsui/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg. warnings.warn("The 'normed' kwarg is deprecated, and has been " /cellar/users/btsui/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg. warnings.warn("The 'normed' kwarg is deprecated, and has been "
[<matplotlib.lines.Line2D at 0x2b5171cc34e0>]
np.arange(0,cutoff+1)*l_l.slope+l_l.intercept
array([0.05092114, 0.86934295])
#woDupRemoved_inDf_tmp
array([0, 1])
woDupRemoved_inDf_DepthS=(woDupRemoved_inDf_tmp.groupby(['Chr','Pos']).sum())
wDupRemoved_inDf_DepthS=wDupRemoved_inDf_tmp.groupby(['Chr','Pos']).sum()
#wDupRemoved_inDf_DepthS
tmpI=woDupRemoved_inDf_tmp.index.to_frame().set_index(['Chr','Pos']).index
woDupRemoved_inDf_DepthS_algned=woDupRemoved_inDf_DepthS[tmpI]
tmpI=wDupRemoved_inDf_tmp.index.to_frame().set_index(['Chr','Pos']).index
wDupRemoved_inDf_DepthS_algned=wDupRemoved_inDf_DepthS[tmpI]
woDupRemoved_inDf_AllelicFreqS=woDupRemoved_inDf_tmp/woDupRemoved_inDf_DepthS_algned.values
wDupRemoved_inDf_AllelicFreqS=wDupRemoved_inDf_tmp/wDupRemoved_inDf_DepthS_algned.values
%time allelicDf=pd.DataFrame({'woDupRemoved':woDupRemoved_inDf_AllelicFreqS,'wDupRemoved':wDupRemoved_inDf_AllelicFreqS}).dropna()
CPU times: user 13.1 s, sys: 540 ms, total: 13.7 s Wall time: 2.27 s
inVcfDir='/data/cellardata/users/btsui/dbsnp/Homo_sapiens/All_20170710.f1_byte2_not_00.vcf.gz'
vcfDf=pd.read_csv(inVcfDir,sep='\t',header=None)
vcfDf.columns=['Chr','Pos','RsId','RefBase','AltBase','','','Annot']
vcfDf['Chr']=vcfDf['Chr'].astype(np.str)
/cellar/users/btsui/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2785: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False. interactivity=interactivity, compiler=compiler, result=result)
allelicDfResetDf=allelicDf.reset_index()
refI=vcfDf.set_index(['Chr','Pos','RefBase']).index
ChrPosI=allelicDfResetDf.set_index(['Chr','Pos','base']).index
ref_m=ChrPosI.isin(refI)
allelicDfResetDf['is_ref']=ref_m
bins = np.linspace(0, 1, 11)
array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])
allelicDfResetDf['wDupRemoved_bin']=np.digitize(allelicDfResetDf['wDupRemoved'],bins)
(allelicDfResetDf['woDupRemoved']-allelicDfResetDf['wDupRemoved']).abs()
allelicDfResetDf['abs_diff']=(allelicDfResetDf['woDupRemoved']-allelicDfResetDf['wDupRemoved']).abs()
woDupRemoved_inDf_DepthS_algned.head()
Chr Pos 1 14727 700 14727 700 630825 7 630833 5 850609 1 Name: ReadDepth, dtype: uint16
allelicDfResetDf['woDupRemoved_inDf_DepthS']=woDupRemoved_inDf_DepthS[ChrPosI].values
allelicDfResetDf['woDupRemoved_inDf_DepthS_log10']=np.log10(allelicDfResetDf['woDupRemoved_inDf_DepthS']+1)
binsize=1
allelicDfResetDf['woDupRemoved_inDf_DepthS_bin']=(allelicDfResetDf['woDupRemoved_inDf_DepthS_log10']/binsize).astype(int)*binsize
ax = sns.violinplot(data=allelicDfResetDf,x="woDupRemoved_inDf_DepthS_bin", y="abs_diff",
inner=None, color=".8")
allelicDfResetDf.head()
Chr | Pos | base | woDupRemoved | wDupRemoved | is_ref | wDupRemoved_bin | abs_diff | woDupRemoved_inDf_DepthS | woDupRemoved_inDf_DepthS_bin | woDupRemoved_inDf_DepthS_log | woDupRemoved_inDf_DepthS_log10 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 14727 | A | 0.062857 | 0.085714 | False | 1 | 0.022857 | 700 | 2 | 2.845718 | 2.845718 |
1 | 1 | 14727 | G | 0.937143 | 0.914286 | True | 10 | 0.022857 | 700 | 2 | 2.845718 | 2.845718 |
2 | 1 | 630825 | T | 1.000000 | 1.000000 | True | 11 | 0.000000 | 7 | 0 | 0.903090 | 0.903090 |
3 | 1 | 630833 | C | 1.000000 | 1.000000 | True | 11 | 0.000000 | 5 | 0 | 0.778151 | 0.778151 |
4 | 1 | 850609 | T | 1.000000 | 1.000000 | True | 11 | 0.000000 | 1 | 0 | 0.301030 | 0.301030 |
allelicDfResetDf.head()
Chr | Pos | base | woDupRemoved | wDupRemoved | is_ref | wDupRemoved_bin | abs_diff | woDupRemoved_inDf_DepthS | woDupRemoved_inDf_DepthS_bin | woDupRemoved_inDf_DepthS_log | woDupRemoved_inDf_DepthS_log10 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 14727 | A | 0.062857 | 0.085714 | False | 1 | 0.022857 | 700 | 2 | 2.845718 | 2.845718 |
1 | 1 | 14727 | G | 0.937143 | 0.914286 | True | 10 | 0.022857 | 700 | 2 | 2.845718 | 2.845718 |
2 | 1 | 630825 | T | 1.000000 | 1.000000 | True | 11 | 0.000000 | 7 | 0 | 0.903090 | 0.903090 |
3 | 1 | 630833 | C | 1.000000 | 1.000000 | True | 11 | 0.000000 | 5 | 0 | 0.778151 | 0.778151 |
4 | 1 | 850609 | T | 1.000000 | 1.000000 | True | 11 | 0.000000 | 1 | 0 | 0.301030 | 0.301030 |
import matplotlib.pyplot as plt
array([0, 1, 2, 3])
myBin=2
n_bins=100
fig, ax = plt.subplots(figsize=(3, 2))
# plot the cumulative histogram
# "xx to xx read "
for myBin in np.arange(0,3):
x=allelicDfResetDf['abs_diff'][(allelicDfResetDf['woDupRemoved_inDf_DepthS_bin']==myBin)&(~allelicDfResetDf.is_ref)]#.sample(1000)
n, bins, patches = ax.hist(x, n_bins, normed=1, histtype='step',
cumulative=True, label='{} to {}'.format(10**(myBin),10**(myBin+1)),range=[0,1.1])
ax.set_xlim([-0.02,1.0])
ax.set_ylim([0.5,1.02])
ax.set_ylabel('Cumulative % of variants')
ax.set_xlabel('Absoluate differenece in allelic fraction\n due to duplicates')
ax.legend(loc=(0.3,0.2))
/cellar/users/btsui/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg. warnings.warn("The 'normed' kwarg is deprecated, and has been " /cellar/users/btsui/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg. warnings.warn("The 'normed' kwarg is deprecated, and has been " /cellar/users/btsui/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg. warnings.warn("The 'normed' kwarg is deprecated, and has been "
<matplotlib.legend.Legend at 0x2b5172505898>
#x, 0 to 1
ax=sns.distplot(allelicDfResetDf['abs_diff'][allelicDfResetDf['woDupRemoved_inDf_DepthS_bin']==2].sample(200),
hist_kws=dict(cumulative=True),
kde_kws=dict(cumulative=True),
)
ax.set_ylabel('Cumulative % of samples')
ax.set_ylabel('Absoluate differenece in allelic fraction due to duplicates')
/cellar/users/btsui/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg. warnings.warn("The 'normed' kwarg is deprecated, and has been "
Text(0,0.5,'Absoluate differenece in allelic fraction due to duplicates')
sns.boxplot(data=allelicDfResetDf,x='woDupRemoved_inDf_DepthS_bin',y='abs_diff')
<matplotlib.axes._subplots.AxesSubplot at 0x2b52769376a0>
##
sns.stripplot(data=allelicDfResetDf,x='woDupRemoved_inDf_DepthS_bin',y='abs_diff',jitter=0.3)
<matplotlib.axes._subplots.AxesSubplot at 0x2b52768af9b0>
ax=.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x2b52723dec50>
allelicDfResetDf.head()
Chr | Pos | base | woDupRemoved | wDupRemoved | is_ref | wDupRemoved_bin | |
---|---|---|---|---|---|---|---|
0 | 1 | 14727 | A | 0.062857 | 0.085714 | False | 1 |
1 | 1 | 14727 | G | 0.937143 | 0.914286 | True | 10 |
2 | 1 | 630825 | T | 1.000000 | 1.000000 | True | 11 |
3 | 1 | 630833 | C | 1.000000 | 1.000000 | True | 11 |
4 | 1 | 850609 | T | 1.000000 | 1.000000 | True | 11 |
sns.boxplot(data=allelicDfResetDf[~allelicDfResetDf.is_ref],x='wDupRemoved_bin',y='woDupRemoved')
<matplotlib.axes._subplots.AxesSubplot at 0x2b5272168668>
np.digitize(allelicDfResetDf['wDupRemoved'],bins=10)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-47-7937bfdb0e4e> in <module>() ----> 1 np.digitize(allelicDfResetDf['wDupRemoved'],bins=10) ValueError: object of too small depth for desired array
#allelicDfResetDf['wDupRemoved']
sns.jointplot(data=allelicDfResetDf[(~allelicDfResetDf['is_ref'])&(allelicDfResetDf['woDupRemoved']>0)&(allelicDfResetDf['woDupRemoved']<1.0)],x='woDupRemoved',y='wDupRemoved',kind='hex')
/cellar/users/btsui/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg. warnings.warn("The 'normed' kwarg is deprecated, and has been " /cellar/users/btsui/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg. warnings.warn("The 'normed' kwarg is deprecated, and has been "
<seaborn.axisgrid.JointGrid at 0x2b5271f9af98>
### variant allele only
sns.jointplot(data=allelicDf,x='woDup',y='wDup')
woDup | wDup | |||
---|---|---|---|---|
Chr | Pos | base | ||
1 | 14727 | A | 0.062857 | 0.085714 |
G | 0.937143 | 0.914286 | ||
630825 | T | 1.000000 | 1.000000 | |
630833 | C | 1.000000 | 1.000000 | |
850609 | T | 1.000000 | 1.000000 | |
948136 | G | 1.000000 | 1.000000 | |
955964 | G | 1.000000 | 1.000000 | |
970788 | G | 1.000000 | 1.000000 | |
1013541 | C | 0.581967 | 0.510204 | |
T | 0.409836 | 0.489796 | ||
1014143 | C | 1.000000 | 1.000000 | |
1014228 | G | 1.000000 | 1.000000 | |
1014316 | C | 0.997067 | 1.000000 | |
1014359 | G | 1.000000 | 1.000000 | |
1020217 | G | 1.000000 | 1.000000 | |
1020221 | C | 1.000000 | 1.000000 | |
1020239 | G | 1.000000 | 1.000000 | |
1022188 | A | 1.000000 | 1.000000 | |
1022225 | A | 0.007092 | 0.008929 | |
G | 0.992908 | 0.991071 | ||
1022260 | C | 0.997783 | 1.000000 | |
1022313 | A | 0.993318 | 1.000000 | |
1040679 | C | 1.000000 | 1.000000 | |
1041174 | C | 1.000000 | 1.000000 | |
1041183 | C | 1.000000 | 1.000000 | |
1041218 | A | 0.066667 | 0.076923 | |
C | 0.933333 | 0.923077 | ||
1041249 | C | 1.000000 | 1.000000 | |
1041582 | C | 1.000000 | 1.000000 | |
1041583 | A | 0.987013 | 1.000000 | |
... | ... | ... | ... | ... |
Y | 26420477 | G | 0.888889 | 0.904762 |
26508094 | A | 0.750000 | 0.750000 | |
G | 0.250000 | 0.250000 | ||
26509986 | C | 1.000000 | 1.000000 | |
26510310 | G | 1.000000 | 1.000000 | |
26510337 | A | 1.000000 | 1.000000 | |
26510425 | G | 1.000000 | 1.000000 | |
26515535 | A | 0.181818 | 0.181818 | |
G | 0.818182 | 0.818182 | ||
26515550 | C | 0.833333 | 0.833333 | |
T | 0.166667 | 0.166667 | ||
26515553 | A | 0.833333 | 0.833333 | |
G | 0.166667 | 0.166667 | ||
26515558 | A | 0.866667 | 0.857143 | |
C | 0.133333 | 0.142857 | ||
26530858 | G | 1.000000 | 1.000000 | |
26530883 | A | 1.000000 | 1.000000 | |
26530902 | C | 1.000000 | 1.000000 | |
26541498 | C | 1.000000 | 1.000000 | |
26541512 | A | 1.000000 | 1.000000 | |
26543653 | C | 1.000000 | 1.000000 | |
26554088 | A | 1.000000 | 1.000000 | |
26557077 | C | 0.053763 | 0.013158 | |
T | 0.946237 | 0.986842 | ||
26557102 | C | 0.747664 | 0.769231 | |
T | 0.252336 | 0.230769 | ||
26566504 | G | 1.000000 | 1.000000 | |
26567812 | C | 1.000000 | 1.000000 | |
26567857 | C | 1.000000 | 1.000000 | |
26568527 | C | 1.000000 | 1.000000 |
237318 rows × 2 columns
allelicDf.corr()
wo_allele | w | |
---|---|---|
wo_allele | 1.00000 | 0.99838 |
w | 0.99838 | 1.00000 |
tmpDf2=pd.DataFrame({'wo_alle':woDupRemoved_inDf_tmp,'wo_allel_site_depth':woDupRemoved_inDf_DepthS_algned,
'w':wDupRemoved_inDf_tmp})
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-118-13ce9989fae3> in <module>() 1 tmpDf2=pd.DataFrame({'wo_alle':woDupRemoved_inDf_tmp,'wo_allel_site_depth':woDupRemoved_inDf_DepthS_algned, ----> 2 'w':wDupRemoved_inDf_tmp}) ~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy) 346 dtype=dtype, copy=copy) 347 elif isinstance(data, dict): --> 348 mgr = self._init_dict(data, index, columns, dtype=dtype) 349 elif isinstance(data, ma.MaskedArray): 350 import numpy.ma.mrecords as mrecords ~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _init_dict(self, data, index, columns, dtype) 457 arrays = [data[k] for k in keys] 458 --> 459 return _arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype) 460 461 def _init_ndarray(self, values, index, columns, dtype=None, copy=False): ~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _arrays_to_mgr(arrays, arr_names, index, columns, dtype) 7316 7317 # don't force copy because getting jammed in an ndarray anyway -> 7318 arrays = _homogenize(arrays, index, dtype) 7319 7320 # from BlockManager perspective ~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _homogenize(data, index, dtype) 7614 # Forces alignment. No need to copy data since we 7615 # are putting it into an ndarray later -> 7616 v = v.reindex(index, copy=False) 7617 else: 7618 if isinstance(v, dict): ~/anaconda3/lib/python3.6/site-packages/pandas/core/series.py in reindex(self, index, **kwargs) 3320 @Appender(generic._shared_docs['reindex'] % _shared_doc_kwargs) 3321 def reindex(self, index=None, **kwargs): -> 3322 return super(Series, self).reindex(index=index, **kwargs) 3323 3324 def drop(self, labels=None, axis=0, index=None, columns=None, ~/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in reindex(self, *args, **kwargs) 3683 # perform the reindex on the axes 3684 return self._reindex_axes(axes, level, limit, tolerance, method, -> 3685 fill_value, copy).__finalize__(self) 3686 3687 def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, ~/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy) 3696 ax = self._get_axis(a) 3697 new_index, indexer = ax.reindex(labels, level=level, limit=limit, -> 3698 tolerance=tolerance, method=method) 3699 3700 axis = self._get_axis_number(a) ~/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/multi.py in reindex(self, target, method, level, limit, tolerance) 2091 indexer = self.get_indexer(target, method=method, 2092 limit=limit, -> 2093 tolerance=tolerance) 2094 else: 2095 raise Exception("cannot handle a non-unique multi-index!") ~/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/multi.py in get_indexer(self, target, method, limit, tolerance) 2040 'for MultiIndex; see GitHub issue 9365') 2041 else: -> 2042 indexer = self._engine.get_indexer(target) 2043 2044 return _ensure_platform_int(indexer) pandas/_libs/index.pyx in pandas._libs.index.BaseMultiIndexCodesEngine.get_indexer() pandas/_libs/index.pyx in pandas._libs.index.BaseMultiIndexCodesEngine._extract_level_codes() ~/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/multi.py in _codes_to_ints(self, codes) 72 # Shift the representation of each level by the pre-calculated number 73 # of bits: ---> 74 codes <<= self.offsets 75 76 # Now sum and OR are in fact interchangeable. This is a simple ValueError: operands could not be broadcast together with shapes (447902,2) (3,) (447902,2)
%time allelicFracDf=tmpDf2.groupby(['Chr','Pos']).sum(axis=0)
0.873/0.220
3.9681818181818183
tmpDf10=woDupRemoved_inDf.loc['51da3bb2-6045-4f15-a7da-0dec84dda0ed']
tmpDf10
features | ReadDepth | AverageBaseQuality | ||
---|---|---|---|---|
Chr | Pos | base | ||
1 | 14727 | A | 9 | 27 |
G | 95 | 32 | ||
T | 1 | 2 | ||
630825 | T | 4 | 34 | |
630833 | C | 4 | 33 | |
948136 | G | 4 | 33 | |
955964 | G | 102 | 32 | |
970788 | G | 39 | 32 | |
1013541 | C | 39 | 31 | |
1014143 | C | 51 | 31 | |
1014228 | A | 28 | 33 | |
G | 41 | 33 | ||
1014316 | C | 96 | 29 | |
1014359 | G | 83 | 28 | |
1020217 | G | 1 | 26 | |
1020221 | C | 1 | 33 | |
1020239 | G | 1 | 22 | |
1022188 | A | 87 | 31 | |
1022225 | G | 121 | 31 | |
T | 1 | 19 | ||
1022260 | C | 204 | 29 | |
1022313 | A | 217 | 31 | |
G | 1 | 6 | ||
T | 1 | 2 | ||
1041218 | C | 1 | 31 | |
1041249 | C | 2 | 11 | |
1041582 | C | 48 | 30 | |
1041583 | A | 48 | 28 | |
C | 1 | 2 | ||
1041648 | G | 75 | 29 | |
... | ... | ... | ... | ... |
MT | 15553 | G | 15 | 34 |
15572 | T | 8 | 32 | |
15579 | A | 8 | 32 | |
15607 | A | 5 | 31 | |
15615 | G | 2 | 34 | |
15637 | C | 1 | 36 | |
15649 | A | 2 | 35 | |
15670 | T | 8 | 31 | |
15682 | A | 8 | 32 | |
15746 | A | 9 | 33 | |
15758 | A | 13 | 32 | |
15784 | T | 12 | 33 | |
15812 | G | 12 | 30 | |
15833 | C | 8 | 33 | |
15848 | A | 6 | 32 | |
15884 | G | 1 | 34 | |
15890 | C | 1 | 33 | |
15923 | A | 4 | 33 | |
15927 | G | 4 | 33 | |
15928 | G | 4 | 34 | |
15932 | T | 4 | 32 | |
15943 | T | 3 | 33 | |
15950 | G | 4 | 33 | |
15965 | A | 4 | 33 | |
15967 | G | 4 | 32 | |
15990 | C | 1 | 33 | |
16188 | C | 15 | 26 | |
16278 | C | 30 | 33 | |
16390 | G | 18 | 33 | |
16519 | T | 7 | 32 |
242114 rows × 2 columns
tmpDf10.unstack().fillna(0)['ReadDepth']
base | A | C | G | T | |
---|---|---|---|---|---|
Chr | Pos | ||||
1 | 14727 | 9.0 | 0.0 | 95.0 | 1.0 |
630825 | 0.0 | 0.0 | 0.0 | 4.0 | |
630833 | 0.0 | 4.0 | 0.0 | 0.0 | |
948136 | 0.0 | 0.0 | 4.0 | 0.0 | |
955964 | 0.0 | 0.0 | 102.0 | 0.0 | |
970788 | 0.0 | 0.0 | 39.0 | 0.0 | |
1013541 | 0.0 | 39.0 | 0.0 | 0.0 | |
1014143 | 0.0 | 51.0 | 0.0 | 0.0 | |
1014228 | 28.0 | 0.0 | 41.0 | 0.0 | |
1014316 | 0.0 | 96.0 | 0.0 | 0.0 | |
1014359 | 0.0 | 0.0 | 83.0 | 0.0 | |
1020217 | 0.0 | 0.0 | 1.0 | 0.0 | |
1020221 | 0.0 | 1.0 | 0.0 | 0.0 | |
1020239 | 0.0 | 0.0 | 1.0 | 0.0 | |
1022188 | 87.0 | 0.0 | 0.0 | 0.0 | |
1022225 | 0.0 | 0.0 | 121.0 | 1.0 | |
1022260 | 0.0 | 204.0 | 0.0 | 0.0 | |
1022313 | 217.0 | 0.0 | 1.0 | 1.0 | |
1041218 | 0.0 | 1.0 | 0.0 | 0.0 | |
1041249 | 0.0 | 2.0 | 0.0 | 0.0 | |
1041582 | 0.0 | 48.0 | 0.0 | 0.0 | |
1041583 | 48.0 | 1.0 | 0.0 | 0.0 | |
1041648 | 0.0 | 0.0 | 75.0 | 1.0 | |
1041950 | 0.0 | 48.0 | 0.0 | 0.0 | |
1042136 | 0.0 | 0.0 | 0.0 | 33.0 | |
1042190 | 15.0 | 0.0 | 0.0 | 0.0 | |
1043223 | 0.0 | 9.0 | 0.0 | 0.0 | |
1043248 | 0.0 | 16.0 | 0.0 | 0.0 | |
1043288 | 0.0 | 0.0 | 28.0 | 0.0 | |
1043382 | 0.0 | 0.0 | 12.0 | 0.0 | |
... | ... | ... | ... | ... | ... |
MT | 15553 | 0.0 | 0.0 | 15.0 | 0.0 |
15572 | 0.0 | 0.0 | 0.0 | 8.0 | |
15579 | 8.0 | 0.0 | 0.0 | 0.0 | |
15607 | 5.0 | 0.0 | 0.0 | 0.0 | |
15615 | 0.0 | 0.0 | 2.0 | 0.0 | |
15637 | 0.0 | 1.0 | 0.0 | 0.0 | |
15649 | 2.0 | 0.0 | 0.0 | 0.0 | |
15670 | 0.0 | 0.0 | 0.0 | 8.0 | |
15682 | 8.0 | 0.0 | 0.0 | 0.0 | |
15746 | 9.0 | 0.0 | 0.0 | 0.0 | |
15758 | 13.0 | 0.0 | 0.0 | 0.0 | |
15784 | 0.0 | 0.0 | 0.0 | 12.0 | |
15812 | 0.0 | 0.0 | 12.0 | 0.0 | |
15833 | 0.0 | 8.0 | 0.0 | 0.0 | |
15848 | 6.0 | 0.0 | 0.0 | 0.0 | |
15884 | 0.0 | 0.0 | 1.0 | 0.0 | |
15890 | 0.0 | 1.0 | 0.0 | 0.0 | |
15923 | 4.0 | 0.0 | 0.0 | 0.0 | |
15927 | 0.0 | 0.0 | 4.0 | 0.0 | |
15928 | 0.0 | 0.0 | 4.0 | 0.0 | |
15932 | 0.0 | 0.0 | 0.0 | 4.0 | |
15943 | 0.0 | 0.0 | 0.0 | 3.0 | |
15950 | 0.0 | 0.0 | 4.0 | 0.0 | |
15965 | 4.0 | 0.0 | 0.0 | 0.0 | |
15967 | 0.0 | 0.0 | 4.0 | 0.0 | |
15990 | 0.0 | 1.0 | 0.0 | 0.0 | |
16188 | 0.0 | 15.0 | 0.0 | 0.0 | |
16278 | 0.0 | 30.0 | 0.0 | 0.0 | |
16390 | 0.0 | 0.0 | 18.0 | 0.0 | |
16519 | 0.0 | 0.0 | 0.0 | 7.0 |
205539 rows × 4 columns