%matplotlib inline
import pandas as pd
import numpy as np
import re
import os
import math
from multiprocessing import Pool
from tqdm import tqdm
from scipy import stats
## init
mySpecie='Homo_sapiens'
#prealigned_dir='/cellar/users/btsui/all_seq_snp/Homo_sapiens_all_merged_snp.TCGA.prealigned.pickle'
targetted_align_dir='/cellar/users/btsui/all_seq_snp/Homo_sapiens_all_merged_snp.TCGA.pickle'
targetted_df=pd.read_pickle(targetted_align_dir).loc["TCGA"]
all_UUIDs=targetted_df.index.get_level_values('Run_digits').unique()
#883, 1427
print ('n UUID:',len(all_UUIDs))
n UUID: 1427
manifest_dir='/cellar/users/btsui/Project/METAMAP/notebook/RapMapTest/XGS_WGS/./tcga_lgg_wgs_bams.df.wxs_rnaseq.pickle'
### use andrea mapping to map from TCGA barcode to UUID.
manifest_df=pd.read_pickle(manifest_dir)
manifest_df['processed']=manifest_df.file_id.isin(all_UUIDs)
manifest_df[manifest_df['processed']]
access | annotations | cases | data_category | data_format | data_type | experimental_strategy | file_id | file_name | file_size | platform | processed | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
546 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | RNA-Seq | 1985b367-00c4-4c25-b049-5858e937cc6d | 948a7b5a-de83-41db-85f1-93a1a9b109ba_gdc_realn... | 3293981085 | Illumina | True |
1019 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | RNA-Seq | a7aff6a7-cea4-42e9-8d56-b211843a0302 | 8e4c32f5-1ed2-453b-b029-db7836249d3b_gdc_realn... | 4070637137 | Illumina | True |
860 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | RNA-Seq | 4038984b-21d1-45d3-a5bb-208f9e71aa78 | b52aeb27-8d06-4a1c-b322-ee947261e95c_gdc_realn... | 4195199173 | Illumina | True |
791 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | RNA-Seq | 7b17a5ef-c778-4753-8c30-427af9171f7a | a9bfcb6f-ce2b-47e2-a2e2-dd777b6d1344_gdc_realn... | 4436516715 | Illumina | True |
1285 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | RNA-Seq | 93e20db7-4892-4ef8-9331-ecd665e4cb91 | 33262d51-82cb-4fe6-89d1-b7e1c4673896_gdc_realn... | 4501820893 | Illumina | True |
1412 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | ceb1a38c-fc22-4d27-9ada-553c1765f1f6 | C494.TCGA-HT-A4DV-10A-01D-A26K-08.4_gdc_realn.bam | 4546249959 | Illumina | True |
1395 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | RNA-Seq | 35a18f9e-0ef8-4e55-b655-33397b63fd1a | b2a58478-d5d2-443c-9f20-222d43b6c326_gdc_realn... | 4601200721 | Illumina | True |
1843 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | 9ac78029-3c36-4e00-b3e2-06415649ec43 | C494.TCGA-HT-A4DS-10A-01D-A26K-08.4_gdc_realn.bam | 4659688085 | Illumina | True |
1421 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | bc27f71a-4222-4940-8a64-246e2b9f3d44 | C494.TCGA-DB-A4XC-01A-11D-A26M-08.3_gdc_realn.bam | 4709420043 | Illumina | True |
1131 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | 84943493-e5ef-4099-8706-09afa625b076 | C494.TCGA-DB-A4X9-01A-11D-A26M-08.5_gdc_realn.bam | 4720213753 | Illumina | True |
697 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | f36d778a-2c49-43aa-a577-7a6a921ad6cc | C494.TCGA-HT-A4DV-01A-11D-A26M-08.4_gdc_realn.bam | 4776368233 | Illumina | True |
1363 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | RNA-Seq | 243e9ccb-7c93-4c6a-9e3e-105c379e7f78 | de2603fc-4fbd-42bf-8601-5807195e8e58_gdc_realn... | 4936527695 | Illumina | True |
333 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | RNA-Seq | f895b46c-7811-4ebb-ac9c-970bbfed657a | fa9ba06b-49aa-4794-a8a4-458245f0f0d0_gdc_realn... | 5038123177 | Illumina | True |
302 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | 3594993e-f65c-434b-a64e-b1873f4b04ca | C494.TCGA-DB-A4XA-01A-11D-A26M-08.4_gdc_realn.bam | 5103387809 | Illumina | True |
610 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | 4522becf-1566-4de8-b760-79bb93513ecf | C494.TCGA-FG-A4MY-01A-11D-A26M-08.3_gdc_realn.bam | 5106736217 | Illumina | True |
1593 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | RNA-Seq | 65c95f19-84f0-4e8b-99c3-186f881181a4 | eb40595a-e557-40f1-bf04-1d2bced6b1ea_gdc_realn... | 5119970361 | Illumina | True |
567 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | RNA-Seq | eb4dda76-4215-458b-b9a0-60f9ff24bd6e | 450ad55c-9632-4249-9308-0054f93f2c1d_gdc_realn... | 5199614743 | Illumina | True |
398 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | RNA-Seq | f4db57f0-ba62-4a60-b65d-d2354612eb7b | ec7f0c4f-eac0-4aaa-9bc1-6a2116770962_gdc_realn... | 5227895868 | Illumina | True |
933 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | ce74f5ce-d92f-4c8d-8234-65a372edb92d | C494.TCGA-FG-A4MX-10A-01D-A26K-08.4_gdc_realn.bam | 5308076839 | Illumina | True |
744 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | RNA-Seq | 00b53e00-d640-49e5-b2bd-3a3bdf867998 | a13a5da8-86e7-40a1-83b9-d2f42cde39c3_gdc_realn... | 5493253789 | Illumina | True |
1824 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | e55c8de3-3a92-42f2-bc1e-70a21355d696 | C494.TCGA-HT-A4DS-01A-11D-A26M-08.4_gdc_realn.bam | 5548625577 | Illumina | True |
1345 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | 3a0e5ae0-dc79-468d-b459-a6d43b612851 | C494.TCGA-DB-A4XB-01A-11D-A26M-08.3_gdc_realn.bam | 5566530125 | Illumina | True |
580 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | RNA-Seq | 3011867e-1c3b-4791-849e-4e7d636ddc88 | 46ac364a-6eee-4ddf-8a86-30d5d93e69d9_gdc_realn... | 5578347546 | Illumina | True |
433 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | fd5d9171-979e-4742-adec-179f19bf6c06 | C494.TCGA-FG-A4MW-01A-11D-A26M-08.4_gdc_realn.bam | 5610837831 | Illumina | True |
2039 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | RNA-Seq | aeebf360-49c4-4db3-bdf4-daffdc5279cc | 9d7eba88-a95f-475a-a86f-49a42c7b6420_gdc_realn... | 5640103253 | Illumina | True |
2013 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | 25650a4b-775b-485b-92c2-3f8f30ba4169 | C494.TCGA-DU-A6S2-01A-21D-A32B-08.1_gdc_realn.bam | 5680657764 | Illumina | True |
1768 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | d8d284a7-6a45-4938-bdf7-5da558a19c79 | C494.TCGA-DB-A4XC-10A-01D-A26K-08.3_gdc_realn.bam | 5745632232 | Illumina | True |
1289 | controlled | [{'annotation_id': '1c9a57d0-d4bf-5ecc-bb2a-1d... | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | RNA-Seq | b2bd014e-44fe-4d24-9b75-91baf36b3c0a | ff71a6a7-0752-4a83-b2b8-c4d155b7d8ae_gdc_realn... | 5811473591 | Illumina | True |
1754 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | RNA-Seq | dcbaf670-8100-4c98-bd4d-2880a5805f23 | d5d72a65-f064-43cc-8872-7b08f0a31722_gdc_realn... | 5858397208 | Illumina | True |
684 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | a6218259-7690-4646-9756-eaf97e326cb1 | C494.TCGA-FG-A4MT-10A-01D-A26K-08.7_gdc_realn.bam | 5884382571 | Illumina | True |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
350 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | de3dfa71-a490-4344-95dc-99301d1dee08 | C494.TCGA-S9-A6U6-01A-12D-A33T-08.1_gdc_realn.bam | 25764491559 | Illumina | True |
786 | controlled | [{'annotation_id': '8fce0c90-2c27-5289-bd76-c8... | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | d5c9ab65-7aa8-4efc-a570-f935c861d29d | C494.TCGA-S9-A6WG-10A-01D-A33W-08.1_gdc_realn.bam | 25819033618 | Illumina | True |
194 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | cdb99f45-9d8a-4f5f-8e10-9b3ef51eac2e | C494.TCGA-S9-A6WD-01A-12D-A33T-08.1_gdc_realn.bam | 25899694434 | Illumina | True |
295 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | 6bd2f0f6-df8f-4b4e-804b-328923ec382b | C494.TCGA-FG-A711-01A-21D-A33T-08.1_gdc_realn.bam | 25954568773 | Illumina | True |
301 | controlled | [{'annotation_id': 'fcaf6ab7-6f32-578f-993e-4c... | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | 489cc5f3-9f88-4051-87c8-a242123d9ed9 | C494.TCGA-S9-A6U6-10A-01D-A33W-08.1_gdc_realn.bam | 25971722247 | Illumina | True |
1572 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | 4e30cef6-d1af-4d77-ad93-6e31ec58fc35 | C494.TCGA-VW-A7QS-01A-12D-A33T-08.1_gdc_realn.bam | 26077683267 | Illumina | True |
1526 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | 3772a453-c9cb-4084-b73a-181304516637 | C494.TCGA-DU-A7TA-01A-11D-A33T-08.1_gdc_realn.bam | 26276177578 | Illumina | True |
404 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | 5de82077-8540-474f-a569-c4b3b951f81c | C494.TCGA-TQ-A7RG-01A-11D-A33T-08.1_gdc_realn.bam | 26344403523 | Illumina | True |
2077 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | 704f5788-ca6e-499f-8d43-47134c5ba275 | C494.TCGA-S9-A6UA-01A-12D-A33T-08.1_gdc_realn.bam | 26357408848 | Illumina | True |
1585 | controlled | [{'annotation_id': '06c4a1d9-1b68-53ec-af55-65... | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | 1f46ad29-0ebf-4882-9e90-09428f6242ca | C494.TCGA-S9-A6U5-10A-01D-A33W-08.1_gdc_realn.bam | 26373199753 | Illumina | True |
1694 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | 3ba871b3-2d7f-4266-922b-627785c9e2e5 | C494.TCGA-TQ-A7RS-10A-01D-A33W-08.1_gdc_realn.bam | 26379422195 | Illumina | True |
309 | controlled | [{'annotation_id': '47718055-fc95-5478-b8b4-c8... | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | 4b6fa49f-791e-45ee-8355-be690c9f127d | C494.TCGA-S9-A6WH-10A-01D-A33W-08.1_gdc_realn.bam | 26381599891 | Illumina | True |
1025 | controlled | [{'annotation_id': '28fb238d-7868-547e-b643-a3... | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | 1ed49bd2-38ef-4747-b572-76bb16d0cb7b | C494.TCGA-S9-A6WI-10A-01D-A33W-08.1_gdc_realn.bam | 26422463992 | Illumina | True |
769 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | 2207155d-348a-4ed0-91f1-71b60f179ee2 | C494.TCGA-R8-A6MO-01A-11D-A33T-08.1_gdc_realn.bam | 26443229606 | Illumina | True |
206 | controlled | [{'annotation_id': 'ca544dc9-4f40-5589-99ab-26... | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | e45b1fae-1a83-438e-a6e3-967813831346 | C494.TCGA-S9-A6U8-01A-21D-A33T-08.1_gdc_realn.bam | 26444187783 | Illumina | True |
1057 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | 67a2edc2-4776-4442-b74c-ff8b9b91867e | C494.TCGA-E1-5322-01A-01D-1468-08.9_gdc_realn.bam | 26715379366 | Illumina | True |
1539 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | 2d622f8a-a91d-44ed-a818-ea7c2fd698cd | C494.TCGA-TQ-A7RO-01A-11D-A33T-08.1_gdc_realn.bam | 27083238331 | Illumina | True |
231 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | 06350102-0c44-4846-935d-515a5b0da989 | C494.TCGA-TQ-A7RS-01A-12D-A33T-08.1_gdc_realn.bam | 27099567535 | Illumina | True |
97 | controlled | [{'annotation_id': '672848f7-5cd4-54d4-b025-fe... | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | d942c172-b9b5-452c-bb0d-83ec7f6a5f33 | C494.TCGA-S9-A6U1-10A-01D-A33W-08.1_gdc_realn.bam | 27142848890 | Illumina | True |
1348 | controlled | [{'annotation_id': 'e16ce51f-59d8-5b7e-9d90-2c... | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | 0831b36d-1b92-4c65-83e4-baba1a37120c | C494.TCGA-S9-A6WM-10A-01D-A33W-08.1_gdc_realn.bam | 27268755793 | Illumina | True |
1407 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | 86cd60d0-91b1-402a-b900-b3dbdd556efa | C494.TCGA-TQ-A7RJ-10A-01D-A33W-08.1_gdc_realn.bam | 27300150050 | Illumina | True |
1716 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | 17e5b895-6ec1-4bfc-a55f-a2114adb0af1 | C494.TCGA-DU-A76K-01A-11D-A33T-08.1_gdc_realn.bam | 27312491024 | Illumina | True |
35 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | db95c082-fe8d-4c85-a4d8-b63d4a39c1ea | C494.TCGA-S9-A6U2-01A-21D-A33T-08.1_gdc_realn.bam | 27452781065 | Illumina | True |
1669 | controlled | [{'annotation_id': '233d4f67-1289-563f-8f18-cf... | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | 39ea5309-579b-49a6-a9b7-dc45a935be1f | C494.TCGA-S9-A6U8-10A-01D-A33W-08.1_gdc_realn.bam | 27462233818 | Illumina | True |
825 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | 5b96100f-1815-453d-b2a4-b32bb747b4ad | C494.TCGA-DH-A7US-01A-11D-A33T-08.1_gdc_realn.bam | 27741243341 | Illumina | True |
570 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | d1ff8258-be02-44bf-9cbb-0bbc895452bf | C494.TCGA-S9-A6WM-01A-12D-A33T-08.1_gdc_realn.bam | 27769132884 | Illumina | True |
197 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | e81ecc22-c399-4d35-b4b0-abcec7f895d5 | C494.TCGA-TQ-A7RM-01A-11D-A33T-08.1_gdc_realn.bam | 27972622913 | Illumina | True |
983 | controlled | [{'annotation_id': 'b9ac0f52-67e2-56fc-a794-aa... | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | 66ffda7b-6998-4690-a9b4-14b29cf52aa8 | C494.TCGA-S9-A6WE-10A-01D-A33W-08.1_gdc_realn.bam | 28087416994 | Illumina | True |
1063 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | 2b0048e0-a062-40d2-a1e1-4bb763ea0ead | C494.TCGA-S9-A6U1-01A-21D-A33T-08.1_gdc_realn.bam | 28910247215 | Illumina | True |
631 | controlled | NaN | [{'project': {'project_id': 'TCGA-LGG'}, 'case... | Raw Sequencing Data | BAM | Aligned Reads | WXS | 65170c46-a72e-41e7-84bb-2e3fe2f90667 | C494.TCGA-DU-A7TB-10A-01D-A33W-08.1_gdc_realn.bam | 30154920343 | Illumina | True |
1427 rows × 12 columns
pd.read_pickle(manifest_dir)['experimental_strategy'].value_counts()
WXS 1045 RNA-Seq 530 Name: experimental_strategy, dtype: int64
### generate the correlation between the data