#Import pdVCFsingle package
import matplotlib as plt
import sys
sys.path.append( '../src/multi_sample/' )
from pandasVCFmulti import *
%config InlineBackend.figure_format = 'retina'
pd.options.mode.chained_assignment = None #supressing the chained assignment warnings
vcf_path = '/Users/ers_vader/datasets_raw/1000genomes/phase3/CDS/ALL.chr21.phase3_shapeit2_mvncall_integrated_v4.20130502.genotypes_CDS.vcf.gz'
#vcf_path = '/Users/ers_vader//Dropbox/TSRI/SWGR_titin.vcf.gz'
vcf_chunk = Vcf(vcf_path, sample_id='all', cols=['#CHROM', 'POS', 'REF', 'ALT','FORMAT'], chunksize=1000, n_cores=20)
vcf_chunk.get_vcf_df_chunk()
0
vcf_chunk.df.info()
<class 'pandas.core.frame.DataFrame'> MultiIndex: 878 entries, (21, 10906922, C, T) to (21, 27945256, C, T) Columns: 2509 entries, CHROM to NA21144 dtypes: int64(1), object(2508) memory usage: 16.8+ MB
vcf_chunk.df.head()
CHROM | POS | REF | ALT | FORMAT | HG00096 | HG00097 | HG00099 | HG00100 | HG00101 | ... | NA21128 | NA21129 | NA21130 | NA21133 | NA21135 | NA21137 | NA21141 | NA21142 | NA21143 | NA21144 | ||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
CHROM | POS | REF | ALT | |||||||||||||||||||||
21 | 10906922 | C | T | 21 | 10906922 | C | T | GT | 0|0 | 0|0 | 0|0 | 0|0 | 0|0 | ... | 0|0 | 0|0 | 0|0 | 0|0 | 0|0 | 0|0 | 0|0 | 0|0 | 0|0 | 0|0 |
10906931 | T | C | 21 | 10906931 | T | C | GT | 0|0 | 0|0 | 0|0 | 0|0 | 0|0 | ... | 0|0 | 0|0 | 0|0 | 0|0 | 0|0 | 0|0 | 0|0 | 0|0 | 0|0 | 0|0 | |
10906933 | G | A | 21 | 10906933 | G | A | GT | 0|0 | 0|0 | 0|0 | 0|0 | 0|0 | ... | 0|0 | 0|0 | 0|0 | 0|0 | 0|0 | 0|0 | 0|0 | 0|0 | 0|0 | 0|0 | |
10906965 | G | A | 21 | 10906965 | G | A | GT | 0|0 | 0|0 | 0|0 | 0|0 | 0|0 | ... | 0|0 | 0|0 | 0|0 | 0|0 | 0|0 | 0|0 | 0|0 | 0|0 | 0|0 | 0|0 | |
10906982 | A | G | 21 | 10906982 | A | G | GT | 0|0 | 0|0 | 0|0 | 0|0 | 0|0 | ... | 0|0 | 0|0 | 0|0 | 0|0 | 0|0 | 0|0 | 0|0 | 0|0 | 0|0 | 0|0 |
5 rows × 2509 columns
#checking stopIteration flag
vcf_chunk.stopIteration
False
%time vcf_chunk.add_variant_annotations(inplace=True) #split_columns={'AD':2, 'HQ':2},
CPU times: user 813 ms, sys: 148 ms, total: 961 ms Wall time: 2.35 s
0
vcf_chunk.df
multiallele | phase | GT1 | GT2 | a1 | a2 | zygosity | vartype1 | vartype2 | GT | FORMAT | hom_ref_counts | |||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
CHROM | POS | REF | ALT | sample_ids | ||||||||||||
21 | 10906922 | C | T | NA19452 | 0 | | | 0 | 1 | C | T | het-ref | ref | snp | 0|1 | GT | 2499 |
HG02485 | 0 | | | 1 | 0 | T | C | het-ref | snp | ref | 1|0 | GT | 2499 | ||||
HG03520 | 0 | | | 1 | 0 | T | C | het-ref | snp | ref | 1|0 | GT | 2499 | ||||
NA19461 | 0 | | | 1 | 0 | T | C | het-ref | snp | ref | 1|0 | GT | 2499 | ||||
NA20287 | 0 | | | 1 | 0 | T | C | het-ref | snp | ref | 1|0 | GT | 2499 | ||||
10906931 | T | C | NA18995 | 0 | | | 1 | 0 | C | T | het-ref | snp | ref | 1|0 | GT | 2502 | |
NA19078 | 0 | | | 1 | 0 | C | T | het-ref | snp | ref | 1|0 | GT | 2502 | ||||
10906933 | G | A | HG01970 | 0 | | | 0 | 1 | G | A | het-ref | ref | snp | 0|1 | GT | 2503 | |
10906965 | G | A | HG03673 | 0 | | | 0 | 1 | G | A | het-ref | ref | snp | 0|1 | GT | 2503 | |
10906982 | A | G | HG00699 | 0 | | | 0 | 1 | A | G | het-ref | ref | snp | 0|1 | GT | 2503 | |
10906985 | T | G | HG03900 | 0 | | | 0 | 1 | T | G | het-ref | ref | snp | 0|1 | GT | 2502 | |
HG03021 | 0 | | | 1 | 0 | G | T | het-ref | snp | ref | 1|0 | GT | 2502 | ||||
10906987 | C | A | HG02716 | 0 | | | 0 | 1 | C | A | het-ref | ref | snp | 0|1 | GT | 2502 | |
HG03081 | 0 | | | 1 | 0 | A | C | het-ref | snp | ref | 1|0 | GT | 2502 | ||||
10906989 | C | T | HG00096 | 0 | | | 0 | 1 | C | T | het-ref | ref | snp | 0|1 | GT | 1245 | |
HG00101 | 0 | | | 0 | 1 | C | T | het-ref | ref | snp | 0|1 | GT | 1245 | ||||
HG00116 | 0 | | | 0 | 1 | C | T | het-ref | ref | snp | 0|1 | GT | 1245 | ||||
HG00120 | 0 | | | 0 | 1 | C | T | het-ref | ref | snp | 0|1 | GT | 1245 | ||||
HG00128 | 0 | | | 0 | 1 | C | T | het-ref | ref | snp | 0|1 | GT | 1245 | ||||
HG00137 | 0 | | | 0 | 1 | C | T | het-ref | ref | snp | 0|1 | GT | 1245 | ||||
HG00239 | 0 | | | 0 | 1 | C | T | het-ref | ref | snp | 0|1 | GT | 1245 | ||||
HG00258 | 0 | | | 0 | 1 | C | T | het-ref | ref | snp | 0|1 | GT | 1245 | ||||
HG00275 | 0 | | | 0 | 1 | C | T | het-ref | ref | snp | 0|1 | GT | 1245 | ||||
HG00280 | 0 | | | 0 | 1 | C | T | het-ref | ref | snp | 0|1 | GT | 1245 | ||||
HG00320 | 0 | | | 0 | 1 | C | T | het-ref | ref | snp | 0|1 | GT | 1245 | ||||
HG00327 | 0 | | | 0 | 1 | C | T | het-ref | ref | snp | 0|1 | GT | 1245 | ||||
HG00342 | 0 | | | 0 | 1 | C | T | het-ref | ref | snp | 0|1 | GT | 1245 | ||||
HG00380 | 0 | | | 0 | 1 | C | T | het-ref | ref | snp | 0|1 | GT | 1245 | ||||
HG00404 | 0 | | | 0 | 1 | C | T | het-ref | ref | snp | 0|1 | GT | 1245 | ||||
HG00409 | 0 | | | 0 | 1 | C | T | het-ref | ref | snp | 0|1 | GT | 1245 | ||||
10906989 | C | T | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | |
27938630 | G | A | HG02629 | 0 | | | 1 | 0 | A | G | het-ref | snp | ref | 1|0 | GT | 2460 | |
HG02702 | 0 | | | 1 | 0 | A | G | het-ref | snp | ref | 1|0 | GT | 2460 | ||||
HG03052 | 0 | | | 1 | 0 | A | G | het-ref | snp | ref | 1|0 | GT | 2460 | ||||
HG03103 | 0 | | | 1 | 0 | A | G | het-ref | snp | ref | 1|0 | GT | 2460 | ||||
HG03105 | 0 | | | 1 | 0 | A | G | het-ref | snp | ref | 1|0 | GT | 2460 | ||||
HG03351 | 0 | | | 1 | 0 | A | G | het-ref | snp | ref | 1|0 | GT | 2460 | ||||
HG03376 | 0 | | | 1 | 0 | A | G | het-ref | snp | ref | 1|0 | GT | 2460 | ||||
HG03394 | 0 | | | 1 | 0 | A | G | het-ref | snp | ref | 1|0 | GT | 2460 | ||||
HG03397 | 0 | | | 1 | 0 | A | G | het-ref | snp | ref | 1|0 | GT | 2460 | ||||
HG03469 | 0 | | | 1 | 0 | A | G | het-ref | snp | ref | 1|0 | GT | 2460 | ||||
HG03556 | 0 | | | 1 | 0 | A | G | het-ref | snp | ref | 1|0 | GT | 2460 | ||||
HG03559 | 0 | | | 1 | 0 | A | G | het-ref | snp | ref | 1|0 | GT | 2460 | ||||
NA18867 | 0 | | | 1 | 0 | A | G | het-ref | snp | ref | 1|0 | GT | 2460 | ||||
NA19113 | 0 | | | 1 | 0 | A | G | het-ref | snp | ref | 1|0 | GT | 2460 | ||||
NA19189 | 0 | | | 1 | 0 | A | G | het-ref | snp | ref | 1|0 | GT | 2460 | ||||
NA19307 | 0 | | | 1 | 0 | A | G | het-ref | snp | ref | 1|0 | GT | 2460 | ||||
NA19327 | 0 | | | 1 | 0 | A | G | het-ref | snp | ref | 1|0 | GT | 2460 | ||||
NA19346 | 0 | | | 1 | 0 | A | G | het-ref | snp | ref | 1|0 | GT | 2460 | ||||
NA19920 | 0 | | | 1 | 0 | A | G | het-ref | snp | ref | 1|0 | GT | 2460 | ||||
27938652 | A | T | HG00611 | 0 | | | 0 | 1 | A | T | het-ref | ref | snp | 0|1 | GT | 2503 | |
27945220 | G | A | NA20332 | 0 | | | 0 | 1 | G | A | het-ref | ref | snp | 0|1 | GT | 2503 | |
27945250 | G | A | HG03802 | 0 | | | 0 | 1 | G | A | het-ref | ref | snp | 0|1 | GT | 2503 | |
27945254 | G | A | NA19093 | 0 | | | 1 | 0 | A | G | het-ref | snp | ref | 1|0 | GT | 2503 | |
27945256 | C | T | HG02481 | 0 | | | 0 | 1 | C | T | het-ref | ref | snp | 0|1 | GT | 2497 | |
HG03139 | 0 | | | 0 | 1 | C | T | het-ref | ref | snp | 0|1 | GT | 2497 | ||||
NA18877 | 0 | | | 0 | 1 | C | T | het-ref | ref | snp | 0|1 | GT | 2497 | ||||
NA19130 | 0 | | | 0 | 1 | C | T | het-ref | ref | snp | 0|1 | GT | 2497 | ||||
HG03114 | 0 | | | 1 | 0 | T | C | het-ref | snp | ref | 1|0 | GT | 2497 | ||||
HG03352 | 0 | | | 1 | 0 | T | C | het-ref | snp | ref | 1|0 | GT | 2497 | ||||
NA20287 | 0 | | | 1 | 0 | T | C | het-ref | snp | ref | 1|0 | GT | 2497 |
67268 rows × 12 columns
vcf_chunk.df.info()
<class 'pandas.core.frame.DataFrame'> MultiIndex: 10 entries, (21, 10906922, C, T) to (21, 10907033, G, A) Columns: 2509 entries, CHROM to NA21144 dtypes: int64(1), object(2508) memory usage: 196.4+ KB
vcf_chunk.df.head()
multiallele | phase | GT1 | GT2 | a1 | a2 | zygosity | vartype1 | vartype2 | GT | FORMAT | hom_ref_counts | |||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
CHROM | POS | REF | ALT | sample_ids | ||||||||||||
21 | 10906922 | C | T | NA19452 | 0 | | | 0 | 1 | C | T | het-ref | ref | snp | 0|1 | GT | 2499 |
HG02485 | 0 | | | 1 | 0 | T | C | het-ref | snp | ref | 1|0 | GT | 2499 | ||||
HG03520 | 0 | | | 1 | 0 | T | C | het-ref | snp | ref | 1|0 | GT | 2499 | ||||
NA19461 | 0 | | | 1 | 0 | T | C | het-ref | snp | ref | 1|0 | GT | 2499 | ||||
NA20287 | 0 | | | 1 | 0 | T | C | het-ref | snp | ref | 1|0 | GT | 2499 |
#unstack dataframe by sample - QUITE SPARSE DUE TO RARE VARIANTS
vcf_chunk.df.unstack(level=4).tail()
multiallele | ... | AD_1 | ||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
sample_ids | SHUFFLE_WELLDERLY_EA_0 | SHUFFLE_WELLDERLY_EA_1 | SHUFFLE_WELLDERLY_EA_10 | SHUFFLE_WELLDERLY_EA_100 | SHUFFLE_WELLDERLY_EA_101 | SHUFFLE_WELLDERLY_EA_102 | SHUFFLE_WELLDERLY_EA_103 | SHUFFLE_WELLDERLY_EA_104 | SHUFFLE_WELLDERLY_EA_105 | SHUFFLE_WELLDERLY_EA_106 | ... | SHUFFLE_WELLDERLY_EA_90 | SHUFFLE_WELLDERLY_EA_91 | SHUFFLE_WELLDERLY_EA_92 | SHUFFLE_WELLDERLY_EA_93 | SHUFFLE_WELLDERLY_EA_94 | SHUFFLE_WELLDERLY_EA_95 | SHUFFLE_WELLDERLY_EA_96 | SHUFFLE_WELLDERLY_EA_97 | SHUFFLE_WELLDERLY_EA_98 | SHUFFLE_WELLDERLY_EA_99 | |||
CHROM | POS | REF | ALT | |||||||||||||||||||||
2 | 179636668 | T | C | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
179636679 | AC | A | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | |
179636928 | TA | T | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | |
179636983 | C | T | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | |
179636989 | C | G | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 8626 columns
def get_whole_file(vcf_path, sample_ids='all', columns=['#CHROM', 'POS', 'REF', 'ALT', 'FORMAT'], \
add_variant_annotations=True, split_columns='', chunksize=5000, inplace=True, n_cores=1):
'''
This function will parse the whole multi-sample vcf file
and return a dataframe.
Note using multiple cores with add_variant_annotations will be
very memory intensive as the parsed dataframe is copied to each process.
'''
vcf_df_obj = Vcf(vcf_path, sample_id=sample_ids, cols=columns, chunksize=chunksize, n_cores=n_cores) #initiate object
stopIteration = False #initiating stopIteration flag
data = [] #aggregation df list
while stopIteration == False:
vcf_df_obj.get_vcf_df_chunk() #retrieving df chunk
if vcf_df_obj.stopIteration == True: break #checking for end of file
if add_variant_annotations:
vcf_df_obj.add_variant_annotations(split_columns=split_columns, inplace=inplace) #parsing df and adding annotations
if inplace:
data.append(vcf_df_obj.df)
else:
data.append(vcf_df_obj.df_annot) #aggregating annotation data
else:
vcf_df_obj.append(vcf_df_obj.df)
df = pd.concat(data)
return df
master_df = get_whole_file(vcf_path, sample_ids='all', columns=['#CHROM', 'POS', 'REF', 'ALT','FORMAT'], \
chunksize=5000, n_cores=4)
End of File Reached
master_df.zygosity.value_counts().plot(kind='bar', log=True)
<matplotlib.axes.AxesSubplot at 0x116380b10>
master_df.vartype2.value_counts()
ref 31553 snp 24159 ins 681 mnp 367 del 215 indel 63 dtype: int64
len(master_df)
57038
master_df.head(20)
multiallele | phase | GT1 | GT2 | a1 | a2 | zygosity | vartype1 | vartype2 | GT | FORMAT | hom_ref_counts | FT | GQ | HQ | DP | AD | |||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
CHROM | POS | REF | ALT | sample_ids | |||||||||||||||||
2 | 179392051 | CA | CAA | SHUFFLE_WELLDERLY_EA_170 | 0 | / | 1 | 0 | CAA | CA | het-ref | ins | ref | 1/0 | GT:FT:GQ:HQ:DP:AD | 448 | VQLOW | 20 | 20,20 | 30 | 3,27 |
179392075 | GA | GAA | SHUFFLE_WELLDERLY_EA_52 | 0 | / | 1 | 0 | GAA | GA | het-ref | ins | ref | 1/0 | GT:FT:GQ:HQ:DP:AD | 447 | VQLOW | 20 | 20,20 | 34 | 6,28 | |
179392080 | A | T | SHUFFLE_WELLDERLY_EA_8 | 0 | / | 1 | 0 | T | A | het-ref | snp | ref | 1/0 | GT:FT:GQ:HQ:DP:AD | 337 | PASS | 359 | 454,359 | 49 | 26,23 | |
SHUFFLE_WELLDERLY_EA_12 | 0 | / | 1 | 0 | T | A | het-ref | snp | ref | 1/0 | GT:FT:GQ:HQ:DP:AD | 337 | PASS | 248 | 248,248 | 30 | 14,16 | ||||
SHUFFLE_WELLDERLY_EA_13 | 0 | / | 1 | 0 | T | A | het-ref | snp | ref | 1/0 | GT:FT:GQ:HQ:DP:AD | 337 | PASS | 206 | 206,206 | 28 | 12,16 | ||||
SHUFFLE_WELLDERLY_EA_19 | 0 | / | 1 | 0 | T | A | het-ref | snp | ref | 1/0 | GT:FT:GQ:HQ:DP:AD | 337 | PASS | 463 | 629,463 | 56 | 31,25 | ||||
SHUFFLE_WELLDERLY_EA_28 | 0 | / | 1 | 0 | T | A | het-ref | snp | ref | 1/0 | GT:FT:GQ:HQ:DP:AD | 337 | PASS | 273 | 388,273 | 38 | 23,15 | ||||
SHUFFLE_WELLDERLY_EA_29 | 0 | / | 1 | 0 | T | A | het-ref | snp | ref | 1/0 | GT:FT:GQ:HQ:DP:AD | 337 | PASS | 344 | 344,344 | 34 | 17,17 | ||||
SHUFFLE_WELLDERLY_EA_30 | 0 | / | 1 | 0 | T | A | het-ref | snp | ref | 1/0 | GT:FT:GQ:HQ:DP:AD | 337 | PASS | 240 | 240,277 | 52 | 17,35 | ||||
SHUFFLE_WELLDERLY_EA_39 | 0 | / | 1 | 0 | T | A | het-ref | snp | ref | 1/0 | GT:FT:GQ:HQ:DP:AD | 337 | PASS | 423 | 423,423 | 67 | 24,43 | ||||
SHUFFLE_WELLDERLY_EA_41 | 0 | / | 1 | 0 | T | A | het-ref | snp | ref | 1/0 | GT:FT:GQ:HQ:DP:AD | 337 | PASS | 481 | 481,481 | 59 | 26,33 | ||||
SHUFFLE_WELLDERLY_EA_42 | 0 | / | 1 | 0 | T | A | het-ref | snp | ref | 1/0 | GT:FT:GQ:HQ:DP:AD | 337 | PASS | 298 | 494,298 | 40 | 24,16 | ||||
SHUFFLE_WELLDERLY_EA_44 | 0 | / | 1 | 0 | T | A | het-ref | snp | ref | 1/0 | GT:FT:GQ:HQ:DP:AD | 337 | PASS | 180 | 180,180 | 28 | 10,18 | ||||
SHUFFLE_WELLDERLY_EA_45 | 0 | / | 1 | 0 | T | A | het-ref | snp | ref | 1/0 | GT:FT:GQ:HQ:DP:AD | 337 | PASS | 354 | 354,354 | 56 | 22,34 | ||||
SHUFFLE_WELLDERLY_EA_47 | 0 | / | 1 | 0 | T | A | het-ref | snp | ref | 1/0 | GT:FT:GQ:HQ:DP:AD | 337 | PASS | 644 | 644,644 | 74 | 33,41 | ||||
SHUFFLE_WELLDERLY_EA_54 | 0 | / | 1 | 0 | T | A | het-ref | snp | ref | 1/0 | GT:FT:GQ:HQ:DP:AD | 337 | PASS | 210 | 210,210 | 44 | 15,29 | ||||
SHUFFLE_WELLDERLY_EA_60 | 0 | / | 1 | 0 | T | A | het-ref | snp | ref | 1/0 | GT:FT:GQ:HQ:DP:AD | 337 | PASS | 355 | 656,355 | 50 | 33,17 | ||||
SHUFFLE_WELLDERLY_EA_65 | 0 | / | 1 | 0 | T | A | het-ref | snp | ref | 1/0 | GT:FT:GQ:HQ:DP:AD | 337 | PASS | 340 | 340,340 | 33 | 17,16 | ||||
SHUFFLE_WELLDERLY_EA_75 | 0 | / | 1 | 0 | T | A | het-ref | snp | ref | 1/0 | GT:FT:GQ:HQ:DP:AD | 337 | PASS | 897 | 897,897 | 104 | 53,51 | ||||
SHUFFLE_WELLDERLY_EA_81 | 0 | / | 1 | 0 | T | A | het-ref | snp | ref | 1/0 | GT:FT:GQ:HQ:DP:AD | 337 | PASS | 454 | 456,454 | 49 | 25,24 |