#Import pdVCFsingle package import matplotlib as plt import sys sys.path.append( '../src/multi_sample/' ) from pandasVCFmulti import * %config InlineBackend.figure_format = 'retina' pd.options.mode.chained_assignment = None #supressing the chained assignment warnings vcf_path = '/Users/ers_vader/datasets_raw/1000genomes/phase3/CDS/ALL.chr21.phase3_shapeit2_mvncall_integrated_v4.20130502.genotypes_CDS.vcf.gz' #vcf_path = '/Users/ers_vader//Dropbox/TSRI/SWGR_titin.vcf.gz' vcf_chunk = Vcf(vcf_path, sample_id='all', cols=['#CHROM', 'POS', 'REF', 'ALT','FORMAT'], chunksize=1000, n_cores=20) vcf_chunk.get_vcf_df_chunk() vcf_chunk.df.info() vcf_chunk.df.head() #checking stopIteration flag vcf_chunk.stopIteration %time vcf_chunk.add_variant_annotations(inplace=True) #split_columns={'AD':2, 'HQ':2}, vcf_chunk.df vcf_chunk.df.info() vcf_chunk.df.head() #unstack dataframe by sample - QUITE SPARSE DUE TO RARE VARIANTS vcf_chunk.df.unstack(level=4).tail() def get_whole_file(vcf_path, sample_ids='all', columns=['#CHROM', 'POS', 'REF', 'ALT', 'FORMAT'], \ add_variant_annotations=True, split_columns='', chunksize=5000, inplace=True, n_cores=1): ''' This function will parse the whole multi-sample vcf file and return a dataframe. Note using multiple cores with add_variant_annotations will be very memory intensive as the parsed dataframe is copied to each process. ''' vcf_df_obj = Vcf(vcf_path, sample_id=sample_ids, cols=columns, chunksize=chunksize, n_cores=n_cores) #initiate object stopIteration = False #initiating stopIteration flag data = [] #aggregation df list while stopIteration == False: vcf_df_obj.get_vcf_df_chunk() #retrieving df chunk if vcf_df_obj.stopIteration == True: break #checking for end of file if add_variant_annotations: vcf_df_obj.add_variant_annotations(split_columns=split_columns, inplace=inplace) #parsing df and adding annotations if inplace: data.append(vcf_df_obj.df) else: data.append(vcf_df_obj.df_annot) #aggregating annotation data else: vcf_df_obj.append(vcf_df_obj.df) df = pd.concat(data) return df master_df = get_whole_file(vcf_path, sample_ids='all', columns=['#CHROM', 'POS', 'REF', 'ALT','FORMAT'], \ chunksize=5000, n_cores=4) master_df.zygosity.value_counts().plot(kind='bar', log=True) master_df.vartype2.value_counts() len(master_df) master_df.head(20)