#Import pdVCFsingle package
import matplotlib as plt
import sys
sys.path.append( '../src/multi_sample/' )
from pandasVCFmulti import *
%config InlineBackend.figure_format = 'retina'
pd.options.mode.chained_assignment = None #supressing the chained assignment warnings

vcf_path = '/Users/ers_vader/datasets_raw/1000genomes/phase3/CDS/ALL.chr21.phase3_shapeit2_mvncall_integrated_v4.20130502.genotypes_CDS.vcf.gz'
#vcf_path = '/Users/ers_vader//Dropbox/TSRI/SWGR_titin.vcf.gz'

vcf_chunk = Vcf(vcf_path, sample_id='all', cols=['#CHROM', 'POS', 'REF', 'ALT','FORMAT'], chunksize=1000, n_cores=20)

vcf_chunk.get_vcf_df_chunk()


vcf_chunk.df.info()

vcf_chunk.df.head()

#checking stopIteration flag
vcf_chunk.stopIteration

%time vcf_chunk.add_variant_annotations(inplace=True)  #split_columns={'AD':2, 'HQ':2},

vcf_chunk.df

vcf_chunk.df.info()

vcf_chunk.df.head()

#unstack dataframe by sample - QUITE SPARSE DUE TO RARE VARIANTS
vcf_chunk.df.unstack(level=4).tail()

def get_whole_file(vcf_path, sample_ids='all', columns=['#CHROM', 'POS', 'REF', 'ALT', 'FORMAT'], \
                   add_variant_annotations=True, split_columns='', chunksize=5000, inplace=True, n_cores=1):
    '''
    This function will parse the whole multi-sample vcf file
    and return a dataframe.
    
    Note using multiple cores with add_variant_annotations will be 
    very memory intensive as the parsed dataframe is copied to each process.
    '''
    
    vcf_df_obj = Vcf(vcf_path, sample_id=sample_ids, cols=columns, chunksize=chunksize, n_cores=n_cores)  #initiate object
    stopIteration = False  #initiating stopIteration flag
    data = []  #aggregation df list
    
    while stopIteration == False:

        vcf_df_obj.get_vcf_df_chunk()  #retrieving df chunk
        if vcf_df_obj.stopIteration == True: break  #checking for end of file
        
        if add_variant_annotations:  
            vcf_df_obj.add_variant_annotations(split_columns=split_columns, inplace=inplace)  #parsing df and adding annotations
            if inplace:
                data.append(vcf_df_obj.df)
            else:
                data.append(vcf_df_obj.df_annot)  #aggregating annotation data
        else:
            vcf_df_obj.append(vcf_df_obj.df)

    df = pd.concat(data)
    return df

master_df = get_whole_file(vcf_path, sample_ids='all', columns=['#CHROM', 'POS', 'REF', 'ALT','FORMAT'], \
                           chunksize=5000, n_cores=4)


master_df.zygosity.value_counts().plot(kind='bar', log=True)

master_df.vartype2.value_counts()

len(master_df)

master_df.head(20)