Pfeiffer syndrom quartet analysis

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
In [2]:
!head -n 135 Pfeiffer-quartet.vcf
##fileformat=VCFv4.1
##ApplyRecalibration="analysis_type=ApplyRecalibration input_file=[] read_buffer_size=null phone_home=STANDARD gatk_key=null read_filter=[] intervals=null excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/tmp/generate_APPLY_RECALIBRATOR.py/d2cd22743d3eea79f59dd21ebb84a0d7/human_g1k_v37.fasta nonDeterministicRandomSeed=false downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=1000 baq=OFF baqGapOpenPenalty=40.0 performanceLog=null useOriginalQualities=false BQSR=null quantize_quals=0 disable_indel_quals=false emit_original_quals=false preserve_qscores_less_than=6 defaultBaseQualities=-1 validation_strictness=SILENT remove_program_records=false keep_program_records=false unsafe=null num_threads=1 num_cpu_threads=null num_io_threads=null num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false generateShadowBCF=false logging_level=INFO log_to_file=/storage/gluster/insilico/data/GenomicsData/Series//ISDB11122GPL11154/apply_recalibrator/d2cd22743d3eea79f59dd21ebb84a0d7/SM.recal.snps.vcf.log help=false input=[(RodBinding name=input source=/tmp/generate_APPLY_RECALIBRATOR.py/d2cd22743d3eea79f59dd21ebb84a0d7/ISDB11122.snps.raw.vcf)] recal_file=(RodBinding name=recal_file source=/tmp/generate_APPLY_RECALIBRATOR.py/d2cd22743d3eea79f59dd21ebb84a0d7/ISDB11122.snps.VarRecal.recal) tranches_file=/tmp/generate_APPLY_RECALIBRATOR.py/d2cd22743d3eea79f59dd21ebb84a0d7/ISDB11122.snps.VarRecal.tranches out=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub no_cmdline_in_header=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub bcf=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub ts_filter_level=99.0 ignore_filter=null mode=SNP filter_mismatching_base_and_quals=false"
##CombineVariants="analysis_type=CombineVariants input_file=[] read_buffer_size=null phone_home=STANDARD gatk_key=null read_filter=[] intervals=null excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/tmp/generate_COMBINE_VARIANTS.py/00e90c599e331929a09028693bed91f7/human_g1k_v37.fasta nonDeterministicRandomSeed=false downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=1000 baq=OFF baqGapOpenPenalty=40.0 performanceLog=null useOriginalQualities=false BQSR=null quantize_quals=0 disable_indel_quals=false emit_original_quals=false preserve_qscores_less_than=6 defaultBaseQualities=-1 validation_strictness=SILENT remove_program_records=false keep_program_records=false unsafe=null num_threads=1 num_cpu_threads=null num_io_threads=null num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false generateShadowBCF=false logging_level=INFO log_to_file=/storage/gluster/insilico/data/GenomicsData/Series//ISDB11122GPL11154/combine_variants/00e90c599e331929a09028693bed91f7/ISDB11122.SNPrecal.IndelFiltered.vcf.log help=false variant=[(RodBinding name=variant source=/tmp/generate_COMBINE_VARIANTS.py/00e90c599e331929a09028693bed91f7/ISDB11122.indel.filtered.vcf), (RodBinding name=variant2 source=/tmp/generate_COMBINE_VARIANTS.py/00e90c599e331929a09028693bed91f7/SM.recal.snps.vcf)] out=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub no_cmdline_in_header=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub bcf=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub genotypemergeoption=UNSORTED filteredrecordsmergetype=KEEP_IF_ANY_UNFILTERED multipleallelesmergetype=BY_TYPE rod_priority_list=null printComplexMerges=false filteredAreUncalled=false minimalVCF=false setKey=set assumeIdenticalSamples=false minimumN=1 suppressCommandLineHeader=false mergeInfoWithMaxAC=false filter_mismatching_base_and_quals=false"
##FILTER=<ID=GATKStandard,Description="QD < 2.0 || ReadPosRankSum < -20.0 || FS > 200.0">
##FILTER=<ID=LowQual,Description="Low quality">
##FILTER=<ID=VQSRTrancheSNP99.00to99.90,Description="Truth sensitivity tranche level for SNP model at VQS Lod: -5.6494 <= x < 1.6606">
##FILTER=<ID=VQSRTrancheSNP99.90to100.00+,Description="Truth sensitivity tranche level for SNP model at VQS Lod < -2277.7485">
##FILTER=<ID=VQSRTrancheSNP99.90to100.00,Description="Truth sensitivity tranche level for SNP model at VQS Lod: -2277.7485 <= x < -5.6494">
##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##INFO=<ID=ABHet,Number=1,Type=Float,Description="Allele Balance for hets (ref/(ref+alt))">
##INFO=<ID=ABHom,Number=1,Type=Float,Description="Allele Balance for homs (A/(A+O))">
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities">
##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP Membership">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
##INFO=<ID=DS,Number=0,Type=Flag,Description="Were any of the samples downsampled?">
##INFO=<ID=Dels,Number=1,Type=Float,Description="Fraction of Reads Containing Spanning Deletions">
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
##INFO=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias">
##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes">
##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation">
##INFO=<ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
##INFO=<ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
##INFO=<ID=MQ0,Number=1,Type=Integer,Description="Total Mapping Quality Zero Reads">
##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">
##INFO=<ID=OND,Number=1,Type=Float,Description="Overall non-diploid ratio (alleles/(alleles+non-alleles))">
##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">
##INFO=<ID=RPA,Number=.,Type=Integer,Description="Number of times tandem repeat unit is repeated, for each allele (including reference)">
##INFO=<ID=RU,Number=1,Type=String,Description="Tandem repeat unit (bases)">
##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
##INFO=<ID=SB,Number=1,Type=Float,Description="Strand Bias">
##INFO=<ID=STR,Number=0,Type=Flag,Description="Variant is a short tandem repeat">
##INFO=<ID=VQSLOD,Number=1,Type=Float,Description="Log odds ratio of being a true variant versus being false under the trained gaussian mixture model">
##INFO=<ID=culprit,Number=1,Type=String,Description="The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out">
##INFO=<ID=set,Number=1,Type=String,Description="Source VCF for the merged record in CombineVariants">
##SelectVariants="analysis_type=SelectVariants input_file=[] read_buffer_size=null phone_home=STANDARD gatk_key=null read_filter=[] intervals=null excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/tmp/generate_SELECTEDVARIANTS.py/3aed2f246e8c0249ff8a031bd209dc10/human_g1k_v37.fasta nonDeterministicRandomSeed=false downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=1000 baq=OFF baqGapOpenPenalty=40.0 performanceLog=null useOriginalQualities=false BQSR=null quantize_quals=0 disable_indel_quals=false emit_original_quals=false preserve_qscores_less_than=6 defaultBaseQualities=-1 validation_strictness=SILENT remove_program_records=false keep_program_records=false unsafe=null num_threads=1 num_cpu_threads=null num_io_threads=null num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false generateShadowBCF=false logging_level=INFO log_to_file=null help=false variant=(RodBinding name=variant source=/tmp/generate_SELECTEDVARIANTS.py/3aed2f246e8c0249ff8a031bd209dc10/ISDB11122.variants.raw.vcf) discordance=(RodBinding name= source=UNBOUND) concordance=(RodBinding name= source=UNBOUND) out=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub no_cmdline_in_header=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub bcf=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub sample_name=[] sample_expressions=null sample_file=null exclude_sample_name=[] exclude_sample_file=[] select_expressions=[] excludeNonVariants=false excludeFiltered=false regenotype=false restrictAllelesTo=ALL keepOriginalAC=false mendelianViolation=false mendelianViolationQualThreshold=0.0 select_random_number=0 select_random_fraction=0.0 remove_fraction_genotypes=0.0 selectTypeToInclude=[SNP] keepIDs=null fullyDecode=false forceGenotypesDecode=false justRead=false filter_mismatching_base_and_quals=false"
##UnifiedGenotyper="analysis_type=UnifiedGenotyper input_file=[/storage/gluster/insilico/data/GenomicsData/ISDBM32/ISDBM322016/printreads/880ddadd1566253466a7e6761e854445/ISDBM322016.recal.bam, /storage/gluster/insilico/data/GenomicsData/ISDBM32/ISDBM322018/printreads/c3a712cdc76c431297effdb013de7bd2/ISDBM322018.recal.bam, /storage/gluster/insilico/data/GenomicsData/ISDBM32/ISDBM322017/printreads/b43e7abc909888a6a066fc3da4c305e7/ISDBM322017.recal.bam, /storage/gluster/insilico/data/GenomicsData/ISDBM32/ISDBM322015/printreads/8eb77fc396f7325f7da846402d1a6df9/ISDBM322015.recal.bam] read_buffer_size=null phone_home=STANDARD gatk_key=null read_filter=[] intervals=null excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/tmp/generate_UNIFIEDGENOTYPER.py/dbe01e32e5f0143c1e3d62bfa4a46bd0/human_g1k_v37.fasta nonDeterministicRandomSeed=false downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=250 baq=OFF baqGapOpenPenalty=40.0 performanceLog=null useOriginalQualities=false BQSR=null quantize_quals=0 disable_indel_quals=false emit_original_quals=false preserve_qscores_less_than=6 defaultBaseQualities=-1 validation_strictness=SILENT remove_program_records=false keep_program_records=false unsafe=null num_threads=6 num_cpu_threads=null num_io_threads=null num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false generateShadowBCF=false logging_level=INFO log_to_file=null help=false genotype_likelihoods_model=BOTH p_nonref_model=EXACT pcr_error_rate=1.0E-4 noSLOD=false annotateNDA=false min_base_quality_score=17 max_deletion_fraction=0.05 cap_max_alternate_alleles_for_indels=false min_indel_count_for_genotyping=5 min_indel_fraction_per_sample=0.25 indel_heterozygosity=1.25E-4 indelGapContinuationPenalty=10 indelGapOpenPenalty=45 indelHaplotypeSize=80 noBandedIndel=false indelDebug=false ignoreSNPAlleles=false allReadsSP=false ignoreLaneInfo=false reference_sample_calls=(RodBinding name= source=UNBOUND) reference_sample_name=null sample_ploidy=2 min_quality_score=1 max_quality_score=40 site_quality_prior=20 min_power_threshold_for_calling=0.95 min_reference_depth=100 exclude_filtered_reference_sites=false heterozygosity=0.0010 genotyping_mode=DISCOVERY output_mode=EMIT_VARIANTS_ONLY standard_min_confidence_threshold_for_calling=30.0 standard_min_confidence_threshold_for_emitting=30.0 alleles=(RodBinding name= source=UNBOUND) max_alternate_alleles=3 dbsnp=(RodBinding name=dbsnp source=/tmp/generate_UNIFIEDGENOTYPER.py/dbe01e32e5f0143c1e3d62bfa4a46bd0/dbsnp_135.b37.vcf) comp=[] out=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub no_cmdline_in_header=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub bcf=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub debug_file=null metrics_file=null annotation=[AlleleBalance, DepthOfCoverage, FisherStrand] excludeAnnotation=[] filter_mismatching_base_and_quals=false"
##VariantFiltration="analysis_type=VariantFiltration input_file=[] read_buffer_size=null phone_home=STANDARD gatk_key=null read_filter=[] intervals=null excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/tmp/generate_VARIANT_FILTRATION.py/af94a8015760f0c3d105ae21bdcee583/human_g1k_v37.fasta nonDeterministicRandomSeed=false downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=1000 baq=OFF baqGapOpenPenalty=40.0 performanceLog=null useOriginalQualities=false BQSR=null quantize_quals=0 disable_indel_quals=false emit_original_quals=false preserve_qscores_less_than=6 defaultBaseQualities=-1 validation_strictness=SILENT remove_program_records=false keep_program_records=false unsafe=null num_threads=1 num_cpu_threads=null num_io_threads=null num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false generateShadowBCF=false logging_level=INFO log_to_file=/storage/gluster/insilico/data/GenomicsData/Series//ISDB11122GPL11154/variant_filtration/af94a8015760f0c3d105ae21bdcee583/ISDB11122.indel.filtered.vcf.log help=false variant=(RodBinding name=variant source=/tmp/generate_VARIANT_FILTRATION.py/af94a8015760f0c3d105ae21bdcee583/ISDB11122.indels.raw.vcf) mask=(RodBinding name= source=UNBOUND) out=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub no_cmdline_in_header=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub bcf=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub filterExpression=[QD < 2.0 || ReadPosRankSum < -20.0 || FS > 200.0] filterName=[GATKStandard] genotypeFilterExpression=[] genotypeFilterName=[] clusterSize=3 clusterWindowSize=0 maskExtension=0 maskName=Mask missingValuesInExpressionsShouldEvaluateAsFailing=true invalidatePreviousFilters=false filter_mismatching_base_and_quals=false"
##contig=<ID=1,length=249250621,assembly=b37>
##contig=<ID=2,length=243199373,assembly=b37>
##contig=<ID=3,length=198022430,assembly=b37>
##contig=<ID=4,length=191154276,assembly=b37>
##contig=<ID=5,length=180915260,assembly=b37>
##contig=<ID=6,length=171115067,assembly=b37>
##contig=<ID=7,length=159138663,assembly=b37>
##contig=<ID=8,length=146364022,assembly=b37>
##contig=<ID=9,length=141213431,assembly=b37>
##contig=<ID=10,length=135534747,assembly=b37>
##contig=<ID=11,length=135006516,assembly=b37>
##contig=<ID=12,length=133851895,assembly=b37>
##contig=<ID=13,length=115169878,assembly=b37>
##contig=<ID=14,length=107349540,assembly=b37>
##contig=<ID=15,length=102531392,assembly=b37>
##contig=<ID=16,length=90354753,assembly=b37>
##contig=<ID=17,length=81195210,assembly=b37>
##contig=<ID=18,length=78077248,assembly=b37>
##contig=<ID=19,length=59128983,assembly=b37>
##contig=<ID=20,length=63025520,assembly=b37>
##contig=<ID=21,length=48129895,assembly=b37>
##contig=<ID=22,length=51304566,assembly=b37>
##contig=<ID=X,length=155270560,assembly=b37>
##contig=<ID=Y,length=59373566,assembly=b37>
##contig=<ID=MT,length=16569,assembly=b37>
##contig=<ID=GL000207.1,length=4262,assembly=b37>
##contig=<ID=GL000226.1,length=15008,assembly=b37>
##contig=<ID=GL000229.1,length=19913,assembly=b37>
##contig=<ID=GL000231.1,length=27386,assembly=b37>
##contig=<ID=GL000210.1,length=27682,assembly=b37>
##contig=<ID=GL000239.1,length=33824,assembly=b37>
##contig=<ID=GL000235.1,length=34474,assembly=b37>
##contig=<ID=GL000201.1,length=36148,assembly=b37>
##contig=<ID=GL000247.1,length=36422,assembly=b37>
##contig=<ID=GL000245.1,length=36651,assembly=b37>
##contig=<ID=GL000197.1,length=37175,assembly=b37>
##contig=<ID=GL000203.1,length=37498,assembly=b37>
##contig=<ID=GL000246.1,length=38154,assembly=b37>
##contig=<ID=GL000249.1,length=38502,assembly=b37>
##contig=<ID=GL000196.1,length=38914,assembly=b37>
##contig=<ID=GL000248.1,length=39786,assembly=b37>
##contig=<ID=GL000244.1,length=39929,assembly=b37>
##contig=<ID=GL000238.1,length=39939,assembly=b37>
##contig=<ID=GL000202.1,length=40103,assembly=b37>
##contig=<ID=GL000234.1,length=40531,assembly=b37>
##contig=<ID=GL000232.1,length=40652,assembly=b37>
##contig=<ID=GL000206.1,length=41001,assembly=b37>
##contig=<ID=GL000240.1,length=41933,assembly=b37>
##contig=<ID=GL000236.1,length=41934,assembly=b37>
##contig=<ID=GL000241.1,length=42152,assembly=b37>
##contig=<ID=GL000243.1,length=43341,assembly=b37>
##contig=<ID=GL000242.1,length=43523,assembly=b37>
##contig=<ID=GL000230.1,length=43691,assembly=b37>
##contig=<ID=GL000237.1,length=45867,assembly=b37>
##contig=<ID=GL000233.1,length=45941,assembly=b37>
##contig=<ID=GL000204.1,length=81310,assembly=b37>
##contig=<ID=GL000198.1,length=90085,assembly=b37>
##contig=<ID=GL000208.1,length=92689,assembly=b37>
##contig=<ID=GL000191.1,length=106433,assembly=b37>
##contig=<ID=GL000227.1,length=128374,assembly=b37>
##contig=<ID=GL000228.1,length=129120,assembly=b37>
##contig=<ID=GL000214.1,length=137718,assembly=b37>
##contig=<ID=GL000221.1,length=155397,assembly=b37>
##contig=<ID=GL000209.1,length=159169,assembly=b37>
##contig=<ID=GL000218.1,length=161147,assembly=b37>
##contig=<ID=GL000220.1,length=161802,assembly=b37>
##contig=<ID=GL000213.1,length=164239,assembly=b37>
##contig=<ID=GL000211.1,length=166566,assembly=b37>
##contig=<ID=GL000199.1,length=169874,assembly=b37>
##contig=<ID=GL000217.1,length=172149,assembly=b37>
##contig=<ID=GL000216.1,length=172294,assembly=b37>
##contig=<ID=GL000215.1,length=172545,assembly=b37>
##contig=<ID=GL000205.1,length=174588,assembly=b37>
##contig=<ID=GL000219.1,length=179198,assembly=b37>
##contig=<ID=GL000224.1,length=179693,assembly=b37>
##contig=<ID=GL000223.1,length=180455,assembly=b37>
##contig=<ID=GL000195.1,length=182896,assembly=b37>
##contig=<ID=GL000212.1,length=186858,assembly=b37>
##contig=<ID=GL000222.1,length=186861,assembly=b37>
##contig=<ID=GL000200.1,length=187035,assembly=b37>
##contig=<ID=GL000193.1,length=189789,assembly=b37>
##contig=<ID=GL000194.1,length=191469,assembly=b37>
##contig=<ID=GL000225.1,length=211173,assembly=b37>
##contig=<ID=GL000192.1,length=547496,assembly=b37>
##reference=file:///tmp/generate_COMBINE_VARIANTS.py/00e90c599e331929a09028693bed91f7/human_g1k_v37.fasta
##source=SelectVariants
##SnpEffVersion="3.3c (build 2013-06-28), by Pablo Cingolani"
##SnpEffCmd="SnpEff  -t hg19 /tmp/generate_SNPEFF.py/98b17a7321fce5a67e5dcd5a1c24a311/ISDB11122.SNPrecal.IndelFiltered.vcf "
##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Transcript_BioType | Gene_Coding | Transcript_ID | Exon  | GenotypeNum [ | ERRORS | WARNINGS ] )' ">
#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	ISDBM322015	ISDBM322016	ISDBM322017	ISDBM322018

데이터 로드하기. 135줄부터 로드해야 함.

In [3]:
df = pd.read_table('Pfeiffer-quartet.vcf', skiprows=134, dtype={'#CHROM': str})
df
Out[3]:
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT ISDBM322015 ISDBM322016 ISDBM322017 ISDBM322018
0 1 14907 rs79585140 A G 514.87 VQSRTrancheSNP99.00to99.90 ABHet=0.628;ABHom=1.00;AC=3;AF=0.375;AN=8;Base... GT:AD:DP:GQ:PL 0/0:6,0:6:18:0,18,178 0/1:17,10:27:99:208,0,341 0/1:14,9:23:99:176,0,258 0/1:20,11:31:99:174,0,380
1 1 14930 rs75454623 A G 780.25 VQSRTrancheSNP99.00to99.90 ABHet=0.644;AC=4;AF=0.500;AN=8;BaseQRankSum=-5... GT:AD:DP:GQ:PL 0/1:6,1:7:7:7,0,150 0/1:16,13:29:99:296,0,378 0/1:19,13:32:99:245,0,420 0/1:24,15:39:99:274,0,535
2 1 14948 . G A 47.48 VQSRTrancheSNP99.90to100.00 ABHet=0.846;ABHom=1.00;AC=2;AF=0.250;AN=8;Base... GT:AD:DP:GQ:PL 0/0:10,0:10:24:0,24,228 0/0:32,0:32:81:0,81,734 0/1:28,6:34:79:79,0,549 0/1:41,6:47:8:8,0,850
3 1 15211 rs144718396 T G 110.28 VQSRTrancheSNP99.90to100.00 ABHet=0.359;ABHom=1.00;AC=5;AF=0.625;AN=8;Base... GT:AD:DP:GQ:PL 0/1:1,1:2:17:17,0,17 0/1:3,5:8:8:76,0,8 1/1:0,1:1:3:23,3,0 0/1:1,4:5:14:36,0,14
4 1 17538 . C A 151.64 VQSRTrancheSNP99.00to99.90 ABHet=0.735;ABHom=1.00;AC=2;AF=0.250;AN=8;Base... GT:AD:DP:GQ:PL 0/0:2,0:2:6:0,6,80 0/1:10,2:13:51:51,0,138 0/0:23,1:24:0:0,0,418 0/1:14,8:22:99:139,0,243
5 1 63336 . C T 460.26 VQSRTrancheSNP99.90to100.00 ABHet=0.705;ABHom=1.00;AC=2;AF=0.333;AN=6;Base... GT:AD:DP:GQ:PL ./. 0/1:31,15:46:99:340,0,308 0/1:25,9:34:99:159,0,394 0/0:52,0:52:90:0,90,830
6 1 63735 . CCTA C 193.23 PASS AC=3;AF=0.500;AN=6;BaseQRankSum=-0.960;DP=42;F... GT:AD:DP:GQ:PL ./. 1/1:6,2:4:6:116,6,0 0/1:8,4:8:99:128,0,301 0/0:18,2:15:30:0,30,666
7 1 69511 rs75062661 A G 4293.01 VQSRTrancheSNP99.90to100.00 ABHom=0.982;AC=8;AF=1.00;AN=8;BaseQRankSum=2.0... GT:AD:DP:GQ:PL 1/1:2,171:173:99:2218,228,0 1/1:0,33:34:60:508,60,0 1/1:0,61:63:93:777,93,0 1/1:0,61:61:96:790,96,0
8 1 121009 rs1851943 C T 35.27 VQSRTrancheSNP99.00to99.90 ABHet=0.636;ABHom=0.929;AC=1;AF=0.167;AN=6;Bas... GT:AD:DP:GQ:PL ./. 0/0:11,0:11:33:0,33,436 0/0:6,1:7:18:0,18,240 0/1:7,4:11:70:70,0,229
9 1 133160 . G A 88.69 VQSRTrancheSNP99.00to99.90 ABHet=0.750;ABHom=0.841;AC=3;AF=0.500;AN=6;Bas... GT:AD:DP:GQ:PL 1/1:1,3:4:9:109,9,0 0/1:3,1:4:20:20,0,107 ./. 0/0:1,0:1:3:0,3,40
10 1 546952 rs9438487 T C 43.22 VQSRTrancheSNP99.90to100.00 ABHom=1.00;AC=4;AF=1.00;AN=4;DB;DP=3;Dels=0.00... GT:AD:DP:GQ:PL 1/1:0,2:2:6:49,6,0 ./. ./. 1/1:0,1:1:3:28,3,0
11 1 745347 . T C 59.16 VQSRTrancheSNP99.90to100.00 ABHet=0.702;ABHom=0.938;AC=3;AF=0.375;AN=8;Bas... GT:AD:DP:GQ:PL 0/1:3,1:4:13:13,0,60 0/0:15,1:16:8:0,8,212 0/1:4,4:8:71:75,0,71 0/1:13,2:15:13:13,0,203
12 1 745370 rs146246821 TA T 510.54 PASS AC=4;AF=0.500;AN=8;BaseQRankSum=1.905;DB;DP=64... GT:AD:DP:GQ:PL 0/1:4,1:5:32:32,0,185 0/1:17,4:18:99:132,0,623 0/1:9,7:16:99:273,0,395 0/1:16,4:19:99:125,0,661
13 1 752566 rs3094315 G A 118.67 PASS ABHom=1.00;AC=6;AF=1.00;AN=6;DB;DP=4;Dels=0.00... GT:AD:DP:GQ:PL ./. 1/1:0,2:2:6:79,6,0 1/1:0,1:1:3:39,3,0 1/1:0,1:1:3:36,3,0
14 1 752721 rs3131972 A G 242.70 PASS ABHom=1.00;AC=6;AF=1.00;AN=6;DB;DP=7;Dels=0.00... GT:AD:DP:GQ:PL ./. 1/1:0,2:2:6:80,6,0 1/1:0,4:4:12:159,12,0 1/1:0,1:1:3:40,3,0
15 1 753405 rs61770173 C A 432.40 VQSRTrancheSNP99.90to100.00 ABHom=1.00;AC=6;AF=1.00;AN=6;DB;DP=31;Dels=0.0... GT:AD:DP:GQ:PL ./. 1/1:0,14:14:24:221,24,0 1/1:0,8:8:12:111,12,0 1/1:0,9:9:15:138,15,0
16 1 753474 rs2073814 C G 271.92 VQSRTrancheSNP99.00to99.90 ABHom=0.952;AC=6;AF=1.00;AN=6;DB;DP=15;Dels=0.... GT:AD:DP:GQ:PL ./. 1/1:0,3:3:9:118,9,0 1/1:0,6:7:9:83,9,0 1/1:0,5:5:9:108,9,0
17 1 758324 rs3131955 T C 45.77 PASS ABHom=1.00;AC=2;AF=1.00;AN=2;DB;DP=2;Dels=0.00... GT:AD:DP:GQ:PL 1/1:0,2:2:6:77,6,0 ./. ./. ./.
18 1 780027 rs2977613 G T 47.01 PASS ABHom=1.00;AC=4;AF=1.00;AN=4;DB;DP=2;Dels=0.00... GT:AD:DP:GQ:PL 1/1:0,1:1:3:40,3,0 ./. 1/1:0,1:1:3:40,3,0 ./.
19 1 808631 rs11240779 G A 1853.99 PASS ABHom=1.000;AC=8;AF=1.00;AN=8;DB;DP=52;Dels=0.... GT:AD:DP:GQ:PL 1/1:0,7:7:21:223,21,0 1/1:0,16:16:48:602,48,0 1/1:0,13:13:36:457,36,0 1/1:0,16:16:48:611,48,0
20 1 808922 rs6594027 G A 4960.01 VQSRTrancheSNP99.00to99.90 ABHom=1.00;AC=8;AF=1.00;AN=8;DB;DP=222;Dels=0.... GT:AD:DP:GQ:PL 1/1:0,43:43:96:1199,96,0 1/1:0,62:62:99:1304,102,0 1/1:0,58:58:93:1169,93,0 1/1:0,59:59:99:1288,102,0
21 1 808928 rs11240780 C T 5605.01 VQSRTrancheSNP99.00to99.90 ABHom=1.00;AC=8;AF=1.00;AN=8;DB;DP=234;Dels=0.... GT:AD:DP:GQ:PL 1/1:0,48:48:99:1318,111,0 1/1:0,61:61:99:1456,114,0 1/1:0,63:63:99:1383,111,0 1/1:0,62:62:99:1448,114,0
22 1 809171 . G A 103.55 VQSRTrancheSNP99.90to100.00 ABHet=0.800;ABHom=0.936;AC=2;AF=0.250;AN=8;Bas... GT:AD:DP:GQ:PL 0/1:35,6:41:36:36,0,520 0/1:44,15:59:99:109,0,657 0/0:47,5:52:57:0,57,722 0/0:31,0:32:78:0,78,634
23 1 809732 rs147199422 T C 44.02 VQSRTrancheSNP99.90to100.00 ABHet=0.865;ABHom=0.932;AC=1;AF=0.125;AN=8;Bas... GT:AD:DP:GQ:PL 0/1:64,10:74:80:80,0,2123 0/0:58,2:60:99:0,121,2171 0/0:42,4:46:17:0,17,1486 0/0:54,5:59:29:0,29,1912
24 1 809744 . A G 40.99 VQSRTrancheSNP99.90to100.00 ABHet=0.864;ABHom=0.950;AC=1;AF=0.125;AN=8;Bas... GT:AD:DP:GQ:PL 0/1:57,9:67:77:77,0,1851 0/0:57,2:60:99:0,115,2057 0/0:46,3:49:59:0,59,1686 0/0:53,3:56:74:0,74,1828
25 1 812267 rs7541694 A G 385.67 PASS ABHom=1.00;AC=8;AF=1.00;AN=8;DB;DP=13;Dels=0.0... GT:AD:DP:GQ:PL 1/1:0,3:3:9:95,9,0 1/1:0,5:5:15:175,15,0 1/1:0,2:2:6:55,6,0 1/1:0,3:3:9:99,9,0
26 1 812284 rs7545373 C G 428.75 PASS ABHom=1.00;AC=8;AF=1.00;AN=8;DB;DP=13;Dels=0.0... GT:AD:DP:GQ:PL 1/1:0,5:5:15:185,15,0 1/1:0,4:4:12:148,12,0 1/1:0,2:2:6:55,6,0 1/1:0,2:2:6:79,6,0
27 1 823790 rs143626389 G A 38.82 PASS ABHom=1.00;AC=2;AF=0.500;AN=4;BaseQRankSum=0.3... GT:AD:DP:GQ:PL ./. 1/1:0,2:2:6:75,6,0 ./. 0/0:3,0:3:9:0,9,99
28 1 834832 rs4411087 G C 48.77 PASS ABHom=1.00;AC=2;AF=1.00;AN=2;DB;DP=2;Dels=0.00... GT:AD:DP:GQ:PL ./. 1/1:0,2:2:6:80,6,0 ./. ./.
29 1 849998 rs13303222 A G 45.77 PASS ABHom=1.00;AC=2;AF=1.00;AN=2;DB;DP=2;Dels=0.00... GT:AD:DP:GQ:PL 1/1:0,2:2:6:77,6,0 ./. ./. ./.
... ... ... ... ... ... ... ... ... ... ... ... ... ...
300006 GL000225.1 198637 . C T 80.98 PASS ABHom=1.00;AC=2;AF=1.00;AN=2;DP=3;Dels=0.00;FS... GT:AD:DP:GQ:PL ./. 1/1:0,3:3:9:113,9,0 ./. ./.
300007 GL000225.1 198643 . A G 86.98 PASS ABHom=1.00;AC=2;AF=1.00;AN=2;DP=3;Dels=0.00;FS... GT:AD:DP:GQ:PL ./. 1/1:0,3:3:9:119,9,0 ./. ./.
300008 GL000225.1 202533 . C T 46.77 PASS ABHom=1.00;AC=2;AF=1.00;AN=2;DP=2;Dels=0.00;FS... GT:AD:DP:GQ:PL ./. ./. ./. 1/1:0,2:2:6:78,6,0
300009 GL000225.1 203673 . C T 78.22 PASS ABHom=1.00;AC=4;AF=1.00;AN=4;DP=3;Dels=0.00;FS... GT:AD:DP:GQ:PL 1/1:0,2:2:6:80,6,0 1/1:0,1:1:3:32,3,0 ./. ./.
300010 GL000192.1 99120 . C T 1132.79 VQSRTrancheSNP99.90to100.00 ABHet=0.594;ABHom=1.00;AC=3;AF=0.375;AN=8;Base... GT:AD:DP:GQ:PL 0/1:37,35:72:99:680,0,597 0/1:30,11:41:99:211,0,460 0/1:15,13:28:99:285,0,257 0/0:45,0:45:99:0,123,1131
300011 GL000192.1 99283 . T C 775.57 VQSRTrancheSNP99.90to100.00 ABHet=0.648;ABHom=0.800;AC=5;AF=0.625;AN=8;Bas... GT:AD:DP:GQ:PL 0/1:21,12:33:99:143,0,226 0/1:13,12:25:99:223,0,167 0/1:26,7:33:99:127,0,155 1/1:4,16:20:36:328,36,0
300012 GL000192.1 99390 . C G 74.63 VQSRTrancheSNP99.90to100.00 ABHet=0.527;ABHom=1.00;AC=2;AF=0.250;AN=8;Base... GT:AD:DP:GQ:PL 0/0:3,0:3:6:0,6,55 0/0:7,0:7:12:0,12,111 0/1:5,4:9:13:96,0,13 0/1:5,5:10:19:19,0,46
300013 GL000192.1 100085 . A G 128.56 VQSRTrancheSNP99.00to99.90 ABHet=0.442;ABHom=1.00;AC=4;AF=0.667;AN=6;Base... GT:AD:DP:GQ:PL ./. 0/1:2,1:3:19:19,0,66 0/1:1,4:5:11:120,0,11 1/1:0,1:1:3:28,3,0
300014 GL000192.1 101228 . T C 493.56 VQSRTrancheSNP99.00to99.90 ABHet=0.378;ABHom=1.00;AC=5;AF=0.625;AN=8;Base... GT:AD:DP:GQ:PL 0/1:6,3:9:93:93,0,186 0/1:3,7:10:75:222,0,75 0/1:1,5:6:23:133,0,23 1/1:0,3:3:9:90,9,0
300015 GL000192.1 101317 . A C 311.87 VQSRTrancheSNP99.90to100.00 ABHet=0.429;ABHom=1.00;AC=3;AF=0.375;AN=8;Base... GT:AD:DP:GQ:PL 0/1:10,7:17:99:146,0,201 0/1:3,7:10:28:166,0,28 0/0:7,0:7:18:0,18,166 0/1:2,3:5:43:43,0,43
300016 GL000192.1 107709 . G A 149.56 VQSRTrancheSNP99.00to99.90 ABHet=0.662;ABHom=1.00;AC=2;AF=0.250;AN=8;Base... GT:AD:DP:GQ:PL 0/0:24,0:24:45:0,45,511 0/0:12,0:12:27:0,27,328 0/1:6,5:12:99:124,0,131 0/1:14,4:18:67:67,0,313
300017 GL000192.1 111042 . C A 440.55 VQSRTrancheSNP99.00to99.90 ABHet=0.460;ABHom=1.00;AC=2;AF=0.250;AN=8;Base... GT:AD:DP:GQ:PL 0/0:39,0:39:78:0,78,1000 0/0:24,0:24:45:0,45,579 0/1:9,11:20:45:310,0,45 0/1:8,9:17:99:172,0,133
300018 GL000192.1 121717 . G A 286.55 VQSRTrancheSNP99.90to100.00 ABHet=0.663;ABHom=1.00;AC=2;AF=0.250;AN=8;Base... GT:AD:DP:GQ:PL 0/0:45,0:45:51:0,51,470 0/0:29,0:29:51:0,51,463 0/1:20,7:27:87:87,0,281 0/1:17,12:29:99:241,0,209
300019 GL000192.1 121977 . G A 1200.79 VQSRTrancheSNP99.00to99.90 ABHet=0.450;ABHom=1.00;AC=3;AF=0.375;AN=8;Base... GT:AD:DP:GQ:PL 0/1:19,19:38:99:492,0,361 0/1:16,15:31:99:394,0,386 0/0:28,0:28:84:0,84,918 0/1:6,12:18:99:358,0,137
300020 GL000192.1 131599 . C G 360.79 VQSRTrancheSNP99.90to100.00 ABHet=0.460;ABHom=1.00;AC=3;AF=0.375;AN=8;Base... GT:AD:DP:GQ:PL 0/1:3,3:6:65:65,0,65 0/1:10,9:19:99:155,0,125 0/0:15,0:15:36:0,36,332 0/1:6,11:17:75:184,0,75
300021 GL000192.1 139953 . C G 165.56 VQSRTrancheSNP99.90to100.00 ABHet=0.451;ABHom=1.00;AC=2;AF=0.250;AN=8;Base... GT:AD:DP:GQ:PL 0/0:27,0:27:57:0,57,725 0/0:18,0:18:30:0,30,363 0/1:11,4:15:48:48,0,280 0/1:1,5:6:21:159,0,21
300022 GL000192.1 139953 . CTG C 1104.53 PASS AC=4;AF=0.500;AN=8;BaseQRankSum=-3.358;DP=66;F... GT:AD:DP:GQ:PL 1/1:7,18:20:51:677,51,0 0/1:9,6:10:33:204,0,33 0/1:8,7:10:19:280,0,19 0/0:6,0:6:9:0,9,141
300023 GL000192.1 160087 . C T 157.22 PASS ABHet=0.625;ABHom=1.00;AC=6;AF=0.750;AN=8;Base... GT:AD:DP:GQ:PL 0/1:2,2:4:52:60,0,52 0/1:3,1:4:25:25,0,105 1/1:0,2:2:6:78,6,0 1/1:0,1:1:3:37,3,0
300024 GL000192.1 197562 . T C 158.55 VQSRTrancheSNP99.90to100.00 ABHom=1.00;AC=4;AF=1.00;AN=4;DP=7;Dels=0.00;FS... GT:AD:DP:GQ:PL ./. ./. 1/1:0,4:4:12:111,12,0 1/1:0,3:3:9:83,9,0
300025 GL000192.1 212151 . G C 566.79 VQSRTrancheSNP99.90to100.00 ABHet=0.498;AC=4;AF=0.500;AN=8;BaseQRankSum=0.... GT:AD:DP:GQ:PL 0/1:18,23:41:99:291,0,208 0/1:6,12:18:99:253,0,114 0/1:7,4:11:62:62,0,90 0/1:11,1:12:1:1,0,194
300026 GL000192.1 216599 . C A 223.55 VQSRTrancheSNP99.90to100.00 ABHet=0.492;ABHom=1.00;AC=2;AF=0.250;AN=8;Base... GT:AD:DP:GQ:PL 0/0:46,0:46:99:0,102,941 0/0:19,0:19:33:0,33,304 0/1:5,6:11:56:139,0,56 0/1:9,8:17:99:126,0,130
300027 GL000192.1 228788 . G T 46.77 VQSRTrancheSNP99.90to100.00 ABHom=0.500;AC=2;AF=1.00;AN=2;DP=10;Dels=0.00;... GT:AD:DP:GQ:PL ./. 1/1:2,2:4:6:78,6,0 ./. ./.
300028 GL000192.1 229867 . C T 2232.98 VQSRTrancheSNP99.90to100.00 ABHom=0.857;AC=8;AF=1.00;AN=8;DP=111;Dels=0.00... GT:AD:DP:GQ:PL 1/1:4,25:29:66:609,66,0 1/1:2,25:27:57:528,57,0 1/1:6,23:29:66:609,66,0 1/1:4,22:26:57:526,57,0
300029 GL000192.1 272061 . C CT 125.59 PASS AC=3;AF=0.375;AN=8;BaseQRankSum=-0.058;DP=33;F... GT:AD:DP:GQ:PL 0/0:5,0:4:9:0,9,93 0/1:5,4:7:12:107,0,12 0/1:9,2:3:18:53,0,18 0/1:6,1:4:17:17,0,69
300030 GL000192.1 311575 . C T 248.50 PASS ABHet=0.333;ABHom=1.00;AC=3;AF=0.375;AN=8;Base... GT:AD:DP:GQ:PL 0/0:2,0:2:3:0,3,39 0/1:3,6:9:93:212,0,93 1/1:0,2:2:6:80,6,0 0/0:5,0:5:15:0,15,181
300031 GL000192.1 313293 . G A 174.79 VQSRTrancheSNP99.90to100.00 ABHet=0.640;ABHom=1.00;AC=3;AF=0.375;AN=8;Base... GT:AD:DP:GQ:PL 0/1:42,16:58:55:55,0,587 0/0:34,0:34:42:0,42,447 0/1:26,10:36:57:57,0,367 0/1:19,21:40:99:106,0,367
300032 GL000192.1 313701 . T C 120.67 PASS ABHom=1.00;AC=6;AF=1.00;AN=6;DP=4;Dels=0.00;FS... GT:AD:DP:GQ:PL ./. 1/1:0,1:1:3:40,3,0 1/1:0,1:1:3:39,3,0 1/1:0,2:2:6:77,6,0
300033 GL000192.1 337826 . TA T 489.51 PASS AC=2;AF=0.250;AN=8;BaseQRankSum=-3.725;DP=62;F... GT:AD:DP:GQ:PL 0/0:18,0:18:51:0,51,855 0/0:13,0:13:39:0,39,695 0/1:5,10:15:99:438,0,228 0/1:13,3:16:99:102,0,647
300034 GL000192.1 394140 . A G 45.26 PASS ABHom=1.00;AC=2;AF=0.500;AN=4;BaseQRankSum=0.7... GT:AD:DP:GQ:PL ./. 0/0:1,0:1:3:0,3,39 ./. 1/1:0,2:2:6:80,6,0
300035 10 123256215 . T G 100.00 PASS GENE=FGFR2;INHERITANCE=AD;MIM=101600 GT:AD:DP:GQ:PL 0/0:1,0:1:3:0,3,39 0/0:1,0:1:3:0,3,39 1/0:1,0:1:3:0,3,39 0/0:1,0:1:3:0,3,39

300036 rows × 13 columns

In [4]:
df['ALT'].value_counts()
Out[4]:
A                                               74449
T                                               73785
G                                               72981
C                                               72857
TA                                                509
CT                                                493
AT                                                479
GA                                                424
CA                                                395
GT                                                359
AG                                                256
TG                                                233
GC                                                219
TC                                                157
AC                                                152
CG                                                 63
A,T                                                56
CTT                                                50
CAT                                                45
TAA                                                42
C,CT                                               41
A,C                                                41
CAA                                                40
TAC                                                37
A,G                                                37
CAG                                                37
C,T                                                36
C,G                                                36
CCT                                                35
G,T                                                35
                                                ...  
ATTC                                                1
C,CGT                                               1
CAACA                                               1
TAAA,TA                                             1
GTGTT                                               1
GATAA                                               1
CAGAGAG,C                                           1
AGGG                                                1
GGAGGAA                                             1
GCCTT                                               1
G,GTT                                               1
GGGT                                                1
CTCAT                                               1
TACTG                                               1
CGGT                                                1
GGTTT                                               1
CACCT                                               1
GA,G                                                1
GGGGCTGGTACACACAGGTCAGCACGGCCAGGTTCCCACTCCCG        1
GT,G                                                1
G,GAAA                                              1
CATAT                                               1
TTGTG,T                                             1
CTTAT                                               1
GACAC,G                                             1
ACCAT                                               1
CAAA,C                                              1
ACTAT                                               1
GTAAA                                               1
CCTCTCT,C                                           1
Name: ALT, dtype: int64
In [5]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300036 entries, 0 to 300035
Data columns (total 13 columns):
#CHROM         300036 non-null object
POS            300036 non-null int64
ID             300036 non-null object
REF            300036 non-null object
ALT            300036 non-null object
QUAL           300036 non-null float64
FILTER         300036 non-null object
INFO           300036 non-null object
FORMAT         300036 non-null object
ISDBM322015    300036 non-null object
ISDBM322016    300036 non-null object
ISDBM322017    300036 non-null object
ISDBM322018    300036 non-null object
dtypes: float64(1), int64(1), object(11)
memory usage: 29.8+ MB

FILTER가 PASS인 레코드만 취합

In [6]:
dfc = df[df['FILTER'] == 'PASS']
dfc.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 268101 entries, 6 to 300035
Data columns (total 13 columns):
#CHROM         268101 non-null object
POS            268101 non-null int64
ID             268101 non-null object
REF            268101 non-null object
ALT            268101 non-null object
QUAL           268101 non-null float64
FILTER         268101 non-null object
INFO           268101 non-null object
FORMAT         268101 non-null object
ISDBM322015    268101 non-null object
ISDBM322016    268101 non-null object
ISDBM322017    268101 non-null object
ISDBM322018    268101 non-null object
dtypes: float64(1), int64(1), object(11)
memory usage: 28.6+ MB
In [7]:
dfc = dfc.reset_index()
In [8]:
sample_ids = ['ISDBM322015', 'ISDBM322016', 'ISDBM322017', 'ISDBM322018']
sample_ids_genotype = ['{} genotype'.format(s) for s in sample_ids]

def call_genotype(row):
    alts = row['ALT'].split(',')
    m = {
        '0': row['REF'],
        '1': alts[0],
        '2': alts[1] if len(alts) == 2 else alts[0],
        '3': alts[2] if len(alts) == 3 else alts[0],
        '.': '',
    }
    return pd.Series(['{}-{}'.format(
                m[row[s][0]], m[row[s][2]]) for s in sample_ids if s], 
                index=sample_ids_genotype)

genotypes = dfc.apply(call_genotype, axis=1)
genotypes
Out[8]:
ISDBM322015 genotype ISDBM322016 genotype ISDBM322017 genotype ISDBM322018 genotype
0 - C-C CCTA-C CCTA-CCTA
1 TA-T TA-T TA-T TA-T
2 - A-A A-A A-A
3 - G-G G-G G-G
4 C-C - - -
5 T-T - T-T -
6 A-A A-A A-A A-A
7 G-G G-G G-G G-G
8 G-G G-G G-G G-G
9 - A-A - G-G
10 - C-C - -
11 G-G - - -
12 G-G G-G - -
13 G-G - - G-G
14 T-T C-C - -
15 A-A A-A A-A A-A
16 CCCCT-CCCCT CCCCT-CCCCT C-C CCCCT-CCCCT
17 - - G-G G-G
18 - - T-T T-T
19 - - C-C C-C
20 C-C C-C - C-C
21 - T-T G-G G-T
22 T-T T-T - -
23 G-G G-G - -
24 C-C C-G C-G C-C
25 C-T C-C C-C C-T
26 G-C G-G G-G G-C
27 A-A A-A A-A A-A
28 C-C C-C C-C C-C
29 G-G G-G G-G G-G
... ... ... ... ...
268071 G-G - G-G G-G
268072 GT-G G-G - G-G
268073 G-G - - A-G
268074 G-G - - A-G
268075 C-C - - C-C
268076 T-T - T-T C-T
268077 - - T-T A-T
268078 - - G-G A-G
268079 G-G - - -
268080 C-C C-C C-C -
268081 G-G G-G G-G G-G
268082 G-G - - -
268083 - C-C C-C -
268084 T-T T-T T-T T-T
268085 G-G G-G - -
268086 TC-TC TC-TC TC-TC T-TC
268087 C-C - - -
268088 G-G - - -
268089 - T-T - -
268090 - G-G - -
268091 - - - T-T
268092 T-T T-T - -
268093 C-C CTG-C CTG-C CTG-CTG
268094 C-T C-T T-T T-T
268095 C-C C-CT C-CT C-CT
268096 C-C C-T T-T C-C
268097 - C-C C-C C-C
268098 TA-TA TA-TA TA-T TA-T
268099 - A-A - G-G
268100 T-T T-T G-T T-T

268101 rows × 4 columns

In [9]:
dfc[sample_ids_genotype] = genotypes
dfc
Out[9]:
index #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT ISDBM322015 ISDBM322016 ISDBM322017 ISDBM322018 ISDBM322015 genotype ISDBM322016 genotype ISDBM322017 genotype ISDBM322018 genotype
0 6 1 63735 . CCTA C 193.23 PASS AC=3;AF=0.500;AN=6;BaseQRankSum=-0.960;DP=42;F... GT:AD:DP:GQ:PL ./. 1/1:6,2:4:6:116,6,0 0/1:8,4:8:99:128,0,301 0/0:18,2:15:30:0,30,666 - C-C CCTA-C CCTA-CCTA
1 12 1 745370 rs146246821 TA T 510.54 PASS AC=4;AF=0.500;AN=8;BaseQRankSum=1.905;DB;DP=64... GT:AD:DP:GQ:PL 0/1:4,1:5:32:32,0,185 0/1:17,4:18:99:132,0,623 0/1:9,7:16:99:273,0,395 0/1:16,4:19:99:125,0,661 TA-T TA-T TA-T TA-T
2 13 1 752566 rs3094315 G A 118.67 PASS ABHom=1.00;AC=6;AF=1.00;AN=6;DB;DP=4;Dels=0.00... GT:AD:DP:GQ:PL ./. 1/1:0,2:2:6:79,6,0 1/1:0,1:1:3:39,3,0 1/1:0,1:1:3:36,3,0 - A-A A-A A-A
3 14 1 752721 rs3131972 A G 242.70 PASS ABHom=1.00;AC=6;AF=1.00;AN=6;DB;DP=7;Dels=0.00... GT:AD:DP:GQ:PL ./. 1/1:0,2:2:6:80,6,0 1/1:0,4:4:12:159,12,0 1/1:0,1:1:3:40,3,0 - G-G G-G G-G
4 17 1 758324 rs3131955 T C 45.77 PASS ABHom=1.00;AC=2;AF=1.00;AN=2;DB;DP=2;Dels=0.00... GT:AD:DP:GQ:PL 1/1:0,2:2:6:77,6,0 ./. ./. ./. C-C - - -
5 18 1 780027 rs2977613 G T 47.01 PASS ABHom=1.00;AC=4;AF=1.00;AN=4;DB;DP=2;Dels=0.00... GT:AD:DP:GQ:PL 1/1:0,1:1:3:40,3,0 ./. 1/1:0,1:1:3:40,3,0 ./. T-T - T-T -
6 19 1 808631 rs11240779 G A 1853.99 PASS ABHom=1.000;AC=8;AF=1.00;AN=8;DB;DP=52;Dels=0.... GT:AD:DP:GQ:PL 1/1:0,7:7:21:223,21,0 1/1:0,16:16:48:602,48,0 1/1:0,13:13:36:457,36,0 1/1:0,16:16:48:611,48,0 A-A A-A A-A A-A
7 25 1 812267 rs7541694 A G 385.67 PASS ABHom=1.00;AC=8;AF=1.00;AN=8;DB;DP=13;Dels=0.0... GT:AD:DP:GQ:PL 1/1:0,3:3:9:95,9,0 1/1:0,5:5:15:175,15,0 1/1:0,2:2:6:55,6,0 1/1:0,3:3:9:99,9,0 G-G G-G G-G G-G
8 26 1 812284 rs7545373 C G 428.75 PASS ABHom=1.00;AC=8;AF=1.00;AN=8;DB;DP=13;Dels=0.0... GT:AD:DP:GQ:PL 1/1:0,5:5:15:185,15,0 1/1:0,4:4:12:148,12,0 1/1:0,2:2:6:55,6,0 1/1:0,2:2:6:79,6,0 G-G G-G G-G G-G
9 27 1 823790 rs143626389 G A 38.82 PASS ABHom=1.00;AC=2;AF=0.500;AN=4;BaseQRankSum=0.3... GT:AD:DP:GQ:PL ./. 1/1:0,2:2:6:75,6,0 ./. 0/0:3,0:3:9:0,9,99 - A-A - G-G
10 28 1 834832 rs4411087 G C 48.77 PASS ABHom=1.00;AC=2;AF=1.00;AN=2;DB;DP=2;Dels=0.00... GT:AD:DP:GQ:PL ./. 1/1:0,2:2:6:80,6,0 ./. ./. - C-C - -
11 29 1 849998 rs13303222 A G 45.77 PASS ABHom=1.00;AC=2;AF=1.00;AN=2;DB;DP=2;Dels=0.00... GT:AD:DP:GQ:PL 1/1:0,2:2:6:77,6,0 ./. ./. ./. G-G - - -
12 30 1 851757 rs62677860 A G 63.22 PASS ABHom=1.00;AC=4;AF=1.00;AN=4;DB;DP=4;Dels=0.00... GT:AD:DP:GQ:PL 1/1:0,2:2:6:69,6,0 1/1:0,2:2:3:28,3,0 ./. ./. G-G G-G - -
13 31 1 861808 rs13302982 A G 66.22 PASS ABHom=1.00;AC=4;AF=1.00;AN=4;DB;DP=3;Dels=0.00... GT:AD:DP:GQ:PL 1/1:0,2:2:6:64,6,0 ./. ./. 1/1:0,1:1:3:36,3,0 G-G - - G-G
14 32 1 862866 rs3892970 C T 31.26 PASS ABHom=1.00;AC=2;AF=0.500;AN=4;BaseQRankSum=-0.... GT:AD:DP:GQ:PL 1/1:0,2:2:6:66,6,0 0/0:1,0:1:3:0,3,40 ./. ./. T-T C-C - -
15 33 1 866319 rs9988021 G A 825.39 PASS ABHom=1.00;AC=8;AF=1.00;AN=8;DB;DP=24;Dels=0.0... GT:AD:DP:GQ:PL 1/1:0,11:11:33:427,33,0 1/1:0,7:7:18:228,18,0 1/1:0,4:4:12:133,12,0 1/1:0,2:2:6:76,6,0 A-A A-A A-A A-A
16 34 1 866511 rs60722469 C CCCCT 325.15 PASS AC=6;AF=0.750;AN=8;BaseQRankSum=0.747;DB;DP=15... GT:AD:DP:GQ:PL 1/1:5,5:10:15:278,15,0 1/1:3,0:3:3:32,3,0 0/0:1,0:1:3:0,3,65 1/1:0,1:1:3:67,3,0 CCCCT-CCCCT CCCCT-CCCCT C-C CCCCT-CCCCT
17 35 1 866920 rs2341361 A G 47.01 PASS ABHom=1.00;AC=4;AF=1.00;AN=4;DB;DP=2;Dels=0.00... GT:AD:DP:GQ:PL ./. ./. 1/1:0,1:1:3:40,3,0 1/1:0,1:1:3:40,3,0 - - G-G G-G
18 36 1 867584 rs2341360 A T 47.01 PASS ABHom=1.00;AC=4;AF=1.00;AN=4;DB;DP=2;Dels=0.00... GT:AD:DP:GQ:PL ./. ./. 1/1:0,1:1:3:40,3,0 1/1:0,1:1:3:40,3,0 - - T-T T-T
19 37 1 869323 rs13303207 T C 69.22 PASS ABHom=1.00;AC=4;AF=1.00;AN=4;DB;DP=3;Dels=0.00... GT:AD:DP:GQ:PL ./. ./. 1/1:0,2:2:6:73,6,0 1/1:0,1:1:3:30,3,0 - - C-C C-C
20 40 1 870903 rs13303094 T C 183.66 PASS ABHom=1.00;AC=6;AF=1.00;AN=6;DB;DP=6;Dels=0.00... GT:AD:DP:GQ:PL 1/1:0,2:2:6:62,6,0 1/1:0,2:2:6:79,6,0 ./. 1/1:0,2:2:6:79,6,0 C-C C-C - C-C
21 41 1 871334 rs4072383 G T 74.01 PASS ABHet=0.500;ABHom=1.00;AC=3;AF=0.500;AN=6;Base... GT:AD:DP:GQ:PL ./. 1/1:0,2:2:6:63,6,0 0/0:3,0:3:9:0,9,120 0/1:2,2:4:52:52,0,68 - T-T G-G G-T
22 42 1 873558 rs1110052 G T 39.01 PASS ABHom=1.00;AC=4;AF=1.00;AN=4;DB;DP=2;Dels=0.00... GT:AD:DP:GQ:PL 1/1:0,1:1:3:39,3,0 1/1:0,1:1:3:33,3,0 ./. ./. T-T T-T - -
23 43 1 876499 rs4372192 A G 200.89 PASS ABHom=1.00;AC=4;AF=1.00;AN=4;DB;DP=6;Dels=0.00... GT:AD:DP:GQ:PL 1/1:0,4:4:12:160,12,0 1/1:0,2:2:6:76,6,0 ./. ./. G-G G-G - -
24 44 1 878784 rs142929357 C G 125.24 PASS ABHet=0.583;ABHom=1.00;AC=2;AF=0.250;AN=8;Base... GT:AD:DP:GQ:PL 0/0:16,0:16:45:0,45,553 0/1:4,4:8:99:136,0,113 0/1:2,1:3:30:30,0,57 0/0:2,0:2:6:0,6,66 C-C C-G C-G C-C
25 45 1 879317 rs7523549 C T 388.57 PASS ABHet=0.493;ABHom=1.00;AC=2;AF=0.250;AN=8;Base... GT:AD:DP:GQ:PL 0/1:15,8:23:99:243,0,369 0/0:7,0:7:21:0,21,252 0/0:12,0:12:36:0,36,405 0/1:3,6:9:64:187,0,64 C-T C-C C-C C-T
26 46 1 879482 rs149880798 G C 799.55 PASS ABHet=0.592;ABHom=1.00;AC=2;AF=0.250;AN=8;Base... GT:AD:DP:GQ:PL 0/1:27,20:47:99:584,0,841 0/0:33,0:33:93:0,93,1155 0/0:19,0:19:51:0,51,650 0/1:14,9:23:99:257,0,389 G-C G-G G-G G-C
27 47 1 879676 rs6605067 G A 432.72 PASS ABHom=1.00;AC=8;AF=1.00;AN=8;DB;DP=12;Dels=0.0... GT:AD:DP:GQ:PL 1/1:0,2:2:6:73,6,0 1/1:0,3:3:9:120,9,0 1/1:0,3:3:9:120,9,0 1/1:0,4:4:12:158,12,0 A-A A-A A-A A-A
28 48 1 879687 rs2839 T C 335.22 PASS ABHom=1.00;AC=8;AF=1.00;AN=8;DB;DP=10;Dels=0.0... GT:AD:DP:GQ:PL 1/1:0,1:1:3:40,3,0 1/1:0,3:3:9:114,9,0 1/1:0,2:2:6:80,6,0 1/1:0,4:4:12:139,12,0 C-C C-C C-C C-C
29 49 1 880238 rs3748592 A G 516.84 PASS ABHom=1.00;AC=8;AF=1.00;AN=8;DB;DP=14;Dels=0.0... GT:AD:DP:GQ:PL 1/1:0,1:1:3:40,3,0 1/1:0,5:5:15:199,15,0 1/1:0,3:3:9:120,9,0 1/1:0,5:5:15:196,15,0 G-G G-G G-G G-G
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
268071 299643 GL000225.1 64237 . A G 152.14 PASS ABHom=1.00;AC=6;AF=1.00;AN=6;DP=5;Dels=0.00;FS... GT:AD:DP:GQ:PL 1/1:0,2:2:6:70,6,0 ./. 1/1:0,1:1:3:40,3,0 1/1:0,2:2:6:78,6,0 G-G - G-G G-G
268072 299706 GL000225.1 67508 . GT G 229.23 PASS AC=5;AF=0.833;AN=6;BaseQRankSum=-0.742;DP=13;F... GT:AD:DP:GQ:PL 0/1:1,6:4:40:136,0,40 1/1:1,1:2:3:47,3,0 ./. 1/1:0,4:2:6:94,6,0 GT-G G-G - G-G
268073 299713 GL000225.1 71934 . A G 62.01 PASS ABHet=0.500;ABHom=1.00;AC=3;AF=0.750;AN=4;Base... GT:AD:DP:GQ:PL 1/1:0,2:2:6:70,6,0 ./. ./. 0/1:1,1:2:28:28,0,33 G-G - - A-G
268074 299714 GL000225.1 71966 . A G 138.21 PASS ABHet=0.333;ABHom=1.00;AC=3;AF=0.750;AN=4;Base... GT:AD:DP:GQ:PL 1/1:0,3:3:9:113,9,0 ./. ./. 0/1:1,2:3:30:62,0,30 G-G - - A-G
268075 299715 GL000225.1 72003 . T C 119.76 PASS ABHom=1.00;AC=4;AF=1.00;AN=4;DP=5;Dels=0.00;FS... GT:AD:DP:GQ:PL 1/1:0,4:4:9:114,9,0 ./. ./. 1/1:0,1:1:3:40,3,0 C-C - - C-C
268076 299716 GL000225.1 72057 . C T 75.26 PASS ABHet=0.500;ABHom=1.00;AC=5;AF=0.833;AN=6;Base... GT:AD:DP:GQ:PL 1/1:0,1:1:3:40,3,0 ./. 1/1:0,1:1:3:40,3,0 0/1:1,1:2:26:33,0,26 T-T - T-T C-T
268077 299717 GL000225.1 72073 . A T 37.25 PASS ABHet=0.500;ABHom=1.00;AC=3;AF=0.750;AN=4;Base... GT:AD:DP:GQ:PL ./. ./. 1/1:0,1:1:3:39,3,0 0/1:1,1:2:30:33,0,30 - - T-T A-T
268078 299718 GL000225.1 72104 . A G 74.02 PASS ABHet=0.500;ABHom=1.00;AC=3;AF=0.750;AN=4;Base... GT:AD:DP:GQ:PL ./. ./. 1/1:0,2:2:6:77,6,0 0/1:1,1:2:24:33,0,24 - - G-G A-G
268079 299723 GL000225.1 72249 . A G 37.77 PASS ABHom=1.00;AC=2;AF=1.00;AN=2;DP=2;Dels=0.00;FS... GT:AD:DP:GQ:PL 1/1:0,2:2:6:69,6,0 ./. ./. ./. G-G - - -
268080 299734 GL000225.1 72555 . T C 123.67 PASS ABHom=1.00;AC=6;AF=1.00;AN=6;DP=4;Dels=0.00;FS... GT:AD:DP:GQ:PL 1/1:0,1:1:3:40,3,0 1/1:0,2:2:6:79,6,0 1/1:0,1:1:3:40,3,0 ./. C-C C-C C-C -
268081 299735 GL000225.1 72582 . A G 202.89 PASS ABHom=1.00;AC=8;AF=1.00;AN=8;DP=8;Dels=0.00;FS... GT:AD:DP:GQ:PL 1/1:0,4:4:9:111,9,0 1/1:0,2:2:6:61,6,0 1/1:0,1:1:3:35,3,0 1/1:0,1:1:3:33,3,0 G-G G-G G-G G-G
268082 299778 GL000225.1 78642 . A G 32.77 PASS ABHom=1.00;AC=2;AF=1.00;AN=2;DP=2;Dels=0.00;FS... GT:AD:DP:GQ:PL 1/1:0,2:2:6:64,6,0 ./. ./. ./. G-G - - -
268083 299860 GL000225.1 89058 . T C 47.01 PASS ABHom=1.00;AC=4;AF=1.00;AN=4;DP=3;Dels=0.00;FS... GT:AD:DP:GQ:PL ./. 1/1:0,2:2:3:40,3,0 1/1:0,1:1:3:40,3,0 ./. - C-C C-C -
268084 299876 GL000225.1 89930 . A T 139.54 PASS ABHom=1.00;AC=8;AF=1.00;AN=8;DP=5;Dels=0.00;FS... GT:AD:DP:GQ:PL 1/1:0,1:1:3:39,3,0 1/1:0,2:2:6:72,6,0 1/1:0,1:1:3:30,3,0 1/1:0,1:1:3:35,3,0 T-T T-T T-T T-T
268085 299909 GL000225.1 96661 . A G 40.01 PASS ABHom=1.00;AC=4;AF=1.00;AN=4;DP=2;Dels=0.00;FS... GT:AD:DP:GQ:PL 1/1:0,1:1:3:40,3,0 1/1:0,1:1:3:33,3,0 ./. ./. G-G G-G - -
268086 299927 GL000225.1 114739 . T TC 679.82 PASS AC=7;AF=0.875;AN=8;BaseQRankSum=3.317;DP=23;FS... GT:AD:DP:GQ:PL 1/1:0,5:5:15:209,15,0 1/1:0,6:6:18:260,18,0 1/1:0,4:4:12:158,12,0 0/1:5,3:8:99:106,0,182 TC-TC TC-TC TC-TC T-TC
268087 299992 GL000225.1 143836 . T C 117.52 PASS ABHom=1.00;AC=2;AF=1.00;AN=2;DP=4;Dels=0.00;FS... GT:AD:DP:GQ:PL 1/1:0,4:4:12:150,12,0 ./. ./. ./. C-C - - -
268088 299993 GL000225.1 143849 . A G 117.52 PASS ABHom=1.00;AC=2;AF=1.00;AN=2;DP=4;Dels=0.00;FS... GT:AD:DP:GQ:PL 1/1:0,4:4:12:150,12,0 ./. ./. ./. G-G - - -
268089 300006 GL000225.1 198637 . C T 80.98 PASS ABHom=1.00;AC=2;AF=1.00;AN=2;DP=3;Dels=0.00;FS... GT:AD:DP:GQ:PL ./. 1/1:0,3:3:9:113,9,0 ./. ./. - T-T - -
268090 300007 GL000225.1 198643 . A G 86.98 PASS ABHom=1.00;AC=2;AF=1.00;AN=2;DP=3;Dels=0.00;FS... GT:AD:DP:GQ:PL ./. 1/1:0,3:3:9:119,9,0 ./. ./. - G-G - -
268091 300008 GL000225.1 202533 . C T 46.77 PASS ABHom=1.00;AC=2;AF=1.00;AN=2;DP=2;Dels=0.00;FS... GT:AD:DP:GQ:PL ./. ./. ./. 1/1:0,2:2:6:78,6,0 - - - T-T
268092 300009 GL000225.1 203673 . C T 78.22 PASS ABHom=1.00;AC=4;AF=1.00;AN=4;DP=3;Dels=0.00;FS... GT:AD:DP:GQ:PL 1/1:0,2:2:6:80,6,0 1/1:0,1:1:3:32,3,0 ./. ./. T-T T-T - -
268093 300022 GL000192.1 139953 . CTG C 1104.53 PASS AC=4;AF=0.500;AN=8;BaseQRankSum=-3.358;DP=66;F... GT:AD:DP:GQ:PL 1/1:7,18:20:51:677,51,0 0/1:9,6:10:33:204,0,33 0/1:8,7:10:19:280,0,19 0/0:6,0:6:9:0,9,141 C-C CTG-C CTG-C CTG-CTG
268094 300023 GL000192.1 160087 . C T 157.22 PASS ABHet=0.625;ABHom=1.00;AC=6;AF=0.750;AN=8;Base... GT:AD:DP:GQ:PL 0/1:2,2:4:52:60,0,52 0/1:3,1:4:25:25,0,105 1/1:0,2:2:6:78,6,0 1/1:0,1:1:3:37,3,0 C-T C-T T-T T-T
268095 300029 GL000192.1 272061 . C CT 125.59 PASS AC=3;AF=0.375;AN=8;BaseQRankSum=-0.058;DP=33;F... GT:AD:DP:GQ:PL 0/0:5,0:4:9:0,9,93 0/1:5,4:7:12:107,0,12 0/1:9,2:3:18:53,0,18 0/1:6,1:4:17:17,0,69 C-C C-CT C-CT C-CT
268096 300030 GL000192.1 311575 . C T 248.50 PASS ABHet=0.333;ABHom=1.00;AC=3;AF=0.375;AN=8;Base... GT:AD:DP:GQ:PL 0/0:2,0:2:3:0,3,39 0/1:3,6:9:93:212,0,93 1/1:0,2:2:6:80,6,0 0/0:5,0:5:15:0,15,181 C-C C-T T-T C-C
268097 300032 GL000192.1 313701 . T C 120.67 PASS ABHom=1.00;AC=6;AF=1.00;AN=6;DP=4;Dels=0.00;FS... GT:AD:DP:GQ:PL ./. 1/1:0,1:1:3:40,3,0 1/1:0,1:1:3:39,3,0 1/1:0,2:2:6:77,6,0 - C-C C-C C-C
268098 300033 GL000192.1 337826 . TA T 489.51 PASS AC=2;AF=0.250;AN=8;BaseQRankSum=-3.725;DP=62;F... GT:AD:DP:GQ:PL 0/0:18,0:18:51:0,51,855 0/0:13,0:13:39:0,39,695 0/1:5,10:15:99:438,0,228 0/1:13,3:16:99:102,0,647 TA-TA TA-TA TA-T TA-T
268099 300034 GL000192.1 394140 . A G 45.26 PASS ABHom=1.00;AC=2;AF=0.500;AN=4;BaseQRankSum=0.7... GT:AD:DP:GQ:PL ./. 0/0:1,0:1:3:0,3,39 ./. 1/1:0,2:2:6:80,6,0 - A-A - G-G
268100 300035 10 123256215 . T G 100.00 PASS GENE=FGFR2;INHERITANCE=AD;MIM=101600 GT:AD:DP:GQ:PL 0/0:1,0:1:3:0,3,39 0/0:1,0:1:3:0,3,39 1/0:1,0:1:3:0,3,39 0/0:1,0:1:3:0,3,39 T-T T-T G-T T-T

268101 rows × 18 columns

유전자형을 숫자로 추가하기 (AA: 0, AB: 1, BB:2)

In [10]:
sample_ids_number = ['{} num'.format(s) for s in sample_ids]

def get_number(ref, alt):
    if ',' in alt or '.' in ref or '.' in alt:
        return np.nan
    return sum(map(int, [ref, alt]))

def call_genotype_number(row):
    return pd.Series(
            [get_number(row[s][0], row[s][2]) for s in sample_ids if s], 
                index=sample_ids_number)

dfc[sample_ids_number] = dfc.apply(call_genotype_number, axis=1)
dfc
Out[10]:
index #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT ... ISDBM322017 ISDBM322018 ISDBM322015 genotype ISDBM322016 genotype ISDBM322017 genotype ISDBM322018 genotype ISDBM322015 num ISDBM322016 num ISDBM322017 num ISDBM322018 num
0 6 1 63735 . CCTA C 193.23 PASS AC=3;AF=0.500;AN=6;BaseQRankSum=-0.960;DP=42;F... GT:AD:DP:GQ:PL ... 0/1:8,4:8:99:128,0,301 0/0:18,2:15:30:0,30,666 - C-C CCTA-C CCTA-CCTA NaN 2.0 1.0 0.0
1 12 1 745370 rs146246821 TA T 510.54 PASS AC=4;AF=0.500;AN=8;BaseQRankSum=1.905;DB;DP=64... GT:AD:DP:GQ:PL ... 0/1:9,7:16:99:273,0,395 0/1:16,4:19:99:125,0,661 TA-T TA-T TA-T TA-T 1.0 1.0 1.0 1.0
2 13 1 752566 rs3094315 G A 118.67 PASS ABHom=1.00;AC=6;AF=1.00;AN=6;DB;DP=4;Dels=0.00... GT:AD:DP:GQ:PL ... 1/1:0,1:1:3:39,3,0 1/1:0,1:1:3:36,3,0 - A-A A-A A-A NaN 2.0 2.0 2.0
3 14 1 752721 rs3131972 A G 242.70 PASS ABHom=1.00;AC=6;AF=1.00;AN=6;DB;DP=7;Dels=0.00... GT:AD:DP:GQ:PL ... 1/1:0,4:4:12:159,12,0 1/1:0,1:1:3:40,3,0 - G-G G-G G-G NaN 2.0 2.0 2.0
4 17 1 758324 rs3131955 T C 45.77 PASS ABHom=1.00;AC=2;AF=1.00;AN=2;DB;DP=2;Dels=0.00... GT:AD:DP:GQ:PL ... ./. ./. C-C - - - 2.0 NaN NaN NaN
5 18 1 780027 rs2977613 G T 47.01 PASS ABHom=1.00;AC=4;AF=1.00;AN=4;DB;DP=2;Dels=0.00... GT:AD:DP:GQ:PL ... 1/1:0,1:1:3:40,3,0 ./. T-T - T-T - 2.0 NaN 2.0 NaN
6 19 1 808631 rs11240779 G A 1853.99 PASS ABHom=1.000;AC=8;AF=1.00;AN=8;DB;DP=52;Dels=0.... GT:AD:DP:GQ:PL ... 1/1:0,13:13:36:457,36,0 1/1:0,16:16:48:611,48,0 A-A A-A A-A A-A 2.0 2.0 2.0 2.0
7 25 1 812267 rs7541694 A G 385.67 PASS ABHom=1.00;AC=8;AF=1.00;AN=8;DB;DP=13;Dels=0.0... GT:AD:DP:GQ:PL ... 1/1:0,2:2:6:55,6,0 1/1:0,3:3:9:99,9,0 G-G G-G G-G G-G 2.0 2.0 2.0 2.0
8 26 1 812284 rs7545373 C G 428.75 PASS ABHom=1.00;AC=8;AF=1.00;AN=8;DB;DP=13;Dels=0.0... GT:AD:DP:GQ:PL ... 1/1:0,2:2:6:55,6,0 1/1:0,2:2:6:79,6,0 G-G G-G G-G G-G 2.0 2.0 2.0 2.0
9 27 1 823790 rs143626389 G A 38.82 PASS ABHom=1.00;AC=2;AF=0.500;AN=4;BaseQRankSum=0.3... GT:AD:DP:GQ:PL ... ./. 0/0:3,0:3:9:0,9,99 - A-A - G-G NaN 2.0 NaN 0.0
10 28 1 834832 rs4411087 G C 48.77 PASS ABHom=1.00;AC=2;AF=1.00;AN=2;DB;DP=2;Dels=0.00... GT:AD:DP:GQ:PL ... ./. ./. - C-C - - NaN 2.0 NaN NaN
11 29 1 849998 rs13303222 A G 45.77 PASS ABHom=1.00;AC=2;AF=1.00;AN=2;DB;DP=2;Dels=0.00... GT:AD:DP:GQ:PL ... ./. ./. G-G - - - 2.0 NaN NaN NaN
12 30 1 851757 rs62677860 A G 63.22 PASS ABHom=1.00;AC=4;AF=1.00;AN=4;DB;DP=4;Dels=0.00... GT:AD:DP:GQ:PL ... ./. ./. G-G G-G - - 2.0 2.0 NaN NaN
13 31 1 861808 rs13302982 A G 66.22 PASS ABHom=1.00;AC=4;AF=1.00;AN=4;DB;DP=3;Dels=0.00... GT:AD:DP:GQ:PL ... ./. 1/1:0,1:1:3:36,3,0 G-G - - G-G 2.0 NaN NaN 2.0
14 32 1 862866 rs3892970 C T 31.26 PASS ABHom=1.00;AC=2;AF=0.500;AN=4;BaseQRankSum=-0.... GT:AD:DP:GQ:PL ... ./. ./. T-T C-C - - 2.0 0.0 NaN NaN
15 33 1 866319 rs9988021 G A 825.39 PASS ABHom=1.00;AC=8;AF=1.00;AN=8;DB;DP=24;Dels=0.0... GT:AD:DP:GQ:PL ... 1/1:0,4:4:12:133,12,0 1/1:0,2:2:6:76,6,0 A-A A-A A-A A-A 2.0 2.0 2.0 2.0
16 34 1 866511 rs60722469 C CCCCT 325.15 PASS AC=6;AF=0.750;AN=8;BaseQRankSum=0.747;DB;DP=15... GT:AD:DP:GQ:PL ... 0/0:1,0:1:3:0,3,65 1/1:0,1:1:3:67,3,0 CCCCT-CCCCT CCCCT-CCCCT C-C CCCCT-CCCCT 2.0 2.0 0.0 2.0
17 35 1 866920 rs2341361 A G 47.01 PASS ABHom=1.00;AC=4;AF=1.00;AN=4;DB;DP=2;Dels=0.00... GT:AD:DP:GQ:PL ... 1/1:0,1:1:3:40,3,0 1/1:0,1:1:3:40,3,0 - - G-G G-G NaN NaN 2.0 2.0
18 36 1 867584 rs2341360 A T 47.01 PASS ABHom=1.00;AC=4;AF=1.00;AN=4;DB;DP=2;Dels=0.00... GT:AD:DP:GQ:PL ... 1/1:0,1:1:3:40,3,0 1/1:0,1:1:3:40,3,0 - - T-T T-T NaN NaN 2.0 2.0
19 37 1 869323 rs13303207 T C 69.22 PASS ABHom=1.00;AC=4;AF=1.00;AN=4;DB;DP=3;Dels=0.00... GT:AD:DP:GQ:PL ... 1/1:0,2:2:6:73,6,0 1/1:0,1:1:3:30,3,0 - - C-C C-C NaN NaN 2.0 2.0
20 40 1 870903 rs13303094 T C 183.66 PASS ABHom=1.00;AC=6;AF=1.00;AN=6;DB;DP=6;Dels=0.00... GT:AD:DP:GQ:PL ... ./. 1/1:0,2:2:6:79,6,0 C-C C-C - C-C 2.0 2.0 NaN 2.0
21 41 1 871334 rs4072383 G T 74.01 PASS ABHet=0.500;ABHom=1.00;AC=3;AF=0.500;AN=6;Base... GT:AD:DP:GQ:PL ... 0/0:3,0:3:9:0,9,120 0/1:2,2:4:52:52,0,68 - T-T G-G G-T NaN 2.0 0.0 1.0
22 42 1 873558 rs1110052 G T 39.01 PASS ABHom=1.00;AC=4;AF=1.00;AN=4;DB;DP=2;Dels=0.00... GT:AD:DP:GQ:PL ... ./. ./. T-T T-T - - 2.0 2.0 NaN NaN
23 43 1 876499 rs4372192 A G 200.89 PASS ABHom=1.00;AC=4;AF=1.00;AN=4;DB;DP=6;Dels=0.00... GT:AD:DP:GQ:PL ... ./. ./. G-G G-G - - 2.0 2.0 NaN NaN
24 44 1 878784 rs142929357 C G 125.24 PASS ABHet=0.583;ABHom=1.00;AC=2;AF=0.250;AN=8;Base... GT:AD:DP:GQ:PL ... 0/1:2,1:3:30:30,0,57 0/0:2,0:2:6:0,6,66 C-C C-G C-G C-C 0.0 1.0 1.0 0.0
25 45 1 879317 rs7523549 C T 388.57 PASS ABHet=0.493;ABHom=1.00;AC=2;AF=0.250;AN=8;Base... GT:AD:DP:GQ:PL ... 0/0:12,0:12:36:0,36,405 0/1:3,6:9:64:187,0,64 C-T C-C C-C C-T 1.0 0.0 0.0 1.0
26 46 1 879482 rs149880798 G C 799.55 PASS ABHet=0.592;ABHom=1.00;AC=2;AF=0.250;AN=8;Base... GT:AD:DP:GQ:PL ... 0/0:19,0:19:51:0,51,650 0/1:14,9:23:99:257,0,389 G-C G-G G-G G-C 1.0 0.0 0.0 1.0
27 47 1 879676 rs6605067 G A 432.72 PASS ABHom=1.00;AC=8;AF=1.00;AN=8;DB;DP=12;Dels=0.0... GT:AD:DP:GQ:PL ... 1/1:0,3:3:9:120,9,0 1/1:0,4:4:12:158,12,0 A-A A-A A-A A-A 2.0 2.0 2.0 2.0
28 48 1 879687 rs2839 T C 335.22 PASS ABHom=1.00;AC=8;AF=1.00;AN=8;DB;DP=10;Dels=0.0... GT:AD:DP:GQ:PL ... 1/1:0,2:2:6:80,6,0 1/1:0,4:4:12:139,12,0 C-C C-C C-C C-C 2.0 2.0 2.0 2.0
29 49 1 880238 rs3748592 A G 516.84 PASS ABHom=1.00;AC=8;AF=1.00;AN=8;DB;DP=14;Dels=0.0... GT:AD:DP:GQ:PL ... 1/1:0,3:3:9:120,9,0 1/1:0,5:5:15:196,15,0 G-G G-G G-G G-G 2.0 2.0 2.0 2.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
268071 299643 GL000225.1 64237 . A G 152.14 PASS ABHom=1.00;AC=6;AF=1.00;AN=6;DP=5;Dels=0.00;FS... GT:AD:DP:GQ:PL ... 1/1:0,1:1:3:40,3,0 1/1:0,2:2:6:78,6,0 G-G - G-G G-G 2.0 NaN 2.0 2.0
268072 299706 GL000225.1 67508 . GT G 229.23 PASS AC=5;AF=0.833;AN=6;BaseQRankSum=-0.742;DP=13;F... GT:AD:DP:GQ:PL ... ./. 1/1:0,4:2:6:94,6,0 GT-G G-G - G-G 1.0 2.0 NaN 2.0
268073 299713 GL000225.1 71934 . A G 62.01 PASS ABHet=0.500;ABHom=1.00;AC=3;AF=0.750;AN=4;Base... GT:AD:DP:GQ:PL ... ./. 0/1:1,1:2:28:28,0,33 G-G - - A-G 2.0 NaN NaN 1.0
268074 299714 GL000225.1 71966 . A G 138.21 PASS ABHet=0.333;ABHom=1.00;AC=3;AF=0.750;AN=4;Base... GT:AD:DP:GQ:PL ... ./. 0/1:1,2:3:30:62,0,30 G-G - - A-G 2.0 NaN NaN 1.0
268075 299715 GL000225.1 72003 . T C 119.76 PASS ABHom=1.00;AC=4;AF=1.00;AN=4;DP=5;Dels=0.00;FS... GT:AD:DP:GQ:PL ... ./. 1/1:0,1:1:3:40,3,0 C-C - - C-C 2.0 NaN NaN 2.0
268076 299716 GL000225.1 72057 . C T 75.26 PASS ABHet=0.500;ABHom=1.00;AC=5;AF=0.833;AN=6;Base... GT:AD:DP:GQ:PL ... 1/1:0,1:1:3:40,3,0 0/1:1,1:2:26:33,0,26 T-T - T-T C-T 2.0 NaN 2.0 1.0
268077 299717 GL000225.1 72073 . A T 37.25 PASS ABHet=0.500;ABHom=1.00;AC=3;AF=0.750;AN=4;Base... GT:AD:DP:GQ:PL ... 1/1:0,1:1:3:39,3,0 0/1:1,1:2:30:33,0,30 - - T-T A-T NaN NaN 2.0 1.0
268078 299718 GL000225.1 72104 . A G 74.02 PASS ABHet=0.500;ABHom=1.00;AC=3;AF=0.750;AN=4;Base... GT:AD:DP:GQ:PL ... 1/1:0,2:2:6:77,6,0 0/1:1,1:2:24:33,0,24 - - G-G A-G NaN NaN 2.0 1.0
268079 299723 GL000225.1 72249 . A G 37.77 PASS ABHom=1.00;AC=2;AF=1.00;AN=2;DP=2;Dels=0.00;FS... GT:AD:DP:GQ:PL ... ./. ./. G-G - - - 2.0 NaN NaN NaN
268080 299734 GL000225.1 72555 . T C 123.67 PASS ABHom=1.00;AC=6;AF=1.00;AN=6;DP=4;Dels=0.00;FS... GT:AD:DP:GQ:PL ... 1/1:0,1:1:3:40,3,0 ./. C-C C-C C-C - 2.0 2.0 2.0 NaN
268081 299735 GL000225.1 72582 . A G 202.89 PASS ABHom=1.00;AC=8;AF=1.00;AN=8;DP=8;Dels=0.00;FS... GT:AD:DP:GQ:PL ... 1/1:0,1:1:3:35,3,0 1/1:0,1:1:3:33,3,0 G-G G-G G-G G-G 2.0 2.0 2.0 2.0
268082 299778 GL000225.1 78642 . A G 32.77 PASS ABHom=1.00;AC=2;AF=1.00;AN=2;DP=2;Dels=0.00;FS... GT:AD:DP:GQ:PL ... ./. ./. G-G - - - 2.0 NaN NaN NaN
268083 299860 GL000225.1 89058 . T C 47.01 PASS ABHom=1.00;AC=4;AF=1.00;AN=4;DP=3;Dels=0.00;FS... GT:AD:DP:GQ:PL ... 1/1:0,1:1:3:40,3,0 ./. - C-C C-C - NaN 2.0 2.0 NaN
268084 299876 GL000225.1 89930 . A T 139.54 PASS ABHom=1.00;AC=8;AF=1.00;AN=8;DP=5;Dels=0.00;FS... GT:AD:DP:GQ:PL ... 1/1:0,1:1:3:30,3,0 1/1:0,1:1:3:35,3,0 T-T T-T T-T T-T 2.0 2.0 2.0 2.0
268085 299909 GL000225.1 96661 . A G 40.01 PASS ABHom=1.00;AC=4;AF=1.00;AN=4;DP=2;Dels=0.00;FS... GT:AD:DP:GQ:PL ... ./. ./. G-G G-G - - 2.0 2.0 NaN NaN
268086 299927 GL000225.1 114739 . T TC 679.82 PASS AC=7;AF=0.875;AN=8;BaseQRankSum=3.317;DP=23;FS... GT:AD:DP:GQ:PL ... 1/1:0,4:4:12:158,12,0 0/1:5,3:8:99:106,0,182 TC-TC TC-TC TC-TC T-TC 2.0 2.0 2.0 1.0
268087 299992 GL000225.1 143836 . T C 117.52 PASS ABHom=1.00;AC=2;AF=1.00;AN=2;DP=4;Dels=0.00;FS... GT:AD:DP:GQ:PL ... ./. ./. C-C - - - 2.0 NaN NaN NaN
268088 299993 GL000225.1 143849 . A G 117.52 PASS ABHom=1.00;AC=2;AF=1.00;AN=2;DP=4;Dels=0.00;FS... GT:AD:DP:GQ:PL ... ./. ./. G-G - - - 2.0 NaN NaN NaN
268089 300006 GL000225.1 198637 . C T 80.98 PASS ABHom=1.00;AC=2;AF=1.00;AN=2;DP=3;Dels=0.00;FS... GT:AD:DP:GQ:PL ... ./. ./. - T-T - - NaN 2.0 NaN NaN
268090 300007 GL000225.1 198643 . A G 86.98 PASS ABHom=1.00;AC=2;AF=1.00;AN=2;DP=3;Dels=0.00;FS... GT:AD:DP:GQ:PL ... ./. ./. - G-G - - NaN 2.0 NaN NaN
268091 300008 GL000225.1 202533 . C T 46.77 PASS ABHom=1.00;AC=2;AF=1.00;AN=2;DP=2;Dels=0.00;FS... GT:AD:DP:GQ:PL ... ./. 1/1:0,2:2:6:78,6,0 - - - T-T NaN NaN NaN 2.0
268092 300009 GL000225.1 203673 . C T 78.22 PASS ABHom=1.00;AC=4;AF=1.00;AN=4;DP=3;Dels=0.00;FS... GT:AD:DP:GQ:PL ... ./. ./. T-T T-T - - 2.0 2.0 NaN NaN
268093 300022 GL000192.1 139953 . CTG C 1104.53 PASS AC=4;AF=0.500;AN=8;BaseQRankSum=-3.358;DP=66;F... GT:AD:DP:GQ:PL ... 0/1:8,7:10:19:280,0,19 0/0:6,0:6:9:0,9,141 C-C CTG-C CTG-C CTG-CTG 2.0 1.0 1.0 0.0
268094 300023 GL000192.1 160087 . C T 157.22 PASS ABHet=0.625;ABHom=1.00;AC=6;AF=0.750;AN=8;Base... GT:AD:DP:GQ:PL ... 1/1:0,2:2:6:78,6,0 1/1:0,1:1:3:37,3,0 C-T C-T T-T T-T 1.0 1.0 2.0 2.0
268095 300029 GL000192.1 272061 . C CT 125.59 PASS AC=3;AF=0.375;AN=8;BaseQRankSum=-0.058;DP=33;F... GT:AD:DP:GQ:PL ... 0/1:9,2:3:18:53,0,18 0/1:6,1:4:17:17,0,69 C-C C-CT C-CT C-CT 0.0 1.0 1.0 1.0
268096 300030 GL000192.1 311575 . C T 248.50 PASS ABHet=0.333;ABHom=1.00;AC=3;AF=0.375;AN=8;Base... GT:AD:DP:GQ:PL ... 1/1:0,2:2:6:80,6,0 0/0:5,0:5:15:0,15,181 C-C C-T T-T C-C 0.0 1.0 2.0 0.0
268097 300032 GL000192.1 313701 . T C 120.67 PASS ABHom=1.00;AC=6;AF=1.00;AN=6;DP=4;Dels=0.00;FS... GT:AD:DP:GQ:PL ... 1/1:0,1:1:3:39,3,0 1/1:0,2:2:6:77,6,0 - C-C C-C C-C NaN 2.0 2.0 2.0
268098 300033 GL000192.1 337826 . TA T 489.51 PASS AC=2;AF=0.250;AN=8;BaseQRankSum=-3.725;DP=62;F... GT:AD:DP:GQ:PL ... 0/1:5,10:15:99:438,0,228 0/1:13,3:16:99:102,0,647 TA-TA TA-TA TA-T TA-T 0.0 0.0 1.0 1.0
268099 300034 GL000192.1 394140 . A G 45.26 PASS ABHom=1.00;AC=2;AF=0.500;AN=4;BaseQRankSum=0.7... GT:AD:DP:GQ:PL ... ./. 1/1:0,2:2:6:80,6,0 - A-A - G-G NaN 0.0 NaN 2.0
268100 300035 10 123256215 . T G 100.00 PASS GENE=FGFR2;INHERITANCE=AD;MIM=101600 GT:AD:DP:GQ:PL ... 1/0:1,0:1:3:0,3,39 0/0:1,0:1:3:0,3,39 T-T T-T G-T T-T 0.0 0.0 1.0 0.0

268101 rows × 22 columns

In [11]:
dfc[[not x.startswith('G') for x in dfc['#CHROM']]]
Out[11]:
index #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT ... ISDBM322017 ISDBM322018 ISDBM322015 genotype ISDBM322016 genotype ISDBM322017 genotype ISDBM322018 genotype ISDBM322015 num ISDBM322016 num ISDBM322017 num ISDBM322018 num
0 6 1 63735 . CCTA C 193.23 PASS AC=3;AF=0.500;AN=6;BaseQRankSum=-0.960;DP=42;F... GT:AD:DP:GQ:PL ... 0/1:8,4:8:99:128,0,301 0/0:18,2:15:30:0,30,666 - C-C CCTA-C CCTA-CCTA NaN 2.0 1.0 0.0
1 12 1 745370 rs146246821 TA T 510.54 PASS AC=4;AF=0.500;AN=8;BaseQRankSum=1.905;DB;DP=64... GT:AD:DP:GQ:PL ... 0/1:9,7:16:99:273,0,395 0/1:16,4:19:99:125,0,661 TA-T TA-T TA-T TA-T 1.0 1.0 1.0 1.0
2 13 1 752566 rs3094315 G A 118.67 PASS ABHom=1.00;AC=6;AF=1.00;AN=6;DB;DP=4;Dels=0.00... GT:AD:DP:GQ:PL ... 1/1:0,1:1:3:39,3,0 1/1:0,1:1:3:36,3,0 - A-A A-A A-A NaN 2.0 2.0 2.0
3 14 1 752721 rs3131972 A G 242.70 PASS ABHom=1.00;AC=6;AF=1.00;AN=6;DB;DP=7;Dels=0.00... GT:AD:DP:GQ:PL ... 1/1:0,4:4:12:159,12,0 1/1:0,1:1:3:40,3,0 - G-G G-G G-G NaN 2.0 2.0 2.0
4 17 1 758324 rs3131955 T C 45.77 PASS ABHom=1.00;AC=2;AF=1.00;AN=2;DB;DP=2;Dels=0.00... GT:AD:DP:GQ:PL ... ./. ./. C-C - - - 2.0 NaN NaN NaN
5 18 1 780027 rs2977613 G T 47.01 PASS ABHom=1.00;AC=4;AF=1.00;AN=4;DB;DP=2;Dels=0.00... GT:AD:DP:GQ:PL ... 1/1:0,1:1:3:40,3,0 ./. T-T - T-T - 2.0 NaN 2.0 NaN
6 19 1 808631 rs11240779 G A 1853.99 PASS ABHom=1.000;AC=8;AF=1.00;AN=8;DB;DP=52;Dels=0.... GT:AD:DP:GQ:PL ... 1/1:0,13:13:36:457,36,0 1/1:0,16:16:48:611,48,0 A-A A-A A-A A-A 2.0 2.0 2.0 2.0
7 25 1 812267 rs7541694 A G 385.67 PASS ABHom=1.00;AC=8;AF=1.00;AN=8;DB;DP=13;Dels=0.0... GT:AD:DP:GQ:PL ... 1/1:0,2:2:6:55,6,0 1/1:0,3:3:9:99,9,0 G-G G-G G-G G-G 2.0 2.0 2.0 2.0
8 26 1 812284 rs7545373 C G 428.75 PASS ABHom=1.00;AC=8;AF=1.00;AN=8;DB;DP=13;Dels=0.0... GT:AD:DP:GQ:PL ... 1/1:0,2:2:6:55,6,0 1/1:0,2:2:6:79,6,0 G-G G-G G-G G-G 2.0 2.0 2.0 2.0
9 27 1 823790 rs143626389 G A 38.82 PASS ABHom=1.00;AC=2;AF=0.500;AN=4;BaseQRankSum=0.3... GT:AD:DP:GQ:PL ... ./. 0/0:3,0:3:9:0,9,99 - A-A - G-G NaN 2.0 NaN 0.0
10 28 1 834832 rs4411087 G C 48.77 PASS ABHom=1.00;AC=2;AF=1.00;AN=2;DB;DP=2;Dels=0.00... GT:AD:DP:GQ:PL ... ./. ./. - C-C - - NaN 2.0 NaN NaN
11 29 1 849998 rs13303222 A G 45.77 PASS ABHom=1.00;AC=2;AF=1.00;AN=2;DB;DP=2;Dels=0.00... GT:AD:DP:GQ:PL ... ./. ./. G-G - - - 2.0 NaN NaN NaN
12 30 1 851757 rs62677860 A G 63.22 PASS ABHom=1.00;AC=4;AF=1.00;AN=4;DB;DP=4;Dels=0.00... GT:AD:DP:GQ:PL ... ./. ./. G-G G-G - - 2.0 2.0 NaN NaN
13 31 1 861808 rs13302982 A G 66.22 PASS ABHom=1.00;AC=4;AF=1.00;AN=4;DB;DP=3;Dels=0.00... GT:AD:DP:GQ:PL ... ./. 1/1:0,1:1:3:36,3,0 G-G - - G-G 2.0 NaN NaN 2.0
14 32 1 862866 rs3892970 C T 31.26 PASS ABHom=1.00;AC=2;AF=0.500;AN=4;BaseQRankSum=-0.... GT:AD:DP:GQ:PL ... ./. ./. T-T C-C - - 2.0 0.0 NaN NaN
15 33 1 866319 rs9988021 G A 825.39 PASS ABHom=1.00;AC=8;AF=1.00;AN=8;DB;DP=24;Dels=0.0... GT:AD:DP:GQ:PL ... 1/1:0,4:4:12:133,12,0 1/1:0,2:2:6:76,6,0 A-A A-A A-A A-A 2.0 2.0 2.0 2.0
16 34 1 866511 rs60722469 C CCCCT 325.15 PASS AC=6;AF=0.750;AN=8;BaseQRankSum=0.747;DB;DP=15... GT:AD:DP:GQ:PL ... 0/0:1,0:1:3:0,3,65 1/1:0,1:1:3:67,3,0 CCCCT-CCCCT CCCCT-CCCCT C-C CCCCT-CCCCT 2.0 2.0 0.0 2.0
17 35 1 866920 rs2341361 A G 47.01 PASS ABHom=1.00;AC=4;AF=1.00;AN=4;DB;DP=2;Dels=0.00... GT:AD:DP:GQ:PL ... 1/1:0,1:1:3:40,3,0 1/1:0,1:1:3:40,3,0 - - G-G G-G NaN NaN 2.0 2.0
18 36 1 867584 rs2341360 A T 47.01 PASS ABHom=1.00;AC=4;AF=1.00;AN=4;DB;DP=2;Dels=0.00... GT:AD:DP:GQ:PL ... 1/1:0,1:1:3:40,3,0 1/1:0,1:1:3:40,3,0 - - T-T T-T NaN NaN 2.0 2.0
19 37 1 869323 rs13303207 T C 69.22 PASS ABHom=1.00;AC=4;AF=1.00;AN=4;DB;DP=3;Dels=0.00... GT:AD:DP:GQ:PL ... 1/1:0,2:2:6:73,6,0 1/1:0,1:1:3:30,3,0 - - C-C C-C NaN NaN 2.0 2.0
20 40 1 870903 rs13303094 T C 183.66 PASS ABHom=1.00;AC=6;AF=1.00;AN=6;DB;DP=6;Dels=0.00... GT:AD:DP:GQ:PL ... ./. 1/1:0,2:2:6:79,6,0 C-C C-C - C-C 2.0 2.0 NaN 2.0
21 41 1 871334 rs4072383 G T 74.01 PASS ABHet=0.500;ABHom=1.00;AC=3;AF=0.500;AN=6;Base... GT:AD:DP:GQ:PL ... 0/0:3,0:3:9:0,9,120 0/1:2,2:4:52:52,0,68 - T-T G-G G-T NaN 2.0 0.0 1.0
22 42 1 873558 rs1110052 G T 39.01 PASS ABHom=1.00;AC=4;AF=1.00;AN=4;DB;DP=2;Dels=0.00... GT:AD:DP:GQ:PL ... ./. ./. T-T T-T - - 2.0 2.0 NaN NaN
23 43 1 876499 rs4372192 A G 200.89 PASS ABHom=1.00;AC=4;AF=1.00;AN=4;DB;DP=6;Dels=0.00... GT:AD:DP:GQ:PL ... ./. ./. G-G G-G - - 2.0 2.0 NaN NaN
24 44 1 878784 rs142929357 C G 125.24 PASS ABHet=0.583;ABHom=1.00;AC=2;AF=0.250;AN=8;Base... GT:AD:DP:GQ:PL ... 0/1:2,1:3:30:30,0,57 0/0:2,0:2:6:0,6,66 C-C C-G C-G C-C 0.0 1.0 1.0 0.0
25 45 1 879317 rs7523549 C T 388.57 PASS ABHet=0.493;ABHom=1.00;AC=2;AF=0.250;AN=8;Base... GT:AD:DP:GQ:PL ... 0/0:12,0:12:36:0,36,405 0/1:3,6:9:64:187,0,64 C-T C-C C-C C-T 1.0 0.0 0.0 1.0
26 46 1 879482 rs149880798 G C 799.55 PASS ABHet=0.592;ABHom=1.00;AC=2;AF=0.250;AN=8;Base... GT:AD:DP:GQ:PL ... 0/0:19,0:19:51:0,51,650 0/1:14,9:23:99:257,0,389 G-C G-G G-G G-C 1.0 0.0 0.0 1.0
27 47 1 879676 rs6605067 G A 432.72 PASS ABHom=1.00;AC=8;AF=1.00;AN=8;DB;DP=12;Dels=0.0... GT:AD:DP:GQ:PL ... 1/1:0,3:3:9:120,9,0 1/1:0,4:4:12:158,12,0 A-A A-A A-A A-A 2.0 2.0 2.0 2.0
28 48 1 879687 rs2839 T C 335.22 PASS ABHom=1.00;AC=8;AF=1.00;AN=8;DB;DP=10;Dels=0.0... GT:AD:DP:GQ:PL ... 1/1:0,2:2:6:80,6,0 1/1:0,4:4:12:139,12,0 C-C C-C C-C C-C 2.0 2.0 2.0 2.0
29 49 1 880238 rs3748592 A G 516.84 PASS ABHom=1.00;AC=8;AF=1.00;AN=8;DB;DP=14;Dels=0.0... GT:AD:DP:GQ:PL ... 1/1:0,3:3:9:120,9,0 1/1:0,5:5:15:196,15,0 G-G G-G G-G G-G 2.0 2.0 2.0 2.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
267146 296313 Y 59026513 rs1826575 A G 42.77 PASS ABHom=1.00;AC=2;AF=1.00;AN=2;DB;DP=2;Dels=0.00... GT:AD:DP:GQ:PL ... ./. 1/1:0,2:2:6:74,6,0 - - - G-G NaN NaN NaN 2.0
267147 296320 Y 59033110 rs28628009 A T 36.01 PASS ABHom=1.00;AC=4;AF=1.00;AN=4;DB;DP=2;Dels=0.00... GT:AD:DP:GQ:PL ... ./. 1/1:0,1:1:3:39,3,0 - T-T - T-T NaN 2.0 NaN 2.0
267148 296321 Y 59033139 rs55686319 T C 37.77 PASS ABHom=1.00;AC=2;AF=1.00;AN=2;DB;DP=2;Dels=0.00... GT:AD:DP:GQ:PL ... ./. 1/1:0,2:2:6:69,6,0 - - - C-C NaN NaN NaN 2.0
267149 296322 MT 73 rs3087742 A G 327.99 PASS ABHom=1.00;AC=8;AF=1.00;AN=8;DB;DP=10;Dels=0.0... GT:AD:DP:GQ:PL ... 1/1:0,3:3:9:94,9,0 1/1:0,2:2:6:79,6,0 G-G G-G G-G G-G 2.0 2.0 2.0 2.0
267150 296323 MT 195 rs2857291 T C 47.77 PASS ABHom=1.00;AC=2;AF=1.00;AN=2;DB;DP=2;Dels=0.00... GT:AD:DP:GQ:PL ... ./. ./. - C-C - - NaN 2.0 NaN NaN
267151 296324 MT 263 rs2853515 A G 229.77 PASS ABHom=1.00;AC=8;AF=1.00;AN=8;DB;DP=8;Dels=0.00... GT:AD:DP:GQ:PL ... 1/1:0,1:1:3:35,3,0 1/1:0,1:1:3:32,3,0 G-G G-G G-G G-G 2.0 2.0 2.0 2.0
267152 296325 MT 709 rs2853517 G A 346.40 PASS ABHom=1.00;AC=4;AF=0.667;AN=6;BaseQRankSum=1.4... GT:AD:DP:GQ:PL ... ./. 1/1:0,5:5:15:193,15,0 A-A G-G - A-A 2.0 0.0 NaN 2.0
267153 296326 MT 750 rs2853518 A G 476.08 PASS ABHom=1.00;AC=6;AF=1.00;AN=6;DB;DP=15;Dels=0.0... GT:AD:DP:GQ:PL ... ./. 1/1:0,7:7:21:273,21,0 G-G G-G - G-G 2.0 2.0 NaN 2.0
267154 296327 MT 1243 rs28358572 T C 422.79 PASS ABHom=1.00;AC=6;AF=0.750;AN=8;BaseQRankSum=0.6... GT:AD:DP:GQ:PL ... 1/1:0,1:1:3:40,3,0 1/1:0,5:5:9:119,9,0 C-C T-T C-C C-C 2.0 0.0 2.0 2.0
267155 296329 MT 1719 rs3928305 G A 35.00 PASS ABHom=1.00;AC=2;AF=0.250;AN=8;BaseQRankSum=-0.... GT:AD:DP:GQ:PL ... 0/0:1,0:1:3:0,3,32 0/0:4,0:4:12:0,12,159 G-G A-A G-G G-G 0.0 2.0 0.0 0.0
267156 296330 MT 2706 rs2854128 A G 300.19 PASS ABHom=0.899;AC=6;AF=1.00;AN=6;DB;DP=10;Dels=0.... GT:AD:DP:GQ:PL ... ./. 1/1:0,2:3:6:73,6,0 G-G G-G - G-G 2.0 2.0 NaN 2.0
267157 296331 MT 3505 rs28358585 A G 219.20 PASS ABHom=1.00;AC=6;AF=0.750;AN=8;BaseQRankSum=1.6... GT:AD:DP:GQ:PL ... 1/1:0,1:1:3:40,3,0 1/1:0,2:2:6:67,6,0 G-G A-A G-G G-G 2.0 0.0 2.0 2.0
267158 296333 MT 5460 rs3021088 G A 515.25 PASS ABHom=1.00;AC=6;AF=0.750;AN=8;BaseQRankSum=0.1... GT:AD:DP:GQ:PL ... 1/1:0,4:4:12:144,12,0 1/1:0,12:12:33:380,33,0 A-A G-G A-A A-A 2.0 0.0 2.0 2.0
267159 296335 MT 6371 rs41366755 C T 116.32 PASS ABHom=1.00;AC=2;AF=0.250;AN=8;BaseQRankSum=0.5... GT:AD:DP:GQ:PL ... 0/0:1,0:1:3:0,3,40 0/0:8,0:8:24:0,24,297 C-C T-T C-C C-C 0.0 2.0 0.0 0.0
267160 296336 MT 8697 rs28358886 G A 280.28 PASS ABHom=1.00;AC=4;AF=1.00;AN=4;DB;DP=8;Dels=0.00... GT:AD:DP:GQ:PL ... ./. 1/1:0,7:7:21:275,21,0 A-A - - A-A 2.0 NaN NaN 2.0
267161 296340 MT 11947 rs28359168 A G 529.29 PASS ABHom=1.00;AC=4;AF=0.667;AN=6;BaseQRankSum=-1.... GT:AD:DP:GQ:PL ... ./. 1/1:0,7:7:21:243,21,0 G-G A-A - G-G 2.0 0.0 NaN 2.0
267162 296341 MT 12414 . T C 547.46 PASS ABHom=1.00;AC=4;AF=0.667;AN=6;BaseQRankSum=-0.... GT:AD:DP:GQ:PL ... ./. 1/1:0,10:10:30:392,30,0 C-C T-T - C-C 2.0 0.0 NaN 2.0
267163 296342 MT 12705 rs2854122 C T 272.26 PASS ABHom=1.00;AC=6;AF=1.00;AN=6;DB;DP=9;Dels=0.00... GT:AD:DP:GQ:PL ... ./. 1/1:0,2:2:6:80,6,0 T-T T-T - T-T 2.0 2.0 NaN 2.0
267164 296343 MT 13819 . T C 77.31 PASS ABHom=1.00;AC=2;AF=0.250;AN=8;BaseQRankSum=1.4... GT:AD:DP:GQ:PL ... 0/0:4,0:4:12:0,12,159 0/0:7,0:7:21:0,21,268 T-T C-C T-T T-T 0.0 2.0 0.0 0.0
267165 296344 MT 13966 rs41535848 A G 188.02 PASS ABHom=1.00;AC=2;AF=0.250;AN=8;BaseQRankSum=3.3... GT:AD:DP:GQ:PL ... 0/0:2,0:2:6:0,6,75 0/0:10,0:10:27:0,27,316 A-A G-G A-A A-A 0.0 2.0 0.0 0.0
267166 296345 MT 14371 . T C 634.48 PASS ABHom=1.00;AC=6;AF=0.750;AN=8;BaseQRankSum=2.3... GT:AD:DP:GQ:PL ... 1/1:0,1:1:3:29,3,0 1/1:0,9:9:27:336,27,0 C-C T-T C-C C-C 2.0 0.0 2.0 2.0
267167 296346 MT 14470 rs3135030 T C 288.43 PASS ABHom=1.00;AC=2;AF=0.333;AN=6;BaseQRankSum=3.4... GT:AD:DP:GQ:PL ... ./. 0/0:7,0:7:21:0,21,243 T-T C-C - T-T 0.0 2.0 NaN 0.0
267168 296347 MT 14766 rs3135031 C T 526.56 PASS ABHom=1.00;AC=8;AF=1.00;AN=8;DB;DP=15;Dels=0.0... GT:AD:DP:GQ:PL ... 1/1:0,2:2:6:77,6,0 1/1:0,3:3:9:116,9,0 T-T T-T T-T T-T 2.0 2.0 2.0 2.0
267169 296349 MT 15326 rs2853508 A G 557.29 PASS ABHom=1.00;AC=8;AF=1.00;AN=8;DB;DP=16;Dels=0.0... GT:AD:DP:GQ:PL ... 1/1:0,1:1:3:37,3,0 1/1:0,9:9:27:333,27,0 G-G G-G G-G G-G 2.0 2.0 2.0 2.0
267170 296350 MT 15884 rs28617642 G C 83.98 PASS ABHom=1.00;AC=2;AF=1.00;AN=2;DB;DP=3;Dels=0.00... GT:AD:DP:GQ:PL ... ./. 1/1:0,3:3:9:116,9,0 - - - C-C NaN NaN NaN 2.0
267171 296351 MT 16223 rs2853513 C T 396.83 PASS ABHom=1.00;AC=6;AF=1.00;AN=6;DB;DP=11;Dels=0.0... GT:AD:DP:GQ:PL ... ./. 1/1:0,4:4:12:155,12,0 T-T T-T - T-T 2.0 2.0 NaN 2.0
267172 296352 MT 16278 rs41458645 C T 109.32 PASS ABHom=1.00;AC=2;AF=0.250;AN=8;BaseQRankSum=-2.... GT:AD:DP:GQ:PL ... 0/0:1,0:1:3:0,3,40 0/0:5,0:5:15:0,15,180 C-C T-T C-C C-C 0.0 2.0 0.0 0.0
267173 296353 MT 16290 rs34524463 C T 583.33 PASS ABHom=1.00;AC=6;AF=0.750;AN=8;BaseQRankSum=-0.... GT:AD:DP:GQ:PL ... 1/1:0,1:1:3:39,3,0 1/1:0,5:5:15:181,15,0 T-T C-C T-T T-T 2.0 0.0 2.0 2.0
267174 296356 MT 16519 rs3937033 T C 455.12 PASS ABHom=1.00;AC=8;AF=1.00;AN=8;DB;DP=13;Dels=0.0... GT:AD:DP:GQ:PL ... 1/1:0,2:2:6:80,6,0 1/1:0,3:3:9:116,9,0 C-C C-C C-C C-C 2.0 2.0 2.0 2.0
268100 300035 10 123256215 . T G 100.00 PASS GENE=FGFR2;INHERITANCE=AD;MIM=101600 GT:AD:DP:GQ:PL ... 1/0:1,0:1:3:0,3,39 0/0:1,0:1:3:0,3,39 T-T T-T G-T T-T 0.0 0.0 1.0 0.0

267176 rows × 22 columns

QUAL, 변이 갯수를 염색체별로

In [12]:
dfc[[not x.startswith('G') for x in dfc['#CHROM']]].boxplot(
    column='QUAL', by='#CHROM', figsize=(15,5))
Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x11d838198>
In [13]:
dfc[[not x.startswith('G') for x in dfc['#CHROM']]]['#CHROM'].value_counts().plot(kind='bar')
Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x10891ebe0>
In [34]:
dfc = dfc[[not x.startswith('G') for x in dfc['#CHROM']]]
sample_ids_snps = ['{} snp'.format(s) for s in sample_ids]

def _call_snp(row, sample_id):
    if ',' in row['ALT']:
        return np.nan
    m = {
        '0': row['REF'],
        '1': row['ALT'],
        '.': ''
    }
    genotype = '{} {}'.format(m[row[sample_id][0]], m[row[sample_id][2]])
    if len(genotype) != 3:
        return np.nan
    return genotype

def call_snp(row):
    return pd.Series([_call_snp(row, s) for s in sample_ids if s], 
                index=sample_ids_genotype)

snps = dfc.apply(call_snp, axis=1)
snps
Out[34]:
ISDBM322015 genotype ISDBM322016 genotype ISDBM322017 genotype ISDBM322018 genotype
0 NaN C C NaN NaN
1 NaN NaN NaN NaN
2 NaN A A A A A A
3 NaN G G G G G G
4 C C NaN NaN NaN
5 T T NaN T T NaN
6 A A A A A A A A
7 G G G G G G G G
8 G G G G G G G G
9 NaN A A NaN G G
10 NaN C C NaN NaN
11 G G NaN NaN NaN
12 G G G G NaN NaN
13 G G NaN NaN G G
14 T T C C NaN NaN
15 A A A A A A A A
16 NaN NaN C C NaN
17 NaN NaN G G G G
18 NaN NaN T T T T
19 NaN NaN C C C C
20 C C C C NaN C C
21 NaN T T G G G T
22 T T T T NaN NaN
23 G G G G NaN NaN
24 C C C G C G C C
25 C T C C C C C T
26 G C G G G G G C
27 A A A A A A A A
28 C C C C C C C C
29 G G G G G G G G
... ... ... ... ...
267146 NaN NaN NaN G G
267147 NaN T T NaN T T
267148 NaN NaN NaN C C
267149 G G G G G G G G
267150 NaN C C NaN NaN
267151 G G G G G G G G
267152 A A G G NaN A A
267153 G G G G NaN G G
267154 C C T T C C C C
267155 G G A A G G G G
267156 G G G G NaN G G
267157 G G A A G G G G
267158 A A G G A A A A
267159 C C T T C C C C
267160 A A NaN NaN A A
267161 G G A A NaN G G
267162 C C T T NaN C C
267163 T T T T NaN T T
267164 T T C C T T T T
267165 A A G G A A A A
267166 C C T T C C C C
267167 T T C C NaN T T
267168 T T T T T T T T
267169 G G G G G G G G
267170 NaN NaN NaN C C
267171 T T T T NaN T T
267172 C C T T C C C C
267173 T T C C T T T T
267174 C C C C C C C C
268100 T T T T G T T T

267176 rows × 4 columns

In [17]:
def make_plink_map(df):
    with open('pfeiffer.map', 'w') as mapfile:
        for i, r in df.iterrows():
            id_ = r['ID']
            if id_ == '.':
                id_ = '{}-{}'.format(r['#CHROM'], r['POS'])
            mapfile.write(
                '{} {} 0 {}\n'.format(r['#CHROM'], r['POS'], id_)
            )

make_plink_map(dfc)
In [18]:
!head -n 100 pfeiffer.map
1 63735 0 1-63735
1 745370 0 rs146246821
1 752566 0 rs3094315
1 752721 0 rs3131972
1 758324 0 rs3131955
1 780027 0 rs2977613
1 808631 0 rs11240779
1 812267 0 rs7541694
1 812284 0 rs7545373
1 823790 0 rs143626389
1 834832 0 rs4411087
1 849998 0 rs13303222
1 851757 0 rs62677860
1 861808 0 rs13302982
1 862866 0 rs3892970
1 866319 0 rs9988021
1 866511 0 rs60722469
1 866920 0 rs2341361
1 867584 0 rs2341360
1 869323 0 rs13303207
1 870903 0 rs13303094
1 871334 0 rs4072383
1 873558 0 rs1110052
1 876499 0 rs4372192
1 878784 0 rs142929357
1 879317 0 rs7523549
1 879482 0 rs149880798
1 879676 0 rs6605067
1 879687 0 rs2839
1 880238 0 rs3748592
1 880390 0 rs3748593
1 881627 0 rs2272757
1 883625 0 rs4970378
1 884091 0 rs7522415
1 887560 0 rs3748595
1 887801 0 rs3828047
1 888639 0 rs3748596
1 888659 0 rs3748597
1 889158 0 rs56262069
1 889159 0 rs13302945
1 889638 0 rs13303206
1 892460 0 rs41285802
1 892745 0 rs13303227
1 894573 0 rs13303010
1 895706 0 rs13303327
1 896333 0 rs144174542
1 897325 0 rs4970441
1 897564 0 rs13303229
1 897730 0 rs7549631
1 898323 0 rs6605071
1 900285 0 rs4970435
1 900286 0 rs4970434
1 900505 0 rs28705211
1 902069 0 rs116147894
1 908823 0 rs28687780
1 909073 0 rs3892467
1 909238 0 rs3829740
1 909309 0 rs3829738
1 909555 0 rs2340594
1 909768 0 rs2340593
1 911916 0 rs74045046
1 914876 0 rs13302983
1 916549 0 rs6660139
1 928520 0 rs35002855
1 936848 0 rs149671836
1 941539 0 rs9778087
1 943468 0 rs3121567
1 943687 0 rs2465140
1 948846 0 rs3841266
1 948870 0 rs4615788
1 948921 0 rs15842
1 949235 0 rs2465124
1 949608 0 rs1921
1 949654 0 rs8997
1 949925 0 rs2799070
1 957898 0 rs2799064
1 962210 0 rs3128126
1 963249 0 rs2710870
1 971224 0 rs2799055
1 974662 0 rs2465135
1 977203 0 rs3121552
1 977330 0 rs2799066
1 977570 0 rs2710876
1 980460 0 rs3128097
1 981087 0 rs3128098
1 981931 0 rs2465128
1 982444 0 rs3128099
1 982462 0 rs3128100
1 982513 0 rs3128101
1 982941 0 rs3128102
1 982994 0 rs10267
1 986443 0 rs2710887
1 987200 0 rs9803031
1 988932 0 rs2710871
1 990806 0 rs2799073
1 1001177 0 rs4970401
1 1001178 0 1-1001178
1 1017587 0 rs3766191
1 1019175 0 rs2298215
1 1021346 0 rs10907177
In [36]:
def make_plink_ped(snps):
    with open('pfeiffer.ped', 'w') as pedfile:
        for i, r in snps.T.iterrows():
            s = ' '.join(list(r))
            pedfile.write(
                'FAM1 {} 0 0 0 {}'.format(i.split()[0], s)
            )

make_plink_ped(snps.replace(np.nan, '0 0'))
In [ ]: