ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/20130502/
import os
import gzip
import csv
import pandas
import vcf
# Compute vcf file names
vcf_files = [
'ALL.chr{}.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz'.format(x)
for x in range(1, 23)]
vcf_files += [
'ALL.chrX.phase3_shapeit2_mvncall_integrated_v1b.20130502.genotypes.vcf.gz',
'ALL.chrY.phase3_integrated_v1b.20130502.genotypes.vcf.gz',
]
vcf_files
['ALL.chr1.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz', 'ALL.chr2.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz', 'ALL.chr3.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz', 'ALL.chr4.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz', 'ALL.chr5.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz', 'ALL.chr6.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz', 'ALL.chr7.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz', 'ALL.chr8.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz', 'ALL.chr9.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz', 'ALL.chr10.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz', 'ALL.chr11.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz', 'ALL.chr12.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz', 'ALL.chr13.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz', 'ALL.chr14.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz', 'ALL.chr15.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz', 'ALL.chr16.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz', 'ALL.chr17.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz', 'ALL.chr18.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz', 'ALL.chr19.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz', 'ALL.chr20.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz', 'ALL.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz', 'ALL.chr22.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz', 'ALL.chrX.phase3_shapeit2_mvncall_integrated_v1b.20130502.genotypes.vcf.gz', 'ALL.chrY.phase3_integrated_v1b.20130502.genotypes.vcf.gz']
binary = '/home/dhimmels/bin/bin/bcftools'
download_dir = 'download'
subset_dir = 'data/common-snps/vcf/'
# Maximum major allele frequency to allow
major_af_max = 0.95
bcftools
¶for vcf_file in vcf_files:
! {binary} view --exclude-types indels,mnps,other \
--apply-filters PASS --max-af {major_af_max}:'major' \
--drop-genotypes --output-type z \
--output-file {subset_dir}/{vcf_file} {download_dir}/{vcf_file}
Warning: The index file is older than the data file: download/ALL.chr1.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi Warning: The index file is older than the data file: download/ALL.chr2.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi Warning: The index file is older than the data file: download/ALL.chr3.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi Warning: The index file is older than the data file: download/ALL.chr4.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi Warning: The index file is older than the data file: download/ALL.chr5.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi Warning: The index file is older than the data file: download/ALL.chr6.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi Warning: The index file is older than the data file: download/ALL.chr7.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi Warning: The index file is older than the data file: download/ALL.chr8.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi Warning: The index file is older than the data file: download/ALL.chr9.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi Warning: The index file is older than the data file: download/ALL.chr10.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi Warning: The index file is older than the data file: download/ALL.chr11.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi Warning: The index file is older than the data file: download/ALL.chr12.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi Warning: The index file is older than the data file: download/ALL.chr13.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi Warning: The index file is older than the data file: download/ALL.chr14.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi Warning: The index file is older than the data file: download/ALL.chr15.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi Warning: The index file is older than the data file: download/ALL.chr16.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi Warning: The index file is older than the data file: download/ALL.chr17.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi Warning: The index file is older than the data file: download/ALL.chr18.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi Warning: The index file is older than the data file: download/ALL.chr19.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi Warning: The index file is older than the data file: download/ALL.chr20.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi Warning: The index file is older than the data file: download/ALL.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi Warning: The index file is older than the data file: download/ALL.chr22.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi Warning: The index file is older than the data file: download/ALL.chrX.phase3_shapeit2_mvncall_integrated_v1b.20130502.genotypes.vcf.gz.tbi Warning: The index file is older than the data file: download/ALL.chrY.phase3_integrated_v1b.20130502.genotypes.vcf.gz.tbi
def get_majar_af(aafs):
"""
Returns the major allele frequency from
the list of alternate allele frequencies.
"""
allele_freqs = aafs + [1 - sum(aafs)]
return max(allele_freqs)
write_file = gzip.open('data/common-snps/common-snps.tsv.gz', 'wt')
writer = csv.writer(write_file, delimiter='\t')
writer.writerow(('chromosome', 'position', 'major_allele_frequency', 'rsid'))
for vcf_file in vcf_files:
path = os.path.join(subset_dir, vcf_file)
for r in vcf.Reader(filename=path):
# Quality control
if r.FILTER:
print(r.FILTER)
continue
# Exclude non-SNPs
if not r.is_snp:
print(r.var_type)
continue
# Major allele frequency check
major_af = get_majar_af(r.INFO['AF'])
if major_af > major_af_max:
print(major_af)
continue
# Write SNP to tsv
row = r.CHROM, r.POS, round(major_af, 6), r.ID
writer.writerow(row)
write_file.close()
bed_df = pandas.read_table('data/common-snps/common-snps.tsv.gz', low_memory=False)
bed_df['chrom'] = 'chr' + bed_df['chromosome']
bed_df['chromStart'] = bed_df['position'] - 1
bed_df['chromEnd'] = bed_df['position']
bed_df = bed_df[['chrom', 'chromStart', 'chromEnd']]
bed_df.head(2)
chrom | chromStart | chromEnd | |
---|---|---|---|
0 | chr1 | 11007 | 11008 |
1 | chr1 | 11011 | 11012 |
with gzip.open('data/common-snps/common-snps.bed.gz', 'wt') as write_file:
bed_df.to_csv(write_file, index=False, sep='\t', header=False)