#Import our libs up front import itertools import random import os import datetime import pandas as pd import numpy as np from HTSeq import FastaReader import_df = pd.read_csv("./EnsamblCFTR.tsv", sep="\t") tx_exon_dict = dict() # in general this is poor practice with pandas dataframes, but this df is very small # and a pure python data structure is more convienent for i in range(len(import_df)): exon_start_stop = zip(map(int, import_df["Exon Starts"][i].split(",")), map(int, import_df["Exon Stops"][i].split(","))) tx_exon_dict[import_df["Transcript Name"][i]] = exon_start_stop expanded_tx_exon_dict = dict() for tx_name, exon_coordinate_list in tx_exon_dict.items(): new_coordinate_list = [] for start, stop in exon_coordinate_list: new_start = start - 100 new_stop = stop + 100 new_coordinate_list.append((new_start, new_stop)) expanded_tx_exon_dict[tx_name] = new_coordinate_list interval_list = [] [interval_list.extend(x) for x in expanded_tx_exon_dict.itervalues()] print(interval_list) # sort by the start position sorted_by_start = sorted(interval_list, key=lambda x: x[0]) def collapse_intervals(intervals): cur_start, cur_stop = intervals[0] for next_start, next_stop in intervals[1:]: if cur_stop < next_start: yield cur_start, cur_stop cur_start, cur_stop = next_start, next_stop else: cur_stop = next_stop yield cur_start, cur_stop test_intervals = [(1,4), (2,5), (5,7), (10,13)] #min set [(1,7), (10,13)] test_intervals2 = [(1,4), (2,5), (6,7), (10,13), (12,15)] #min set [(1,5), (6,7), (10,15)] print(list(collapse_intervals(test_intervals)) == [(1,7), (10,13)]) print(list(collapse_intervals(test_intervals2)) == [(1,5), (6,7), (10,15)]) collapsed_interval_list = list(collapse_intervals(sorted_by_start)) print("Before collapsing we have: " + str(len(interval_list)) + " intervals") print("After collapsing we have: " + str(len(collapsed_interval_list)) + " intervals") print("\nOur new interval set:\n") print(collapsed_interval_list) BASES = ['A', 'G', 'C', 'T'] TWOBASES = [ ''.join(combo) for combo in itertools.permutations(BASES, 2)] THREEBASES = [ ''.join(combo) for combo in itertools.permutations(BASES, 3)] print BASES print TWOBASES print THREEBASES #treat the fasta reader like an iterator and get the first item chr7_seq = FastaReader("./chr7.fa.gz").__iter__().next() def get_variants(chr, pos): variant_list = [] #subtract 1 because python lists are 0-indexed; add 3 b/c python list slicing is exclusive of upper index ref = chr7_seq[pos-1:pos+3].seq.upper() variant_list.extend(get_snps(chr, pos, ref)) variant_list.extend(get_insertions(chr, pos, ref)) variant_list.extend(get_deletions(chr, pos, ref)) return variant_list def get_snps(chr, pos, ref): snp_list = [] ref = ref[0] for base in BASES: if base == ref: continue snp_list.append({'#CHROM': chr, 'POS': pos, 'REF': ref, 'ALT': base}) return snp_list def get_insertions(chr, pos, ref): ins_list = [] ref = ref[0] for base in itertools.chain(BASES, random.sample(TWOBASES, 2), random.sample(THREEBASES, 2)): ins_list.append({'#CHROM': chr, 'POS': pos,'REF': ref[0], 'ALT': ref[0] + base}) return ins_list def get_deletions(chr, pos, ref): del_list = [] del_list.append({'#CHROM': chr, 'POS': pos, 'REF': ref[0:2], 'ALT': ref[0]}) del_list.append({'#CHROM': chr, 'POS': pos, 'REF': ref[0:3], 'ALT': ref[0]}) del_list.append({'#CHROM': chr, 'POS': pos, 'REF': ref, 'ALT': ref[0]}) return del_list col_names= ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER','INFO','FORMAT','SAMPLE'] variant_df = pd.DataFrame(columns=col_names) print variant_df for interval in collapsed_interval_list: pos, stop = interval while(pos < stop): var_dict = get_variants(7, pos) variant_df = variant_df.append(var_dict) pos += 1 #hard code in our genotype information variant_df['FORMAT'] = 'GT' variant_df['SAMPLE'] = '0/1' variant_df.head() vcf_header = """##fileformat=VCFv4.1 ##fileDate={0} ##source=CFTR_Variant_Project ##FORMAT= """ vcf_header = vcf_header.format(datetime.datetime.utcnow().strftime("%Y%m%d")) '\n'.join([vcf_header, ('\t'.join(variant_df.columns)), '\t'.join(col_names)]) output_file = "CFTR_artifical_variant.vcf" try: os.remove(output_file) except OSError: pass with open(output_file, 'a') as f: f.write(vcf_header) variant_df.to_csv(f, sep='\t', na_rep='.', index=False) store_file="store.h5" try: os.remove(store_file) except OSError: pass store = pd.HDFStore(store_file) store['variant_df'] = variant_df