%%file dna_analysis.py """Code for analyzing DNA sequences""" from __future__ import division def get_clean_seq(seq): """Get a consistently formated sequence without extra characters for analysis""" seq = seq.replace('\n', '').replace(' ', '').upper() seq = seq.upper() return seq def get_gc_content(seq): """Determine the GC content of a sequence""" seq = get_clean_seq(seq) gc_content = 100 * (seq.count('G') + seq.count('C')) / len(seq) return gc_content """Analysis code for Dr. Granger's project""" from __future__ import division import urllib import csv import numpy as np import dna_analysis def get_size_class(earlength): """Determine the size class of earlength based on Dr. Grangers specification""" assert float(earlength), "Function requires numeric input" if earlength >= 15: size_class = 'extralarge' elif 10 <= earlength < 15: size_class = 'large' elif 8 <= earlength < 10: size_class = 'medium' elif earlength < 8: size_class = 'small' return size_class def import_data(datafile, headerrow=False): """Retrieve CSV data from the file and store it in a list of lists""" datareader = csv.reader(datafile) if headerrow == True: datareader.next() data = [] for row in datareader: data.append(row) return data def export_to_csv(data, filename): """Export a list of lists to a CSV file""" output_file = open(filename, 'w') datawriter = csv.writer(output_file) datawriter.writerows(data) output_file.close() def get_indiv_earlength_gc_data(elves_data): """Determine individual level earth length category and gc content values""" results = [] for indiv_id, earlength, dna in elves_data: gc_content = dna_analysis.get_gc_content(dna) earlength_size_class = get_size_class(float(earlength)) results.append((indiv_id, earlength_size_class, gc_content)) return results def get_avg_gc_by_sizeclass(indiv_data): """Get average values of gc content for each size class""" indiv_data = np.array(indiv_data, dtype={'names': ['ID', 'SizeClass', 'GCcontent'], 'formats': ['a10', 'a10', 'f4']}) size_classes = set(indiv_data['SizeClass']) results = [] for size_class in size_classes: avg_gc_content = np.mean(indiv_data['GCcontent'][indiv_data['SizeClass'] == size_class]) results.append([size_class, avg_gc_content]) return results def main(): url = 'http://programmingforbiologists.org/sites/programmingforbiologists.org/files/houseelf_earlength_dna_data.csv' datafile = urllib.urlopen(url) elves_data = import_data(datafile, headerrow=True) indiv_data = get_indiv_earlength_gc_data(elves_data) results = get_avg_gc_by_sizeclass(indiv_data) export_to_csv(results, 'grangers_output.csv') if __name__ == '__main__': main() def main(): if sys.argv[1]: url = sys.argv[1] else: url = 'http://programmingforbiologists.org/sites/programmingforbiologists.org/files/houseelf_earlength_dna_data.csv' datafile = urllib.urlopen(url) elves_data = import_data(datafile, headerrow=True) indiv_data = get_indiv_earlength_gc_data(elves_data) results = get_avg_gc_by_sizeclass(indiv_data) export_to_csv(results, 'grangers_output.csv')