%%file calc_gc.py def calc_gc(sequence): sequence = sequence.upper() # make all chars uppercase n = sequence.count('T') + sequence.count('A') # count only A, T, m = sequence.count('G') + sequence.count('C') # C, and G -- nothing else (no Ns, Rs, Ws, etc.) if n + m == 0: return 0. # avoid divide-by-zero return float(m) / float(n + m) def test_1(): result = round(calc_gc('ATGGCAT'), 2) print 'hello, this is a test; the value of result is', result assert result == 0.43 def test_2(): # test handling N result = round(calc_gc('NATGC'), 2) assert result == 0.5, result def test_3(): # test handling lowercase result = round(calc_gc('natgc'), 2) assert result == 0.5, result !nosetests calc_gc.py !nosetests -v calc_gc.py %%file gc-of-seqs.py import sys import screed import calc_gc filename = sys.argv[1] # take the sequence filename in from the command line total_gc = [] for record in screed.open(filename): gc = calc_gc.calc_gc(record.sequence) total_gc.append(gc) print sum(total_gc) / float(len(total_gc)) # run the script and look at the output -- then write that output into the following file. !python gc-of-seqs.py 25k.fq.gz %%file test_gc_script.py import subprocess correct_output = "0.607911191366\n" # this is taken from the previous exec'd cell # the following function checks to see if running this script at the command line # returns the right result. make sure you're running this from *within* the python/ subdirectory # of the 2012-11-scripps/ repository. def test_run(): p = subprocess.Popen('python gc-of-seqs.py 25k.fq.gz', shell=True, stdout=subprocess.PIPE) (stdout, stderr) = p.communicate() assert stdout == correct_output !nosetests test_gc_script.py