The most basic goal of program design is to break code into manageable pieces, store them in such a way that it is easy to find the piece we want, and make it easy to reuse pieces of our code in other projects.
The basic elements of design for my scientific projects include:
"""module docstring"""
# imports
# constants
# classes
# functions
def main(...):
...
if __name__ == '__main__':
main()
I've simplified this slightly to focus on the things that most scientific programmers use. If you want to see the full structure check out the original source.
Let's take the answer to the refactoring problem in Assignment 5 as an example.
We don't have a lot of general functions yet, but the couple we have should be placed in a module.
%%file dna_analysis.py
"""Code for analyzing DNA sequences"""
from __future__ import division
def get_clean_seq(seq):
"""Get a consistently formated sequence without extra characters for analysis"""
seq = seq.replace('\n', '').replace(' ', '').upper()
seq = seq.upper()
return seq
def get_gc_content(seq):
"""Determine the GC content of a sequence"""
seq = get_clean_seq(seq)
gc_content = 100 * (seq.count('G') + seq.count('C')) / len(seq)
return gc_content
We then put our project specific code into another module following Python structure.
"""Analysis code for Dr. Granger's project"""
from __future__ import division
import urllib
import csv
import numpy as np
import dna_analysis
def get_size_class(earlength):
"""Determine the size class of earlength based on Dr. Grangers specification"""
assert float(earlength), "Function requires numeric input"
if earlength >= 15:
size_class = 'extralarge'
elif 10 <= earlength < 15:
size_class = 'large'
elif 8 <= earlength < 10:
size_class = 'medium'
elif earlength < 8:
size_class = 'small'
return size_class
def import_data(datafile, headerrow=False):
"""Retrieve CSV data from the file and store it in a list of lists"""
datareader = csv.reader(datafile)
if headerrow == True:
datareader.next()
data = []
for row in datareader:
data.append(row)
return data
def export_to_csv(data, filename):
"""Export a list of lists to a CSV file"""
output_file = open(filename, 'w')
datawriter = csv.writer(output_file)
datawriter.writerows(data)
output_file.close()
def get_indiv_earlength_gc_data(elves_data):
"""Determine individual level earth length category and gc content values"""
results = []
for indiv_id, earlength, dna in elves_data:
gc_content = dna_analysis.get_gc_content(dna)
earlength_size_class = get_size_class(float(earlength))
results.append((indiv_id, earlength_size_class, gc_content))
return results
def get_avg_gc_by_sizeclass(indiv_data):
"""Get average values of gc content for each size class"""
indiv_data = np.array(indiv_data, dtype={'names': ['ID', 'SizeClass', 'GCcontent'],
'formats': ['a10', 'a10', 'f4']})
size_classes = set(indiv_data['SizeClass'])
results = []
for size_class in size_classes:
avg_gc_content = np.mean(indiv_data['GCcontent'][indiv_data['SizeClass']
== size_class])
results.append([size_class, avg_gc_content])
return results
def main():
url = 'http://programmingforbiologists.org/sites/programmingforbiologists.org/files/houseelf_earlength_dna_data.csv'
datafile = urllib.urlopen(url)
elves_data = import_data(datafile, headerrow=True)
indiv_data = get_indiv_earlength_gc_data(elves_data)
results = get_avg_gc_by_sizeclass(indiv_data)
export_to_csv(results, 'grangers_output.csv')
if __name__ == '__main__':
main()
To add some command line control we can have main()
check to see if there are arguments and use them.
def main():
if sys.argv[1]:
url = sys.argv[1]
else:
url = 'http://programmingforbiologists.org/sites/programmingforbiologists.org/files/houseelf_earlength_dna_data.csv'
datafile = urllib.urlopen(url)
elves_data = import_data(datafile, headerrow=True)
indiv_data = get_indiv_earlength_gc_data(elves_data)
results = get_avg_gc_by_sizeclass(indiv_data)
export_to_csv(results, 'grangers_output.csv')