y = 3 + 4
x = 5 + 8 ** 23
print(x, "is big", y, "is small")
590295810358705651717 is big 7 is small
This is just bold text
$\frac{1}{2}$ is a fraction
f = open("data/ecoli_uti89.txt")
dna = ""
for line in f:
dna += line.strip()
len(dna)
5065741
def count_goadrich1(dna):
return dna.count("C") + dna.count("G")
count_goadrich1(dna) / len(dna)
0.5060418604109448
def count_goadrich2(dna):
count = 0
for bp in dna:
if bp == "C" or bp == "G":
count += 1
return count
count_goadrich2(dna) / len(dna)
0.5060418604109448
########
# Use this page to paste in your GC counting functions, so I can easily add them to my
# notebook on Friday. You will need to write two functions. Follow the format shown below,
# where each function name begins with “count_” followed by your email login and a number.
# Your functions will have one parameter which is a string, and will return an integer for the
# count of GC in the given string.
########
def count_goadrich1(dna):
return dna.count("C") + dna.count("G")
def count_goadrich2(dna):
count = 0
for bp in dna:
if bp == "C" or bp == "G":
count += 1
return count
######### ADD YOUR FUNCTIONS HERE ###########
def count_rosert1(dna):
return sum([x == "G" or x == "C" for x in dna])
import random
def count_rosert2(dna):
count = 0
symbol_indices = set()
while len(symbol_indices) != len(dna):
t = random.randrange(0, len(dna))
if t not in symbol_indices:
symbol_indices.add(t)
if dna[t] == "C" or dna[t] == "G":
count += 1
return count
#############################################
def count_lesliejd1(dna):
d = {"C": 1, "G": 1, "T": 0, "A": 0}
c = 0
for bp in dna:
c += d[bp]
return c
def count_lesliejd2(dna):
if len(dna) > 1:
if dna[0] == "C" or dna[0] == "G":
return 1 + count_lesliejd2(dna[1:])
else:
return count_lesliejd2(dna[1:])
else:
if dna == "C" or dna == "G":
return 1
else:
return 0
##############################
def count_mershonrb1(genomes):
gcCount = 0
count = 0
while count != len(genomes):
if genomes[count] == "G" or genomes[count] =="C":
gcCount+=1
count += 1
return gcCount
def count_mershonrb2(genomes):
aCount = genomes.count('A')
tCount = genomes.count('T')
return (len(genomes) - (aCount + tCount))
##############################
def count_khoojj1(dna):
dna.replace('A','')
dna.replace('T','')
return len(dna)
def count_khoojj2(dna):
count = 0
for bp in dna:
if bp == 'A' or bp == 'T':
bp.lower()
for bp in dna:
if bp.isupper():
count += 1
return count
##############################
## can we import re outside of the function?? this will slow it down!
# YES! -MHG
import re
def count_huynhem1(dna):
d = re.findall("[CG]",dna)
return len(d)
def count_huynhem2(dna):
d = re.sub("[AT]","",dna)
return len(d)
###########################################
import re
def count_hendersondd1(dna):
c = re.sub("[AT]", "", dna)
return len(c)
from collections import Counter
def count_hendersondd2(dna):
counter = Counter(dna)
del counter['A']
del counter['T']
return sum(counter.values())
#####################################
def count_garimellakv1(dna):
#https://wiki.python.org/moin/PythonSpeed/PerformanceTips#String_Concatenation
gc_list = [char for char in dna if char == 'C' or char == 'G']
return len(''.join(gc_list))
def count_garimellakv2(dna):
if len(dna) <= 2:
count = 0
for char in dna:
if char == 'C' or char == 'G':
count += 1
return count
else:
mid = len(dna) // 2
return (count_garimellakv2(dna[0:mid]) + count_garimellakv2(dna[mid:]))
###################################
def count_kweejj1(dna):
d= {'A':0, 'C':0, 'G':0, 'T':0}
for c in dna:
if c in d.keys():
d[c] += 1
return d['C'] + d['G']
def count_kweejj2(dna):
l = list(dna)
l.sort()
up = l.index('C')
low = l.index('T')
return(len(l[up:low]))
#####################################
def count_holmesaa1(dna):
d = {'G':0,'C':0}
for c in dna:
if not c in d:
None
else:
d[c] += 1
count = (d['G'] + d['C'])
return count
def count_holmesaa2(dna):
dna = dna.replace("A","")
dna = dna.replace("T","")
count = len(dna)
return count
####################################
def count_spurlockee1(dna):
count=0
for bp in dna:
temp = ord(bp)-65
if(temp==0):
continue
count+=abs(((temp)%2)-1)
return count
def count_spurlockee2(dna):
count=0
iteratedBP = iter(dna)
for bp in dna:
temp = next(iteratedBP)
if(temp=="G"):
count+=1
if(temp=="C"):
count+=1
return count
####################################
def count_falleurjd1(dna):
x = len(dna)
y = (dna.count("A") + dna.count("T"))
return ((x - y) / x)
def count_falleurjd2(dna):
dna.replace('A','')
dna.replace('T','')
return len(dna)
#########################################
###http://stackoverflow.com/questions/15046242/how-to-sort-the-letters-in-a-string-alphabetically-in-python
def count_matsonjr1(dna):
sort = "".join(sorted(dna))
A = sort.find("C")
T = sort.find("T")
return (T - A)
def count_matsonjr2(dna):
string = dna
length1 = len(dna)
string = string.replace("G","gg")
string = string.replace("C","cc")
return len(string) - length1
############################################
def count_shaddoxac1(dna):
newdna = dna
newdna = newdna.replace("A","")
newdna = newdna.replace("T","")
return len(newdna) / len(dna)
def count_shaddoxac2(dna):
return len(re.findall('C|G', dna)) / len(dna)
############################################
import re
def count_bentonjt2(dna):
return len([c for c in dna if c == "C" or c == "G"])
# from http://stackoverflow.com/questions/9957081/counting-the-number-of-occurrences-of-a-character-in-multiple-files-with-unix-sh
# and http://stackoverflow.com/questions/29801975/why-is-the-subprocess-popen-argument-length-limit-smaller-than-what-the-os-repor
import subprocess
def count_bentonjt1(dna):
argsize = 100000 # not the max, just to be safe
arglist = [dna[i:i + argsize] for i in range(0, len(dna), argsize)]
total = 0
for a in arglist:
cmd = "echo \"" + a + "\" | grep -o \"[GC]\" | wc -w"
total += int(subprocess.check_output(cmd, shell=True))
return total
############################################################################
import re
def count_ndemeyemm1(dna):
#base case
if (dna == 'C' or dna == 'G') and len(dna) == 1:
return 1
elif len(dna) == 1:
return 0
else:
if dna[0] == 'C' or dna[0] == 'G':
return 1 + counter(string[1:])
else:
return counter(string[1:])
def count_ndemeyemm2(dna):
return len(re.findall("r'[CG]", dna))
logins = open("data/logins.txt")
for name in logins:
for i in range(1, 3):
print("count_" + name.lower()[:name.find("@")] + str(i) + ",")
count_goadrich1, count_goadrich2, count_bentonjt1, count_bentonjt2, count_falleurjd1, count_falleurjd2, count_garimellakv1, count_garimellakv2, count_hendersondd1, count_hendersondd2, count_holmesaa1, count_holmesaa2, count_horanni1, count_horanni2, count_huynhem1, count_huynhem2, count_kangasee1, count_kangasee2, count_khoojj1, count_khoojj2, count_kouss1, count_kouss2, count_kweejj1, count_kweejj2, count_lesliejd1, count_lesliejd2, count_matsonjr1, count_matsonjr2, count_mershonrb1, count_mershonrb2, count_ndemeyemm1, count_ndemeyemm2, count_nicklejm1, count_nicklejm2, count_rosert1, count_rosert2, count_shaddoxac1, count_shaddoxac2, count_spurlockee1, count_spurlockee2,
funcs = [
count_goadrich1,
count_goadrich2,
count_bentonjt1,
count_bentonjt2,
count_falleurjd1,
count_falleurjd2,
count_garimellakv1,
count_garimellakv2,
count_hendersondd1,
count_hendersondd2,
count_holmesaa1,
count_holmesaa2,
count_huynhem1,
count_huynhem2,
count_khoojj1,
count_khoojj2,
count_kweejj1,
count_kweejj2,
count_lesliejd1,
#count_lesliejd2,
count_matsonjr1,
count_matsonjr2,
count_mershonrb1,
count_mershonrb2,
#count_ndemeyemm1,
count_ndemeyemm2,
count_rosert1,
#count_rosert2,
count_shaddoxac1,
count_shaddoxac2,
count_spurlockee1,
count_spurlockee2
]
dir(count_goadrich1)
['__annotations__', '__call__', '__class__', '__closure__', '__code__', '__defaults__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__get__', '__getattribute__', '__globals__', '__gt__', '__hash__', '__init__', '__kwdefaults__', '__le__', '__lt__', '__module__', '__name__', '__ne__', '__new__', '__qualname__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__']
names = []
timings = []
results = []
import time
for f in funcs:
print(f.__name__)
t0 = time.clock()
results.append(f(dna))
t1 = time.clock()
cpu_time = t1 - t0
timings.append(cpu_time)
names.append(f.__name__)
count_goadrich1 count_goadrich2 count_bentonjt1 count_bentonjt2 count_falleurjd1 count_falleurjd2 count_garimellakv1 count_garimellakv2 count_hendersondd1 count_hendersondd2 count_holmesaa1 count_holmesaa2 count_huynhem1 count_huynhem2 count_khoojj1 count_khoojj2 count_kweejj1 count_kweejj2 count_lesliejd1 count_matsonjr1 count_matsonjr2 count_mershonrb1 count_mershonrb2 count_ndemeyemm2 count_rosert1 count_shaddoxac1 count_shaddoxac2 count_spurlockee1 count_spurlockee2
timings
[0.030991999999969266, 0.44987099999997326, 0.16952399999996715, 0.3722349999999892, 0.02913499999999658, 0.09618500000004815, 0.38488600000005135, 2.8516349999999875, 0.4915210000000343, 0.2830910000000131, 0.47906000000000404, 0.09136200000000372, 0.4088459999999827, 0.557578000000035, 0.10607899999996562, 1.1772740000000113, 1.0186610000000087, 0.9976370000000543, 0.3721259999999802, 0.9697639999999978, 0.10881499999999278, 1.4399839999999813, 0.02943700000002991, 0.0026950000000169894, 0.5653489999999692, 0.09178399999996145, 0.4111180000000445, 1.283881000000008, 1.0567050000000222]
results
[2563477, 2563477, 2563477, 2563477, 0.5060418604109448, 5065741, 2563477, 2563477, 2563477, 2563477, 2563477, 2563477, 2563477, 2563477, 5065741, 5065741, 2563477, 2563477, 2563477, 2563477, 2563477, 2563477, 2563477, 0, 2563477, 0.5060418604109448, 0.5060418604109448, 2563477, 2563477]
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
fig, ax = plt.subplots()
ind = np.arange(len(funcs))
width = 0.75
rects = ax.bar(ind, timings, width, color="r")
ax.set_xticks(ind + width / 2)
ax.set_xticklabels(names, rotation="vertical")
[<matplotlib.text.Text at 0x1159fa358>, <matplotlib.text.Text at 0x115a0d6d8>, <matplotlib.text.Text at 0x115dac0b8>, <matplotlib.text.Text at 0x115dacac8>, <matplotlib.text.Text at 0x115db1518>, <matplotlib.text.Text at 0x115a12320>, <matplotlib.text.Text at 0x115a0a438>, <matplotlib.text.Text at 0x107c04ac8>, <matplotlib.text.Text at 0x115db42b0>, <matplotlib.text.Text at 0x115db4c50>, <matplotlib.text.Text at 0x115db66a0>, <matplotlib.text.Text at 0x115dba0f0>, <matplotlib.text.Text at 0x115dbab00>, <matplotlib.text.Text at 0x115dbf550>, <matplotlib.text.Text at 0x115dbff60>, <matplotlib.text.Text at 0x115dc29b0>, <matplotlib.text.Text at 0x115dc6400>, <matplotlib.text.Text at 0x115dc6e10>, <matplotlib.text.Text at 0x115dc9860>, <matplotlib.text.Text at 0x115f022b0>, <matplotlib.text.Text at 0x115f02cc0>, <matplotlib.text.Text at 0x115f05710>, <matplotlib.text.Text at 0x115f08160>, <matplotlib.text.Text at 0x115f08b70>, <matplotlib.text.Text at 0x115f0d5c0>, <matplotlib.text.Text at 0x115f0dfd0>, <matplotlib.text.Text at 0x115f11a20>, <matplotlib.text.Text at 0x115f17470>, <matplotlib.text.Text at 0x115f17e80>]
letters = ["A", "T", "G", "C"]
counts = [dna.count(x) for x in letters]
counts
[1250197, 1252067, 1279155, 1284322]
plt.pie(counts, labels=letters)
([<matplotlib.patches.Wedge at 0x11607f630>, <matplotlib.patches.Wedge at 0x116085278>, <matplotlib.patches.Wedge at 0x116085eb8>, <matplotlib.patches.Wedge at 0x11608bb38>], [<matplotlib.text.Text at 0x11607fe48>, <matplotlib.text.Text at 0x116085a58>, <matplotlib.text.Text at 0x11608b6d8>, <matplotlib.text.Text at 0x115fd6358>])
%time count_goadrich1(dna)
CPU times: user 31.5 ms, sys: 478 µs, total: 32 ms Wall time: 31.6 ms
2563477
timings[0]
0.03824299999996583