Notebook

In [3]:

y = 3 + 4

In [4]:

x = 5 + 8 ** 23
print(x, "is big", y, "is small")

590295810358705651717 is big 7 is small

This is just bold text

$\frac{1}{2}$ is a fraction

test one
another item

In [19]:

f = open("data/ecoli_uti89.txt")

In [20]:

dna = ""
for line in f:
    dna += line.strip()
    

In [21]:

len(dna)

Out[21]:

In [56]:

def count_goadrich1(dna):
    return dna.count("C") + dna.count("G")

In [57]:

count_goadrich1(dna) / len(dna)

Out[57]:

0.5060418604109448

In [58]:

def count_goadrich2(dna):
    count = 0
    for bp in dna:
        if bp == "C" or bp == "G":
            count += 1
    return count

In [59]:

count_goadrich2(dna) / len(dna)

Out[59]:

0.5060418604109448

In [89]:

########
# Use this page to paste in your GC counting functions, so I can easily add them to my 
# notebook on Friday. You will need to write two functions. Follow the format shown below, 
# where each function name begins with “count_” followed by your email login and a number. 
# Your functions will have one parameter which is a string, and will return an integer for the 
# count of GC in the given string.
########

def count_goadrich1(dna):
    return dna.count("C") + dna.count("G")

def count_goadrich2(dna):
    count = 0
    for bp in dna:
            if bp == "C" or bp == "G":
                    count += 1
    return count

######### ADD YOUR FUNCTIONS HERE ###########

def count_rosert1(dna):
    return sum([x == "G" or x == "C" for x in dna])

import random
def count_rosert2(dna):
    count = 0
    symbol_indices = set()
    while len(symbol_indices) != len(dna):
        t = random.randrange(0, len(dna))
        if t not in symbol_indices:
            symbol_indices.add(t)
            if dna[t] == "C" or dna[t] == "G":
                count += 1
    return count

#############################################

def count_lesliejd1(dna):
     d = {"C": 1, "G": 1, "T": 0, "A": 0}
     c = 0
     for bp in dna:
          c += d[bp]
     return c

def count_lesliejd2(dna):
    if len(dna) > 1:
        if dna[0] == "C" or dna[0] == "G":
            return 1 + count_lesliejd2(dna[1:])
        else:
            return count_lesliejd2(dna[1:])
    else:
        if dna == "C" or dna == "G":
            return 1
        else:
            return 0

##############################

def count_mershonrb1(genomes):
    gcCount = 0
    count = 0
    while count != len(genomes):
        if     genomes[count] == "G" or genomes[count] =="C":
            gcCount+=1
        count += 1
    return gcCount

def count_mershonrb2(genomes):
    
    aCount = genomes.count('A')
    tCount = genomes.count('T')
    
    return (len(genomes) - (aCount + tCount))


##############################
def count_khoojj1(dna):
    dna.replace('A','')
    dna.replace('T','')
    return len(dna)

def count_khoojj2(dna):
    count = 0
    for bp in dna:
        if bp == 'A' or bp == 'T':
            bp.lower()
    for bp in dna:
        if bp.isupper():
            count += 1
    return count
##############################
## can we import re outside of the function?? this will slow it down!
# YES! -MHG
import re

def count_huynhem1(dna):
    d = re.findall("[CG]",dna)
    return len(d)

def count_huynhem2(dna):
    d = re.sub("[AT]","",dna)
    return len(d)


###########################################

import re
def count_hendersondd1(dna):
    c = re.sub("[AT]", "", dna)
    return len(c)
from collections import Counter
def count_hendersondd2(dna):
    counter = Counter(dna)
    del counter['A']
    del counter['T']
    return sum(counter.values())

#####################################

def count_garimellakv1(dna):
    #https://wiki.python.org/moin/PythonSpeed/PerformanceTips#String_Concatenation
    gc_list = [char for char in dna if char == 'C' or char == 'G']
    return len(''.join(gc_list))

def count_garimellakv2(dna):
    if len(dna) <= 2:
        count = 0
        for char in dna:
            if char == 'C' or char == 'G':
                count += 1
        return count
    else:
        mid = len(dna) // 2
        return (count_garimellakv2(dna[0:mid]) + count_garimellakv2(dna[mid:]))

###################################
def count_kweejj1(dna):
    d= {'A':0, 'C':0, 'G':0, 'T':0}
    for c in dna:
        if c in d.keys():
            d[c] += 1
    return d['C'] + d['G']

def count_kweejj2(dna):
    l = list(dna)
    l.sort()
    up = l.index('C')
    low = l.index('T')
    return(len(l[up:low]))

#####################################

def count_holmesaa1(dna):
    d = {'G':0,'C':0}
    for c in dna:
        if not c in d:
            None
        else:
            d[c] += 1
    count = (d['G'] + d['C'])
    return count

def count_holmesaa2(dna):
    dna = dna.replace("A","")
    dna = dna.replace("T","")
    count = len(dna)
    return count

####################################
def count_spurlockee1(dna):
    count=0
    for bp in dna:
        temp = ord(bp)-65
        if(temp==0):
            continue
        count+=abs(((temp)%2)-1)
        
    return count

def count_spurlockee2(dna):
    count=0
    iteratedBP = iter(dna)
    for bp in dna:
        temp = next(iteratedBP)
        if(temp=="G"):
            count+=1
        if(temp=="C"):
            count+=1
    return count



####################################

def count_falleurjd1(dna):
    x = len(dna)
    y = (dna.count("A") + dna.count("T"))
    return ((x - y) / x)

def count_falleurjd2(dna):
    dna.replace('A','')
    dna.replace('T','')
    return len(dna)

#########################################

###http://stackoverflow.com/questions/15046242/how-to-sort-the-letters-in-a-string-alphabetically-in-python
def count_matsonjr1(dna):
    sort = "".join(sorted(dna))
    A = sort.find("C")
    T = sort.find("T")
    return (T - A)

def count_matsonjr2(dna):
    string = dna
    length1 = len(dna)
    string = string.replace("G","gg")
    string = string.replace("C","cc")
    return len(string) - length1

############################################

def count_shaddoxac1(dna):
    newdna = dna
    newdna = newdna.replace("A","")
    newdna = newdna.replace("T","")
    return len(newdna) / len(dna)

def count_shaddoxac2(dna):
    return len(re.findall('C|G', dna)) / len(dna)

############################################

import re
def count_bentonjt2(dna):
    return len([c for c in dna if c == "C" or c == "G"])

# from http://stackoverflow.com/questions/9957081/counting-the-number-of-occurrences-of-a-character-in-multiple-files-with-unix-sh
# and http://stackoverflow.com/questions/29801975/why-is-the-subprocess-popen-argument-length-limit-smaller-than-what-the-os-repor
import subprocess
def count_bentonjt1(dna):
    argsize = 100000  # not the max, just to be safe
    arglist = [dna[i:i + argsize] for i in range(0, len(dna), argsize)]
    total = 0
    for a in arglist:
        cmd = "echo \"" + a + "\" | grep -o \"[GC]\" | wc -w"
        total += int(subprocess.check_output(cmd, shell=True))
    return total

############################################################################
import re
def count_ndemeyemm1(dna):
    #base case
    if (dna == 'C' or dna == 'G') and len(dna) == 1:
        return 1
    elif len(dna) == 1:
        return 0
    else:
        if dna[0] == 'C' or dna[0] == 'G':
            return 1 + counter(string[1:])
        else:
            return counter(string[1:])

def count_ndemeyemm2(dna):
    return len(re.findall("r'[CG]", dna))

    

In [72]:

logins = open("data/logins.txt")
for name in logins:
    for i in range(1, 3):
        print("count_" + name.lower()[:name.find("@")] + str(i) + ",")

count_goadrich1,
count_goadrich2,
count_bentonjt1,
count_bentonjt2,
count_falleurjd1,
count_falleurjd2,
count_garimellakv1,
count_garimellakv2,
count_hendersondd1,
count_hendersondd2,
count_holmesaa1,
count_holmesaa2,
count_horanni1,
count_horanni2,
count_huynhem1,
count_huynhem2,
count_kangasee1,
count_kangasee2,
count_khoojj1,
count_khoojj2,
count_kouss1,
count_kouss2,
count_kweejj1,
count_kweejj2,
count_lesliejd1,
count_lesliejd2,
count_matsonjr1,
count_matsonjr2,
count_mershonrb1,
count_mershonrb2,
count_ndemeyemm1,
count_ndemeyemm2,
count_nicklejm1,
count_nicklejm2,
count_rosert1,
count_rosert2,
count_shaddoxac1,
count_shaddoxac2,
count_spurlockee1,
count_spurlockee2,

In [97]:

funcs = [
    count_goadrich1,
count_goadrich2,
count_bentonjt1,
count_bentonjt2,
count_falleurjd1,
count_falleurjd2,
count_garimellakv1,
count_garimellakv2,
count_hendersondd1,
count_hendersondd2,
count_holmesaa1,
count_holmesaa2,
count_huynhem1,
count_huynhem2,
count_khoojj1,
count_khoojj2,
count_kweejj1,
count_kweejj2,
count_lesliejd1,
#count_lesliejd2,
count_matsonjr1,
count_matsonjr2,
count_mershonrb1,
count_mershonrb2,
#count_ndemeyemm1,
count_ndemeyemm2,
count_rosert1,
#count_rosert2,
count_shaddoxac1,
count_shaddoxac2,
count_spurlockee1,
count_spurlockee2
]

In [84]:

dir(count_goadrich1)

Out[84]:

['__annotations__',
 '__call__',
 '__class__',
 '__closure__',
 '__code__',
 '__defaults__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__get__',
 '__getattribute__',
 '__globals__',
 '__gt__',
 '__hash__',
 '__init__',
 '__kwdefaults__',
 '__le__',
 '__lt__',
 '__module__',
 '__name__',
 '__ne__',
 '__new__',
 '__qualname__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__']

In [102]:

names = []
timings = []
results = []
import time

for f in funcs:
    print(f.__name__)
    t0 = time.clock()
    results.append(f(dna))
    t1 = time.clock()
    cpu_time = t1 - t0
    timings.append(cpu_time)
    names.append(f.__name__)

count_goadrich1
count_goadrich2
count_bentonjt1
count_bentonjt2
count_falleurjd1
count_falleurjd2
count_garimellakv1
count_garimellakv2
count_hendersondd1
count_hendersondd2
count_holmesaa1
count_holmesaa2
count_huynhem1
count_huynhem2
count_khoojj1
count_khoojj2
count_kweejj1
count_kweejj2
count_lesliejd1
count_matsonjr1
count_matsonjr2
count_mershonrb1
count_mershonrb2
count_ndemeyemm2
count_rosert1
count_shaddoxac1
count_shaddoxac2
count_spurlockee1
count_spurlockee2

In [99]:

timings

Out[99]:

[0.030991999999969266,
 0.44987099999997326,
 0.16952399999996715,
 0.3722349999999892,
 0.02913499999999658,
 0.09618500000004815,
 0.38488600000005135,
 2.8516349999999875,
 0.4915210000000343,
 0.2830910000000131,
 0.47906000000000404,
 0.09136200000000372,
 0.4088459999999827,
 0.557578000000035,
 0.10607899999996562,
 1.1772740000000113,
 1.0186610000000087,
 0.9976370000000543,
 0.3721259999999802,
 0.9697639999999978,
 0.10881499999999278,
 1.4399839999999813,
 0.02943700000002991,
 0.0026950000000169894,
 0.5653489999999692,
 0.09178399999996145,
 0.4111180000000445,
 1.283881000000008,
 1.0567050000000222]

In [100]:

results

Out[100]:

[2563477,
 2563477,
 2563477,
 2563477,
 0.5060418604109448,
 5065741,
 2563477,
 2563477,
 2563477,
 2563477,
 2563477,
 2563477,
 2563477,
 2563477,
 5065741,
 5065741,
 2563477,
 2563477,
 2563477,
 2563477,
 2563477,
 2563477,
 2563477,
 0,
 2563477,
 0.5060418604109448,
 0.5060418604109448,
 2563477,
 2563477]

In [106]:

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

fig, ax = plt.subplots()

ind = np.arange(len(funcs))
width = 0.75

rects = ax.bar(ind, timings, width, color="r")

ax.set_xticks(ind + width / 2)
ax.set_xticklabels(names, rotation="vertical")

Out[106]:

[<matplotlib.text.Text at 0x1159fa358>,
 <matplotlib.text.Text at 0x115a0d6d8>,
 <matplotlib.text.Text at 0x115dac0b8>,
 <matplotlib.text.Text at 0x115dacac8>,
 <matplotlib.text.Text at 0x115db1518>,
 <matplotlib.text.Text at 0x115a12320>,
 <matplotlib.text.Text at 0x115a0a438>,
 <matplotlib.text.Text at 0x107c04ac8>,
 <matplotlib.text.Text at 0x115db42b0>,
 <matplotlib.text.Text at 0x115db4c50>,
 <matplotlib.text.Text at 0x115db66a0>,
 <matplotlib.text.Text at 0x115dba0f0>,
 <matplotlib.text.Text at 0x115dbab00>,
 <matplotlib.text.Text at 0x115dbf550>,
 <matplotlib.text.Text at 0x115dbff60>,
 <matplotlib.text.Text at 0x115dc29b0>,
 <matplotlib.text.Text at 0x115dc6400>,
 <matplotlib.text.Text at 0x115dc6e10>,
 <matplotlib.text.Text at 0x115dc9860>,
 <matplotlib.text.Text at 0x115f022b0>,
 <matplotlib.text.Text at 0x115f02cc0>,
 <matplotlib.text.Text at 0x115f05710>,
 <matplotlib.text.Text at 0x115f08160>,
 <matplotlib.text.Text at 0x115f08b70>,
 <matplotlib.text.Text at 0x115f0d5c0>,
 <matplotlib.text.Text at 0x115f0dfd0>,
 <matplotlib.text.Text at 0x115f11a20>,
 <matplotlib.text.Text at 0x115f17470>,
 <matplotlib.text.Text at 0x115f17e80>]

In [107]:

letters = ["A", "T", "G", "C"]

In [108]:

counts = [dna.count(x) for x in letters]

In [109]:

counts

Out[109]:

[1250197, 1252067, 1279155, 1284322]

In [110]:

plt.pie(counts, labels=letters)

Out[110]:

([<matplotlib.patches.Wedge at 0x11607f630>,
  <matplotlib.patches.Wedge at 0x116085278>,
  <matplotlib.patches.Wedge at 0x116085eb8>,
  <matplotlib.patches.Wedge at 0x11608bb38>],
 [<matplotlib.text.Text at 0x11607fe48>,
  <matplotlib.text.Text at 0x116085a58>,
  <matplotlib.text.Text at 0x11608b6d8>,
  <matplotlib.text.Text at 0x115fd6358>])

In [114]:

%time count_goadrich1(dna)

CPU times: user 31.5 ms, sys: 478 µs, total: 32 ms
Wall time: 31.6 ms

Out[114]:

In [112]:

timings[0]

Out[112]:

0.03824299999996583

In [ ]: