Python Programming for Biologists, Tel-Aviv University / 0411-3122 / Spring 2015 ¶

Class-exercise solutions¶

1A¶

In [1]:

print("I'm a genius!")
print("7 + 6 =",7+6)
print("My name is Inigo Montoya")

I'm a genius!
7 + 6 = 13
My name is Inigo Montoya

1B¶

In [2]:

# Define a and b
a = 10
b = 4.7

# calculate hypotenuse
c = (a**2 + b**2)**0.5

# print result
print(c)

11.04943437466371

1C¶

In [6]:

# Choose a year
year = 1492

# test year
if (year % 400 == 0 or (year % 4 == 0 and year % 100 != 0)):
    print(year,"is a leap year")
else:
    print(year,"is not a leap year")

1492 is a leap year

1D¶

In [8]:

m = 555 # integer to apply the conjecture on

n = m
while n != 1:
    print(n, end=", ")
    # if n is even
    if n % 2 == 0:
        n = n // 2
    # if n is odd
    else:
        n = 3 * n + 1
print(1) # 1 was not printed
print(m, "is OK")

555, 1666, 833, 2500, 1250, 625, 1876, 938, 469, 1408, 704, 352, 176, 88, 44, 22, 11, 34, 17, 52, 26, 13, 40, 20, 10, 5, 16, 8, 4, 2, 1
555 is OK

2A¶

In [1]:

seq = "CAAGTAATGGCAGCCATTAA"
# print 2nd nucleotide
print(seq[1])
# print 7th nucleotide
print(seq[6])
# print 2nd nucleotide from the tail
print(seq[-2])

first_half = seq[:10]
print(first_half)
second_half = seq[10:]
print(second_half)
middle = seq[5:15]
print(middle)

A
A
A
CAAGTAATGG
CAGCCATTAA
AATGGCAGCC

2B¶

In [ ]:

# create list
animals = birds + snakes
# add Mus musculus
animals.append('Mus musculus')
# remove Corvus corone element
animals.remove('Corvus corone')
# print
print(sorted(animals[1:5]))

2C¶

In [11]:

insulin = 'MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN'

In [12]:

charged = ['R','H','K','D','E']

charged_count = 0
for aa in insulin:
    if aa in charged:
        charged_count += 1

insulin_length = len(insulin)
charged_ratio = charged_count/insulin_length
print("Ratio of charged amino acids is:",charged_ratio)

Ratio of charged amino acids is: 0.17272727272727273

2D¶

In [1]:

seq = "ACTGATCGATTACGTATAGTAGAATTCTATCATACATATATATCGATGCGTTCAT"

1)¶

In [13]:

fragments = seq.split('GAATTC')
f1_length = len(fragments[0]) + 1 # add 1 for the 'G'
f2_length = len(fragments[1]) + 5 # add 5 for the 'AATTC'
print('Fragment lengths of',f1_length,'and',f2_length,'will be produced.')

Fragment lengths of 22 and 33 will be produced.

2)¶

In [2]:

complement = ''
for base in seq:
    if base == 'A':
        complement = complement + 'T'
    elif base == 'T':
        complement = complement + 'A' 
    elif base == 'G':
        complement = complement + 'C'
    elif base == 'C':
        complement = complement + 'G'    
    else:
        print("Bad base:", base)
print("Complement:", complement)

Complement: TGACTAGCTAATGCATATCATCTTAAGATAGTATGTATATATAGCTACGCAAGTA

3)¶

In [ ]:

for i in range(len(apes)):
    print(apes[i])
    display(Image(url=ape_pics[i]))

3A¶

1)¶

In [1]:

# Create dictionary
details_dict = {'Name': 'James Watson', 'Address': 'Cambridge', 'Phone': '12345678'}

# print sentence
print("My name is",details_dict['Name'],"I live in",details_dict['Address'],"My phone number is",details_dict['Phone'])

My name is James Watson I live in Cambridge My phone number is 12345678

2)¶

In [2]:

# Create codons dictionary
bases = ['t', 'c', 'a', 'g']
codons = [a+b+c for a in bases for b in bases for c in bases]
amino_acids = 'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
codon_table = dict(zip(codons, amino_acids))

# Sequence list
seq_list = ["atg","caa","ggc","ata","tca","tgg","cga","agg","cct","taa"]

# iterate on list and translate
for codon in seq_list:
    print(codon_table[codon], end='')

MQGISWRRP*

3B¶

1)¶

In [3]:

# define function
def first_5_longer_sequence(seq1,seq2):
    if len(seq1) > len(seq2):
        return seq1[:5]
    else:
        return seq2[:5]
    
# Test function
sequence1 = "aggtctcggatataggcgcgatattta"
sequence2 = "ttaagccacgcttcggatta"
first_5 = first_5_longer_sequence(sequence1, sequence2)
print(first_5)

aggtc

2)¶

In [6]:

# define function
def odd_bases(seq):
    odd_bases_list = []
    for i in range(len(seq)):
        if i % 2 == 0:
            odd_bases_list.append(seq[i])    
    return odd_bases_list

# Test function
odd_bases_list = odd_bases("aggtctcggatataggcgcgatattta")
print(odd_bases_list)

['a', 'g', 'c', 'c', 'g', 't', 't', 'g', 'c', 'c', 'a', 'a', 't', 'a']

Or another alternative, using string slicing:

In [7]:

# define function
def odd_bases(seq):
    odd_bases_list = list(seq[::2])   
    return odd_bases_list

# Test function
odd_bases_list = odd_bases("aggtctcggatataggcgcgatattta")
print(odd_bases_list)

['a', 'g', 'c', 'c', 'g', 't', 't', 'g', 'c', 'c', 'a', 'a', 't', 'a']

4A¶

1)¶

In [1]:

with open('lec4_files/crops.txt','r') as crops_file:
    entire_file = crops_file.read()
    lines_list = entire_file.split('\n')
    print(lines_list[-1])

Ziziphus zizyphus |

2)¶

In [2]:

with open('lec4_files/crops.txt','r') as crops_file:
    triticum_count = 0
    for line in crops_file:
        if line.startswith('Triticum'):
            triticum_count += 1
    print(triticum_count)

4B¶

In [6]:

# new function
def parse_fasta_30_nuc(file_name):
    """
    Receives a path to a fasta file, and returns a dictionary where the keys
    are the sequence gb accession numbers and the values are the first 30
    nucleotides of the sequences.
    """
    # create an empty dictionary to store the sequences
    sequences = {}
    # open fasta file for reading
    with open(file_name,'r') as f:
        # Loop over file lines
        for line in f:
            # if header line
            if line.startswith('>'):
                gb = line.split('|')[3]
            # if sequence line
            else:
                seq = line.strip()[:30]
                sequences[gb] = seq
    return sequences

# parse file
camelus_seq = parse_fasta_30_nuc('lec4_files/camelus.fasta')

# write to new file
with open('lec4_files/4b_output.txt','w') as of:
    for gb_id in camelus_seq:
        print(gb_id + ':',camelus_seq[gb_id], file=of)

4C¶

In [11]:

def mean_of_string_values(lst):
    """
    receives a list of strings representing numbers and returns their mean
    """
    numeric_lst = []
    for x in lst:
        numeric_lst.append(float(x))
    return mean(numeric_lst)

experiments_file = 'lec4_files/electrolyte_leakage.csv'
with open(experiments_file, 'r') as f:
    with open('lec4_files/4c_output.csv','w', newline='') as fo:
        csv_writer = csv.writer(fo)
        csv_writer.writerow(['Accession','control mean','test mean'])
        csv_reader = csv.reader(f)
        next(csv_reader)
        for row in csv_reader:
            acc = row[0]
            control = row [1:4]
            test = row[4:]
            to_write = [acc,mean_of_string_values(control),mean_of_string_values(test)]
            csv_writer.writerow(to_write)

4D¶

In [23]:

genes = ['xkn59438', 'yhdck2', 'eihd39d9', 'chdsye847', 'hedle3455', 'xjhd53e', '45da', 'de37dp','map492ty']

# 1.
print('Gene names containing d or e:')
regex1 = re.compile(r'[de]')
for gene in genes:
    if re.search(regex1,gene):
        print(gene)
        
print('------------------------')

# 2.
print('Gene names containing d and e, in that order:')
regex2 = re.compile(r'd[^e]*e')
for gene in genes:
    if re.search(regex2,gene):
        print(gene)
        
print('------------------------')

# 3.
print('Gene names containing three digits in a row:')
regex3 = re.compile(r'\d{3,}')
for gene in genes:
    if re.search(regex3,gene):
        print(gene)

Gene names containing d or e:
yhdck2
eihd39d9
chdsye847
hedle3455
xjhd53e
45da
de37dp
------------------------
Gene names containing d and e, in that order:
chdsye847
hedle3455
xjhd53e
de37dp
------------------------
Gene names containing three digits in a row:
xkn59438
chdsye847
hedle3455
map492ty

4E¶

1)¶

In [ ]:

def count_promoters_with_motif(promoters_dictionary):
    """
    Receives a dictionary representing a promoters fasta file,
    and counts how many of the promoters include a GATA-4 motif.
    """
    promoters_count = 0   # store the number of promoters with GATA-4 motif
    for p in promoters_dictionary:
        if check_for_GATA4(promoters_dictionary[p]):
            promoters_count += 1
    return promoters_count

2)¶

In [ ]:

def get_positions_statistics(promoters_dictionary):
    """
    Receives a dictionary representing a promoters fasta file,
    and returns the frequencies of possible nucleotides in 
    each variable position.
    """
    # define a  dictionary for each position, to store the nucleotide frequencies
    # D position
    D_dict = {'A':0, 'G':0, 'T':0}
    # M position
    M_dict = {'A':0, 'C':0}
    # R position
    R_dict = {'A':0, 'G':0}
    # S position
    S_dict = {'C':0, 'G':0}
    
    # itterate over promoters
    for p in promoters_dictionary:
        # if promoter includes the GATA-4 motif
        if check_for_GATA4(promoters_dictionary[p]):
            # get variable nucleotides in promoter
            D,M,R,S = extract_ambiguous_for_GATA4(promoters_dictionary[p])
            # insert to dictionaries
            D_dict[D] += 1
            M_dict[M] += 1
            R_dict[R] += 1
            S_dict[S] += 1
            
    return D_dict, M_dict, R_dict, S_dict

3)¶

In [ ]:

def summarize_results(D_dict, M_dict, R_dict, S_dict, output_file):
    with open(output_file, 'w') as fo:
        csv_writer = csv.writer(fo)
        # write headers line
        csv_writer.writerow(['Position','A','G','C','T'])
        # summarize D position
        csv_writer.writerow(['D',D_dict['A'],D_dict['G'],0,D_dict['T']])
        # summarize M position
        csv_writer.writerow(['M',M_dict['A'],0,M_dict['C'],0])
        # summarize R position
        csv_writer.writerow(['R',R_dict['A'],R_dict['G'],0,0])
        # summarize S position
        csv_writer.writerow(['S',0,S_dict['G'],S_dict['C'],0])

4)¶

In [ ]:

promoters_file = "lec4_files/GATA4_promoters.fasta"
output_file = "lec4_files/promoters_stats.csv"

# parse fasta file
promoters_dict = parse_promoters_fasta(promoters_file)

# Count promoters with/without GATA-4 motif
promoters_with_motif = count_promoters_with_motif(promoters_dict)
promoters_without_motif = len(promoters_dict) - promoters_with_motif
print('Total promoters:',promoters_with_motif + promoters_without_motif)
print('Promoters with GATA-4 motif:',promoters_with_motif)
print('Promoters without GATA-4 motif:',promoters_without_motif)

# Get statistics
D_dict, M_dict, R_dict, S_dict = get_positions_statistics(promoters_dict)
# write to CSV
summarize_results(D_dict, M_dict, R_dict, S_dict,output_file)

6A¶

In [20]:

from Bio.Seq import Seq
def antisense_string_to_protein_seq(DNA_string):
    antisense_seq = Seq(DNA_string)
    sense_seq = antisense_seq.reverse_complement()
    prot_seq = sense_seq.translate()
    return prot_seq

antisense_DNA = "TACCGGTAACATTACCCGGCGACTTTCCCACGGGCTATC"
protein = antisense_string_to_protein_seq(antisense_DNA)
print(protein)
assert str(protein) == 'DSPWESRRVMLPV'
assert isinstance(protein,Seq)

6B¶

In [24]:

from Bio import SeqIO
def get_unique_species(gb_file):
    species_list = []
    # iterate on file records
    for seq_record in SeqIO.parse(gb_file,'genbank'):
        # get species
        record_organism = seq_record.annotations['organism']
        species = record_organism.split()[1]   # get the second word
        # insert species to list
        species_list.append(species)
    return set(species_list)

orchids_species = get_unique_species('lec6_files/Orchids.gbk')
print(orchids_species)
assert len(orchids_species) == 92

{'segawai', 'kaiteurum', 'caricinum', 'mastersianum', 'formosanum', 'hookerae', 'victoria', 'lichiangense', 'passerinum', 'charlesworthii', 'bougainvilleanum', 'venustum', 'stonei', 'sanderianum', 'hirsutissimum', 'urbanianum', 'superbiens', 'supardii', 'appletonianum', 'armeniacum', 'papuanum', 'ciliolare', 'philippinense', 'haynaldianum', 'lindleyanum', 'bullenianum', 'sargentianum', 'pearcei', 'barbigerum', 'calceolus', 'purpuratum', 'xerophyticum', 'dayanum', 'henryanum', 'callosum', 'fasciculatum', 'gratrixianum', 'insigne', 'caudatum', 'macranthon', 'delenatii', 'primulinum', 'longifolium', 'californicum', 'rothschildianum', 'parviflorum', 'warszewiczianum', 'parishii', 'lawrenceanum', 'acmodontum', 'glanduliferum', 'godefroyae', 'wallisii', 'wardii', 'czerwiakowianum', 'fowliei', 'hennisianum', 'argus', 'margaritaceum', 'kolopakingii', 'adductum', 'niveum', 'exstaminodium', 'glaucophyllum', 'besseae', 'malipoense', 'dianthum', 'micranthum', 'concolor', 'tigrinum', 'boissierianum', 'tonsum', 'emersonii', 'fairrieanum', 'villosum', 'reginae', 'schlimii', 'druryi', 'barbatum', 'schoseri', 'guttatum', 'lindenii', 'flavum', 'acaule', 'lowii', 'himalaicum', 'exul', 'sukhakulii', 'yatabeanum', 'irapeanum', 'bellatulum', 'javanicum'}

6C¶

In [ ]:

from Bio.Seq import Seq
from Bio.Blast import NCBIWWW, NCBIXML
def longest_blast_hit(seq):
    result_handle = NCBIWWW.qblast("blastn", "nt", seq)
    blast_record = NCBIXML.read(result_handle)
    longest = 0
    for hit in blast_record.alignments:
        if hit.length > longest:
            longest = hit.length
            name = hit.title
    return name

assert longest_blast_hit(seq).split('|')[1] == '47776119'

7A¶

In [ ]:

sqrt_data = np.sqrt(data)
print(sqrt_data[:3,:5])

7B¶

In [ ]:

print(data.argmax(axis=1))

7C¶

In [ ]:

scatter(range(data.shape[0]), data.argmax(axis=1))
xlabel('patient')
ylabel('day of max inflammation')

8A¶

In [1]:

size = 1000
n = 100
p = 0.5

data = np.random.binomial(n,p,size)
plt.hist(data);

8B¶

In [5]:

N = 1000
n = 3
p = 0.34
bins = np.linspace(0, N, 25)
nsteps_list = [10, 100, 1000]
ntrials = 100

In [6]:

def simulate(x, nsteps):
    """Run the simulation."""
    for _ in range(nsteps - 1):
        # Which trials to update?
        update = (0 < x) & (x < N-1)
        # In which trials do births occur?
        boys = poisson(x[update] * n * p)
        # We update the population size for all trials.
        x[update] = boys

In [7]:

x = randint(size=ntrials, low=0, high=N)
fig, ax = subplots(1, 3, figsize=(15,5), sharex=True, sharey=False)
for i, nsteps in enumerate(nsteps_list):
    simulate(x, nsteps)
    ax[i].hist(x, bins=bins)
    ax[i].set_title("%d time steps" % nsteps)
    if i == 0:
        ax[i].set_ylabel("Frequency")
    if i == 1:
        ax[i].set_xlabel("# males")

8C¶

In [8]:

import seaborn as sns
def simulate(x0, n, p, ntrials=100):
    x = x0 * ones(ntrials, dtype=int)
    update = (0 < x) & (x < N-1)
    while update.any():
        # Which trials to update?
        update = (0 < x) & (x < N-1)
        # In which trials do births occur?
        boys = binomial(x[update] * n, p)
        # We update the population size for all trials.
        x[update] = boys
    return x

In [9]:

N  = 1000
x0 = 1
n  = 3

p_range = arange(0.25,0.4,0.005)
prob = array([simulate(x0,n,p)==0 for p in p_range])

mean_prob = prob.mean(axis=1)
sem_prob = prob.std(axis=1) / sqrt(prob.shape[0])

In [10]:

errorbar(x=p_range, y=mean_prob, yerr=sem_prob)
axvline(x=1./n, color='k', ls='--')
xlabel('probability for reproductive son')
ylabel('extinction probability')
sns.despine();

In [ ]:

Python Programming for Biologists, Tel-Aviv University / 0411-3122 / Spring 2015¶

Class-exercise solutions¶

1A¶

1B¶

1C¶

1D¶

2A¶

2B¶

2C¶

2D¶

1)¶

2)¶

3)¶

3A¶

1)¶

2)¶

3B¶

1)¶

2)¶

4A¶

1)¶

2)¶

4B¶

4C¶

4D¶

4E¶

1)¶

2)¶

3)¶

4)¶

6A¶

6B¶

6C¶

7A¶

7B¶

7C¶

8A¶

8B¶

8C¶

Python Programming for Biologists, Tel-Aviv University / 0411-3122 / Spring 2015 ¶