print("I'm a genius!")
print("7 + 6 =",7+6)
print("My name is Inigo Montoya")
I'm a genius! 7 + 6 = 13 My name is Inigo Montoya
# Define a and b
a = 10
b = 4.7
# calculate hypotenuse
c = (a**2 + b**2)**0.5
# print result
print(c)
11.04943437466371
# Choose a year
year = 1492
# test year
if (year % 400 == 0 or (year % 4 == 0 and year % 100 != 0)):
print(year,"is a leap year")
else:
print(year,"is not a leap year")
1492 is a leap year
m = 555 # integer to apply the conjecture on
n = m
while n != 1:
print(n, end=", ")
# if n is even
if n % 2 == 0:
n = n // 2
# if n is odd
else:
n = 3 * n + 1
print(1) # 1 was not printed
print(m, "is OK")
555, 1666, 833, 2500, 1250, 625, 1876, 938, 469, 1408, 704, 352, 176, 88, 44, 22, 11, 34, 17, 52, 26, 13, 40, 20, 10, 5, 16, 8, 4, 2, 1 555 is OK
seq = "CAAGTAATGGCAGCCATTAA"
# print 2nd nucleotide
print(seq[1])
# print 7th nucleotide
print(seq[6])
# print 2nd nucleotide from the tail
print(seq[-2])
first_half = seq[:10]
print(first_half)
second_half = seq[10:]
print(second_half)
middle = seq[5:15]
print(middle)
A A A CAAGTAATGG CAGCCATTAA AATGGCAGCC
# create list
animals = birds + snakes
# add Mus musculus
animals.append('Mus musculus')
# remove Corvus corone element
animals.remove('Corvus corone')
# print
print(sorted(animals[1:5]))
insulin = 'MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN'
charged = ['R','H','K','D','E']
charged_count = 0
for aa in insulin:
if aa in charged:
charged_count += 1
insulin_length = len(insulin)
charged_ratio = charged_count/insulin_length
print("Ratio of charged amino acids is:",charged_ratio)
Ratio of charged amino acids is: 0.17272727272727273
seq = "ACTGATCGATTACGTATAGTAGAATTCTATCATACATATATATCGATGCGTTCAT"
fragments = seq.split('GAATTC')
f1_length = len(fragments[0]) + 1 # add 1 for the 'G'
f2_length = len(fragments[1]) + 5 # add 5 for the 'AATTC'
print('Fragment lengths of',f1_length,'and',f2_length,'will be produced.')
Fragment lengths of 22 and 33 will be produced.
complement = ''
for base in seq:
if base == 'A':
complement = complement + 'T'
elif base == 'T':
complement = complement + 'A'
elif base == 'G':
complement = complement + 'C'
elif base == 'C':
complement = complement + 'G'
else:
print("Bad base:", base)
print("Complement:", complement)
Complement: TGACTAGCTAATGCATATCATCTTAAGATAGTATGTATATATAGCTACGCAAGTA
for i in range(len(apes)):
print(apes[i])
display(Image(url=ape_pics[i]))
# Create dictionary
details_dict = {'Name': 'James Watson', 'Address': 'Cambridge', 'Phone': '12345678'}
# print sentence
print("My name is",details_dict['Name'],"I live in",details_dict['Address'],"My phone number is",details_dict['Phone'])
My name is James Watson I live in Cambridge My phone number is 12345678
# Create codons dictionary
bases = ['t', 'c', 'a', 'g']
codons = [a+b+c for a in bases for b in bases for c in bases]
amino_acids = 'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
codon_table = dict(zip(codons, amino_acids))
# Sequence list
seq_list = ["atg","caa","ggc","ata","tca","tgg","cga","agg","cct","taa"]
# iterate on list and translate
for codon in seq_list:
print(codon_table[codon], end='')
MQGISWRRP*
# define function
def first_5_longer_sequence(seq1,seq2):
if len(seq1) > len(seq2):
return seq1[:5]
else:
return seq2[:5]
# Test function
sequence1 = "aggtctcggatataggcgcgatattta"
sequence2 = "ttaagccacgcttcggatta"
first_5 = first_5_longer_sequence(sequence1, sequence2)
print(first_5)
aggtc
# define function
def odd_bases(seq):
odd_bases_list = []
for i in range(len(seq)):
if i % 2 == 0:
odd_bases_list.append(seq[i])
return odd_bases_list
# Test function
odd_bases_list = odd_bases("aggtctcggatataggcgcgatattta")
print(odd_bases_list)
['a', 'g', 'c', 'c', 'g', 't', 't', 'g', 'c', 'c', 'a', 'a', 't', 'a']
Or another alternative, using string slicing:
# define function
def odd_bases(seq):
odd_bases_list = list(seq[::2])
return odd_bases_list
# Test function
odd_bases_list = odd_bases("aggtctcggatataggcgcgatattta")
print(odd_bases_list)
['a', 'g', 'c', 'c', 'g', 't', 't', 'g', 'c', 'c', 'a', 'a', 't', 'a']
with open('lec4_files/crops.txt','r') as crops_file:
entire_file = crops_file.read()
lines_list = entire_file.split('\n')
print(lines_list[-1])
Ziziphus zizyphus |
with open('lec4_files/crops.txt','r') as crops_file:
triticum_count = 0
for line in crops_file:
if line.startswith('Triticum'):
triticum_count += 1
print(triticum_count)
6
# new function
def parse_fasta_30_nuc(file_name):
"""
Receives a path to a fasta file, and returns a dictionary where the keys
are the sequence gb accession numbers and the values are the first 30
nucleotides of the sequences.
"""
# create an empty dictionary to store the sequences
sequences = {}
# open fasta file for reading
with open(file_name,'r') as f:
# Loop over file lines
for line in f:
# if header line
if line.startswith('>'):
gb = line.split('|')[3]
# if sequence line
else:
seq = line.strip()[:30]
sequences[gb] = seq
return sequences
# parse file
camelus_seq = parse_fasta_30_nuc('lec4_files/camelus.fasta')
# write to new file
with open('lec4_files/4b_output.txt','w') as of:
for gb_id in camelus_seq:
print(gb_id + ':',camelus_seq[gb_id], file=of)
def mean_of_string_values(lst):
"""
receives a list of strings representing numbers and returns their mean
"""
numeric_lst = []
for x in lst:
numeric_lst.append(float(x))
return mean(numeric_lst)
experiments_file = 'lec4_files/electrolyte_leakage.csv'
with open(experiments_file, 'r') as f:
with open('lec4_files/4c_output.csv','w', newline='') as fo:
csv_writer = csv.writer(fo)
csv_writer.writerow(['Accession','control mean','test mean'])
csv_reader = csv.reader(f)
next(csv_reader)
for row in csv_reader:
acc = row[0]
control = row [1:4]
test = row[4:]
to_write = [acc,mean_of_string_values(control),mean_of_string_values(test)]
csv_writer.writerow(to_write)
genes = ['xkn59438', 'yhdck2', 'eihd39d9', 'chdsye847', 'hedle3455', 'xjhd53e', '45da', 'de37dp','map492ty']
# 1.
print('Gene names containing d or e:')
regex1 = re.compile(r'[de]')
for gene in genes:
if re.search(regex1,gene):
print(gene)
print('------------------------')
# 2.
print('Gene names containing d and e, in that order:')
regex2 = re.compile(r'd[^e]*e')
for gene in genes:
if re.search(regex2,gene):
print(gene)
print('------------------------')
# 3.
print('Gene names containing three digits in a row:')
regex3 = re.compile(r'\d{3,}')
for gene in genes:
if re.search(regex3,gene):
print(gene)
Gene names containing d or e: yhdck2 eihd39d9 chdsye847 hedle3455 xjhd53e 45da de37dp ------------------------ Gene names containing d and e, in that order: chdsye847 hedle3455 xjhd53e de37dp ------------------------ Gene names containing three digits in a row: xkn59438 chdsye847 hedle3455 map492ty
def count_promoters_with_motif(promoters_dictionary):
"""
Receives a dictionary representing a promoters fasta file,
and counts how many of the promoters include a GATA-4 motif.
"""
promoters_count = 0 # store the number of promoters with GATA-4 motif
for p in promoters_dictionary:
if check_for_GATA4(promoters_dictionary[p]):
promoters_count += 1
return promoters_count
def get_positions_statistics(promoters_dictionary):
"""
Receives a dictionary representing a promoters fasta file,
and returns the frequencies of possible nucleotides in
each variable position.
"""
# define a dictionary for each position, to store the nucleotide frequencies
# D position
D_dict = {'A':0, 'G':0, 'T':0}
# M position
M_dict = {'A':0, 'C':0}
# R position
R_dict = {'A':0, 'G':0}
# S position
S_dict = {'C':0, 'G':0}
# itterate over promoters
for p in promoters_dictionary:
# if promoter includes the GATA-4 motif
if check_for_GATA4(promoters_dictionary[p]):
# get variable nucleotides in promoter
D,M,R,S = extract_ambiguous_for_GATA4(promoters_dictionary[p])
# insert to dictionaries
D_dict[D] += 1
M_dict[M] += 1
R_dict[R] += 1
S_dict[S] += 1
return D_dict, M_dict, R_dict, S_dict
def summarize_results(D_dict, M_dict, R_dict, S_dict, output_file):
with open(output_file, 'w') as fo:
csv_writer = csv.writer(fo)
# write headers line
csv_writer.writerow(['Position','A','G','C','T'])
# summarize D position
csv_writer.writerow(['D',D_dict['A'],D_dict['G'],0,D_dict['T']])
# summarize M position
csv_writer.writerow(['M',M_dict['A'],0,M_dict['C'],0])
# summarize R position
csv_writer.writerow(['R',R_dict['A'],R_dict['G'],0,0])
# summarize S position
csv_writer.writerow(['S',0,S_dict['G'],S_dict['C'],0])
promoters_file = "lec4_files/GATA4_promoters.fasta"
output_file = "lec4_files/promoters_stats.csv"
# parse fasta file
promoters_dict = parse_promoters_fasta(promoters_file)
# Count promoters with/without GATA-4 motif
promoters_with_motif = count_promoters_with_motif(promoters_dict)
promoters_without_motif = len(promoters_dict) - promoters_with_motif
print('Total promoters:',promoters_with_motif + promoters_without_motif)
print('Promoters with GATA-4 motif:',promoters_with_motif)
print('Promoters without GATA-4 motif:',promoters_without_motif)
# Get statistics
D_dict, M_dict, R_dict, S_dict = get_positions_statistics(promoters_dict)
# write to CSV
summarize_results(D_dict, M_dict, R_dict, S_dict,output_file)
from Bio.Seq import Seq
def antisense_string_to_protein_seq(DNA_string):
antisense_seq = Seq(DNA_string)
sense_seq = antisense_seq.reverse_complement()
prot_seq = sense_seq.translate()
return prot_seq
antisense_DNA = "TACCGGTAACATTACCCGGCGACTTTCCCACGGGCTATC"
protein = antisense_string_to_protein_seq(antisense_DNA)
print(protein)
assert str(protein) == 'DSPWESRRVMLPV'
assert isinstance(protein,Seq)
from Bio import SeqIO
def get_unique_species(gb_file):
species_list = []
# iterate on file records
for seq_record in SeqIO.parse(gb_file,'genbank'):
# get species
record_organism = seq_record.annotations['organism']
species = record_organism.split()[1] # get the second word
# insert species to list
species_list.append(species)
return set(species_list)
orchids_species = get_unique_species('lec6_files/Orchids.gbk')
print(orchids_species)
assert len(orchids_species) == 92
{'segawai', 'kaiteurum', 'caricinum', 'mastersianum', 'formosanum', 'hookerae', 'victoria', 'lichiangense', 'passerinum', 'charlesworthii', 'bougainvilleanum', 'venustum', 'stonei', 'sanderianum', 'hirsutissimum', 'urbanianum', 'superbiens', 'supardii', 'appletonianum', 'armeniacum', 'papuanum', 'ciliolare', 'philippinense', 'haynaldianum', 'lindleyanum', 'bullenianum', 'sargentianum', 'pearcei', 'barbigerum', 'calceolus', 'purpuratum', 'xerophyticum', 'dayanum', 'henryanum', 'callosum', 'fasciculatum', 'gratrixianum', 'insigne', 'caudatum', 'macranthon', 'delenatii', 'primulinum', 'longifolium', 'californicum', 'rothschildianum', 'parviflorum', 'warszewiczianum', 'parishii', 'lawrenceanum', 'acmodontum', 'glanduliferum', 'godefroyae', 'wallisii', 'wardii', 'czerwiakowianum', 'fowliei', 'hennisianum', 'argus', 'margaritaceum', 'kolopakingii', 'adductum', 'niveum', 'exstaminodium', 'glaucophyllum', 'besseae', 'malipoense', 'dianthum', 'micranthum', 'concolor', 'tigrinum', 'boissierianum', 'tonsum', 'emersonii', 'fairrieanum', 'villosum', 'reginae', 'schlimii', 'druryi', 'barbatum', 'schoseri', 'guttatum', 'lindenii', 'flavum', 'acaule', 'lowii', 'himalaicum', 'exul', 'sukhakulii', 'yatabeanum', 'irapeanum', 'bellatulum', 'javanicum'}
from Bio.Seq import Seq
from Bio.Blast import NCBIWWW, NCBIXML
def longest_blast_hit(seq):
result_handle = NCBIWWW.qblast("blastn", "nt", seq)
blast_record = NCBIXML.read(result_handle)
longest = 0
for hit in blast_record.alignments:
if hit.length > longest:
longest = hit.length
name = hit.title
return name
assert longest_blast_hit(seq).split('|')[1] == '47776119'
sqrt_data = np.sqrt(data)
print(sqrt_data[:3,:5])
print(data.argmax(axis=1))
scatter(range(data.shape[0]), data.argmax(axis=1))
xlabel('patient')
ylabel('day of max inflammation')
size = 1000
n = 100
p = 0.5
data = np.random.binomial(n,p,size)
plt.hist(data);
N = 1000
n = 3
p = 0.34
bins = np.linspace(0, N, 25)
nsteps_list = [10, 100, 1000]
ntrials = 100
def simulate(x, nsteps):
"""Run the simulation."""
for _ in range(nsteps - 1):
# Which trials to update?
update = (0 < x) & (x < N-1)
# In which trials do births occur?
boys = poisson(x[update] * n * p)
# We update the population size for all trials.
x[update] = boys
x = randint(size=ntrials, low=0, high=N)
fig, ax = subplots(1, 3, figsize=(15,5), sharex=True, sharey=False)
for i, nsteps in enumerate(nsteps_list):
simulate(x, nsteps)
ax[i].hist(x, bins=bins)
ax[i].set_title("%d time steps" % nsteps)
if i == 0:
ax[i].set_ylabel("Frequency")
if i == 1:
ax[i].set_xlabel("# males")
import seaborn as sns
def simulate(x0, n, p, ntrials=100):
x = x0 * ones(ntrials, dtype=int)
update = (0 < x) & (x < N-1)
while update.any():
# Which trials to update?
update = (0 < x) & (x < N-1)
# In which trials do births occur?
boys = binomial(x[update] * n, p)
# We update the population size for all trials.
x[update] = boys
return x
N = 1000
x0 = 1
n = 3
p_range = arange(0.25,0.4,0.005)
prob = array([simulate(x0,n,p)==0 for p in p_range])
mean_prob = prob.mean(axis=1)
sem_prob = prob.std(axis=1) / sqrt(prob.shape[0])
errorbar(x=p_range, y=mean_prob, yerr=sem_prob)
axvline(x=1./n, color='k', ls='--')
xlabel('probability for reproductive son')
ylabel('extinction probability')
sns.despine();