# Open the reference file (fasta) and loop through it, removing the first line (starts with >)
# Sum the length of the lines, removing the newline character
f = open("Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa", 'r')
length = 0
for line in f:
if not line.startswith(">"):
length += len(line.strip()) # add the length of the line, without newline character
print(length)
f.close()
159345973
# Open the GTF file and loop throught it.
# To find the genes, check the 3rd element of each line and add 1 if the feature is "gene"
g = open("Homo_sapiens.GRCh38.93.gtf", 'r')
genes = 0
for line in g:
if not line.startswith("#"):
feature = line.split("\t")[2]
if feature == "gene":
genes +=1
print(genes)
g.close()
58395
All following tasks are related to the CFTR gene.
# Open the GTF file and loop throught it.
# To find the number of transcripts, check the 3rd element for "transcript" while the specific gene exists
# Can also be solved with bash using:
# cat Homo_sapiens.GRCh38.93.gtf | grep ENSG00000001626 | grep "\ttranscript\t" | wc -l
g = open("Homo_sapiens.GRCh38.93.gtf", 'r')
# Find number of transcripts
transcripts = 0
for line in g:
if not line.startswith("#"):
entries = line.split("\t")
if "ENSG00000001626" in line and entries[2] == "transcript":
transcripts += 1
print("Number of transcripts: ", transcripts)
g.close()
Number of transcripts: 11
# Open the GTF file and loop throught it.
# Find all the instances of transcripts for specific gene
# For each of them, extract the transcript id (column 8) and the length (column 4 - column 3 + 1)
# If longer than previous longest, store it and continue
g = open("Homo_sapiens.GRCh38.93.gtf", 'r')
# Find longest transcript
longest_transcript = (0, '') # update the list with length and transcript ID with a for loop
for line in g:
if not line.startswith("#"):
entries = line.strip().split("\t")
if "ENSG00000001626" in line and entries[2] == "transcript":
attribute = entries[8].split("; ")
transcriptID = attribute[2].strip().split(' ')[1]
length = int(entries[4]) - int(entries[3]) + 1
if length > longest_transcript[0]:
longest_transcript = (length, transcriptID)
print("Longest transcript: ", longest_transcript[1], "\nLength: ", longest_transcript[0])
g.close()
Longest transcript: "ENST00000003084" Length: 188703
# Get the longest transcript from the previous step
# and extract the start and end position from the columns 3,4
# Save the whole sequence from the fasta file to a variable
# and get the start-1 to end from this variable
# (incl. its start and end positions)
g = open("Homo_sapiens.GRCh38.93.gtf", 'r')
for line in g:
if not line.startswith("#"):
entries = line.strip().split("\t")
if "ENSG00000001626" in line and entries[2] == "transcript":
attribute = entries[8].split("; ")
transcriptID = attribute[2].strip().split(' ')[1]
if '"ENST00000003084"' == transcriptID:
start = int(entries[3])
end = int(entries[4])
print(transcriptID, start, end)
g.close()
# Extract the longest transcript sequence from the genome fasta file
f = open("Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa", 'r')
# Store the DNA sequence of chromosome 7 in a variable
seqList = []
for line in f:
if not line.startswith(">"):
seqList.append(line.strip())
chr7 = "".join(seqList) # join all DNA fragments from the seqList to one string
f.close()
# Store DNA sequence of longest transcript in variable
cftr = chr7[start-1:end]
print(cftr[:10], "...")
d = open("CFTR_longest_transcript.txt", 'w')
d.write(cftr)
d.close()
"ENST00000003084" 117479963 117668665 AATTGGAAGC ...
Check the sequence with the package utils.check_answers (note that I moved the directory "utils" to the directory where the Jupyter notebook is)
from utils import check_answers
check_answers.ex3("CFTR_longest_transcript.txt")
The sequences are matching!
# Open the GTF file and loop throught it. Get the same as in the previous question
# but store all of the transcripts in a set
# Create a variable from the other file, like in the previous question
# Loop through the set in the other file and pick the positions of the exons
# Get the longest transcript and its exons
g = open("Homo_sapiens.GRCh38.93.gtf", 'r')
exons = []
for line in g:
if not line.startswith("#"):
entries = line.strip().split("\t")
if "ENSG00000001626" in line and entries[2] == "exon":
attribute = entries[8].split("; ")
transcriptID = attribute[2].strip().split(' ')[1]
if '"ENST00000003084"' == transcriptID:
start = int(entries[3])
end = int(entries[4])
exons.append((start, end))
g.close()
# Extract all exons of the longest transcript sequence from the genome fasta file
f = open("Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa", 'r')
seqList = [] # store the DNA sequence of chromosome 7 in a list
for line in f:
if not line.startswith(">"):
seqList.append(line.strip())
chr7 = "".join(seqList) # join all DNA fragments from the seqList to one string
f.close()
mRNA = "" # add exon sequences to string
for exon in exons:
mRNA += chr7[exon[0]-1:exon[1]]
print(mRNA[:10], "...")
d = open("CFTR_longest_transcript_exons.txt", 'w')
d.write(mRNA)
d.close()
AATTGGAAGC ...
Check the sequence with the package utils.check_answers
# no need to import again as it was imported in a cell above
from utils import check_answers
check_answers.ex4("CFTR_longest_transcript_exons.txt")
The sequences are matching!
# Get the longest transcript incl. its start and stop codon positions
g = open("Homo_sapiens.GRCh38.93.gtf", 'r')
for line in g:
if not line.startswith("#"):
entries = line.strip().split("\t")
if "ENSG00000001626" in line and entries[2] == 'start_codon' or entries[2] == 'stop_codon': # check for start or stop codons
attribute = entries[8].split("; ")
transcriptID = attribute[2].strip().split(' ')[1]
if '"ENST00000003084"' == transcriptID:
if entries[2] == 'start_codon':
start_codon = int(entries[3]) # save start codon
elif entries[2] == 'stop_codon':
stop_codon = int(entries[3]) # save stop codon
g.close()
# Extract all start and stop codon of the longest transcript sequence from the genome fasta file
f = open("Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa", 'r')
seqList = [] # store the DNA sequence of chromosome 7 in a list
for line in f:
if not line.startswith(">"):
seqList.append(line.strip())
chr7 = "".join(seqList) # join all DNA fragments from the seqList to one string
f.close()
if chr7[start_codon-1:start_codon+2] == "ATG":
print("The start codon is", chr7[start_codon-1:start_codon+2], "and starts at position", start_codon)
else:
print("The start codon is not ATG")
if chr7[stop_codon-1:stop_codon+2] == "TAG" or chr7[stop_codon-1:stop_codon+2] == "TAA" or chr7[stop_codon-1:stop_codon+2] == "TGA":
print("The stop codon is", chr7[stop_codon-1:stop_codon+2], "and starts at position", stop_codon)
else:
print("The stop codon does not correspond to a proper stop codon")
The start codon is ATG and starts at position 117480095 The stop codon is TAG and starts at position 117667106
# Get the longest transcript and its exons, as well as the start codon
g = open("Homo_sapiens.GRCh38.93.gtf", 'r')
exons = []
for line in g:
if not line.startswith("#"):
entries = line.strip().split("\t")
if "ENSG00000001626" in line:
if entries[2] == "exon":
attribute = entries[8].split("; ")
transcriptID = attribute[2].strip().split(' ')[1]
if '"ENST00000003084"' == transcriptID:
start = int(entries[3])
end = int(entries[4])
exons.append((start, end))
elif entries[2] == 'start_codon':
attribute = entries[8].split("; ")
transcriptID = attribute[2].strip().split(' ')[1]
if '"ENST00000003084"' == transcriptID:
start_codon = int(entries[3])
print("The first exon has the positions", exons[0])
print("The start codon has the position", start_codon)
g.close()
# Extract all exons of the longest transcript sequence from the genome fasta file
f = open("Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa", 'r')
seqList = [] # store the DNA sequence of chromosome 7 in a list
for line in f:
if not line.startswith(">"):
seqList.append(line.strip())
chr7 = "".join(seqList) # join all DNA fragments from the seqList to one string
f.close()
mRNA = "" # add exon sequences to string
for exon in exons:
mRNA += chr7[exon[0]-1:exon[1]]
# Translate to aminoacids, from the correct start position on
if chr7[start_codon-1:start_codon+2] != "ATG":
print("The start codon is not ATG")
else:
start = start_codon - exons[0][0] # get position of start codon in mRNA by subtracting the start codon position and the start position of the first exon
from utils.rna import translate_dna
aminoacids = translate_dna(mRNA[start:]) # translate the mRNA sequence from the start codon on
print(aminoacids[:10], "...")
a = open("CFTR_longest_transcript_aminoacids.txt", 'w')
a.write(aminoacids)
a.close()
The first exon has the positions (117479963, 117480147) The start codon has the position 117480095 MQRSPLEKAS ...
Check the sequence with the package utils.check_answers
check_answers.ex6("CFTR_longest_transcript_aminoacids.txt")
The sequences are matching!
# Get the longest transcript and its exons, as well as the start codon
g = open("Homo_sapiens.GRCh38.93.gtf", 'r')
exons = []
for line in g:
if not line.startswith("#"):
entries = line.strip().split("\t")
if "ENSG00000001626" in line:
if entries[2] == "exon":
attribute = entries[8].split("; ")
transcriptID = attribute[2].strip().split(' ')[1]
if '"ENST00000003084"' == transcriptID:
start = int(entries[3])
end = int(entries[4])
exons.append((start, end))
elif entries[2] == 'start_codon':
attribute = entries[8].split("; ")
transcriptID = attribute[2].strip().split(' ')[1]
if '"ENST00000003084"' == transcriptID:
start_codon = int(entries[3])
g.close()
# Extract all exons of the longest transcript sequence from the genome fasta file
f = open("Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa", 'r')
seqList = [] # store the DNA sequence of chromosome 7 in a list
for line in f:
if not line.startswith(">"):
seqList.append(line.strip())
chr7 = "".join(seqList) # join all DNA fragments from the seqList to one string
f.close()
mRNA = "" # add exon sequences to string
for exon in exons:
mRNA += chr7[exon[0]-1:exon[1]]
# Translate to aminoacids, from the correct start position on
if chr7[start_codon-1:start_codon+2] != "ATG":
print("The start codon is not ATG")
else:
start = start_codon - exons[0][0] # get position of start codon in mRNA by subtracting the start codon position and the start position of the first exon
from utils.rna import translate_dna
aminoacids = translate_dna(mRNA[start:]) # translate the mRNA sequence from the start codon on
i = 1 # index for loop through patient fasta files
while i < 6:
p = open("Patient" + str(i) + ".fa", 'r')
patientSeqList = [] # store the DNA sequence of the patient in a list
for line in p:
if not line.startswith(">"):
patientSeqList.append(line.strip())
patientSeq = "".join(patientSeqList) # construct one sequence for the currently analyzed patient
patientmRNA = "" # append exon sequences to string. Works only for small strings.
for exon in exons:
patientmRNA += patientSeq[exon[0]-1:exon[1]]
# Translate to aminoacids
patient_aminoacids = translate_dna(patientmRNA[start:]) # translate the mRNA sequence from the start codon on
if not len(patient_aminoacids) == len(aminoacids):
print("The sequence of patient ", str(i), " has a different length than the reference genome sequence.")
p.close()
i += 1
The sequence of patient 3 has a different length than the reference genome sequence.
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
# Get the longest transcript and its exons, as well as the start codon
g = open("Homo_sapiens.GRCh38.93.gtf", 'r')
exons = []
for line in g:
if not line.startswith("#"):
entries = line.strip().split("\t")
if "ENSG00000001626" in line:
if entries[2] == "exon":
attribute = entries[8].split("; ")
transcriptID = attribute[2].strip().split(' ')[1]
if '"ENST00000003084"' == transcriptID:
start = int(entries[3])
end = int(entries[4])
exons.append((start, end))
elif entries[2] == 'start_codon':
attribute = entries[8].split("; ")
transcriptID = attribute[2].strip().split(' ')[1]
if '"ENST00000003084"' == transcriptID:
start_codon = int(entries[3])
g.close()
# Extract all exons of the longest transcript sequence from the genome fasta file
f = open("Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa", 'r')
records = list(SeqIO.parse(f, "fasta")) # BioPython
if records[0].id == "7": # BioPython
chr7 = records[0].seq # BioPython
f.close()
mRNA = "" # add exon sequences to string
for exon in exons:
mRNA += chr7[exon[0]-1:exon[1]]
# Translate to aminoacids, from the correct start position on
if chr7[start_codon-1:start_codon+2] != "ATG":
print("The start codon is not ATG")
else:
start = start_codon - exons[0][0] # get position of start codon in mRNA by subtracting the start codon position and the start position of the first exon
aminoacids = mRNA[start:].translate(to_stop=True) # BioPython
i = 1 # index for loop through patient fasta files
while i < 6:
p = "Patient" + str(i) + ".fa"
patientRecords = list(SeqIO.parse(p, "fasta")) # BioPython
if patientRecords[0].id == "7": # BioPython
patientSeq = patientRecords[0].seq # BioPython
patientmRNA = "" # append exon sequences to string. Works only for small strings.
for exon in exons:
patientmRNA += patientSeq[exon[0]-1:exon[1]]
# Translate to aminoacids
patient_aminoacids = patientmRNA[start:].translate(to_stop=True) # BioPython
if not len(patient_aminoacids) == len(aminoacids):
print("The sequence of patient ", str(i), " has a different length than the reference genome sequence.")
i += 1
The sequence of patient 3 has a different length than the reference genome sequence.