Notebook

Warmup tasks¶

3. What is the length of chromosome 7 on the reference sequence?¶

In [2]:

# Open the reference file (fasta) and loop through it, removing the first line (starts with >)
# Sum the length of the lines, removing the newline character
f = open("Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa", 'r')
length = 0
for line in f:
    if not line.startswith(">"):
        length += len(line.strip()) # add the length of the line, without newline character
print(length)
f.close()

159345973

4. How many genes are annotated in the GTF file?¶

In [3]:

# Open the GTF file and loop throught it.
# To find the genes, check the 3rd element of each line and add 1 if the feature is "gene"
g = open("Homo_sapiens.GRCh38.93.gtf", 'r')
genes = 0
for line in g:
    if not line.startswith("#"):
        feature = line.split("\t")[2]
        if feature == "gene":
            genes +=1
print(genes)
g.close()

Architect a method¶

All following tasks are related to the CFTR gene.

1. How many transcripts can the CFTR gene generate?¶

In [11]:

# Open the GTF file and loop throught it.
# To find the number of transcripts, check the 3rd element for "transcript" while the specific gene exists
# Can also be solved with bash using: 
# cat Homo_sapiens.GRCh38.93.gtf | grep ENSG00000001626 | grep "\ttranscript\t" | wc -l
g = open("Homo_sapiens.GRCh38.93.gtf", 'r')

# Find number of transcripts
transcripts = 0
for line in g:
    if not line.startswith("#"):
        entries = line.split("\t")
        if "ENSG00000001626" in line and entries[2] == "transcript":
            transcripts += 1
print("Number of transcripts: ", transcripts)
g.close()

Number of transcripts:  11

2. Which of these transcripts is the longest transcript in nucleotides?¶

In [3]:

# Open the GTF file and loop throught it.
# Find all the instances of transcripts for specific gene
# For each of them, extract the transcript id (column 8) and the length (column 4 - column 3 + 1)
# If longer than previous longest, store it and continue
g = open("Homo_sapiens.GRCh38.93.gtf", 'r')

# Find longest transcript
longest_transcript = (0, '') # update the list with length and transcript ID with a for loop
for line in g:
    if not line.startswith("#"):
        entries = line.strip().split("\t")
        if "ENSG00000001626" in line and entries[2] == "transcript":
            attribute = entries[8].split("; ")
            transcriptID = attribute[2].strip().split(' ')[1]
            length = int(entries[4]) - int(entries[3]) + 1
            if length > longest_transcript[0]:
                longest_transcript = (length, transcriptID)
                                
print("Longest transcript: ", longest_transcript[1], "\nLength: ", longest_transcript[0])

g.close()

Longest transcript:  "ENST00000003084" 
Length:  188703

3. Fetch the DNA sequence for that transcript.¶

In [4]:

# Get the longest transcript from the previous step 
# and extract the start and end position from the columns 3,4
# Save the whole sequence from the fasta file to a variable
# and get the start-1 to end from this variable 
# (incl. its start and end positions)
g = open("Homo_sapiens.GRCh38.93.gtf", 'r')

for line in g:
    if not line.startswith("#"):
        entries = line.strip().split("\t")
        if "ENSG00000001626" in line and entries[2] == "transcript":
            attribute = entries[8].split("; ")
            transcriptID = attribute[2].strip().split(' ')[1]
            if '"ENST00000003084"' == transcriptID:
                start = int(entries[3])
                end = int(entries[4])
                print(transcriptID, start, end)

g.close()

# Extract the longest transcript sequence from the genome fasta file
f = open("Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa", 'r')

# Store the DNA sequence of chromosome 7 in a variable
seqList = []
for line in f:
    if not line.startswith(">"):
        seqList.append(line.strip())
chr7 = "".join(seqList) # join all DNA fragments from the seqList to one string

f.close()

# Store DNA sequence of longest transcript in variable
cftr = chr7[start-1:end] 
print(cftr[:10], "...")

d = open("CFTR_longest_transcript.txt", 'w')
d.write(cftr)
d.close()

"ENST00000003084" 117479963 117668665
AATTGGAAGC ...

Check the sequence with the package utils.check_answers (note that I moved the directory "utils" to the directory where the Jupyter notebook is)

In [50]:

from utils import check_answers
check_answers.ex3("CFTR_longest_transcript.txt")

The sequences are matching!

4. Fetch the DNA sequences of all exons for that transcript, spliced together to one sequence.¶

In [4]:

# Open the GTF file and loop throught it. Get the same as in the previous question
# but store all of the transcripts in a set
# Create a variable from the other file, like in the previous question
# Loop through the set in the other file and pick the positions of the exons
# Get the longest transcript and its exons
g = open("Homo_sapiens.GRCh38.93.gtf", 'r')

exons = []

for line in g:
    if not line.startswith("#"):
        entries = line.strip().split("\t")
        if "ENSG00000001626" in line and entries[2] == "exon":
            attribute = entries[8].split("; ")
            transcriptID = attribute[2].strip().split(' ')[1]
            if '"ENST00000003084"' == transcriptID:
                start = int(entries[3])
                end = int(entries[4])
                exons.append((start, end))

g.close()

# Extract all exons of the longest transcript sequence from the genome fasta file
f = open("Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa", 'r')

seqList = [] # store the DNA sequence of chromosome 7 in a list
for line in f:
    if not line.startswith(">"):
        seqList.append(line.strip())
chr7 = "".join(seqList) # join all DNA fragments from the seqList to one string

f.close()

mRNA = "" # add exon sequences to string
for exon in exons:
    mRNA += chr7[exon[0]-1:exon[1]]

print(mRNA[:10], "...")

d = open("CFTR_longest_transcript_exons.txt", 'w')
d.write(mRNA)
d.close()

AATTGGAAGC ...

Check the sequence with the package utils.check_answers

In [6]:

# no need to import again as it was imported in a cell above
from utils import check_answers
check_answers.ex4("CFTR_longest_transcript_exons.txt")

The sequences are matching!

5. What are the start positions and sequences of the start_codon and stop_codon for that transcript?¶

In [3]:

# Get the longest transcript incl. its start and stop codon positions
g = open("Homo_sapiens.GRCh38.93.gtf", 'r')

for line in g:
    if not line.startswith("#"):
        entries = line.strip().split("\t")
        if "ENSG00000001626" in line and entries[2] == 'start_codon' or entries[2] == 'stop_codon': # check for start or stop codons
                attribute = entries[8].split("; ")
                transcriptID = attribute[2].strip().split(' ')[1]
                if '"ENST00000003084"' == transcriptID:
                    if entries[2] == 'start_codon':
                        start_codon = int(entries[3]) # save start codon
                    elif entries[2] == 'stop_codon':
                        stop_codon = int(entries[3]) # save stop codon

g.close()

# Extract all start and stop codon of the longest transcript sequence from the genome fasta file
f = open("Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa", 'r')

seqList = [] # store the DNA sequence of chromosome 7 in a list
for line in f:
    if not line.startswith(">"):
        seqList.append(line.strip())
chr7 = "".join(seqList) # join all DNA fragments from the seqList to one string

f.close()

if chr7[start_codon-1:start_codon+2] == "ATG":
    print("The start codon is", chr7[start_codon-1:start_codon+2], "and starts at position", start_codon)
else:
    print("The start codon is not ATG")

if chr7[stop_codon-1:stop_codon+2] == "TAG" or chr7[stop_codon-1:stop_codon+2] == "TAA" or chr7[stop_codon-1:stop_codon+2] == "TGA":
    print("The stop codon is", chr7[stop_codon-1:stop_codon+2], "and starts at position", stop_codon)
else:
    print("The stop codon does not correspond to a proper stop codon")

The start codon is ATG and starts at position 117480095
The stop codon is TAG and starts at position 117667106

6. Translate the above sequence of all exons into amino acids, using an implementation of the translation table from the utils.rna package¶

In [77]:

# Get the longest transcript and its exons, as well as the start codon
g = open("Homo_sapiens.GRCh38.93.gtf", 'r')

exons = []

for line in g:
    if not line.startswith("#"):
        entries = line.strip().split("\t")
        if "ENSG00000001626" in line:
            if entries[2] == "exon":
                attribute = entries[8].split("; ")
                transcriptID = attribute[2].strip().split(' ')[1]
                if '"ENST00000003084"' == transcriptID:
                    start = int(entries[3])
                    end = int(entries[4])
                    exons.append((start, end))
            elif entries[2] == 'start_codon':
                attribute = entries[8].split("; ")
                transcriptID = attribute[2].strip().split(' ')[1]
                if '"ENST00000003084"' == transcriptID:
                    start_codon = int(entries[3])

print("The first exon has the positions", exons[0])
print("The start codon has the position", start_codon)
g.close()

# Extract all exons of the longest transcript sequence from the genome fasta file
f = open("Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa", 'r')

seqList = [] # store the DNA sequence of chromosome 7 in a list
for line in f:
    if not line.startswith(">"):
        seqList.append(line.strip())
chr7 = "".join(seqList) # join all DNA fragments from the seqList to one string

f.close()

mRNA = "" # add exon sequences to string
for exon in exons:
    mRNA += chr7[exon[0]-1:exon[1]]

# Translate to aminoacids, from the correct start position on
if chr7[start_codon-1:start_codon+2] != "ATG":
    print("The start codon is not ATG")
else:
    start = start_codon - exons[0][0] # get position of start codon in mRNA by subtracting the start codon position and the start position of the first exon

from utils.rna import translate_dna
aminoacids = translate_dna(mRNA[start:]) # translate the mRNA sequence from the start codon on
print(aminoacids[:10], "...")

a = open("CFTR_longest_transcript_aminoacids.txt", 'w')
a.write(aminoacids)
a.close()

The first exon has the positions (117479963, 117480147)
The start codon has the position 117480095
MQRSPLEKAS ...

Check the sequence with the package utils.check_answers

In [78]:

check_answers.ex6("CFTR_longest_transcript_aminoacids.txt")

The sequences are matching!

Use the python code you have written to solve the tasks above to determine which of five patients are carrying a mutation in the CFTR gene that causes a truncated protein.¶

In [81]:

# Get the longest transcript and its exons, as well as the start codon
g = open("Homo_sapiens.GRCh38.93.gtf", 'r')

exons = []

for line in g:
    if not line.startswith("#"):
        entries = line.strip().split("\t")
        if "ENSG00000001626" in line:
            if entries[2] == "exon":
                attribute = entries[8].split("; ")
                transcriptID = attribute[2].strip().split(' ')[1]
                if '"ENST00000003084"' == transcriptID:
                    start = int(entries[3])
                    end = int(entries[4])
                    exons.append((start, end))
            elif entries[2] == 'start_codon':
                attribute = entries[8].split("; ")
                transcriptID = attribute[2].strip().split(' ')[1]
                if '"ENST00000003084"' == transcriptID:
                    start_codon = int(entries[3])

g.close()

# Extract all exons of the longest transcript sequence from the genome fasta file
f = open("Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa", 'r')

seqList = [] # store the DNA sequence of chromosome 7 in a list
for line in f:
    if not line.startswith(">"):
        seqList.append(line.strip())
chr7 = "".join(seqList) # join all DNA fragments from the seqList to one string

f.close()

mRNA = "" # add exon sequences to string
for exon in exons:
    mRNA += chr7[exon[0]-1:exon[1]]

# Translate to aminoacids, from the correct start position on
if chr7[start_codon-1:start_codon+2] != "ATG":
    print("The start codon is not ATG")
else:
    start = start_codon - exons[0][0] # get position of start codon in mRNA by subtracting the start codon position and the start position of the first exon

from utils.rna import translate_dna
aminoacids = translate_dna(mRNA[start:]) # translate the mRNA sequence from the start codon on

i = 1 # index for loop through patient fasta files
while i < 6:
    p = open("Patient" + str(i) + ".fa", 'r')
    patientSeqList = [] # store the DNA sequence of the patient in a list
    for line in p:
        if not line.startswith(">"):
            patientSeqList.append(line.strip())
    patientSeq = "".join(patientSeqList) # construct one sequence for the currently analyzed patient

    patientmRNA = "" # append exon sequences to string. Works only for small strings.
    for exon in exons:
        patientmRNA += patientSeq[exon[0]-1:exon[1]]

    # Translate to aminoacids
    patient_aminoacids = translate_dna(patientmRNA[start:]) # translate the mRNA sequence from the start codon on
    if not len(patient_aminoacids) == len(aminoacids):
        print("The sequence of patient ", str(i), " has a different length than the reference genome sequence.")

    p.close()
    i += 1

The sequence of patient  3  has a different length than the reference genome sequence.

Extra task: Use BioPython to parse the fasta file and to translate DNA nucleotides into amino acids.¶

In [95]:

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC

# Get the longest transcript and its exons, as well as the start codon
g = open("Homo_sapiens.GRCh38.93.gtf", 'r')

exons = []

for line in g:
    if not line.startswith("#"):
        entries = line.strip().split("\t")
        if "ENSG00000001626" in line:
            if entries[2] == "exon":
                attribute = entries[8].split("; ")
                transcriptID = attribute[2].strip().split(' ')[1]
                if '"ENST00000003084"' == transcriptID:
                    start = int(entries[3])
                    end = int(entries[4])
                    exons.append((start, end))
            elif entries[2] == 'start_codon':
                attribute = entries[8].split("; ")
                transcriptID = attribute[2].strip().split(' ')[1]
                if '"ENST00000003084"' == transcriptID:
                    start_codon = int(entries[3])

g.close()


# Extract all exons of the longest transcript sequence from the genome fasta file
f = open("Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa", 'r')

records = list(SeqIO.parse(f, "fasta")) # BioPython
if records[0].id == "7":  # BioPython
    chr7 = records[0].seq # BioPython

f.close()


mRNA = "" # add exon sequences to string
for exon in exons:
    mRNA += chr7[exon[0]-1:exon[1]]

# Translate to aminoacids, from the correct start position on
if chr7[start_codon-1:start_codon+2] != "ATG":
    print("The start codon is not ATG")
else:
    start = start_codon - exons[0][0] # get position of start codon in mRNA by subtracting the start codon position and the start position of the first exon

aminoacids = mRNA[start:].translate(to_stop=True) # BioPython

i = 1 # index for loop through patient fasta files
while i < 6:
    p = "Patient" + str(i) + ".fa"
    patientRecords = list(SeqIO.parse(p, "fasta")) # BioPython
    if patientRecords[0].id == "7":  # BioPython
        patientSeq = patientRecords[0].seq # BioPython

    patientmRNA = "" # append exon sequences to string. Works only for small strings.
    for exon in exons:
        patientmRNA += patientSeq[exon[0]-1:exon[1]]

    # Translate to aminoacids
    patient_aminoacids = patientmRNA[start:].translate(to_stop=True) # BioPython
    
    if not len(patient_aminoacids) == len(aminoacids):
        print("The sequence of patient ", str(i), " has a different length than the reference genome sequence.")

    i += 1

The sequence of patient  3  has a different length than the reference genome sequence.