Notebook

Warmup tasks¶

Download the data.zip file from http://pcons1.scilifelab.se/misc/download/python2024/data.zip and unzip it. Then rename the folder as data

3. What is the length of chromosome 7 on the reference sequence?¶

In [26]:

# Open the reference file (fasta) and loop through it, removing the first line (starts with >)
# Sum the length of the lines, removing the newline character
with open("data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa", 'r') as fh:
    length = 0
    for line in fh:
        if not line.startswith(">"):
            length += len(line.strip()) # add the length of the line, without newline character
    print(length)

159345973

4. How many genes are annotated in the GTF file?¶

In [27]:

# Open the GTF file and loop through it.
# To find the genes, check the 3rd element of each line and add 1 if the feature is "gene"
with open("data/Homo_sapiens.GRCh38.93.gtf", 'r') as fh:
    num_genes = 0
    for line in fh:
        if not line.startswith("#"):
            feature = line.split("\t")[2]
            if feature == "gene":
                num_genes +=1
    print(num_genes)

Architect a method¶

In [33]:

# helper function to parse attribute
# A more robust way to parse the attribute by using a dictionary. 
# In this case, attribute items will be parsed correctly even when the order of the items are changed.
def parse_attribute(attribute):
    """
    Parse the attribute field from the GTF file
    Input: string of the attribute field 
    Output: a dictionary
    """
    li_attr = [x.strip() for x in attribute.split(';') if x.strip()]
    d_attr = {}
    for attr in li_attr:
        key_and_value = attr.split(' ', maxsplit=1)
        if (len(key_and_value) < 2):
            print(f"Error: bad data {attr} in attribute {attribute}")
        key = key_and_value[0]
        value = key_and_value[1].strip().strip('"')
        d_attr[key] = value
    return d_attr

# Example usage of the function parse_att
attribute = 'gene_id "unkown gene"; gene_version "14"; transcript_id "ENST00000546407"; transcript_version "1"; gene_name "CFTR"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "CFTR-207"; transcript_source "havana"; transcript_biotype "processed_transcript"; transcript_support_level "1";'
d_attr = parse_attribute(attribute)
print(d_attr)
gene_id = d_attr.get("gene_id", "")
print(f"gene_id: {gene_id}")

transcript_id = d_attr.get("transcript_id", "")
print(f"transcript_id: {transcript_id}")

{'gene_id': 'unkown gene', 'gene_version': '14', 'transcript_id': 'ENST00000546407', 'transcript_version': '1', 'gene_name': 'CFTR', 'gene_source': 'ensembl_havana', 'gene_biotype': 'protein_coding', 'transcript_name': 'CFTR-207', 'transcript_source': 'havana', 'transcript_biotype': 'processed_transcript', 'transcript_support_level': '1'}
gene_id: unkown gene
transcript_id: ENST00000546407

All following tasks are related to the CFTR gene.

1. How many transcripts can the CFTR gene generate?¶

In [34]:

# Open the GTF file and loop through it.
# To find the number of transcripts, check the 3rd element for "transcript" while the specific gene exists
# Can also be solved with bash using: 
# cat Homo_sapiens.GRCh38.93.gtf | grep ENSG00000001626 | grep "\ttranscript\t" | wc -l
with open("data/Homo_sapiens.GRCh38.93.gtf", 'r') as fh:
    # Find number of transcripts
    num_transcripts = 0
    for line in fh:
        if not line.startswith("#"):
            entries = line.split("\t")
            feature = entries[2]
            attribute = entries[8]
            if feature == 'transcript':
                li_attr = attribute.split(";")
                gene_id = li_attr[0].strip().split(' ', maxsplit=1)[1].strip('"') # remember to strip the surrounding quote
                if gene_id == "ENSG00000001626":
                    num_transcripts += 1
                    
    print("Number of transcripts: ", num_transcripts)

Number of transcripts:  11

2. Which of these transcripts is the longest transcript in nucleotides?¶

In [55]:

# Open the GTF file and loop throught it.
# Find all the instances of transcripts for specific gene
# For each of them, extract the transcript id (column 8) and the length (column 4 - column 3 + 1)
# If longer than previous longest, store it and continue
with open("data/Homo_sapiens.GRCh38.93.gtf", 'r') as fh:
    # Find longest transcript
    longest_transcript = (0, '') # update the list with length and transcript ID with a for loop
    for line in fh:
        if not line.startswith("#"):
            entries = line.strip().split("\t")
            feature = entries[2]
            attribute = entries[8]
            if feature == "transcript":
                li_attr = attribute.split(";")
                gene_id = li_attr[0].strip().split()[1].strip('"')
                if gene_id == "ENSG00000001626":
                    transcript_id = li_attr[2].strip().split(' ', maxsplit=1)[1].strip('"')
                    length = int(entries[4]) - int(entries[3]) + 1
                    #print(line)
                    if length > longest_transcript[0]:
                        longest_transcript = (length, transcript_id)

    print(f"Longest transcript: {longest_transcript[1]}")
    print(f"Length: {longest_transcript[0]}")

Longest transcript: ENST00000003084
Length: 188703

3. Fetch the DNA sequence for that transcript.¶

In [57]:

# Get the longest transcript from the previous step 
# and extract the start and end position from the columns 4 and 5
# Save the whole sequence from the fasta file to a variable
# and get the start-1 to end from this variable 
# (including its start and end positions)
with open("data/Homo_sapiens.GRCh38.93.gtf", 'r') as fh:
    for line in fh:
        if not line.startswith("#"):
            entries = line.strip().split("\t")
            feature = entries[2]
            attribute = entries[8]
            if feature == "transcript":
                li_attr = attribute.split(";")
                gene_id = li_attr[0].strip().split(' ', maxsplit=1)[1].strip('"')
                if gene_id == "ENSG00000001626":
                    transcript_id = li_attr[2].strip().split(' ', maxsplit=1)[1].strip('"')
                    if  transcript_id == "ENST00000003084":
                        start = int(entries[3])
                        end = int(entries[4])
                        print(transcript_id, start, end)


# Get the full DNA sequence of chromosome 7
with open("data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa", 'r') as fh:
    # Store the DNA sequence of chromosome 7 in a list when reading lines
    seqList = []
    for line in fh:
        if not line.startswith(">"):
            seqList.append(line.strip())
    seq_chr7 = "".join(seqList) # join all DNA fragments from the seqList to one string

# Store DNA sequence of the longest transcript in a variable
seq_CFTR = seq_chr7[start-1:end] 
print(f"First 10 nucleotides of the longest transcript: {seq_CFTR[:10]}")

dna_seqfile = "CFTR_longest_transcript.txt"
with open(dna_seqfile, 'w') as fh:
    fh.write(seq_CFTR)
    print(f"The DNA sequence of the longest transcript has been output to the file '{dna_seqfile}'")

ENST00000003084 117479963 117668665
First 10 nucleotides of the longest transcript: AATTGGAAGC
The DNA sequence of the longest transcript has been output to the file 'CFTR_longest_transcript.txt'

Check the sequence with the package utils.check_answers (note that I moved the directory "utils" to the directory where the Jupyter notebook is)

In [58]:

from utils import check_answers
check_answers.ex3("CFTR_longest_transcript.txt")

The sequences are matching!

4. Fetch the DNA sequences of all exons for that transcript, concatenate into one sequence.¶

In [59]:

# Open the GTF file and loop through it. Get the same as in the previous question
# but store all of the transcripts in a set
# Create a variable from the other file, like in the previous question
# Loop through the set in the other file and pick the positions of the exons
# Get the longest transcript and its exons
with open("data/Homo_sapiens.GRCh38.93.gtf", 'r') as fh:
    exons = []
    for line in fh:
        if not line.startswith("#"):
            entries = line.strip().split("\t")
            feature = entries[2]
            attribute = entries[8]
            if feature == "exon":
                li_attr = attribute.split(";")
                gene_id = li_attr[0].strip().split(' ', maxsplit=1)[1].strip('"')
                transcript_id = li_attr[2].strip().split(' ', maxsplit=1)[1].strip('"')
                if gene_id == "ENSG00000001626" and transcript_id == "ENST00000003084":
                    start = int(entries[3])
                    end = int(entries[4])
                    exons.append((start, end))


# Get the full DNA sequence of chromosome 7
with open("data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa", 'r') as fh:
    seqList = [] # Store the DNA sequence of chromosome 7 in a list when reading lines
    for line in fh:
        if not line.startswith(">"):
            seqList.append(line.strip())
    seq_chr7 = "".join(seqList) # join all DNA fragments from the seqList to one string

# Get the coding sequence by joining all exon sequences
coding_seq = "" # add exon sequences to string
for exon in exons:
    coding_seq += seq_chr7[exon[0]-1 : exon[1]]

print(f"The first 10 nucleotides for the coding sequence: {coding_seq[:10]}")

coding_seqfile = "CFTR_longest_transcript_coding_seq.txt"
with open(coding_seqfile, 'w') as fh:
    fh.write(coding_seq)
    print(f"The coding sequence has been output to the file '{coding_seqfile}'")

The first 10 nucleotides for the coding sequence: AATTGGAAGC
The coding sequence has been output to the file 'CFTR_longest_transcript_coding_seq.txt'

Check the sequence with the package utils.check_answers

In [60]:

# no need to import again as it was imported in a cell above
from utils import check_answers
check_answers.ex4("CFTR_longest_transcript_coding_seq.txt")

The sequences are matching!

5. What are the start positions and sequences of the start_codon and stop_codon for that transcript?¶

In [48]:

# Get the longest transcript including its start and stop codon positions
with open("data/Homo_sapiens.GRCh38.93.gtf", 'r') as fh:
    for line in fh:
        if not line.startswith("#"):
            entries = line.strip().split("\t")
            feature = entries[2]
            attribute = entries[8]
            if feature in ['start_codon', 'stop_codon']: # check for start or stop codons
                li_attr = attribute.split(";")
                gene_id = li_attr[0].strip().split(' ', maxsplit=1)[1].strip('"')
                transcript_id = li_attr[2].strip().split(' ', maxsplit=1)[1].strip('"')
                if gene_id == "ENSG00000001626" and transcript_id == "ENST00000003084":
                    if feature == 'start_codon':
                        pos_start_codon = int(entries[3]) # save position for the start codon 
                    elif feature == 'stop_codon':
                        pos_stop_codon = int(entries[3]) # save position for the stop codon



# Get the full DNA sequence of chromosome 7
with open("data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa", 'r') as fh:
    seqList = [] # Store the DNA sequence of chromosome 7 in a list when reading lines
    for line in fh:
        if not line.startswith(">"):
            seqList.append(line.strip())
    seq_chr7 = "".join(seqList) # join all DNA fragments from the seqList to one string


if seq_chr7[pos_start_codon-1:pos_start_codon+2] == "ATG":
    print("The start codon is", chr7[pos_start_codon-1:pos_start_codon+2], "and starts at position", pos_start_codon)
else:
    print("Warning! The start codon is not ATG")

if seq_chr7[pos_stop_codon-1:pos_stop_codon+2] in ["TAG", "TAA", "TGA"]:
    print("The stop codon is", chr7[pos_stop_codon-1:pos_stop_codon+2], "and starts at position", pos_stop_codon)
else:
    print("Warning! The stop codon does not correspond to a proper stop codon")

The start codon is ATG and starts at position 117480095
The stop codon is TAG and starts at position 117667106

6. Translate the above sequence of all exons into amino acids, using an implementation of the translation table from the utils.rna package¶

strand is all positive for this transcript. For more robust code, you also need to think about the translation direction.

In [51]:

# Get the longest transcript and its exons, as well as the the position of the start codon
with open("data/Homo_sapiens.GRCh38.93.gtf", 'r') as fh:
    exons = []
    for line in fh:
        if not line.startswith("#"):
            entries = line.strip().split("\t")
            strand = entries[6]
            feature = entries[2]
            attribute = entries[8]
            li_attr = attribute.split(";")
            gene_id = li_attr[0].strip().split(' ', maxsplit=1)[1].strip('"')
            transcript_id = li_attr[2].strip().split(' ', maxsplit=1)[1].strip('"')
            
            if gene_id == "ENSG00000001626" and transcript_id == "ENST00000003084":
                # print('strand=',strand)
                if feature == "exon":
                    start = int(entries[3])
                    end = int(entries[4])
                    exons.append((start, end))
                elif feature == 'start_codon':
                    pos_start_codon = int(entries[3])
                    
print("Number of exons for this transcript: ", len(exons))
print("The first exon has the positions", exons[0])
print("The start codon has the position", pos_start_codon)


# Get the full DNA sequence of chromosome 7
with open("data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa", 'r') as fh:
    seqList = [] # Store the DNA sequence of chromosome 7 in a list when reading lines
    for line in fh:
        if not line.startswith(">"):
            seqList.append(line.strip())
    seq_chr7 = "".join(seqList) # join all DNA fragments from the seqList to one string

# Get the coding sequence by joining all exon sequences
coding_seq = "" # add exon sequences to string
for exon in exons:
    coding_seq += seq_chr7[exon[0]-1:exon[1]]

# Translate to aminoacids, from the correct start position on
if seq_chr7[pos_start_codon-1:pos_start_codon+2] != "ATG":
    print("The start codon is not ATG")
else:
    start = pos_start_codon - exons[0][0] # get position of start codon in mRNA by subtracting the start
                                      # codon position and the start position of the first exon

from utils.rna import translate_dna
protein_CFTR = translate_dna(coding_seq[start:]) # translate the mRNA sequence from the start codon on
print(f"First 10 amino acids of the protein CFTR: {protein_CFTR[:10]}")

with open("CFTR_longest_transcript_aminoacids.txt", 'w') as fh:
    fh.write(protein_CFTR)

Number of exons for this transcript:  27
The first exon has the positions (117479963, 117480147)
The start codon has the position 117480095
First 10 amino acids of the protein CFTR: MQRSPLEKAS

Check the sequence with the package utils.check_answers

In [52]:

check_answers.ex6("CFTR_longest_transcript_aminoacids.txt")

The sequences are matching!

Use the python code you have written to solve the tasks above to determine which of five patients are carrying a mutation in the CFTR gene that causes a truncated protein.¶

In [53]:

# Get the longest transcript and its exons, as well as the start codon
with open("data/Homo_sapiens.GRCh38.93.gtf", 'r') as fh:
    exons = []
    for line in fh:
        if not line.startswith("#"):
            entries = line.strip().split("\t")
            strand = entries[6]
            feature = entries[2]
            attribute = entries[8]
            li_attr = attribute.split(";")
            gene_id = li_attr[0].strip().split()[1].strip('"')
            transcript_id = li_attr[2].strip().split()[1].strip('"')
            
            if gene_id == "ENSG00000001626" and transcript_id == "ENST00000003084":
                # print('strand=',strand)
                if feature == "exon":
                    start = int(entries[3])
                    end = int(entries[4])
                    exons.append((start, end))
                elif feature == 'start_codon':
                    pos_start_codon = int(entries[3])


# Get the full DNA sequence of chromosome 7
with open("data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa", 'r') as fh:
    seqList = [] # store the DNA sequence of chromosome 7 in a list
    for line in fh:
        if not line.startswith(">"):
            seqList.append(line.strip())
    seq_chr7 = "".join(seqList) # join all DNA fragments from the seqList to one string


# Get the coding sequence by joining all exon sequences
coding_seq = "" # add exon sequences to string
for exon in exons:
    coding_seq += seq_chr7[exon[0]-1:exon[1]]

# Translate to aminoacids, from the correct start position on
if seq_chr7[pos_start_codon-1:pos_start_codon+2] != "ATG":
    print("The start codon is not ATG")
else:
    start = pos_start_codon - exons[0][0] # get position of start codon in mRNA by subtracting the start codon position and the start position of the first exon

from utils.rna import translate_dna
protein_CFTR = translate_dna(coding_seq[start:]) # translate the mRNA sequence from the start codon on

i = 1 # counter for looping through patient fasta files
while i < 6:
    patient_seqfile = f"data/Patient{i}.fa"
    with open(patient_seqfile, "r") as pf:
        patientSeqList = [] # store the DNA sequence of the patient in a list
        for line in pf:
            if not line.startswith(">"):
                patientSeqList.append(line.strip())
        patientSeq = "".join(patientSeqList) # construct one sequence for the currently analyzed patient

        patient_coding_seq = "" # append exon sequences to string. Works only for small strings.
        for exon in exons:
            patient_coding_seq += patientSeq[exon[0]-1:exon[1]]

        # Translate to amino acids
        patient_protein_CFTR = translate_dna(patient_coding_seq[start:]) # translate the coding sequence from the start codon on
        if not len(patient_protein_CFTR) == len(protein_CFTR):
            print(f"The sequence of patient {i} has a different length than the reference genome sequence.")
            print(f"patient gene length :{len(patient_protein_CFTR)}")
            print(f"reference gene length: {len(protein_CFTR)}")
            
    i += 1 # remember to increase the counter by 1

The sequence of patient 1 has a different length than the reference genome sequence.
patient gene length :113
reference gene length: 1480

Extra task: Use BioPython to parse the fasta file and to translate DNA nucleotides into amino acids.¶

In [61]:

from Bio import SeqIO
from Bio.Seq import Seq

# Get the longest transcript and its exons, as well as the start codon
# In this example, we use the function parse_attribute to parse the field
with open("data/Homo_sapiens.GRCh38.93.gtf", 'r') as fh:
    exons = []
    for line in fh:
        if not line.startswith("#"):
            entries = line.strip().split("\t")
            strand = entries[6]
            feature = entries[2]
            attribute = entries[8]
            d_attr = parse_attribute(attribute)
            
            gene_id = d_attr.get("gene_id", "")
            transcript_id = d_attr.get("transcript_id", "")
                
            if gene_id == "ENSG00000001626" and transcript_id == "ENST00000003084":
                # print('strand=',strand)
                if feature == "exon":
                    start = int(entries[3])
                    end = int(entries[4])
                    exons.append((start, end))
                elif feature == 'start_codon':
                    pos_start_codon = int(entries[3])

# Get the full DNA sequence of chromosome 7
with open("data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa", 'r') as fh:
    for record in SeqIO.parse(fh, "fasta"): # BioPython
        if record.id == "7":  # BioPython
            seq_chr7 = record.seq # BioPython


coding_seq = Seq("") # add exon sequences to string
for exon in exons:
    coding_seq += seq_chr7[exon[0]-1:exon[1]]

# Translate to amino acids, from the correct start position on
if seq_chr7[pos_start_codon-1:pos_start_codon+2] != "ATG":
    print("Warning! The start codon is not ATG")
else:
    start = start_codon - exons[0][0] # get position of start codon in mRNA by subtracting the start codon position and the start position of the first exon

# Use the method Seq.translate() to translate the coding sequence to amino acids
protein_CFTR = coding_seq[start:].translate(to_stop=True)

i = 1 # counter for looping through patient fasta files
while i < 6:
    patient_seqfile = f"data/Patient{i}.fa"
    
    for record in SeqIO.parse(patient_seqfile, "fasta"): # BioPython
        if record.id == "7":  # BioPython
            patient_seq = record.seq # BioPython

    patient_coding_seq = "" # append exon sequences to string. Works only for small strings.
    for exon in exons:
        patient_coding_seq += patient_seq[exon[0]-1:exon[1]]

    # Translate to aminoacids
    patient_protein_CFTR = patient_coding_seq[start:].translate(to_stop=True) # BioPython
    
    if not len(patient_protein_CFTR) == len(protein_CFTR):
        print(f"The sequence of patient {i} has a different length than the reference genome sequence.")
        print(f"patient gene length :{len(patient_protein_CFTR)}")
        print(f"reference gene length: {len(protein_CFTR)}")
        
    i += 1 # remember to increase the counter by 1

The sequence of patient 1 has a different length than the reference genome sequence.
patient gene length :113
reference gene length: 1480

In [ ]: