Download the data.zip file from http://pcons1.scilifelab.se/misc/download/python2024/data.zip and unzip it. Then rename the folder as data
# Open the reference file (fasta) and loop through it, removing the first line (starts with >)
# Sum the length of the lines, removing the newline character
with open("data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa", 'r') as fh:
length = 0
for line in fh:
if not line.startswith(">"):
length += len(line.strip()) # add the length of the line, without newline character
print(length)
159345973
# Open the GTF file and loop through it.
# To find the genes, check the 3rd element of each line and add 1 if the feature is "gene"
with open("data/Homo_sapiens.GRCh38.93.gtf", 'r') as fh:
num_genes = 0
for line in fh:
if not line.startswith("#"):
feature = line.split("\t")[2]
if feature == "gene":
num_genes +=1
print(num_genes)
58395
# helper function to parse attribute
# A more robust way to parse the attribute by using a dictionary.
# In this case, attribute items will be parsed correctly even when the order of the items are changed.
def parse_attribute(attribute):
"""
Parse the attribute field from the GTF file
Input: string of the attribute field
Output: a dictionary
"""
li_attr = [x.strip() for x in attribute.split(';') if x.strip()]
d_attr = {}
for attr in li_attr:
key_and_value = attr.split(' ', maxsplit=1)
if (len(key_and_value) < 2):
print(f"Error: bad data {attr} in attribute {attribute}")
key = key_and_value[0]
value = key_and_value[1].strip().strip('"')
d_attr[key] = value
return d_attr
# Example usage of the function parse_att
attribute = 'gene_id "unkown gene"; gene_version "14"; transcript_id "ENST00000546407"; transcript_version "1"; gene_name "CFTR"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "CFTR-207"; transcript_source "havana"; transcript_biotype "processed_transcript"; transcript_support_level "1";'
d_attr = parse_attribute(attribute)
print(d_attr)
gene_id = d_attr.get("gene_id", "")
print(f"gene_id: {gene_id}")
transcript_id = d_attr.get("transcript_id", "")
print(f"transcript_id: {transcript_id}")
{'gene_id': 'unkown gene', 'gene_version': '14', 'transcript_id': 'ENST00000546407', 'transcript_version': '1', 'gene_name': 'CFTR', 'gene_source': 'ensembl_havana', 'gene_biotype': 'protein_coding', 'transcript_name': 'CFTR-207', 'transcript_source': 'havana', 'transcript_biotype': 'processed_transcript', 'transcript_support_level': '1'} gene_id: unkown gene transcript_id: ENST00000546407
All following tasks are related to the CFTR gene.
# Open the GTF file and loop through it.
# To find the number of transcripts, check the 3rd element for "transcript" while the specific gene exists
# Can also be solved with bash using:
# cat Homo_sapiens.GRCh38.93.gtf | grep ENSG00000001626 | grep "\ttranscript\t" | wc -l
with open("data/Homo_sapiens.GRCh38.93.gtf", 'r') as fh:
# Find number of transcripts
num_transcripts = 0
for line in fh:
if not line.startswith("#"):
entries = line.split("\t")
feature = entries[2]
attribute = entries[8]
if feature == 'transcript':
li_attr = attribute.split(";")
gene_id = li_attr[0].strip().split(' ', maxsplit=1)[1].strip('"') # remember to strip the surrounding quote
if gene_id == "ENSG00000001626":
num_transcripts += 1
print("Number of transcripts: ", num_transcripts)
Number of transcripts: 11
# Open the GTF file and loop throught it.
# Find all the instances of transcripts for specific gene
# For each of them, extract the transcript id (column 8) and the length (column 4 - column 3 + 1)
# If longer than previous longest, store it and continue
with open("data/Homo_sapiens.GRCh38.93.gtf", 'r') as fh:
# Find longest transcript
longest_transcript = (0, '') # update the list with length and transcript ID with a for loop
for line in fh:
if not line.startswith("#"):
entries = line.strip().split("\t")
feature = entries[2]
attribute = entries[8]
if feature == "transcript":
li_attr = attribute.split(";")
gene_id = li_attr[0].strip().split()[1].strip('"')
if gene_id == "ENSG00000001626":
transcript_id = li_attr[2].strip().split(' ', maxsplit=1)[1].strip('"')
length = int(entries[4]) - int(entries[3]) + 1
#print(line)
if length > longest_transcript[0]:
longest_transcript = (length, transcript_id)
print(f"Longest transcript: {longest_transcript[1]}")
print(f"Length: {longest_transcript[0]}")
Longest transcript: ENST00000003084 Length: 188703
# Get the longest transcript from the previous step
# and extract the start and end position from the columns 4 and 5
# Save the whole sequence from the fasta file to a variable
# and get the start-1 to end from this variable
# (including its start and end positions)
with open("data/Homo_sapiens.GRCh38.93.gtf", 'r') as fh:
for line in fh:
if not line.startswith("#"):
entries = line.strip().split("\t")
feature = entries[2]
attribute = entries[8]
if feature == "transcript":
li_attr = attribute.split(";")
gene_id = li_attr[0].strip().split(' ', maxsplit=1)[1].strip('"')
if gene_id == "ENSG00000001626":
transcript_id = li_attr[2].strip().split(' ', maxsplit=1)[1].strip('"')
if transcript_id == "ENST00000003084":
start = int(entries[3])
end = int(entries[4])
print(transcript_id, start, end)
# Get the full DNA sequence of chromosome 7
with open("data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa", 'r') as fh:
# Store the DNA sequence of chromosome 7 in a list when reading lines
seqList = []
for line in fh:
if not line.startswith(">"):
seqList.append(line.strip())
seq_chr7 = "".join(seqList) # join all DNA fragments from the seqList to one string
# Store DNA sequence of the longest transcript in a variable
seq_CFTR = seq_chr7[start-1:end]
print(f"First 10 nucleotides of the longest transcript: {seq_CFTR[:10]}")
dna_seqfile = "CFTR_longest_transcript.txt"
with open(dna_seqfile, 'w') as fh:
fh.write(seq_CFTR)
print(f"The DNA sequence of the longest transcript has been output to the file '{dna_seqfile}'")
ENST00000003084 117479963 117668665 First 10 nucleotides of the longest transcript: AATTGGAAGC The DNA sequence of the longest transcript has been output to the file 'CFTR_longest_transcript.txt'
Check the sequence with the package utils.check_answers (note that I moved the directory "utils" to the directory where the Jupyter notebook is)
from utils import check_answers
check_answers.ex3("CFTR_longest_transcript.txt")
The sequences are matching!
# Open the GTF file and loop through it. Get the same as in the previous question
# but store all of the transcripts in a set
# Create a variable from the other file, like in the previous question
# Loop through the set in the other file and pick the positions of the exons
# Get the longest transcript and its exons
with open("data/Homo_sapiens.GRCh38.93.gtf", 'r') as fh:
exons = []
for line in fh:
if not line.startswith("#"):
entries = line.strip().split("\t")
feature = entries[2]
attribute = entries[8]
if feature == "exon":
li_attr = attribute.split(";")
gene_id = li_attr[0].strip().split(' ', maxsplit=1)[1].strip('"')
transcript_id = li_attr[2].strip().split(' ', maxsplit=1)[1].strip('"')
if gene_id == "ENSG00000001626" and transcript_id == "ENST00000003084":
start = int(entries[3])
end = int(entries[4])
exons.append((start, end))
# Get the full DNA sequence of chromosome 7
with open("data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa", 'r') as fh:
seqList = [] # Store the DNA sequence of chromosome 7 in a list when reading lines
for line in fh:
if not line.startswith(">"):
seqList.append(line.strip())
seq_chr7 = "".join(seqList) # join all DNA fragments from the seqList to one string
# Get the coding sequence by joining all exon sequences
coding_seq = "" # add exon sequences to string
for exon in exons:
coding_seq += seq_chr7[exon[0]-1 : exon[1]]
print(f"The first 10 nucleotides for the coding sequence: {coding_seq[:10]}")
coding_seqfile = "CFTR_longest_transcript_coding_seq.txt"
with open(coding_seqfile, 'w') as fh:
fh.write(coding_seq)
print(f"The coding sequence has been output to the file '{coding_seqfile}'")
The first 10 nucleotides for the coding sequence: AATTGGAAGC The coding sequence has been output to the file 'CFTR_longest_transcript_coding_seq.txt'
Check the sequence with the package utils.check_answers
# no need to import again as it was imported in a cell above
from utils import check_answers
check_answers.ex4("CFTR_longest_transcript_coding_seq.txt")
The sequences are matching!
# Get the longest transcript including its start and stop codon positions
with open("data/Homo_sapiens.GRCh38.93.gtf", 'r') as fh:
for line in fh:
if not line.startswith("#"):
entries = line.strip().split("\t")
feature = entries[2]
attribute = entries[8]
if feature in ['start_codon', 'stop_codon']: # check for start or stop codons
li_attr = attribute.split(";")
gene_id = li_attr[0].strip().split(' ', maxsplit=1)[1].strip('"')
transcript_id = li_attr[2].strip().split(' ', maxsplit=1)[1].strip('"')
if gene_id == "ENSG00000001626" and transcript_id == "ENST00000003084":
if feature == 'start_codon':
pos_start_codon = int(entries[3]) # save position for the start codon
elif feature == 'stop_codon':
pos_stop_codon = int(entries[3]) # save position for the stop codon
# Get the full DNA sequence of chromosome 7
with open("data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa", 'r') as fh:
seqList = [] # Store the DNA sequence of chromosome 7 in a list when reading lines
for line in fh:
if not line.startswith(">"):
seqList.append(line.strip())
seq_chr7 = "".join(seqList) # join all DNA fragments from the seqList to one string
if seq_chr7[pos_start_codon-1:pos_start_codon+2] == "ATG":
print("The start codon is", chr7[pos_start_codon-1:pos_start_codon+2], "and starts at position", pos_start_codon)
else:
print("Warning! The start codon is not ATG")
if seq_chr7[pos_stop_codon-1:pos_stop_codon+2] in ["TAG", "TAA", "TGA"]:
print("The stop codon is", chr7[pos_stop_codon-1:pos_stop_codon+2], "and starts at position", pos_stop_codon)
else:
print("Warning! The stop codon does not correspond to a proper stop codon")
The start codon is ATG and starts at position 117480095 The stop codon is TAG and starts at position 117667106
strand is all positive for this transcript. For more robust code, you also need to think about the translation direction.
# Get the longest transcript and its exons, as well as the the position of the start codon
with open("data/Homo_sapiens.GRCh38.93.gtf", 'r') as fh:
exons = []
for line in fh:
if not line.startswith("#"):
entries = line.strip().split("\t")
strand = entries[6]
feature = entries[2]
attribute = entries[8]
li_attr = attribute.split(";")
gene_id = li_attr[0].strip().split(' ', maxsplit=1)[1].strip('"')
transcript_id = li_attr[2].strip().split(' ', maxsplit=1)[1].strip('"')
if gene_id == "ENSG00000001626" and transcript_id == "ENST00000003084":
# print('strand=',strand)
if feature == "exon":
start = int(entries[3])
end = int(entries[4])
exons.append((start, end))
elif feature == 'start_codon':
pos_start_codon = int(entries[3])
print("Number of exons for this transcript: ", len(exons))
print("The first exon has the positions", exons[0])
print("The start codon has the position", pos_start_codon)
# Get the full DNA sequence of chromosome 7
with open("data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa", 'r') as fh:
seqList = [] # Store the DNA sequence of chromosome 7 in a list when reading lines
for line in fh:
if not line.startswith(">"):
seqList.append(line.strip())
seq_chr7 = "".join(seqList) # join all DNA fragments from the seqList to one string
# Get the coding sequence by joining all exon sequences
coding_seq = "" # add exon sequences to string
for exon in exons:
coding_seq += seq_chr7[exon[0]-1:exon[1]]
# Translate to aminoacids, from the correct start position on
if seq_chr7[pos_start_codon-1:pos_start_codon+2] != "ATG":
print("The start codon is not ATG")
else:
start = pos_start_codon - exons[0][0] # get position of start codon in mRNA by subtracting the start
# codon position and the start position of the first exon
from utils.rna import translate_dna
protein_CFTR = translate_dna(coding_seq[start:]) # translate the mRNA sequence from the start codon on
print(f"First 10 amino acids of the protein CFTR: {protein_CFTR[:10]}")
with open("CFTR_longest_transcript_aminoacids.txt", 'w') as fh:
fh.write(protein_CFTR)
Number of exons for this transcript: 27 The first exon has the positions (117479963, 117480147) The start codon has the position 117480095 First 10 amino acids of the protein CFTR: MQRSPLEKAS
Check the sequence with the package utils.check_answers
check_answers.ex6("CFTR_longest_transcript_aminoacids.txt")
The sequences are matching!
# Get the longest transcript and its exons, as well as the start codon
with open("data/Homo_sapiens.GRCh38.93.gtf", 'r') as fh:
exons = []
for line in fh:
if not line.startswith("#"):
entries = line.strip().split("\t")
strand = entries[6]
feature = entries[2]
attribute = entries[8]
li_attr = attribute.split(";")
gene_id = li_attr[0].strip().split()[1].strip('"')
transcript_id = li_attr[2].strip().split()[1].strip('"')
if gene_id == "ENSG00000001626" and transcript_id == "ENST00000003084":
# print('strand=',strand)
if feature == "exon":
start = int(entries[3])
end = int(entries[4])
exons.append((start, end))
elif feature == 'start_codon':
pos_start_codon = int(entries[3])
# Get the full DNA sequence of chromosome 7
with open("data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa", 'r') as fh:
seqList = [] # store the DNA sequence of chromosome 7 in a list
for line in fh:
if not line.startswith(">"):
seqList.append(line.strip())
seq_chr7 = "".join(seqList) # join all DNA fragments from the seqList to one string
# Get the coding sequence by joining all exon sequences
coding_seq = "" # add exon sequences to string
for exon in exons:
coding_seq += seq_chr7[exon[0]-1:exon[1]]
# Translate to aminoacids, from the correct start position on
if seq_chr7[pos_start_codon-1:pos_start_codon+2] != "ATG":
print("The start codon is not ATG")
else:
start = pos_start_codon - exons[0][0] # get position of start codon in mRNA by subtracting the start codon position and the start position of the first exon
from utils.rna import translate_dna
protein_CFTR = translate_dna(coding_seq[start:]) # translate the mRNA sequence from the start codon on
i = 1 # counter for looping through patient fasta files
while i < 6:
patient_seqfile = f"data/Patient{i}.fa"
with open(patient_seqfile, "r") as pf:
patientSeqList = [] # store the DNA sequence of the patient in a list
for line in pf:
if not line.startswith(">"):
patientSeqList.append(line.strip())
patientSeq = "".join(patientSeqList) # construct one sequence for the currently analyzed patient
patient_coding_seq = "" # append exon sequences to string. Works only for small strings.
for exon in exons:
patient_coding_seq += patientSeq[exon[0]-1:exon[1]]
# Translate to amino acids
patient_protein_CFTR = translate_dna(patient_coding_seq[start:]) # translate the coding sequence from the start codon on
if not len(patient_protein_CFTR) == len(protein_CFTR):
print(f"The sequence of patient {i} has a different length than the reference genome sequence.")
print(f"patient gene length :{len(patient_protein_CFTR)}")
print(f"reference gene length: {len(protein_CFTR)}")
i += 1 # remember to increase the counter by 1
The sequence of patient 1 has a different length than the reference genome sequence. patient gene length :113 reference gene length: 1480
from Bio import SeqIO
from Bio.Seq import Seq
# Get the longest transcript and its exons, as well as the start codon
# In this example, we use the function parse_attribute to parse the field
with open("data/Homo_sapiens.GRCh38.93.gtf", 'r') as fh:
exons = []
for line in fh:
if not line.startswith("#"):
entries = line.strip().split("\t")
strand = entries[6]
feature = entries[2]
attribute = entries[8]
d_attr = parse_attribute(attribute)
gene_id = d_attr.get("gene_id", "")
transcript_id = d_attr.get("transcript_id", "")
if gene_id == "ENSG00000001626" and transcript_id == "ENST00000003084":
# print('strand=',strand)
if feature == "exon":
start = int(entries[3])
end = int(entries[4])
exons.append((start, end))
elif feature == 'start_codon':
pos_start_codon = int(entries[3])
# Get the full DNA sequence of chromosome 7
with open("data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa", 'r') as fh:
for record in SeqIO.parse(fh, "fasta"): # BioPython
if record.id == "7": # BioPython
seq_chr7 = record.seq # BioPython
coding_seq = Seq("") # add exon sequences to string
for exon in exons:
coding_seq += seq_chr7[exon[0]-1:exon[1]]
# Translate to amino acids, from the correct start position on
if seq_chr7[pos_start_codon-1:pos_start_codon+2] != "ATG":
print("Warning! The start codon is not ATG")
else:
start = start_codon - exons[0][0] # get position of start codon in mRNA by subtracting the start codon position and the start position of the first exon
# Use the method Seq.translate() to translate the coding sequence to amino acids
protein_CFTR = coding_seq[start:].translate(to_stop=True)
i = 1 # counter for looping through patient fasta files
while i < 6:
patient_seqfile = f"data/Patient{i}.fa"
for record in SeqIO.parse(patient_seqfile, "fasta"): # BioPython
if record.id == "7": # BioPython
patient_seq = record.seq # BioPython
patient_coding_seq = "" # append exon sequences to string. Works only for small strings.
for exon in exons:
patient_coding_seq += patient_seq[exon[0]-1:exon[1]]
# Translate to aminoacids
patient_protein_CFTR = patient_coding_seq[start:].translate(to_stop=True) # BioPython
if not len(patient_protein_CFTR) == len(protein_CFTR):
print(f"The sequence of patient {i} has a different length than the reference genome sequence.")
print(f"patient gene length :{len(patient_protein_CFTR)}")
print(f"reference gene length: {len(protein_CFTR)}")
i += 1 # remember to increase the counter by 1
The sequence of patient 1 has a different length than the reference genome sequence. patient gene length :113 reference gene length: 1480