#!/usr/bin/env python
# coding: utf-8
# KEGG
# ====
#
# KEGG () is a database resource for understanding
# high-level functions and utilities of the biological system, such as the
# cell, the organism and the ecosystem, from molecular-level information,
# especially large-scale molecular datasets generated by genome sequencing
# and other high-throughput experimental technologies.
#
# Please note that the KEGG parser implementation in Biopython is
# incomplete. While the KEGG website indicates many flat file formats,
# only parsers and writers for compound, enzyme, and map are currently
# implemented. However, a generic parser is implemented to handle the
# other formats.
#
# Parsing KEGG records
# --------------------
#
# Parsing a KEGG record is as simple as using any other file format parser
# in Biopython. (Before running the following codes, please open
# http://rest.kegg.jp/get/ec:5.4.2.2 with your web browser and save it as
# ec\_5.4.2.2.txt.)
# In[2]:
get_ipython().system('wget http://rest.kegg.jp/get/ec:5.4.2.2 -O ec_5.4.2.2.txt')
# In[3]:
from Bio.KEGG import Enzyme
records = Enzyme.parse(open("ec_5.4.2.2.txt"))
record = list(records)[0]
record.classname
# In[4]:
record.entry
# The following section will shows how to download the above enzyme using
# the KEGG api as well as how to use the generic parser with data that
# does not have a custom parser implemented.
#
# Querying the KEGG API
# ---------------------
#
# Biopython has full support for the querying of the KEGG api. Querying
# all KEGG endpoints are supported; all methods documented by KEGG
# () are supported. The
# interface has some validation of queries which follow rules defined on
# the KEGG site. However, invalid queries which return a 400 or 404 must
# be handled by the user.
#
# First, here is how to extend the above example by downloading the
# relevant enzyme and passing it through the Enzyme parser.
# In[5]:
from Bio.KEGG import REST
from Bio.KEGG import Enzyme
request = REST.kegg_get("ec:5.4.2.2")
open("ec_5.4.2.2.txt", 'w').write(request.read().decode("utf-8"))
# In[6]:
records = Enzyme.parse(open("ec_5.4.2.2.txt"))
record = list(records)[0]
record.classname
# In[7]:
record.entry
# Now, here’s a more realistic example which shows a combination of
# querying the KEGG API. This will demonstrate how to extract a unique set
# of all human pathway gene symbols which relate to DNA repair. The steps
# that need to be taken to do so are as follows. First, we need to get a
# list of all human pathways. Secondly, we need to filter those for ones
# which relate to “repair”. Lastly, we need to get a list of all the gene
# symbols in all repair pathways.
# In[8]:
from Bio.KEGG import REST
human_pathways = REST.kegg_list("pathway", "hsa").read()
human_pathways.decode("utf-8").split("\n")[0:5]
# In[9]:
# Filter all human pathways for repair pathways
repair_pathways = []
for line in human_pathways.decode("utf-8").rstrip().split("\n"):
entry, description = line.split("\t")
if "repair" in description:
repair_pathways.append(entry)
repair_pathways
# In[10]:
# Get the genes for pathways and add them to a list
repair_genes = []
for pathway in repair_pathways:
pathway_file = REST.kegg_get(pathway).read() # query and read each pathway
# iterate through each KEGG pathway file, keeping track of which section
# of the file we're in, only read the gene in each pathway
current_section = None
for line in pathway_file.decode("utf-8").rstrip().split("\n"):
section = line[:12].strip() # section names are within 12 columns
if not section == "":
current_section = section
if current_section == "GENE":
gene_identifiers, gene_description = line[12:].split("; ")
gene_id, gene_symbol = gene_identifiers.split()
if not gene_symbol in repair_genes:
repair_genes.append(gene_symbol)
print("There are %d repair pathways and %d repair genes. The genes are:" % \
(len(repair_pathways), len(repair_genes)))
print(", ".join(repair_genes))
# The KEGG API wrapper is compatible with all endpoints. Usage is
# essentially replacing all slashes in the url with commas and using that
# list as arguments to the corresponding method in the KEGG module. Here
# are a few examples from the api documentation
# ().
#
# /list/hsa:10458+ece:Z5100 -> REST.kegg_list(["hsa:10458", "ece:Z5100"])
# /find/compound/300-310/mol_weight -> REST.kegg_find("compound", "300-310", "mol_weight")
# /get/hsa:10458+ece:Z5100/aaseq -> REST.kegg_get(["hsa:10458", "ece:Z5100"], "aaseq")
# In[ ]: