#!/usr/bin/env python # coding: utf-8 # KEGG # ==== # # KEGG () is a database resource for understanding # high-level functions and utilities of the biological system, such as the # cell, the organism and the ecosystem, from molecular-level information, # especially large-scale molecular datasets generated by genome sequencing # and other high-throughput experimental technologies. # # Please note that the KEGG parser implementation in Biopython is # incomplete. While the KEGG website indicates many flat file formats, # only parsers and writers for compound, enzyme, and map are currently # implemented. However, a generic parser is implemented to handle the # other formats. # # Parsing KEGG records # -------------------- # # Parsing a KEGG record is as simple as using any other file format parser # in Biopython. (Before running the following codes, please open # http://rest.kegg.jp/get/ec:5.4.2.2 with your web browser and save it as # ec\_5.4.2.2.txt.) # In[2]: get_ipython().system('wget http://rest.kegg.jp/get/ec:5.4.2.2 -O ec_5.4.2.2.txt') # In[3]: from Bio.KEGG import Enzyme records = Enzyme.parse(open("ec_5.4.2.2.txt")) record = list(records)[0] record.classname # In[4]: record.entry # The following section will shows how to download the above enzyme using # the KEGG api as well as how to use the generic parser with data that # does not have a custom parser implemented. # # Querying the KEGG API # --------------------- # # Biopython has full support for the querying of the KEGG api. Querying # all KEGG endpoints are supported; all methods documented by KEGG # () are supported. The # interface has some validation of queries which follow rules defined on # the KEGG site. However, invalid queries which return a 400 or 404 must # be handled by the user. # # First, here is how to extend the above example by downloading the # relevant enzyme and passing it through the Enzyme parser. # In[5]: from Bio.KEGG import REST from Bio.KEGG import Enzyme request = REST.kegg_get("ec:5.4.2.2") open("ec_5.4.2.2.txt", 'w').write(request.read().decode("utf-8")) # In[6]: records = Enzyme.parse(open("ec_5.4.2.2.txt")) record = list(records)[0] record.classname # In[7]: record.entry # Now, here’s a more realistic example which shows a combination of # querying the KEGG API. This will demonstrate how to extract a unique set # of all human pathway gene symbols which relate to DNA repair. The steps # that need to be taken to do so are as follows. First, we need to get a # list of all human pathways. Secondly, we need to filter those for ones # which relate to “repair”. Lastly, we need to get a list of all the gene # symbols in all repair pathways. # In[8]: from Bio.KEGG import REST human_pathways = REST.kegg_list("pathway", "hsa").read() human_pathways.decode("utf-8").split("\n")[0:5] # In[9]: # Filter all human pathways for repair pathways repair_pathways = [] for line in human_pathways.decode("utf-8").rstrip().split("\n"): entry, description = line.split("\t") if "repair" in description: repair_pathways.append(entry) repair_pathways # In[10]: # Get the genes for pathways and add them to a list repair_genes = [] for pathway in repair_pathways: pathway_file = REST.kegg_get(pathway).read() # query and read each pathway # iterate through each KEGG pathway file, keeping track of which section # of the file we're in, only read the gene in each pathway current_section = None for line in pathway_file.decode("utf-8").rstrip().split("\n"): section = line[:12].strip() # section names are within 12 columns if not section == "": current_section = section if current_section == "GENE": gene_identifiers, gene_description = line[12:].split("; ") gene_id, gene_symbol = gene_identifiers.split() if not gene_symbol in repair_genes: repair_genes.append(gene_symbol) print("There are %d repair pathways and %d repair genes. The genes are:" % \ (len(repair_pathways), len(repair_genes))) print(", ".join(repair_genes)) # The KEGG API wrapper is compatible with all endpoints. Usage is # essentially replacing all slashes in the url with commas and using that # list as arguments to the corresponding method in the KEGG module. Here # are a few examples from the api documentation # (). # # /list/hsa:10458+ece:Z5100 -> REST.kegg_list(["hsa:10458", "ece:Z5100"]) # /find/compound/300-310/mol_weight -> REST.kegg_find("compound", "300-310", "mol_weight") # /get/hsa:10458+ece:Z5100/aaseq -> REST.kegg_get(["hsa:10458", "ece:Z5100"], "aaseq") # In[ ]: