#!/usr/bin/env python
# coding: utf-8

# KEGG
# ====
# 
# KEGG (<http://www.kegg.jp/>) is a database resource for understanding
# high-level functions and utilities of the biological system, such as the
# cell, the organism and the ecosystem, from molecular-level information,
# especially large-scale molecular datasets generated by genome sequencing
# and other high-throughput experimental technologies.
# 
# Please note that the KEGG parser implementation in Biopython is
# incomplete. While the KEGG website indicates many flat file formats,
# only parsers and writers for compound, enzyme, and map are currently
# implemented. However, a generic parser is implemented to handle the
# other formats.
# 
# Parsing KEGG records
# --------------------
# 
# Parsing a KEGG record is as simple as using any other file format parser
# in Biopython. (Before running the following codes, please open
# http://rest.kegg.jp/get/ec:5.4.2.2 with your web browser and save it as
# ec\_5.4.2.2.txt.)

# In[2]:


get_ipython().system('wget http://rest.kegg.jp/get/ec:5.4.2.2 -O ec_5.4.2.2.txt')


# In[3]:


from Bio.KEGG import Enzyme

records = Enzyme.parse(open("ec_5.4.2.2.txt"))

record = list(records)[0]

record.classname


# In[4]:


record.entry


# The following section will shows how to download the above enzyme using
# the KEGG api as well as how to use the generic parser with data that
# does not have a custom parser implemented.
# 
# Querying the KEGG API
# ---------------------
# 
# Biopython has full support for the querying of the KEGG api. Querying
# all KEGG endpoints are supported; all methods documented by KEGG
# (<http://www.kegg.jp/kegg/rest/keggapi.html>) are supported. The
# interface has some validation of queries which follow rules defined on
# the KEGG site. However, invalid queries which return a 400 or 404 must
# be handled by the user.
# 
# First, here is how to extend the above example by downloading the
# relevant enzyme and passing it through the Enzyme parser.

# In[5]:


from Bio.KEGG import REST

from Bio.KEGG import Enzyme

request = REST.kegg_get("ec:5.4.2.2")

open("ec_5.4.2.2.txt", 'w').write(request.read().decode("utf-8"))


# In[6]:


records = Enzyme.parse(open("ec_5.4.2.2.txt"))

record = list(records)[0]

record.classname


# In[7]:


record.entry


# Now, here’s a more realistic example which shows a combination of
# querying the KEGG API. This will demonstrate how to extract a unique set
# of all human pathway gene symbols which relate to DNA repair. The steps
# that need to be taken to do so are as follows. First, we need to get a
# list of all human pathways. Secondly, we need to filter those for ones
# which relate to “repair”. Lastly, we need to get a list of all the gene
# symbols in all repair pathways.

# In[8]:


from Bio.KEGG import REST

human_pathways = REST.kegg_list("pathway", "hsa").read()

human_pathways.decode("utf-8").split("\n")[0:5]


# In[9]:


# Filter all human pathways for repair pathways
repair_pathways = []
for line in human_pathways.decode("utf-8").rstrip().split("\n"):
    entry, description = line.split("\t")
    if "repair" in description:
        repair_pathways.append(entry)

repair_pathways


# In[10]:


# Get the genes for pathways and add them to a list
repair_genes = [] 
for pathway in repair_pathways:
    pathway_file = REST.kegg_get(pathway).read()  # query and read each pathway

    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.decode("utf-8").rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section
        
        if current_section == "GENE":
            gene_identifiers, gene_description = line[12:].split("; ")
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in repair_genes:
                repair_genes.append(gene_symbol)

print("There are %d repair pathways and %d repair genes. The genes are:" % \
        (len(repair_pathways), len(repair_genes)))
print(", ".join(repair_genes))


# The KEGG API wrapper is compatible with all endpoints. Usage is
# essentially replacing all slashes in the url with commas and using that
# list as arguments to the corresponding method in the KEGG module. Here
# are a few examples from the api documentation
# (<http://www.kegg.jp/kegg/docs/keggapi.html>).
# 
#     /list/hsa:10458+ece:Z5100            -> REST.kegg_list(["hsa:10458", "ece:Z5100"])
#     /find/compound/300-310/mol_weight    -> REST.kegg_find("compound", "300-310", "mol_weight")
#     /get/hsa:10458+ece:Z5100/aaseq      -> REST.kegg_get(["hsa:10458", "ece:Z5100"], "aaseq")

# In[ ]: