#!/usr/bin/env python
# coding: utf-8
# # X2K API Tutorial Notebook
# April 9th, 2018
#
# This Jupyter Notebook contains an interactive tutorial for **running the Expression2Kinases (X2K) API** using Python 3.
#
# ### Table of Contents
# The notebook contains the following sections:
# 1. **API Documentation** - shows how to programmatically analyze your gene list in Python.
# 2. **Using the X2K API** - overview of the input parameters and output of the API.
# 3. **Interpreting the results** - gives an overview of the structure and meaning of the analysis results.
# * **Transcription Factor Enrichment Analysis** (ChEA)
# * **Protein-Protein Interaction Expansion** (G2N)
# * **Kinase Enrichment Analysis** (KEA)
# * **Expression2Kinases** (X2K)
# ## 1. Using the X2K API
# The X2K API allows for programmatic analysis of an input gene list.
#
# The `run_X2K()` function displayed below can be used to analyze a gene list and load the results in a Python dictionary by performing a **POST request**.
#
# The function requires only one input, `input_genes`, **a list of gene symbols ** to be analyzed. Additional optional parameters can be specified with the `options` parameters.
# In[1]:
# Import modules
import http.client
import json
##### Function to run X2K
### Input: a Python list of gene symbols
### Output: a dictionary containing the results of X2K, ChEA, G2N, KEA.
def run_X2K(input_genes, options={}):
# Open HTTP connection
conn = http.client.HTTPConnection("amp.pharm.mssm.edu") #
# Get default options
default_options = {'text-genes': '\n'.join(input_genes), 'included_organisms': 'both', 'included_database': 'ChEA 2015', 'path_length': 2, 'minimum network size': 50, 'min_number_of_articles_supporting_interaction': 2, 'max_number_of_interactions_per_protein': 200, 'max_number_of_interactions_per_article': 100, 'biocarta': True, 'biogrid': True, 'dip': True, 'innatedb': True, 'intact': True, 'kegg': True, 'mint': True, 'ppid': True, 'snavi': True, 'number_of_results': 50, 'sort_tfs_by': 'combined score', 'sort_kinases_by': 'combined score'}
# Update options
for key, value in options.items():
if key in default_options.keys() and key != 'text-genes':
default_options.update({key: value})
# Get payload
boundary = "----WebKitFormBoundary7MA4YWxkTrZu0gW"
payload = ''.join(['--'+boundary+'\r\nContent-Disposition: form-data; name=\"{key}\"\r\n\r\n{value}\r\n'.format(**locals()) for key, value in default_options.items()])+'--'+boundary+'--'
# Get Headers
headers = {
'content-type': "multipart/form-data; boundary="+boundary,
'cache-control': "no-cache",
}
# Initialize connection
conn.request("POST", "/X2K/api", payload, headers)
# Get response
res = conn.getresponse()
# Read response
data = res.read().decode('utf-8')
# Convert to dictionary
x2k_results = {key: json.loads(value) if key != 'input' else value for key, value in json.loads(data).items()}
# Clean results
x2k_results['ChEA'] = x2k_results['ChEA']['tfs']
x2k_results['G2N'] = x2k_results['G2N']['network']
x2k_results['KEA'] = x2k_results['KEA']['kinases']
x2k_results['X2K'] = x2k_results['X2K']['network']
# Return results
return x2k_results
# In[2]:
# Get input genes
input_genes = ['Nsun3', 'Polrmt', 'Nlrx1', 'Sfxn5', 'Zc3h12c', 'Slc25a39', 'Arsg', 'Defb29', 'Ndufb6', 'Zfand1', 'Tmem77', '5730403B10Rik', 'Tlcd1', 'Psmc6', 'Slc30a6', 'LOC100047292', 'Lrrc40', 'Orc5l', 'Mpp7', 'Unc119b', 'Prkaca', 'Tcn2', 'Psmc3ip', 'Pcmtd2', 'Acaa1a', 'Lrrc1', '2810432D09Rik', 'Sephs2', 'Sac3d1', 'Tmlhe', 'LOC623451', 'Tsr2', 'Plekha7', 'Gys2', 'Arhgef12', 'Hibch', 'Lyrm2', 'Zbtb44', 'Entpd5', 'Rab11fip2', 'Lipt1', 'Intu', 'Anxa13', 'Klf12', 'Sat2', 'Gal3st2', 'Vamp8', 'Fkbpl', 'Aqp11', 'Trap1', 'Pmpcb', 'Tm7sf3', 'Rbm39', 'Bri3', 'Kdr', 'Zfp748', 'Nap1l1', 'Dhrs1', 'Lrrc56', 'Wdr20a', 'Stxbp2', 'Klf1', 'Ufc1', 'Ccdc16', '9230114K14Rik', 'Rwdd3', '2610528K11Rik', 'Aco1', 'Cables1', 'LOC100047214', 'Yars2', 'Lypla1', 'Kalrn', 'Gyk', 'Zfp787', 'Zfp655', 'Rabepk', 'Zfp650', '4732466D17Rik', 'Exosc4', 'Wdr42a', 'Gphn', '2610528J11Rik', '1110003E01Rik', 'Mdh1', '1200014M14Rik', 'AW209491', 'Mut', '1700123L14Rik', '2610036D13Rik', 'Cox15', 'Tmem30a', 'Nsmce4a', 'Tm2d2', 'Rhbdd3', 'Atxn2', 'Nfs1', '3110001I20Rik', 'BC038156', 'LOC100047782', '2410012H22Rik', 'Rilp', 'A230062G08Rik', 'Pttg1ip', 'Rab1', 'Afap1l1', 'Lyrm5', '2310026E23Rik', 'C330002I19Rik', 'Zfyve20', 'Poli', 'Tomm70a', 'Slc7a6os', 'Mat2b', '4932438A13Rik', 'Lrrc8a', 'Smo', 'Nupl2', 'Trpc2', 'Arsk', 'D630023B12Rik', 'Mtfr1', '5730414N17Rik', 'Scp2', 'Zrsr1', 'Nol7', 'C330018D20Rik', 'Ift122', 'LOC100046168', 'D730039F16Rik', 'Scyl1', '1700023B02Rik', '1700034H14Rik', 'Fbxo8', 'Paip1', 'Tmem186', 'Atpaf1', 'LOC100046254', 'LOC100047604', 'Coq10a', 'Fn3k', 'Sipa1l1', 'Slc25a16', 'Slc25a40', 'Rps6ka5', 'Trim37', 'Lrrc61', 'Abhd3', 'Gbe1', 'Parp16', 'Hsd3b2', 'Esm1', 'Dnajc18', 'Dolpp1', 'Lass2', 'Wdr34', 'Rfesd', 'Cacnb4', '2310042D19Rik', 'Srr', 'Bpnt1', '6530415H11Rik', 'Clcc1', 'Tfb1m', '4632404H12Rik', 'D4Bwg0951e', 'Med14', 'Adhfe1', 'Thtpa', 'Cat', 'Ell3', 'Akr7a5', 'Mtmr14', 'Timm44', 'Sf1', 'Ipp', 'Iah1', 'Trim23', 'Wdr89', 'Gstz1', 'Cradd', '2510006D16Rik', 'Fbxl6', 'LOC100044400', 'Zfp106', 'Cd55', '0610013E23Rik', 'Afmid', 'Tmem86a', 'Aldh6a1', 'Dalrd3', 'Smyd4', 'Nme7', 'Fars2', 'Tasp1', 'Cldn10', 'A930005H10Rik', 'Slc9a6', 'Adk', 'Rbks', '2210016F16Rik', 'Vwce', '4732435N03Rik', 'Zfp11', 'Vldlr', '9630013D21Rik', '4933407N01Rik', 'Fahd1', 'Mipol1', '1810019D21Rik', '1810049H13Rik', 'Tfam', 'Paics', '1110032A03Rik', 'LOC100044139', 'Dnajc19', 'BC016495', 'A930041I02Rik', 'Rqcd1', 'Usp34', 'Zcchc3', 'H2afj', 'Phf7', '4921508D12Rik', 'Kmo', 'Prpf18', 'Mcat', 'Txndc4', '4921530L18Rik', 'Vps13b', 'Scrn3', 'Tor1a', 'AI316807', 'Acbd4', 'Fah', 'Apool', 'Col4a4', 'Lrrc19', 'Gnmt', 'Nr3c1', 'Sip1', 'Ascc1', 'Fech', 'Abhd14a', 'Arhgap18', '2700046G09Rik', 'Yme1l1', 'Gk5', 'Glo1', 'Sbk1', 'Cisd1', '2210011C24Rik', 'Nxt2', 'Notum', 'Ankrd42', 'Ube2e1', 'Ndufv1', 'Slc33a1', 'Cep68', 'Rps6kb1', 'Hyi', 'Aldh1a3', 'Mynn', '3110048L19Rik', 'Rdh14', 'Proz', 'Gorasp1', 'LOC674449', 'Zfp775', '5430437P03Rik', 'Npy', 'Adh5', 'Sybl1', '4930432O21Rik', 'Nat9', 'LOC100048387', 'Mettl8', 'Eny2', '2410018G20Rik', 'Pgm2', 'Fgfr4', 'Mobkl2b', 'Atad3a', '4932432K03Rik', 'Dhtkd1', 'Ubox5', 'A530050D06Rik', 'Zdhhc5', 'Mgat1', 'Nudt6', 'Tpmt', 'Wbscr18', 'LOC100041586', 'Cdk5rap1', '4833426J09Rik', 'Myo6', 'Cpt1a', 'Gadd45gip1', 'Tmbim4', '2010309E21Rik', 'Asb9', '2610019F03Rik', '7530414M10Rik', 'Atp6v1b2', '2310068J16Rik', 'Ddt', 'Klhdc4', 'Hpn', 'Lifr', 'Ovol1', 'Nudt12', 'Cdan1', 'Fbxo9', 'Fbxl3', 'Hoxa7', 'Aldh8a1', '3110057O12Rik', 'Abhd11', 'Psmb1', 'ENSMUSG00000074286', 'Chpt1', 'Oxsm', '2310009A05Rik', '1700001L05Rik', 'Zfp148', '39509', 'Mrpl9', 'Tmem80', '9030420J04Rik', 'Naglu', 'Plscr2', 'Agbl3', 'Pex1', 'Cno', 'Neo1', 'Asf1a', 'Tnfsf5ip1', 'Pkig', 'AI931714', 'D130020L05Rik', 'Cntd1', 'Clec2h', 'Zkscan1', '1810044D09Rik', 'Mettl7a', 'Siae', 'Fbxo3', 'Fzd5', 'Tmem166', 'Tmed4', 'Gpr155', 'Rnf167', 'Sptlc1', 'Riok2', 'Tgds', 'Pms1', 'Pitpnc1', 'Pcsk7', '4933403G14Rik', 'Ei24', 'Crebl2', 'Tln1', 'Mrpl35', '2700038C09Rik', 'Ubie', 'Osgepl1', '2410166I05Rik', 'Wdr24', 'Ap4s1', 'Lrrc44', 'B3bp', 'Itfg1', 'Dmxl1', 'C1d']
# Run X2K results
x2k_results = run_X2K(input_genes)
x2k_results.keys()
# ## 2. X2K API Documentation
#
# ### 2.1 API Inputs
# A **full list of the input parameters** for the `run_X2K()` function is available below.
#
# The optional parameters can provided to the function in the `options` dictionary.
#
#
# Parameter |
# Step |
# Description |
# Notes |
#
#
#
# **input_genes** (required) |
# X2K |
# Contains the input gene set for the X2K analysis. |
# A list of strings representing the input gene symbols. |
#
#
# *organism* (optional) |
# ChEA |
# The organism from which TF-target interaction data should be integrated. |
# One of `('human_only', 'mouse_only', 'both')`. Default `'both'`. |
#
#
# *included_database* (optional) |
# ChEA |
# The database from which TF-target interaction data should be integrated, |
# One of `('ChEA 2015', 'ENCODE 2015', 'ChEA & ENCODE Consensus', 'Transfac & Jaspar')` Default `'ENCODE 2015'`. |
#
#
# *sort_tfs_by* (optional)
# | ChEA |
# The method used to sort the top Transcription Factors identified by ChEA. |
# One of `('p-value', 'rank', 'combined score')`. Default `'p-value'`. |
#
#
# *path_length* (optional) |
# G2N |
# The maximum Protein-Protein Interaction path length for the network expansion. |
# Integer, default `2`. |
#
#
# *minimum_network_size* (optional)
# | G2N |
# The minimum size of the Protein-Protein interaction network generated using Genes2Networks. |
# Integer, default `50`. |
#
#
# *min_number_of_articles_supporting_interaction* (optional)
# | G2N |
# The minimum number of published articles supporting a Protein-Protein Interaction for the expanded subnetwork. |
# Integer, default `2`. |
#
#
# *max_number_of_interactions_per_protein* (optional)
# | G2N |
# The maximum number of physical interactions allowed for the proteins in the expanded subnetwork. |
# Integer, default `200`. |
#
#
# *max_number_of_interactions_per_article* (optional)
# | G2N |
# The maximum number of physical interactions reported in each published article |
# Integer, default `100`. |
#
#
# *ppi_networks* (optional)
# | G2N |
# The Protein-Protein Interaction databases to integrate for generation of the expanded subnetwork. |
# Either `'all'`, or a list containing one or more of `('biocarta', 'biogrid', 'dip', 'innatedb', 'intact', 'kegg', 'mint', 'ppid', 'snavi')`. Default `'all'`. |
#
#
# *number_of_results* (optional)
# | G2N |
# The maximum network size of the expanded network generated using Genes2Networks. |
# Integer, default `50`. |
#
#
# *sort_kinases_by* (optional)
# | KEA |
# The method used to sort the top Transcription Factors identified by KEA. |
# One of `('p-value', 'rank', 'combined score')`. Default `'p-value'`. |
#
#
#
# ### 2.2 API Output
# The `run_X2K()` function returns results as `dict` containing **four keys**, whose contents are described below.
#
#
# Key |
# Notes |
# Contents |
#
#
#
# **ChEA** |
# Contains the results of the **Transcription Factor Enrichment Analysis**, generated using ChEA. |
# A `list` of `dict`s containing information on the top TFs predicted to regulate the input genes. |
#
#
#
# **G2N** |
# Contains the results of the **Protein-Protein Interaction Expansion**, generated using Genes2Networks (G2N). |
# A `dict` containing two keys:
#
# - nodes: A `list` containing information on the nodes of the expanded subnetwork.
# - interactions: A `list` containing information on the edges of the expanded subnetwork.
#
# |
#
#
#
# **KEA** |
# Contains the results of the **Kinase Enrichment Analysis**, generated using KEA. |
# A `list` of `dict`s containing information on the top kinases predicted to regulate the subnetwork identified by G2N. |
#
#
#
# **X2K** |
# Contains the **Expression2Kinases network**, generated by integrating the results of ChEA, G2N and KEA. |
# A `dict` containing two keys:
#
# - nodes: A `list` containing information on the nodes of the final X2K network.
# - interactions: A `list` containing information on the edges of the final X2K network.
#
# |
#
#
#
#
# ## 3. Interpreting the Results
#
# ### 3.1 ChEA results
# The results for the ChEA analysis can be accessed in x2k_results['ChEA']
. Here, the results are converted to a pandas DataFrame for easier interpretation.
# In[3]:
# Import pandas
import pandas as pd
# Read results
chea_dataframe = pd.DataFrame(x2k_results['ChEA'])
chea_dataframe.head()
# ** Table 1 | Results of the ChEA analysis. ** Each row represents a transcription factor predicted to regulate the input gene list.
#
# ### 3.2 G2N Results
# The results for the G2N analysis can be accessed in x2k_results['G2N']
.
#
# The results are stored in a dictionary containing two keys:
# * `edges`
# * `interactions`
# In[4]:
# G2N nodes dataframe
g2n_nodes_dataframe = pd.DataFrame(x2k_results['G2N']['nodes']).drop('pvalue', axis=1)
g2n_nodes_dataframe.head()
# ** Table 2 | Nodes of the Genes2Networks expanded subnetwork. ** Each row represents a node in the expanded subnetwork. The type column indicates whether the node is a Transcription Factor identified by ChEA, or an intermediate protein.
# In[5]:
# G2N edges dataframe
g2n_edges_dataframe = pd.DataFrame(x2k_results['G2N']['interactions'])
g2n_edges_dataframe.head()
# ** Table 3 | Edges of the Genes2Networks expanded subnetwork. ** Each row represents an edge in the expanded subnetwork generated by G2N on the top transcription factors identified by ChEA.
#
# ### 3.3 KEA Results
# The results for the KEA analysis can be accessed in x2k_results['KEA']
.
# In[6]:
# KEA Results
kea_dataframe = pd.DataFrame(x2k_results['KEA'])
kea_dataframe.head()
# ** Table 4 | Results of the KEA analysis. ** Each row represents a protein kinase predicted to regulate the expanded subnetwork generated by G2N.
#
# ### 3.4 X2K Results
# The results for the X2K analysis can be accessed in x2k_results['X2K']
.
#
# The results are stored in a dictionary containing two keys:
# * `nodes`
# * `interactions`
# In[7]:
# X2K nodes dataframe
x2k_nodes_dataframe = pd.DataFrame(x2k_results['X2K']['nodes']).drop('pvalue', axis=1)
x2k_nodes_dataframe.head()
# ** Table 5 | Nodes of the final Expression2Kinases network. ** Each row represents a node in the final X2K network network. The type column indicates whether the node is a Transcription Factor identified by ChEA, an intermediate protein identified by G2N, or a protein kinase identified by KEA.
# In[8]:
# X2K edges dataframe
x2k_edges_dataframe = pd.DataFrame(x2k_results['X2K']['interactions'])
x2k_edges_dataframe.head()
# ** Table 6 | Edges of the final Expression2Kinases subnetwork. ** Each row represents an edge in the final network identified by integrating the results of ChEA, G2N, and KEA.