#!/usr/bin/env python # coding: utf-8 # # X2K API Tutorial Notebook # April 9th, 2018 # # This Jupyter Notebook contains an interactive tutorial for **running the Expression2Kinases (X2K) API** using Python 3. # # ### Table of Contents # The notebook contains the following sections: # 1. **API Documentation** - shows how to programmatically analyze your gene list in Python. # 2. **Using the X2K API** - overview of the input parameters and output of the API. # 3. **Interpreting the results** - gives an overview of the structure and meaning of the analysis results. # * **Transcription Factor Enrichment Analysis** (ChEA) # * **Protein-Protein Interaction Expansion** (G2N) # * **Kinase Enrichment Analysis** (KEA) # * **Expression2Kinases** (X2K) # ## 1. Using the X2K API # The X2K API allows for programmatic analysis of an input gene list. # # The `run_X2K()` function displayed below can be used to analyze a gene list and load the results in a Python dictionary by performing a **POST request**. # # The function requires only one input, `input_genes`, **a list of gene symbols ** to be analyzed. Additional optional parameters can be specified with the `options` parameters. # In[1]: # Import modules import http.client import json ##### Function to run X2K ### Input: a Python list of gene symbols ### Output: a dictionary containing the results of X2K, ChEA, G2N, KEA. def run_X2K(input_genes, options={}): # Open HTTP connection conn = http.client.HTTPConnection("amp.pharm.mssm.edu") # # Get default options default_options = {'text-genes': '\n'.join(input_genes), 'included_organisms': 'both', 'included_database': 'ChEA 2015', 'path_length': 2, 'minimum network size': 50, 'min_number_of_articles_supporting_interaction': 2, 'max_number_of_interactions_per_protein': 200, 'max_number_of_interactions_per_article': 100, 'biocarta': True, 'biogrid': True, 'dip': True, 'innatedb': True, 'intact': True, 'kegg': True, 'mint': True, 'ppid': True, 'snavi': True, 'number_of_results': 50, 'sort_tfs_by': 'combined score', 'sort_kinases_by': 'combined score'} # Update options for key, value in options.items(): if key in default_options.keys() and key != 'text-genes': default_options.update({key: value}) # Get payload boundary = "----WebKitFormBoundary7MA4YWxkTrZu0gW" payload = ''.join(['--'+boundary+'\r\nContent-Disposition: form-data; name=\"{key}\"\r\n\r\n{value}\r\n'.format(**locals()) for key, value in default_options.items()])+'--'+boundary+'--' # Get Headers headers = { 'content-type': "multipart/form-data; boundary="+boundary, 'cache-control': "no-cache", } # Initialize connection conn.request("POST", "/X2K/api", payload, headers) # Get response res = conn.getresponse() # Read response data = res.read().decode('utf-8') # Convert to dictionary x2k_results = {key: json.loads(value) if key != 'input' else value for key, value in json.loads(data).items()} # Clean results x2k_results['ChEA'] = x2k_results['ChEA']['tfs'] x2k_results['G2N'] = x2k_results['G2N']['network'] x2k_results['KEA'] = x2k_results['KEA']['kinases'] x2k_results['X2K'] = x2k_results['X2K']['network'] # Return results return x2k_results # In[2]: # Get input genes input_genes = ['Nsun3', 'Polrmt', 'Nlrx1', 'Sfxn5', 'Zc3h12c', 'Slc25a39', 'Arsg', 'Defb29', 'Ndufb6', 'Zfand1', 'Tmem77', '5730403B10Rik', 'Tlcd1', 'Psmc6', 'Slc30a6', 'LOC100047292', 'Lrrc40', 'Orc5l', 'Mpp7', 'Unc119b', 'Prkaca', 'Tcn2', 'Psmc3ip', 'Pcmtd2', 'Acaa1a', 'Lrrc1', '2810432D09Rik', 'Sephs2', 'Sac3d1', 'Tmlhe', 'LOC623451', 'Tsr2', 'Plekha7', 'Gys2', 'Arhgef12', 'Hibch', 'Lyrm2', 'Zbtb44', 'Entpd5', 'Rab11fip2', 'Lipt1', 'Intu', 'Anxa13', 'Klf12', 'Sat2', 'Gal3st2', 'Vamp8', 'Fkbpl', 'Aqp11', 'Trap1', 'Pmpcb', 'Tm7sf3', 'Rbm39', 'Bri3', 'Kdr', 'Zfp748', 'Nap1l1', 'Dhrs1', 'Lrrc56', 'Wdr20a', 'Stxbp2', 'Klf1', 'Ufc1', 'Ccdc16', '9230114K14Rik', 'Rwdd3', '2610528K11Rik', 'Aco1', 'Cables1', 'LOC100047214', 'Yars2', 'Lypla1', 'Kalrn', 'Gyk', 'Zfp787', 'Zfp655', 'Rabepk', 'Zfp650', '4732466D17Rik', 'Exosc4', 'Wdr42a', 'Gphn', '2610528J11Rik', '1110003E01Rik', 'Mdh1', '1200014M14Rik', 'AW209491', 'Mut', '1700123L14Rik', '2610036D13Rik', 'Cox15', 'Tmem30a', 'Nsmce4a', 'Tm2d2', 'Rhbdd3', 'Atxn2', 'Nfs1', '3110001I20Rik', 'BC038156', 'LOC100047782', '2410012H22Rik', 'Rilp', 'A230062G08Rik', 'Pttg1ip', 'Rab1', 'Afap1l1', 'Lyrm5', '2310026E23Rik', 'C330002I19Rik', 'Zfyve20', 'Poli', 'Tomm70a', 'Slc7a6os', 'Mat2b', '4932438A13Rik', 'Lrrc8a', 'Smo', 'Nupl2', 'Trpc2', 'Arsk', 'D630023B12Rik', 'Mtfr1', '5730414N17Rik', 'Scp2', 'Zrsr1', 'Nol7', 'C330018D20Rik', 'Ift122', 'LOC100046168', 'D730039F16Rik', 'Scyl1', '1700023B02Rik', '1700034H14Rik', 'Fbxo8', 'Paip1', 'Tmem186', 'Atpaf1', 'LOC100046254', 'LOC100047604', 'Coq10a', 'Fn3k', 'Sipa1l1', 'Slc25a16', 'Slc25a40', 'Rps6ka5', 'Trim37', 'Lrrc61', 'Abhd3', 'Gbe1', 'Parp16', 'Hsd3b2', 'Esm1', 'Dnajc18', 'Dolpp1', 'Lass2', 'Wdr34', 'Rfesd', 'Cacnb4', '2310042D19Rik', 'Srr', 'Bpnt1', '6530415H11Rik', 'Clcc1', 'Tfb1m', '4632404H12Rik', 'D4Bwg0951e', 'Med14', 'Adhfe1', 'Thtpa', 'Cat', 'Ell3', 'Akr7a5', 'Mtmr14', 'Timm44', 'Sf1', 'Ipp', 'Iah1', 'Trim23', 'Wdr89', 'Gstz1', 'Cradd', '2510006D16Rik', 'Fbxl6', 'LOC100044400', 'Zfp106', 'Cd55', '0610013E23Rik', 'Afmid', 'Tmem86a', 'Aldh6a1', 'Dalrd3', 'Smyd4', 'Nme7', 'Fars2', 'Tasp1', 'Cldn10', 'A930005H10Rik', 'Slc9a6', 'Adk', 'Rbks', '2210016F16Rik', 'Vwce', '4732435N03Rik', 'Zfp11', 'Vldlr', '9630013D21Rik', '4933407N01Rik', 'Fahd1', 'Mipol1', '1810019D21Rik', '1810049H13Rik', 'Tfam', 'Paics', '1110032A03Rik', 'LOC100044139', 'Dnajc19', 'BC016495', 'A930041I02Rik', 'Rqcd1', 'Usp34', 'Zcchc3', 'H2afj', 'Phf7', '4921508D12Rik', 'Kmo', 'Prpf18', 'Mcat', 'Txndc4', '4921530L18Rik', 'Vps13b', 'Scrn3', 'Tor1a', 'AI316807', 'Acbd4', 'Fah', 'Apool', 'Col4a4', 'Lrrc19', 'Gnmt', 'Nr3c1', 'Sip1', 'Ascc1', 'Fech', 'Abhd14a', 'Arhgap18', '2700046G09Rik', 'Yme1l1', 'Gk5', 'Glo1', 'Sbk1', 'Cisd1', '2210011C24Rik', 'Nxt2', 'Notum', 'Ankrd42', 'Ube2e1', 'Ndufv1', 'Slc33a1', 'Cep68', 'Rps6kb1', 'Hyi', 'Aldh1a3', 'Mynn', '3110048L19Rik', 'Rdh14', 'Proz', 'Gorasp1', 'LOC674449', 'Zfp775', '5430437P03Rik', 'Npy', 'Adh5', 'Sybl1', '4930432O21Rik', 'Nat9', 'LOC100048387', 'Mettl8', 'Eny2', '2410018G20Rik', 'Pgm2', 'Fgfr4', 'Mobkl2b', 'Atad3a', '4932432K03Rik', 'Dhtkd1', 'Ubox5', 'A530050D06Rik', 'Zdhhc5', 'Mgat1', 'Nudt6', 'Tpmt', 'Wbscr18', 'LOC100041586', 'Cdk5rap1', '4833426J09Rik', 'Myo6', 'Cpt1a', 'Gadd45gip1', 'Tmbim4', '2010309E21Rik', 'Asb9', '2610019F03Rik', '7530414M10Rik', 'Atp6v1b2', '2310068J16Rik', 'Ddt', 'Klhdc4', 'Hpn', 'Lifr', 'Ovol1', 'Nudt12', 'Cdan1', 'Fbxo9', 'Fbxl3', 'Hoxa7', 'Aldh8a1', '3110057O12Rik', 'Abhd11', 'Psmb1', 'ENSMUSG00000074286', 'Chpt1', 'Oxsm', '2310009A05Rik', '1700001L05Rik', 'Zfp148', '39509', 'Mrpl9', 'Tmem80', '9030420J04Rik', 'Naglu', 'Plscr2', 'Agbl3', 'Pex1', 'Cno', 'Neo1', 'Asf1a', 'Tnfsf5ip1', 'Pkig', 'AI931714', 'D130020L05Rik', 'Cntd1', 'Clec2h', 'Zkscan1', '1810044D09Rik', 'Mettl7a', 'Siae', 'Fbxo3', 'Fzd5', 'Tmem166', 'Tmed4', 'Gpr155', 'Rnf167', 'Sptlc1', 'Riok2', 'Tgds', 'Pms1', 'Pitpnc1', 'Pcsk7', '4933403G14Rik', 'Ei24', 'Crebl2', 'Tln1', 'Mrpl35', '2700038C09Rik', 'Ubie', 'Osgepl1', '2410166I05Rik', 'Wdr24', 'Ap4s1', 'Lrrc44', 'B3bp', 'Itfg1', 'Dmxl1', 'C1d'] # Run X2K results x2k_results = run_X2K(input_genes) x2k_results.keys() # ## 2. X2K API Documentation # # ### 2.1 API Inputs # A **full list of the input parameters** for the `run_X2K()` function is available below. # # The optional parameters can provided to the function in the `options` dictionary. # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
ParameterStepDescriptionNotes
**input_genes** (required)X2KContains the input gene set for the X2K analysis.A list of strings representing the input gene symbols.
*organism* (optional)ChEAThe organism from which TF-target interaction data should be integrated.One of `('human_only', 'mouse_only', 'both')`. Default `'both'`.
*included_database* (optional)ChEAThe database from which TF-target interaction data should be integrated,One of `('ChEA 2015', 'ENCODE 2015', 'ChEA & ENCODE Consensus', 'Transfac & Jaspar')` Default `'ENCODE 2015'`.
*sort_tfs_by* (optional) # ChEAThe method used to sort the top Transcription Factors identified by ChEA.One of `('p-value', 'rank', 'combined score')`. Default `'p-value'`.
*path_length* (optional)G2NThe maximum Protein-Protein Interaction path length for the network expansion.Integer, default `2`.
*minimum_network_size* (optional) # G2NThe minimum size of the Protein-Protein interaction network generated using Genes2Networks.Integer, default `50`.
*min_number_of_articles_supporting_interaction* (optional) # G2NThe minimum number of published articles supporting a Protein-Protein Interaction for the expanded subnetwork.Integer, default `2`.
*max_number_of_interactions_per_protein* (optional) # G2NThe maximum number of physical interactions allowed for the proteins in the expanded subnetwork.Integer, default `200`.
*max_number_of_interactions_per_article* (optional) # G2NThe maximum number of physical interactions reported in each published articleInteger, default `100`.
*ppi_networks* (optional) # G2NThe Protein-Protein Interaction databases to integrate for generation of the expanded subnetwork.Either `'all'`, or a list containing one or more of `('biocarta', 'biogrid', 'dip', 'innatedb', 'intact', 'kegg', 'mint', 'ppid', 'snavi')`. Default `'all'`.
*number_of_results* (optional) # G2NThe maximum network size of the expanded network generated using Genes2Networks.Integer, default `50`.
*sort_kinases_by* (optional) # KEAThe method used to sort the top Transcription Factors identified by KEA.One of `('p-value', 'rank', 'combined score')`. Default `'p-value'`.
# # ### 2.2 API Output # The `run_X2K()` function returns results as `dict` containing **four keys**, whose contents are described below. # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
KeyNotesContents
**ChEA**Contains the results of the **Transcription Factor Enrichment Analysis**, generated using ChEA.A `list` of `dict`s containing information on the top TFs predicted to regulate the input genes.
**G2N**Contains the results of the **Protein-Protein Interaction Expansion**, generated using Genes2Networks (G2N).A `dict` containing two keys: #
    #
  • nodes: A `list` containing information on the nodes of the expanded subnetwork.
  • #
  • interactions: A `list` containing information on the edges of the expanded subnetwork.
  • #
#
**KEA**Contains the results of the **Kinase Enrichment Analysis**, generated using KEA.A `list` of `dict`s containing information on the top kinases predicted to regulate the subnetwork identified by G2N.
**X2K**Contains the **Expression2Kinases network**, generated by integrating the results of ChEA, G2N and KEA.A `dict` containing two keys: #
    #
  • nodes: A `list` containing information on the nodes of the final X2K network.
  • #
  • interactions: A `list` containing information on the edges of the final X2K network.
  • #
#
# # ## 3. Interpreting the Results # # ### 3.1 ChEA results # The results for the ChEA analysis can be accessed in x2k_results['ChEA']. Here, the results are converted to a pandas DataFrame for easier interpretation. # In[3]: # Import pandas import pandas as pd # Read results chea_dataframe = pd.DataFrame(x2k_results['ChEA']) chea_dataframe.head() # ** Table 1 | Results of the ChEA analysis. ** Each row represents a transcription factor predicted to regulate the input gene list. # # ### 3.2 G2N Results # The results for the G2N analysis can be accessed in x2k_results['G2N']. # # The results are stored in a dictionary containing two keys: # * `edges` # * `interactions` # In[4]: # G2N nodes dataframe g2n_nodes_dataframe = pd.DataFrame(x2k_results['G2N']['nodes']).drop('pvalue', axis=1) g2n_nodes_dataframe.head() # ** Table 2 | Nodes of the Genes2Networks expanded subnetwork. ** Each row represents a node in the expanded subnetwork. The type column indicates whether the node is a Transcription Factor identified by ChEA, or an intermediate protein. # In[5]: # G2N edges dataframe g2n_edges_dataframe = pd.DataFrame(x2k_results['G2N']['interactions']) g2n_edges_dataframe.head() # ** Table 3 | Edges of the Genes2Networks expanded subnetwork. ** Each row represents an edge in the expanded subnetwork generated by G2N on the top transcription factors identified by ChEA. # # ### 3.3 KEA Results # The results for the KEA analysis can be accessed in x2k_results['KEA']. # In[6]: # KEA Results kea_dataframe = pd.DataFrame(x2k_results['KEA']) kea_dataframe.head() # ** Table 4 | Results of the KEA analysis. ** Each row represents a protein kinase predicted to regulate the expanded subnetwork generated by G2N. # # ### 3.4 X2K Results # The results for the X2K analysis can be accessed in x2k_results['X2K']. # # The results are stored in a dictionary containing two keys: # * `nodes` # * `interactions` # In[7]: # X2K nodes dataframe x2k_nodes_dataframe = pd.DataFrame(x2k_results['X2K']['nodes']).drop('pvalue', axis=1) x2k_nodes_dataframe.head() # ** Table 5 | Nodes of the final Expression2Kinases network. ** Each row represents a node in the final X2K network network. The type column indicates whether the node is a Transcription Factor identified by ChEA, an intermediate protein identified by G2N, or a protein kinase identified by KEA. # In[8]: # X2K edges dataframe x2k_edges_dataframe = pd.DataFrame(x2k_results['X2K']['interactions']) x2k_edges_dataframe.head() # ** Table 6 | Edges of the final Expression2Kinases subnetwork. ** Each row represents an edge in the final network identified by integrating the results of ChEA, G2N, and KEA.