#!/usr/bin/env python # coding: utf-8 # # Building a gene network from the IDR and STRINGdb # # This notebook exemplifies how to build Figure 1 of the paper using the IDR API. # ### Dependencies # # * [Matplotlib](http://matplotlib.org/) # * [NumPy](http://www.numpy.org/) # * [Pandas](http://pandas.pydata.org/) # * [NetworkX](https://networkx.github.io/) # * [Py2Cytoscape](https://pypi.python.org/pypi/py2cytoscape) # * [Requests](http://docs.python-requests.org/) # In[1]: import omero from idr import connection import requests from pandas import DataFrame from pandas import read_csv from pandas import concat from io import StringIO import numpy as np from IPython.display import Image, display import networkx as nx import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') from bokeh.models import ColumnDataSource from bokeh.plotting import figure, output_notebook, show, save from bokeh.models import HoverTool from bokeh.palettes import brewer output_notebook() # ### Method definitions # In[2]: def getBulkAnnotationAsDf(screenID, conn): # ofId=8118685 sc = conn.getObject('Screen', screenID) for ann in sc.listAnnotations(): if isinstance(ann, omero.gateway.FileAnnotationWrapper): if (ann.getFile().getName() == 'bulk_annotations'): if (ann.getFile().getSize() > 147625090): print "that's a big file..." return None ofId = ann.getFile().getId() break print ofId original_file = omero.model.OriginalFileI(ofId, False) table = conn.c.sf.sharedResources().openTable(original_file) try: rowCount = table.getNumberOfRows() column_names = [col.name for col in table.getHeaders()] black_list = [] column_indices = [] for column_name in column_names: if column_name in black_list: continue column_indices.append(column_names.index(column_name)) table_data = table.slice(column_indices, None) finally: table.close() data = [] for index in range(rowCount): row_values = [column.values[index] for column in table_data.columns] data.append(row_values) dfAnn = DataFrame(data) dfAnn.columns=column_names return dfAnn # In[3]: def getGenesFromPhenotype(df,phTerm): # all gene from bulk_annotation 'df' annotated with CMPO term 'phTerm' colElong = [] for col in df.columns: if 'Term Accession' in col: if phTerm in df[col].unique(): colElong.append(col) dfElong = concat([df[df[col] != ''] for col in colElong]) return dfElong['Gene Identifier'].unique() # ### Connect to the IDR server # In[4]: conn = connection() # ## Querying the IDR # # We will download the annotations for the three screens under study as panda DataFrames, and sub-select the genes from each which are annotated with the phenotype we are looking for, CMPO_0000077 a.k.a. 'elongated cell phenotype'. # # The next step is to build, from that list, a list of IDs we can query the STRING database with. The translation table was built offline using biomart and pombase. # In[5]: # CMPO term to look for phTerm = 'CMPO_0000077' # ids of screens: # scId = 3 # Graml et al. # scId = 206 # Rohn et al., B # scId = 1202 # Fuchs et al., B screens = [3, 206, 1202] genes = [] for scId in screens: print 'loading ' + str(scId) # loading bulk_annotations of screens as dataframes df = getBulkAnnotationAsDf(scId, conn) # unique genes with CPMO term cur = getGenesFromPhenotype(df, phTerm) print 'got ' + str(len(cur)) + ' genes' genes.extend(cur) # ### Disconnect when done loading data # In[6]: conn.close() # ### Translation # In[7]: # The table was built offline using biomart dfTrans = read_csv('./includes/TableOfGenesWithElongatedCellPhenotype.csv') # extract IDs genesE84 = concat([dfTrans[dfTrans['Screen GeneID']==x]['Human Ortholog Ensembl 84'] for x in genes]) genesE84 = genesE84[genesE84!='(null)'] print genes[:10] print genesE84.head(10) # ### REST query of STRINGdb # # We use the STRINGdb REST API to get all the known interactions of all of our genes # # In[8]: # Building STRINGdb REST api query url = 'http://string-db.org/api/psi-mi-tab/interactionsList?species=Human%209606&identifiers=' # genesE84 = genesE84[:10] for g in genesE84: url = url + g + '%0d' Res = requests.get(url) # In[9]: df = read_csv(StringIO(Res.text), sep='\t', header=None) df.head(10) # ### Some displays # #### A. NetworkX # In[11]: import warnings warnings.filterwarnings('ignore') g = nx.from_pandas_edgelist(df, 2, 3) plt.figure(figsize = (12, 12)) nx.draw_spring(g, with_labels=True) # #### B. Bokeh # In[12]: # Use spring layout from networkx for display pts = nx.spring_layout(g) TOOLS = "tap,pan,wheel_zoom,reset" p = figure(plot_width=800, plot_height=800, title = "My chart", tools=TOOLS) # color certain genes of interest # comment out this section if not needed colors = np.ones(len(g.nodes())) colors[[x=='ASH2L' for x in pts.keys()]] = 4 colors[[x=='SETD1A' for x in pts.keys()]] = 2 colors[[x=='SETD1B' for x in pts.keys()]] = 2 cm = brewer['Set1'][5] colors = [cm[int(x)] for x in colors] names = pts.keys() cntr = 0 for x in pts.itervalues(): cir = p.circle(x=x[0], y=x[1], radius=.01, color=colors[cntr], name=names[cntr]) cntr = cntr+1 # uncomment next line if not # coloring genes of interest # cir=p.circle('x','y',source=sourceNode,radius=.01) sourceEdges = ColumnDataSource( data=dict( x1=[pts[x[0]][0] for x in g.edges()], y1=[pts[x[0]][1] for x in g.edges()], x2=[pts[x[1]][0] for x in g.edges()], y2=[pts[x[1]][1] for x in g.edges()] )) seg = p.segment('x1','y1','x2','y2',source=sourceEdges) hover = HoverTool( tooltips=[ ("Name", "@names"), # ("Screens names", "@screens"), ] ) p.add_tools(hover) show(p) # ### License # # Copyright (C) 2016 University of Dundee. All Rights Reserved. # # This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.