This notebook exemplifies how to build Figure 1 of the paper using the IDR API.
import omero
from idr import connection
import requests
from pandas import DataFrame
from pandas import read_csv
from pandas import concat
from io import StringIO
import numpy as np
from IPython.display import Image, display
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure, output_notebook, show, save
from bokeh.models import HoverTool
from bokeh.palettes import brewer
output_notebook()
def getBulkAnnotationAsDf(screenID, conn):
# ofId=8118685
sc = conn.getObject('Screen', screenID)
for ann in sc.listAnnotations():
if isinstance(ann, omero.gateway.FileAnnotationWrapper):
if (ann.getFile().getName() == 'bulk_annotations'):
if (ann.getFile().getSize() > 147625090):
print "that's a big file..."
return None
ofId = ann.getFile().getId()
break
print ofId
original_file = omero.model.OriginalFileI(ofId, False)
table = conn.c.sf.sharedResources().openTable(original_file)
try:
rowCount = table.getNumberOfRows()
column_names = [col.name for col in table.getHeaders()]
black_list = []
column_indices = []
for column_name in column_names:
if column_name in black_list:
continue
column_indices.append(column_names.index(column_name))
table_data = table.slice(column_indices, None)
finally:
table.close()
data = []
for index in range(rowCount):
row_values = [column.values[index] for column in table_data.columns]
data.append(row_values)
dfAnn = DataFrame(data)
dfAnn.columns=column_names
return dfAnn
def getGenesFromPhenotype(df,phTerm):
# all gene from bulk_annotation 'df' annotated with CMPO term 'phTerm'
colElong = []
for col in df.columns:
if 'Term Accession' in col:
if phTerm in df[col].unique():
colElong.append(col)
dfElong = concat([df[df[col] != ''] for col in colElong])
return dfElong['Gene Identifier'].unique()
conn = connection()
Connected to IDR...
We will download the annotations for the three screens under study as panda DataFrames, and sub-select the genes from each which are annotated with the phenotype we are looking for, CMPO_0000077 a.k.a. 'elongated cell phenotype'.
The next step is to build, from that list, a list of IDs we can query the STRING database with. The translation table was built offline using biomart and pombase.
# CMPO term to look for
phTerm = 'CMPO_0000077'
# ids of screens:
# scId = 3 # Graml et al.
# scId = 206 # Rohn et al., B
# scId = 1202 # Fuchs et al., B
screens = [3, 206, 1202]
genes = []
for scId in screens:
print 'loading ' + str(scId)
# loading bulk_annotations of screens as dataframes
df = getBulkAnnotationAsDf(scId, conn)
# unique genes with CPMO term
cur = getGenesFromPhenotype(df, phTerm)
print 'got ' + str(len(cur)) + ' genes'
genes.extend(cur)
loading 3 14209154 got 39 genes loading 206 15026339 got 72 genes loading 1202 14516716 got 99 genes
conn.close()
# The table was built offline using biomart
dfTrans = read_csv('./includes/TableOfGenesWithElongatedCellPhenotype.csv')
# extract IDs
genesE84 = concat([dfTrans[dfTrans['Screen GeneID']==x]['Human Ortholog Ensembl 84'] for x in genes])
genesE84 = genesE84[genesE84!='(null)']
print genes[:10]
print genesE84.head(10)
['SPAC3G9.08', 'SPCC970.10c', 'SPCC553.08c', 'SPAC16C9.05', 'SPAC30.02c', 'SPAC3H8.05c', 'SPCC306.04c', 'SPCC338.16', 'SPCP1E11.06', 'SPAC1556.01c'] 15 ENSG00000071243 16 ENSG00000111653 17 ENSG00000168395 18 ENSG00000168556 47 ENSG00000103549 48 ENSG00000155827 44 ENSG00000140598 2 ENSG00000109118 11 ENSG00000198841 40 ENSG00000099381 Name: Human Ortholog Ensembl 84, dtype: object
We use the STRINGdb REST API to get all the known interactions of all of our genes
# Building STRINGdb REST api query
url = 'http://string-db.org/api/psi-mi-tab/interactionsList?species=Human%209606&identifiers='
# genesE84 = genesE84[:10]
for g in genesE84:
url = url + g + '%0d'
Res = requests.get(url)
df = read_csv(StringIO(Res.text), sep='\t', header=None)
df.head(10)
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | string:9606.ENSP00000340896 | string:9606.ENSP00000267197 | ASH2L | SETD1B | - | - | - | - | - | taxid:9606 | taxid:9606 | - | - | - | score:0.999|ascore:0.088|escore:0.962|dscore:0... |
1 | string:9606.ENSP00000340896 | string:9606.ENSP00000264515 | ASH2L | RBBP5 | - | - | - | - | - | taxid:9606 | taxid:9606 | - | - | - | score:0.999|ascore:0.201|escore:0.975|dscore:0... |
2 | string:9606.ENSP00000369810 | string:9606.ENSP00000220509 | VPS16 | VPS18 | - | - | - | - | - | taxid:9606 | taxid:9606 | - | - | - | score:0.999|ascore:0.18|escore:0.943|dscore:0.... |
3 | string:9606.ENSP00000267197 | string:9606.ENSP00000264515 | SETD1B | RBBP5 | - | - | - | - | - | taxid:9606 | taxid:9606 | - | - | - | score:0.999|ascore:0.067|escore:0.962|dscore:0... |
4 | string:9606.ENSP00000325863 | string:9606.ENSP00000265335 | MRE11A | RAD50 | - | - | - | - | - | taxid:9606 | taxid:9606 | - | - | - | score:0.999|nscore:0.309|ascore:0.46|escore:0.... |
5 | string:9606.ENSP00000390475 | string:9606.ENSP00000264515 | CXXC1 | RBBP5 | - | - | - | - | - | taxid:9606 | taxid:9606 | - | - | - | score:0.998|ascore:0.054|escore:0.931|dscore:0... |
6 | string:9606.ENSP00000390475 | string:9606.ENSP00000340896 | CXXC1 | ASH2L | - | - | - | - | - | taxid:9606 | taxid:9606 | - | - | - | score:0.998|ascore:0.058|escore:0.953|dscore:0... |
7 | string:9606.ENSP00000390475 | string:9606.ENSP00000262519 | CXXC1 | SETD1A | - | - | - | - | - | taxid:9606 | taxid:9606 | - | - | - | score:0.998|ascore:0.181|escore:0.933|dscore:0... |
8 | string:9606.ENSP00000264515 | string:9606.ENSP00000262519 | RBBP5 | SETD1A | - | - | - | - | - | taxid:9606 | taxid:9606 | - | - | - | score:0.998|ascore:0.067|escore:0.916|dscore:0... |
9 | string:9606.ENSP00000362948 | string:9606.ENSP00000265350 | MED18 | MED20 | - | - | - | - | - | taxid:9606 | taxid:9606 | - | - | - | score:0.998|ascore:0.074|escore:0.833|dscore:0... |
import warnings
warnings.filterwarnings('ignore')
g = nx.from_pandas_edgelist(df, 2, 3)
plt.figure(figsize = (12, 12))
nx.draw_spring(g, with_labels=True)
# Use spring layout from networkx for display
pts = nx.spring_layout(g)
TOOLS = "tap,pan,wheel_zoom,reset"
p = figure(plot_width=800, plot_height=800, title = "My chart", tools=TOOLS)
# color certain genes of interest
# comment out this section if not needed
colors = np.ones(len(g.nodes()))
colors[[x=='ASH2L' for x in pts.keys()]] = 4
colors[[x=='SETD1A' for x in pts.keys()]] = 2
colors[[x=='SETD1B' for x in pts.keys()]] = 2
cm = brewer['Set1'][5]
colors = [cm[int(x)] for x in colors]
names = pts.keys()
cntr = 0
for x in pts.itervalues():
cir = p.circle(x=x[0], y=x[1], radius=.01, color=colors[cntr], name=names[cntr])
cntr = cntr+1
# uncomment next line if not
# coloring genes of interest
# cir=p.circle('x','y',source=sourceNode,radius=.01)
sourceEdges = ColumnDataSource(
data=dict(
x1=[pts[x[0]][0] for x in g.edges()],
y1=[pts[x[0]][1] for x in g.edges()],
x2=[pts[x[1]][0] for x in g.edges()],
y2=[pts[x[1]][1] for x in g.edges()]
))
seg = p.segment('x1','y1','x2','y2',source=sourceEdges)
hover = HoverTool(
tooltips=[
("Name", "@names"),
# ("Screens names", "@screens"),
]
)
p.add_tools(hover)
show(p)
Copyright (C) 2016 University of Dundee. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.