Building a gene network from the IDR and STRINGdb¶

This notebook exemplifies how to build Figure 1 of the paper using the IDR API.

Dependencies¶

In [1]:

import omero
from idr import connection

import requests
from pandas import DataFrame
from pandas import read_csv
from pandas import concat
from io import StringIO

import numpy as np
from IPython.display import Image, display

import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline

from bokeh.models import ColumnDataSource
from bokeh.plotting import figure, output_notebook, show, save
from bokeh.models import HoverTool
from bokeh.palettes import brewer

output_notebook()

Loading BokehJS ...

Method definitions¶

In [2]:

def getBulkAnnotationAsDf(screenID, conn):
#    ofId=8118685
    sc = conn.getObject('Screen', screenID)
    for ann in sc.listAnnotations():
        if isinstance(ann, omero.gateway.FileAnnotationWrapper):
            if (ann.getFile().getName() == 'bulk_annotations'):
                if (ann.getFile().getSize() > 147625090):
                    print "that's a big file..."
                    return None
                ofId = ann.getFile().getId()
                break

    print ofId
    original_file = omero.model.OriginalFileI(ofId, False)

    table = conn.c.sf.sharedResources().openTable(original_file)
    try:
        rowCount = table.getNumberOfRows()

        column_names = [col.name for col in table.getHeaders()]

        black_list = []
        column_indices = []
        for column_name in column_names:
            if column_name in black_list:
                continue
            column_indices.append(column_names.index(column_name))

        table_data = table.slice(column_indices, None)
    finally:
        table.close()

    data = []
    for index in range(rowCount):
        row_values = [column.values[index] for column in table_data.columns]
        data.append(row_values)

    dfAnn = DataFrame(data)
    dfAnn.columns=column_names
    return dfAnn

In [3]:

def getGenesFromPhenotype(df,phTerm):
# all gene from bulk_annotation 'df' annotated with CMPO term 'phTerm'
    colElong = []
    for col in df.columns:
        if 'Term Accession' in col:
            if phTerm in df[col].unique():
                colElong.append(col)

    dfElong = concat([df[df[col] != ''] for col in colElong])
    return dfElong['Gene Identifier'].unique()
    

Connect to the IDR server¶

In [4]:

conn = connection()

Connected to IDR...

Querying the IDR¶

We will download the annotations for the three screens under study as panda DataFrames, and sub-select the genes from each which are annotated with the phenotype we are looking for, CMPO_0000077 a.k.a. 'elongated cell phenotype'.

The next step is to build, from that list, a list of IDs we can query the STRING database with. The translation table was built offline using biomart and pombase.

In [5]:

# CMPO term to look for
phTerm = 'CMPO_0000077'

# ids of screens:
# scId = 3  # Graml et al.
# scId = 206  # Rohn et al., B
# scId = 1202  # Fuchs et al., B

screens = [3, 206, 1202]

genes = []
for scId in screens:
    print 'loading ' + str(scId)

    # loading bulk_annotations of screens as dataframes
    df = getBulkAnnotationAsDf(scId, conn)
    
    # unique genes with CPMO term
    cur = getGenesFromPhenotype(df, phTerm)
    print 'got ' + str(len(cur)) + ' genes'
    genes.extend(cur)

loading 3
14209154
got 39 genes
loading 206
15026339
got 72 genes
loading 1202
14516716
got 99 genes

Disconnect when done loading data¶

In [6]:

conn.close()

Translation¶

In [7]:

# The table was built offline using biomart
dfTrans = read_csv('./includes/TableOfGenesWithElongatedCellPhenotype.csv')

# extract IDs 
genesE84 = concat([dfTrans[dfTrans['Screen GeneID']==x]['Human Ortholog Ensembl 84'] for x in genes])
genesE84 = genesE84[genesE84!='(null)']

print genes[:10]
print genesE84.head(10)

['SPAC3G9.08', 'SPCC970.10c', 'SPCC553.08c', 'SPAC16C9.05', 'SPAC30.02c', 'SPAC3H8.05c', 'SPCC306.04c', 'SPCC338.16', 'SPCP1E11.06', 'SPAC1556.01c']
15    ENSG00000071243
16    ENSG00000111653
17    ENSG00000168395
18    ENSG00000168556
47    ENSG00000103549
48    ENSG00000155827
44    ENSG00000140598
2     ENSG00000109118
11    ENSG00000198841
40    ENSG00000099381
Name: Human Ortholog Ensembl 84, dtype: object

REST query of STRINGdb¶

We use the STRINGdb REST API to get all the known interactions of all of our genes

In [8]:

# Building STRINGdb REST api query
url = 'http://string-db.org/api/psi-mi-tab/interactionsList?species=Human%209606&identifiers='
# genesE84 = genesE84[:10]
for g in genesE84:
    url = url + g + '%0d'
Res = requests.get(url)

In [9]:

df = read_csv(StringIO(Res.text), sep='\t', header=None)
df.head(10)

Out[9]:

	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14
0	string:9606.ENSP00000340896	string:9606.ENSP00000267197	ASH2L	SETD1B	-	-	-	-	-	taxid:9606	taxid:9606	-	-	-	score:0.999\|ascore:0.088\|escore:0.962\|dscore:0...
1	string:9606.ENSP00000340896	string:9606.ENSP00000264515	ASH2L	RBBP5	-	-	-	-	-	taxid:9606	taxid:9606	-	-	-	score:0.999\|ascore:0.201\|escore:0.975\|dscore:0...
2	string:9606.ENSP00000369810	string:9606.ENSP00000220509	VPS16	VPS18	-	-	-	-	-	taxid:9606	taxid:9606	-	-	-	score:0.999\|ascore:0.18\|escore:0.943\|dscore:0....
3	string:9606.ENSP00000267197	string:9606.ENSP00000264515	SETD1B	RBBP5	-	-	-	-	-	taxid:9606	taxid:9606	-	-	-	score:0.999\|ascore:0.067\|escore:0.962\|dscore:0...
4	string:9606.ENSP00000325863	string:9606.ENSP00000265335	MRE11A	RAD50	-	-	-	-	-	taxid:9606	taxid:9606	-	-	-	score:0.999\|nscore:0.309\|ascore:0.46\|escore:0....
5	string:9606.ENSP00000390475	string:9606.ENSP00000264515	CXXC1	RBBP5	-	-	-	-	-	taxid:9606	taxid:9606	-	-	-	score:0.998\|ascore:0.054\|escore:0.931\|dscore:0...
6	string:9606.ENSP00000390475	string:9606.ENSP00000340896	CXXC1	ASH2L	-	-	-	-	-	taxid:9606	taxid:9606	-	-	-	score:0.998\|ascore:0.058\|escore:0.953\|dscore:0...
7	string:9606.ENSP00000390475	string:9606.ENSP00000262519	CXXC1	SETD1A	-	-	-	-	-	taxid:9606	taxid:9606	-	-	-	score:0.998\|ascore:0.181\|escore:0.933\|dscore:0...
8	string:9606.ENSP00000264515	string:9606.ENSP00000262519	RBBP5	SETD1A	-	-	-	-	-	taxid:9606	taxid:9606	-	-	-	score:0.998\|ascore:0.067\|escore:0.916\|dscore:0...
9	string:9606.ENSP00000362948	string:9606.ENSP00000265350	MED18	MED20	-	-	-	-	-	taxid:9606	taxid:9606	-	-	-	score:0.998\|ascore:0.074\|escore:0.833\|dscore:0...

Some displays¶

A. NetworkX¶

In [11]:

import warnings
warnings.filterwarnings('ignore')
g = nx.from_pandas_edgelist(df, 2, 3)
plt.figure(figsize = (12, 12))
nx.draw_spring(g, with_labels=True)

B. Bokeh¶

In [12]:

# Use spring layout from networkx for display
pts = nx.spring_layout(g)

TOOLS = "tap,pan,wheel_zoom,reset"
p = figure(plot_width=800, plot_height=800, title = "My chart", tools=TOOLS)

# color certain genes of interest
# comment out this section if not needed
colors = np.ones(len(g.nodes()))
colors[[x=='ASH2L' for x in pts.keys()]] = 4
colors[[x=='SETD1A' for x in pts.keys()]] = 2
colors[[x=='SETD1B' for x in pts.keys()]] = 2
cm = brewer['Set1'][5]
colors = [cm[int(x)] for x in colors]

names = pts.keys()
cntr = 0
for x in pts.itervalues():
    cir = p.circle(x=x[0], y=x[1], radius=.01, color=colors[cntr], name=names[cntr])
    cntr = cntr+1

# uncomment next line if not
# coloring genes of interest
# cir=p.circle('x','y',source=sourceNode,radius=.01)
sourceEdges = ColumnDataSource(
    data=dict(
        x1=[pts[x[0]][0] for x in g.edges()],
        y1=[pts[x[0]][1] for x in g.edges()],
        x2=[pts[x[1]][0] for x in g.edges()],
        y2=[pts[x[1]][1] for x in g.edges()]
    ))
seg = p.segment('x1','y1','x2','y2',source=sourceEdges)

hover = HoverTool(
        tooltips=[
            ("Name", "@names"),
#            ("Screens names", "@screens"),
        ]
    )
p.add_tools(hover)

show(p)

License¶

This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.