#!/usr/bin/env python
# coding: utf-8

# # Building a gene network from the IDR and STRINGdb
# 
# This notebook exemplifies how to build Figure 1 of the paper using the IDR API. 

# ### Dependencies
# 
#  * [Matplotlib](http://matplotlib.org/)
#  * [NumPy](http://www.numpy.org/)
#  * [Pandas](http://pandas.pydata.org/)
#  * [NetworkX](https://networkx.github.io/)
#  * [Py2Cytoscape](https://pypi.python.org/pypi/py2cytoscape)
#  * [Requests](http://docs.python-requests.org/)

# In[1]:


import omero
from idr import connection

import requests
from pandas import DataFrame
from pandas import read_csv
from pandas import concat
from io import StringIO

import numpy as np
from IPython.display import Image, display

import networkx as nx
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')

from bokeh.models import ColumnDataSource
from bokeh.plotting import figure, output_notebook, show, save
from bokeh.models import HoverTool
from bokeh.palettes import brewer

output_notebook()


# ### Method definitions

# In[2]:


def getBulkAnnotationAsDf(screenID, conn):
#    ofId=8118685
    sc = conn.getObject('Screen', screenID)
    for ann in sc.listAnnotations():
        if isinstance(ann, omero.gateway.FileAnnotationWrapper):
            if (ann.getFile().getName() == 'bulk_annotations'):
                if (ann.getFile().getSize() > 147625090):
                    print "that's a big file..."
                    return None
                ofId = ann.getFile().getId()
                break

    print ofId
    original_file = omero.model.OriginalFileI(ofId, False)

    table = conn.c.sf.sharedResources().openTable(original_file)
    try:
        rowCount = table.getNumberOfRows()

        column_names = [col.name for col in table.getHeaders()]

        black_list = []
        column_indices = []
        for column_name in column_names:
            if column_name in black_list:
                continue
            column_indices.append(column_names.index(column_name))

        table_data = table.slice(column_indices, None)
    finally:
        table.close()

    data = []
    for index in range(rowCount):
        row_values = [column.values[index] for column in table_data.columns]
        data.append(row_values)

    dfAnn = DataFrame(data)
    dfAnn.columns=column_names
    return dfAnn


# In[3]:


def getGenesFromPhenotype(df,phTerm):
# all gene from bulk_annotation 'df' annotated with CMPO term 'phTerm'
    colElong = []
    for col in df.columns:
        if 'Term Accession' in col:
            if phTerm in df[col].unique():
                colElong.append(col)

    dfElong = concat([df[df[col] != ''] for col in colElong])
    return dfElong['Gene Identifier'].unique()
    

# ### Connect to the IDR server

# In[4]:


conn = connection()


# ## Querying the IDR
# 
# We will download the annotations for the three screens under study as panda DataFrames, and sub-select the genes from each which are annotated with the phenotype we are looking for, CMPO_0000077 a.k.a. 'elongated cell phenotype'.
# 
# The next step is to build, from that list, a list of IDs we can query the STRING database with. The translation table was built offline using biomart and pombase. 

# In[5]:


# CMPO term to look for
phTerm = 'CMPO_0000077'

# ids of screens:
# scId = 3  # Graml et al.
# scId = 206  # Rohn et al., B
# scId = 1202  # Fuchs et al., B

screens = [3, 206, 1202]

genes = []
for scId in screens:
    print 'loading ' + str(scId)

    # loading bulk_annotations of screens as dataframes
    df = getBulkAnnotationAsDf(scId, conn)
    
    # unique genes with CPMO term
    cur = getGenesFromPhenotype(df, phTerm)
    print 'got ' + str(len(cur)) + ' genes'
    genes.extend(cur)


# ### Disconnect when done loading data

# In[6]:


conn.close()


# ### Translation

# In[7]:


# The table was built offline using biomart
dfTrans = read_csv('./includes/TableOfGenesWithElongatedCellPhenotype.csv')

# extract IDs 
genesE84 = concat([dfTrans[dfTrans['Screen GeneID']==x]['Human Ortholog Ensembl 84'] for x in genes])
genesE84 = genesE84[genesE84!='(null)']

print genes[:10]
print genesE84.head(10)


# ### REST query of STRINGdb
# 
# We use the STRINGdb REST API to get all the known interactions of all of our genes
# 

# In[8]:


# Building STRINGdb REST api query
url = 'http://string-db.org/api/psi-mi-tab/interactionsList?species=Human%209606&identifiers='
# genesE84 = genesE84[:10]
for g in genesE84:
    url = url + g + '%0d'
Res = requests.get(url)


# In[9]:


df = read_csv(StringIO(Res.text), sep='\t', header=None)
df.head(10)


# ### Some displays

# #### A. NetworkX

# In[11]:


import warnings
warnings.filterwarnings('ignore')
g = nx.from_pandas_edgelist(df, 2, 3)
plt.figure(figsize = (12, 12))
nx.draw_spring(g, with_labels=True)


# #### B. Bokeh

# In[12]:


# Use spring layout from networkx for display
pts = nx.spring_layout(g)

TOOLS = "tap,pan,wheel_zoom,reset"
p = figure(plot_width=800, plot_height=800, title = "My chart", tools=TOOLS)

# color certain genes of interest
# comment out this section if not needed
colors = np.ones(len(g.nodes()))
colors[[x=='ASH2L' for x in pts.keys()]] = 4
colors[[x=='SETD1A' for x in pts.keys()]] = 2
colors[[x=='SETD1B' for x in pts.keys()]] = 2
cm = brewer['Set1'][5]
colors = [cm[int(x)] for x in colors]

names = pts.keys()
cntr = 0
for x in pts.itervalues():
    cir = p.circle(x=x[0], y=x[1], radius=.01, color=colors[cntr], name=names[cntr])
    cntr = cntr+1

# uncomment next line if not
# coloring genes of interest
# cir=p.circle('x','y',source=sourceNode,radius=.01)
sourceEdges = ColumnDataSource(
    data=dict(
        x1=[pts[x[0]][0] for x in g.edges()],
        y1=[pts[x[0]][1] for x in g.edges()],
        x2=[pts[x[1]][0] for x in g.edges()],
        y2=[pts[x[1]][1] for x in g.edges()]
    ))
seg = p.segment('x1','y1','x2','y2',source=sourceEdges)

hover = HoverTool(
        tooltips=[
            ("Name", "@names"),
#            ("Screens names", "@screens"),
        ]
    )
p.add_tools(hover)

show(p)


# ### License
# 
# Copyright (C) 2016 University of Dundee. All Rights Reserved.
# 
# This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.