#!/usr/bin/env python
# coding: utf-8

# In[49]:


get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')


# In[48]:


get_ipython().run_line_magic('pylab', 'inline')


# In[3]:


import dammit


# In[186]:


from dammit.model import CRBL


# In[187]:


reload(dammit)


# In[4]:


import seaborn as sns


# In[7]:


import pandas as pd


# In[100]:


from sklearn import svm
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB
import numpy as np


# In[6]:


sns.set_style('ticks')
blue_ct_cmap = sns.cubehelix_palette(10, start=.5, rot=-.3, light=1, dark=.4, as_cmap=True)
red_ct_cmap = sns.cubehelix_palette(10, start=1.5, rot=-.5, light=1, dark=.4, as_cmap=True)

sns.palplot(blue_ct_cmap.colors[np.linspace(0, 255, 10, dtype=int)])
sns.palplot(red_ct_cmap.colors[np.linspace(0, 255, 10, dtype=int)])


# In[88]:


def get_grid(mins, maxs, clf):
    xx, yy = np.meshgrid(np.linspace(mins, maxs, 1000),
                         np.linspace(mins, maxs, 1000))
    '''
    if hasattr(clf, 'decision_function'):
        Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
    else:
        Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    '''
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    return xx, yy, Z


# In[22]:


def scale_evalue(col):
    col.ix[col == 0.0] = 1e-256
    return -np.log10(col)


# In[238]:


test_aln_A = pd.DataFrame([[0.0, 0.0, 200, 2000, 'tr_Aa', 200, '+', 70, 70, 'pr_Ba', 0, '+', 200, 200.0],
                           [1e-100, 1e-100, 200, 2000, 'tr_Aa', 200, '+', 70, 70, 'pr_Bb', 0, '+', 200, 200.0],
                           [0.0, 0.0, 200, 2000, 'tr_Ab', 200, '+', 70, 70, 'pr_Ba', 0, '+', 200, 200.0],
                           [0.0, 0.0, 200, 2000, 'tr_Ac', 200, '+', 70, 70, 'pr_Bc', 0, '+', 200, 200.0]],
                          columns=[u'E', u'EG2', u'q_aln_len', u'q_len', u'q_name', u'q_start',
                                   u'q_strand', u's_aln_len', u's_len', u's_name', u's_start', u's_strand',
                                   u'score', u'bitscore'])


# In[239]:


test_aln_B = pd.DataFrame([[0.0, 0.0, 200, 2000, 'pr_Ba', 200, '+', 70, 70, 'tr_Aa', 0, '+', 200, 200.0],
                           [0.0, 0.0, 200, 2000, 'pr_Ba', 200, '+', 70, 70, 'tr_Ac', 0, '+', 200, 200.0],
                           [1e-100, 1e-100, 200, 2000, 'pr_Ba', 200, '+', 70, 70, 'tr_Ab', 0, '+', 200, 200.0],
                           [0.0, 0.0, 200, 2000, 'pr_Bb', 200, '+', 70, 70, 'tr_Aa', 0, '+', 200, 200.0]],
                          columns=[u'E', u'EG2', u'q_aln_len', u'q_len', u'q_name', u'q_start',
                                   u'q_strand', u's_aln_len', u's_len', u's_name', u's_start', u's_strand',
                                   u'score', u'bitscore'])5


# In[240]:


CRBL.reciprocal_best_hits(test_aln_A, test_aln_B)


# In[228]:


bh_df.columns


# In[191]:


rbh_df = pd.read_csv('pom.500.fa.dammit/pom.500.fa.transdecoder.pep.rbhx.pep.fa.csv')
rbh_df['E'] = scale_evalue(rbh_df['E'].copy())


# In[192]:


bh_df = pd.concat([group for group in dammit.parsers.maf_to_df_iter('pom.500.fa.dammit/pom.500.fa.x.pep.fa.maf')])
bh_df['E'] = scale_evalue(bh_df['E'].copy())


# In[193]:


bh_df['RBH'] = False
bh_df.ix[rbh_df.index, 'RBH'] = True


# In[200]:


bh_df.sort_values(by=['q_name', 'E'], ascending=False).drop_duplicates(subset='q_name').query('RBH == True')


# In[208]:


pep_x_db_df = pd.concat([group for group in dammit.parsers.maf_to_df_iter('pom.500.fa.dammit/pom.500.fa.transdecoder.pep.x.pep.fa.maf')])


# In[209]:


db_x_pep_df = pd.concat([group for group in dammit.parsers.maf_to_df_iter('pom.500.fa.dammit/pep.fa.x.pom.500.fa.transdecoder.pep.maf')])


# In[210]:


rbh_pep_df = CRBL.reciprocal_best_hits(pep_x_db_df, db_x_pep_df)


# In[223]:


rbh_pep_df


# In[227]:


for key, row in rbh_df.iterrows():
    print row.q_name, '\t', row.s_name, '\n'


# In[206]:


db_x_pep_df.s_name[0]


# In[183]:


len(bh_df.query('RBH == True'))


# In[116]:


scaler = preprocessing.StandardScaler()
scaler.fit(bh_df.query('RBH == True')[['E', 'q_len']])


# In[117]:


rbh_df[['E', 'q_len']] = scaler.transform(rbh_df[['E', 'q_len']])


# In[118]:


bh_df[['E', 'q_len']] = scaler.transform(bh_df[['E', 'q_len']])


# In[190]:


len(bh_df)


# In[188]:


CRBL.best_hits(bh_df)


# In[189]:


len(bh_df.query('RBH == True'))


# In[120]:


sns.distplot(rbh_df['E'], label='E')
sns.distplot(rbh_df['q_len'], label='Length')


# In[148]:


figsize(12,8)
scatter(bh_df.q_len, bh_df.E, c=bh_df.RBH, cmap=plt.cm.Paired, alpha=0.8)
xlabel('Query Length (scaled)')
ylabel('$\log10 Evalue$ (scaled)')
title('LAST hits, RBH and regular', fontsize=14)
savefig('sigh.svg')


# In[168]:


g = sns.lmplot('q_len', 'E', bh_df,  size=10, 
                   hue='RBH', order=2, palette=sns.color_palette('Set1'))
g = (g.set_axis_labels("Query Length (scaled)", "$\log10 Evalue$ (scaled)"))
title(r'$S. pombe$ LAST hits, RBH and regular', fontsize=14)
axis(ymax=2, ymin=-2)
savefig('sigh2.svg')


# In[139]:


# In[133]:


np.linspace(bh_df.q_len.min(), bh_df.q_len.max(), 500)


# In[136]:


pd.rolling_mean(bh_df.set_index('q_len')['E'], 10)


# In[95]:


svc_clf = svm.SVC()


# In[101]:


gnb = GaussianNB()


# In[97]:


svc_clf.fit(bh_df[['q_len', 'E']], bh_df.RBH.astype(int))


# In[106]:


bh_df


# In[105]:


pred = gnb.fit(bh_df[['q_len', 'E']], bh_df.RBH.astype(int)).predict(bh_df[['q_len', 'E']])


# In[98]:


xx, yy, zz = get_grid(-5, 15, svc_clf)


# In[99]:


figsize(12,8)
plt.contourf(xx, yy, zz, cmap=plt.cm.Paired, alpha=0.8)
# Plot also the training points
plt.scatter(bh_df['q_len'], bh_df['E'], c=bh_df.RBH.astype(int), cmap=plt.cm.Paired)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())


# In[ ]: