#!/usr/bin/env python # coding: utf-8 # In[49]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # In[48]: get_ipython().run_line_magic('pylab', 'inline') # In[3]: import dammit # In[186]: from dammit.model import CRBL # In[187]: reload(dammit) # In[4]: import seaborn as sns # In[7]: import pandas as pd # In[100]: from sklearn import svm from sklearn import preprocessing from sklearn.naive_bayes import GaussianNB import numpy as np # In[6]: sns.set_style('ticks') blue_ct_cmap = sns.cubehelix_palette(10, start=.5, rot=-.3, light=1, dark=.4, as_cmap=True) red_ct_cmap = sns.cubehelix_palette(10, start=1.5, rot=-.5, light=1, dark=.4, as_cmap=True) sns.palplot(blue_ct_cmap.colors[np.linspace(0, 255, 10, dtype=int)]) sns.palplot(red_ct_cmap.colors[np.linspace(0, 255, 10, dtype=int)]) # In[88]: def get_grid(mins, maxs, clf): xx, yy = np.meshgrid(np.linspace(mins, maxs, 1000), np.linspace(mins, maxs, 1000)) ''' if hasattr(clf, 'decision_function'): Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) else: Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) ''' Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) return xx, yy, Z # In[22]: def scale_evalue(col): col.ix[col == 0.0] = 1e-256 return -np.log10(col) # In[238]: test_aln_A = pd.DataFrame([[0.0, 0.0, 200, 2000, 'tr_Aa', 200, '+', 70, 70, 'pr_Ba', 0, '+', 200, 200.0], [1e-100, 1e-100, 200, 2000, 'tr_Aa', 200, '+', 70, 70, 'pr_Bb', 0, '+', 200, 200.0], [0.0, 0.0, 200, 2000, 'tr_Ab', 200, '+', 70, 70, 'pr_Ba', 0, '+', 200, 200.0], [0.0, 0.0, 200, 2000, 'tr_Ac', 200, '+', 70, 70, 'pr_Bc', 0, '+', 200, 200.0]], columns=[u'E', u'EG2', u'q_aln_len', u'q_len', u'q_name', u'q_start', u'q_strand', u's_aln_len', u's_len', u's_name', u's_start', u's_strand', u'score', u'bitscore']) # In[239]: test_aln_B = pd.DataFrame([[0.0, 0.0, 200, 2000, 'pr_Ba', 200, '+', 70, 70, 'tr_Aa', 0, '+', 200, 200.0], [0.0, 0.0, 200, 2000, 'pr_Ba', 200, '+', 70, 70, 'tr_Ac', 0, '+', 200, 200.0], [1e-100, 1e-100, 200, 2000, 'pr_Ba', 200, '+', 70, 70, 'tr_Ab', 0, '+', 200, 200.0], [0.0, 0.0, 200, 2000, 'pr_Bb', 200, '+', 70, 70, 'tr_Aa', 0, '+', 200, 200.0]], columns=[u'E', u'EG2', u'q_aln_len', u'q_len', u'q_name', u'q_start', u'q_strand', u's_aln_len', u's_len', u's_name', u's_start', u's_strand', u'score', u'bitscore'])5 # In[240]: CRBL.reciprocal_best_hits(test_aln_A, test_aln_B) # In[228]: bh_df.columns # In[191]: rbh_df = pd.read_csv('pom.500.fa.dammit/pom.500.fa.transdecoder.pep.rbhx.pep.fa.csv') rbh_df['E'] = scale_evalue(rbh_df['E'].copy()) # In[192]: bh_df = pd.concat([group for group in dammit.parsers.maf_to_df_iter('pom.500.fa.dammit/pom.500.fa.x.pep.fa.maf')]) bh_df['E'] = scale_evalue(bh_df['E'].copy()) # In[193]: bh_df['RBH'] = False bh_df.ix[rbh_df.index, 'RBH'] = True # In[200]: bh_df.sort_values(by=['q_name', 'E'], ascending=False).drop_duplicates(subset='q_name').query('RBH == True') # In[208]: pep_x_db_df = pd.concat([group for group in dammit.parsers.maf_to_df_iter('pom.500.fa.dammit/pom.500.fa.transdecoder.pep.x.pep.fa.maf')]) # In[209]: db_x_pep_df = pd.concat([group for group in dammit.parsers.maf_to_df_iter('pom.500.fa.dammit/pep.fa.x.pom.500.fa.transdecoder.pep.maf')]) # In[210]: rbh_pep_df = CRBL.reciprocal_best_hits(pep_x_db_df, db_x_pep_df) # In[223]: rbh_pep_df # In[227]: for key, row in rbh_df.iterrows(): print row.q_name, '\t', row.s_name, '\n' # In[206]: db_x_pep_df.s_name[0] # In[183]: len(bh_df.query('RBH == True')) # In[116]: scaler = preprocessing.StandardScaler() scaler.fit(bh_df.query('RBH == True')[['E', 'q_len']]) # In[117]: rbh_df[['E', 'q_len']] = scaler.transform(rbh_df[['E', 'q_len']]) # In[118]: bh_df[['E', 'q_len']] = scaler.transform(bh_df[['E', 'q_len']]) # In[190]: len(bh_df) # In[188]: CRBL.best_hits(bh_df) # In[189]: len(bh_df.query('RBH == True')) # In[120]: sns.distplot(rbh_df['E'], label='E') sns.distplot(rbh_df['q_len'], label='Length') # In[148]: figsize(12,8) scatter(bh_df.q_len, bh_df.E, c=bh_df.RBH, cmap=plt.cm.Paired, alpha=0.8) xlabel('Query Length (scaled)') ylabel('$\log10 Evalue$ (scaled)') title('LAST hits, RBH and regular', fontsize=14) savefig('sigh.svg') # In[168]: g = sns.lmplot('q_len', 'E', bh_df, size=10, hue='RBH', order=2, palette=sns.color_palette('Set1')) g = (g.set_axis_labels("Query Length (scaled)", "$\log10 Evalue$ (scaled)")) title(r'$S. pombe$ LAST hits, RBH and regular', fontsize=14) axis(ymax=2, ymin=-2) savefig('sigh2.svg') # In[139]: # In[133]: np.linspace(bh_df.q_len.min(), bh_df.q_len.max(), 500) # In[136]: pd.rolling_mean(bh_df.set_index('q_len')['E'], 10) # In[95]: svc_clf = svm.SVC() # In[101]: gnb = GaussianNB() # In[97]: svc_clf.fit(bh_df[['q_len', 'E']], bh_df.RBH.astype(int)) # In[106]: bh_df # In[105]: pred = gnb.fit(bh_df[['q_len', 'E']], bh_df.RBH.astype(int)).predict(bh_df[['q_len', 'E']]) # In[98]: xx, yy, zz = get_grid(-5, 15, svc_clf) # In[99]: figsize(12,8) plt.contourf(xx, yy, zz, cmap=plt.cm.Paired, alpha=0.8) # Plot also the training points plt.scatter(bh_df['q_len'], bh_df['E'], c=bh_df.RBH.astype(int), cmap=plt.cm.Paired) plt.xlim(xx.min(), xx.max()) plt.ylim(yy.min(), yy.max()) # In[ ]: