Notebook

PLS VIP value.¶

In [1]:

!git clone https://github.com/HIPS/neural-fingerprint.git
import pandas as pd
df = pd.read_csv('./neural-fingerprint/data/2015-05-24-delaney/delaney-processed.csv')

!curl -Lo rdkit_installer.py https://git.io/fxiPZ
import rdkit_installer
%time rdkit_installer.install()
import rdkit

fatal: destination path 'neural-fingerprint' already exists and is not an empty directory.
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  2415  100  2415    0     0   3659      0 --:--:-- --:--:-- --:--:--  3659

add /root/miniconda/lib/python3.6/site-packages to PYTHONPATH
rdkit is already installed

CPU times: user 1.94 ms, sys: 0 ns, total: 1.94 ms
Wall time: 2.01 ms

In [0]:

import numpy as np

from sklearn import metrics
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.cross_decomposition import PLSRegression

import matplotlib as mpl
import matplotlib.pyplot as plt

from rdkit.Chem import rdMolDescriptors, Descriptors, MolFromSmiles
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
mpl.rcParams['figure.figsize'] = [12, 8]
mpl.rcParams['font.size'] = 30.

In [0]:

df['mol'] = df.smiles.apply(MolFromSmiles)

In [0]:

count_frags =lambda moleobj: np.array(list(map(lambda m: getattr(Descriptors, m)( moleobj ),
    [method for method in dir(Descriptors) if method.startswith('fr_')])))

In [0]:

methods = [method for method in dir(Descriptors) if method.startswith('fr_')]
n_descs = len(methods)
n_samples = df.shape[0]
X = np.zeros((n_samples,n_descs))
for ix, mol in enumerate(df.mol):
  X[ix,:] = count_frags(mol)

In [0]:

TARGET = df.columns[8]
M = pd.DataFrame(X, columns = methods)
A = pd.concat((df,M),axis=1)

In [7]:

train, test = train_test_split(A, test_size=0.33, random_state = 42)
params = {'n_components': np.arange(1, 16)}
model = GridSearchCV(PLSRegression(), params, cv=3)
model.fit(train[methods], train[TARGET])

Out[7]:

GridSearchCV(cv=3, error_score=nan,
             estimator=PLSRegression(copy=True, max_iter=500, n_components=2,
                                     scale=True, tol=1e-06),
             iid='deprecated', n_jobs=None,
             param_grid={'n_components': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [8]:

with mpl.style.context('seaborn'):
  fig = plt.figure()
  ax = fig.gca()
  plt.plot(train[TARGET], model.predict(train[methods]), 'o')
  plt.plot(test[TARGET], model.predict(test[methods]), 'o')
  xlim = ax.get_xlim()
  ax.set_ylim(xlim)
  plt.plot(xlim,xlim, '-k')
  plt.axis('square')
  plt.xlabel('Measured log solubility')
  plt.ylabel('Estimated log solubility')
  plt.legend({'train', 'test'})

In [9]:

print(np.sqrt(metrics.mean_squared_error(train[TARGET], model.predict(train[methods]))),\
  np.sqrt(metrics.mean_squared_error(test[TARGET], model.predict(test[methods]))))
print(metrics.r2_score(train[TARGET], model.predict(train[methods])),\
  metrics.r2_score(test[TARGET], model.predict(test[methods])))

1.1153743787284136 1.1615487203232076
0.710121766066296 0.7032509664048506

$VIP_j = \sqrt{ \frac{d \sum_{k=1}^hSS_k(w_{k,j})^2}{\sum_{k=1}^hSS_k} } \\ \textrm{such that} \\ SS_k = q_k^2 {\bf t}_k^T{\bf t}_k$

where $d$ denotes the number of the variables in $X$ , ${\bf t}_k$ denotes the x_scores of the k-th component, $q_k$ denotes the y_loading of the k-th component.

Let $W$ $d$ -by- $A$ matrix, $SS$ 1-by- $A$ matrix. Then,

$\sqrt{ \frac{d \times W\times SS'}{\textrm{sum}(SS)} }$

would be the VIP of the whole variables ( $j=1, \ldots, d$ ).

In [0]:

def vip(pls_model):
  n_dims = pls_model.x_weights_.shape[0]
  n_components = pls_model.n_components
  t_a = pls_model.x_scores_
  w_a = pls_model.x_weights_# d x a
  q_a = pls_model.y_loadings_
  denom = list(map(lambda ix:(q_a[:,ix] **2.)*t_a[:,ix].T@t_a[:,ix], range(n_components)))
  denom = np.matrix(denom)
  numer = ((w_a **2.) * denom.T).T
  return (n_dims*numer)/denom.sum()

In [11]:

vipval =vip( model.best_estimator_ )
ixshow = np.where(vipval>1)[1]
print(len(ixshow))
plt.figure()
plt.plot(vipval.T, '-o')
plt.plot(ixshow, vipval[:,ixshow].T, 'o')
plt.xlabel('variable index')
plt.ylabel('VIP value')
plt.show()