PLS $T^2$ score and SPE value.¶

In [0]:

!git clone https://github.com/HIPS/neural-fingerprint.git
import pandas as pd
df = pd.read_csv('./neural-fingerprint/data/2015-05-24-delaney/delaney-processed.csv')

!curl -Lo rdkit_installer.py https://git.io/fxiPZ
import rdkit_installer
%time rdkit_installer.install()
import rdkit

fatal: destination path 'neural-fingerprint' already exists and is not an empty directory.
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  2415  100  2415    0     0   4573      0 --:--:-- --:--:-- --:--:--  485k

add /root/miniconda/lib/python3.6/site-packages to PYTHONPATH
rdkit is already installed

CPU times: user 2.38 ms, sys: 99 µs, total: 2.48 ms
Wall time: 2.3 ms

In [0]:

import numpy as np

from sklearn import metrics
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.cross_decomposition import PLSRegression

import matplotlib as mpl
import matplotlib.pyplot as plt

from rdkit.Chem import PandasTools, rdMolDescriptors, Descriptors
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw

In [0]:

PandasTools.AddMoleculeColumnToFrame(frame=df, smilesCol='smiles')

In [0]:

count_frags =lambda moleobj: np.array(list(map(lambda m: getattr(Descriptors, m)( moleobj ),
    [method for method in dir(Descriptors) if method.startswith('fr_')])))

In [0]:

methods = [method for method in dir(Descriptors) if method.startswith('fr_')]
n_descs = len(methods)
n_samples = df.shape[0]
X = np.zeros((n_samples,n_descs))
for ix, mol in enumerate(df.ROMol):
  X[ix,:] = count_frags(mol)

In [0]:

TARGET = df.columns[8]
M = pd.DataFrame(X, columns = methods)
A = pd.concat((df,M),axis=1)

In [0]:

train, test = train_test_split(A, test_size=0.33, random_state = 42)
params = {'n_components': np.arange(1, 16)}
model = GridSearchCV(PLSRegression(), params, cv=3)
model.fit(train[methods], train[TARGET])

Out[0]:

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=PLSRegression(copy=True, max_iter=500, n_components=2,
                                     scale=True, tol=1e-06),
             iid='warn', n_jobs=None,
             param_grid={'n_components': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [0]:

with mpl.style.context('seaborn'):
  fig = plt.figure()
  ax = fig.gca()
  plt.plot(train[TARGET], model.predict(train[methods]), 'o')
  plt.plot(test[TARGET], model.predict(test[methods]), 'o')
  xlim = ax.get_xlim()
  ax.set_ylim(xlim)
  plt.plot(xlim,xlim, '-k')
  plt.axis('square')
  plt.xlabel('Measured log solubility')
  plt.ylabel('Estimated log solubility')
  plt.legend({'train', 'test'})

In [0]:

print(np.sqrt(metrics.mean_squared_error(train[TARGET], model.predict(train[methods]))),\
  np.sqrt(metrics.mean_squared_error(test[TARGET], model.predict(test[methods]))))
print(metrics.r2_score(train[TARGET], model.predict(train[methods])),\
  metrics.r2_score(test[TARGET], model.predict(test[methods])))

1.1153743787284136 1.1615487203232076
0.710121766066296 0.7032509664048506

In [0]:

def t2_score(data, model):
  assert type(data)==pd.DataFrame or type(data)==np.ndarray, "input must be pandas.DataFrame or np.array"
  explained_std_ = np.sqrt(model.best_estimator_.x_scores_.var(axis=0))
  scores_whiten = model.transform(data) / explained_std_
  return (scores_whiten ** 2.).sum(axis=1)
def q_value(data, model):
  assert type(data)==pd.DataFrame or type(data)==np.ndarray, "input must be pandas.DataFrame or np.array"
  x_reproduced_ = model.transform(data) \
              @ model.best_estimator_.x_loadings_.T \
              * model.best_estimator_.x_std_ + model.best_estimator_.x_mean_
  return ((data - x_reproduced_)**2.).sum(axis=1)

In [0]:

x_reproduced_ = model.transform(train[methods]) \
              @ model.best_estimator_.x_loadings_.T \
              * model.best_estimator_.x_std_ + model.best_estimator_.x_mean_
((train[methods]-x_reproduced_)**2.).sum(axis=1)
df_t2q=pd.DataFrame([t2_score(train[methods], model),q_value(train[methods], model)]).T

df_t2q.rename(columns={ix:name for ix,name in enumerate(['t2','q'])},
              index={ix: idx for ix,idx in enumerate(train.index)},
             inplace=True)
df_t2q.head(2)

Out[0]:

	t2	q
84	1.195129	4.235145
432	35.129500	79.107809

In [0]:

# help(np.quantile)
for x in [df_t2q['t2'], df_t2q['q']]:
  print('%.4f' % np.quantile(x, 0.997),end=' ')

103.2511 80.0971

In [0]:

with mpl.style.context('seaborn'):
  fig = plt.figure()
  for data in [train, test]:
    plt.plot(t2_score(data[methods], model),
             q_value(data[methods], model),
             'o')
  plt.vlines(np.quantile(df_t2q['t2'], 0.997), 0, 250)
  plt.hlines(np.quantile(df_t2q['q'], 0.997), 0, 175)
  plt.legend(['train','test'])

In [0]:

from copy import deepcopy
pls = deepcopy(model.best_estimator_)

PLS T2T^2 score and SPE value.¶

PLS $T^2$ score and SPE value.¶