Notebook

Kernel Functions Comparison¶

Tanimoto vs. Dice coefficients as kernel function.

In [1]:

!wget https://raw.githubusercontent.com/funatsu-lab/support-page/master/data/delaney-solubility/delaney-processed.csv?token=AAQJRTXGFXP44OXYO5VHHKK7A3VI6
!mv delaney-processed.csv?token=AAQJRTXGFXP44OXYO5VHHKK7A3VI6 delaney-processed.csv
!ls delaney-processed.csv
from pandas import read_csv
df_logS = read_csv('delaney-processed.csv')
!curl -Lo rdkit_installer.py https://git.io/fxiPZ
import rdkit_installer
%time rdkit_installer.install()

--2020-07-03 05:46:58--  https://raw.githubusercontent.com/funatsu-lab/support-page/master/data/delaney-solubility/delaney-processed.csv?token=AAQJRTXGFXP44OXYO5VHHKK7A3VI6
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 96699 (94K) [text/plain]
Saving to: ‘delaney-processed.csv?token=AAQJRTXGFXP44OXYO5VHHKK7A3VI6’

delaney-processed.c 100%[===================>]  94.43K  --.-KB/s    in 0.03s   

2020-07-03 05:46:58 (3.67 MB/s) - ‘delaney-processed.csv?token=AAQJRTXGFXP44OXYO5VHHKK7A3VI6’ saved [96699/96699]

delaney-processed.csv
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  2415  100  2415    0     0   4236      0 --:--:-- --:--:-- --:--:--  4236

add /root/miniconda/lib/python3.6/site-packages to PYTHONPATH
python version: 3.6.9
fetching installer from https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
done
installing miniconda to /root/miniconda
done
installing rdkit
done
rdkit-2020.03.3 installation finished!

CPU times: user 319 ms, sys: 190 ms, total: 510 ms
Wall time: 56.9 s

In [2]:

from rdkit.Chem import MolFromSmiles
df_logS['mol'] = df_logS.smiles.apply(MolFromSmiles)
mols = df_logS.mol.tolist()

In [3]:

import numpy as np 
from sklearn.metrics import pairwise as pw

In [4]:

# jaccard, dice, simpson 
def jaccard(u,v,gamma=1.0):
  return gamma*np.sum(np.logical_and(np.array(u), np.array(v)))/np.sum(np.logical_or(np.array(u), np.array(v)))
jaccard([1, 0, 1], [1, 0, 1], 1.0), jaccard([1, 0, 1], [1, 0, 0], 1.0)

Out[4]:

(1.0, 0.5)

In [5]:

sum(map(sum, [[1, 0, 1], [1, 0, 0]]))

Out[5]:

【技術解説】集合の類似度(Jaccard係数，Dice係数，Simpson係数)

In [6]:

def dice(u,v,gamma=1.0):
  return gamma*2.*np.sum(np.logical_and(np.array(u), np.array(v)))/ sum(map(sum, [u, v]))
print('dice', dice([1, 0, 1], [1, 0, 1], 1.0), dice([1, 0, 1], [1, 0, 0], 1.0)  )
def simpson(u,v,gamma=1.0):
  return gamma* np.sum(np.logical_and(np.array(u), np.array(v)))/ min(map(sum, [u, v]))

print('simpson', simpson([1, 0, 1], [1, 0, 1], 1.0), simpson([1, 0, 1], [1, 0, 0], 1.0)  )

dice 1.0 0.6666666666666666
simpson 1.0 1.0

In [9]:

from sklearn.gaussian_process.kernels import PairwiseKernel, WhiteKernel, ConstantKernel
from sklearn.gaussian_process import GaussianProcessRegressor

from sklearn.preprocessing import StandardScaler
from rdkit.Chem.AllChem import GetHashedMorganFingerprint, GetMorganFingerprintAsBitVect
from sklearn.model_selection import train_test_split, GridSearchCV

モデル選択 == パラメータ探索¶

sklearn.pipeline.Pipelineの利用
sklearn.model_selection.GridSearchCVの利用: 交差検定を利用したパラメータ探索
モデル学習

SMILESからMorganフィンガープリントを取り出す¶

In [24]:

from sklearn.base import BaseEstimator, TransformerMixin
class Mol2Fingerprint(BaseEstimator, TransformerMixin):
  def __init__(self, radius=2, n_dims=128):
    self.radius = int(radius)
    self.n_dims = int(n_dims)
    # self._func = 
  def fit(self, x, y=None):
    return self
  def transform(self, data):
    return np.matrix(list(map(lambda m:GetMorganFingerprintAsBitVect(m, radius=self.radius, nBits=self.n_dims), data)))

preprocessor = Mol2Fingerprint()
X = preprocessor.fit_transform(df_logS.mol)
print(X.shape, df_logS.mol.shape)

(1128, 128) (1128,)

Pipelineの利用¶

Pipelineとは？¶

sklearnで複数の前処理+予測（回帰/分類）モデルを結合して一つのモデルとみなすこと

In [18]:

type([1, 10, 3]), type(['a', 'cc', 'zzz']), type(('mol2fp', Mol2Fingerprint()))

Out[18]:

(list, list, tuple)

In [25]:

from sklearn.pipeline import make_pipeline, Pipeline 
pipeline = Pipeline([('mol2fp', Mol2Fingerprint()), 
                     ('gpr', GaussianProcessRegressor())])
print(pipeline)

Pipeline(memory=None,
         steps=[('mol2fp', Mol2Fingerprint(n_dims=128, radius=2)),
                ('gpr',
                 GaussianProcessRegressor(alpha=1e-10, copy_X_train=True,
                                          kernel=None, n_restarts_optimizer=0,
                                          normalize_y=False,
                                          optimizer='fmin_l_bfgs_b',
                                          random_state=None))],
         verbose=False)

In [26]:

TARGET = ['measured log solubility in mols per litre']
y = df_logS[TARGET]
print(y.shape)

(1128, 1)

In [27]:

pipeline.fit(df_logS, y)

---------------------------------------------------------------------------
ArgumentError                             Traceback (most recent call last)
<ipython-input-27-75d5f787fed2> in <module>()
----> 1 pipeline.fit(df_logS, y)

/usr/local/lib/python3.6/dist-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
    348             This estimator
    349         """
--> 350         Xt, fit_params = self._fit(X, y, **fit_params)
    351         with _print_elapsed_time('Pipeline',
    352                                  self._log_message(len(self.steps) - 1)):

/usr/local/lib/python3.6/dist-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
    313                 message_clsname='Pipeline',
    314                 message=self._log_message(step_idx),
--> 315                 **fit_params_steps[name])
    316             # Replace the transformer of the step with the fitted
    317             # transformer. This is necessary when loading the transformer

/usr/local/lib/python3.6/dist-packages/joblib/memory.py in __call__(self, *args, **kwargs)
    350 
    351     def __call__(self, *args, **kwargs):
--> 352         return self.func(*args, **kwargs)
    353 
    354     def call_and_shelve(self, *args, **kwargs):

/usr/local/lib/python3.6/dist-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    726     with _print_elapsed_time(message_clsname, message):
    727         if hasattr(transformer, 'fit_transform'):
--> 728             res = transformer.fit_transform(X, y, **fit_params)
    729         else:
    730             res = transformer.fit(X, y, **fit_params).transform(X)

/usr/local/lib/python3.6/dist-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
    572         else:
    573             # fit method of arity 2 (supervised transformation)
--> 574             return self.fit(X, y, **fit_params).transform(X)
    575 
    576 

<ipython-input-24-e99a42e81794> in transform(self, data)
      8     return self
      9   def transform(self, data):
---> 10     return np.matrix(list(map(lambda m:GetMorganFingerprintAsBitVect(m, radius=self.radius, nBits=self.n_dims), data)))
     11 
     12 preprocessor = Mol2Fingerprint()

<ipython-input-24-e99a42e81794> in <lambda>(m)
      8     return self
      9   def transform(self, data):
---> 10     return np.matrix(list(map(lambda m:GetMorganFingerprintAsBitVect(m, radius=self.radius, nBits=self.n_dims), data)))
     11 
     12 preprocessor = Mol2Fingerprint()

ArgumentError: Python argument types in
    rdkit.Chem.rdMolDescriptors.GetMorganFingerprintAsBitVect(str)
did not match C++ signature:
    GetMorganFingerprintAsBitVect(RDKit::ROMol mol, int radius, unsigned int nBits=2048, boost::python::api::object invariants=[], boost::python::api::object fromAtoms=[], bool useChirality=False, bool useBondTypes=True, bool useFeatures=False, boost::python::api::object bitInfo=None, bool includeRedundantEnvironments=False)

In [ ]:

kernels = [
           ('tanimoto', PairwiseKernel(metric=jaccard)+WhiteKernel()),
           ('dice', PairwiseKernel(metric=dice)+WhiteKernel()),
          #  ('simpson', PairwiseKernel(metric=simpson)+WhiteKernel()), 
]

In [ ]:

params = {
    'm2fp__n_dims': 2 ** np.arange(3,8),
    'm2fp__radius': np.arange(1,4), # 'm2fp__asBit': np.ones(1, dtype=np.bool),
    'gpr__kernel': [PairwiseKernel(metric=jaccard)+WhiteKernel(),
                    PairwiseKernel(metric=dice)+WhiteKernel()]
}
pipeline = Pipeline([('m2fp', Mol2Fingerprint(as_bit=True)),
                     ('gpr', GaussianProcessRegressor())])
# pipeline.set_params(m2fp__n_dims=128, m2fp__radius=1, gpr__kernel=PairwiseKernel(metric=dice)+WhiteKernel(),
                    # m2fp__asBit=np.ones(1, dtype=np.bool))
model = GridSearchCV(pipeline, params, cv=5)
# model.fit(df_logS.mol,y)

model.fit(df_logS.mol, y)
# model.best_estimator_

/usr/local/lib/python3.6/dist-packages/sklearn/model_selection/_validation.py:536: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: 
Boost.Python.ArgumentError: Python argument types in
    rdkit.Chem.rdMolDescriptors.GetMorganFingerprintAsBitVect(Mol)
did not match C++ signature:
    GetMorganFingerprintAsBitVect(RDKit::ROMol mol, int radius, unsigned int nBits=2048, boost::python::api::object invariants=[], boost::python::api::object fromAtoms=[], bool useChirality=False, bool useBondTypes=True, bool useFeatures=False, boost::python::api::object bitInfo=None, bool includeRedundantEnvironments=False)

  FitFailedWarning)

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-66-01122cd32832> in <module>()
     12 # model.fit(df_logS.mol,y)
     13 
---> 14 model.fit(df_logS.mol, y)
     15 # model.best_estimator_

/usr/local/lib/python3.6/dist-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
    734             # of the params are estimators as well.
    735             self.best_estimator_ = clone(clone(base_estimator).set_params(
--> 736                 **self.best_params_))
    737             refit_start_time = time.time()
    738             if y is not None:

/usr/local/lib/python3.6/dist-packages/sklearn/base.py in clone(estimator, safe)
     69     new_object_params = estimator.get_params(deep=False)
     70     for name, param in new_object_params.items():
---> 71         new_object_params[name] = clone(param, safe=False)
     72     new_object = klass(**new_object_params)
     73     params_set = new_object.get_params(deep=False)

/usr/local/lib/python3.6/dist-packages/sklearn/base.py in clone(estimator, safe)
     57     # XXX: not handling dictionaries
     58     if estimator_type in (list, tuple, set, frozenset):
---> 59         return estimator_type([clone(e, safe=safe) for e in estimator])
     60     elif not hasattr(estimator, 'get_params') or isinstance(estimator, type):
     61         if not safe:

/usr/local/lib/python3.6/dist-packages/sklearn/base.py in <listcomp>(.0)
     57     # XXX: not handling dictionaries
     58     if estimator_type in (list, tuple, set, frozenset):
---> 59         return estimator_type([clone(e, safe=safe) for e in estimator])
     60     elif not hasattr(estimator, 'get_params') or isinstance(estimator, type):
     61         if not safe:

/usr/local/lib/python3.6/dist-packages/sklearn/base.py in clone(estimator, safe)
     57     # XXX: not handling dictionaries
     58     if estimator_type in (list, tuple, set, frozenset):
---> 59         return estimator_type([clone(e, safe=safe) for e in estimator])
     60     elif not hasattr(estimator, 'get_params') or isinstance(estimator, type):
     61         if not safe:

/usr/local/lib/python3.6/dist-packages/sklearn/base.py in <listcomp>(.0)
     57     # XXX: not handling dictionaries
     58     if estimator_type in (list, tuple, set, frozenset):
---> 59         return estimator_type([clone(e, safe=safe) for e in estimator])
     60     elif not hasattr(estimator, 'get_params') or isinstance(estimator, type):
     61         if not safe:

/usr/local/lib/python3.6/dist-packages/sklearn/base.py in clone(estimator, safe)
     80             raise RuntimeError('Cannot clone object %s, as the constructor '
     81                                'either does not set or modifies parameter %s' %
---> 82                                (estimator, name))
     83     return new_object
     84 

RuntimeError: Cannot clone object Mol2Fingerprint(as_bit=True, n_dims=8, radius=1), as the constructor either does not set or modifies parameter n_dims

In [ ]:

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-23-8922de862c59> in <module>()
----> 1 model.best_estimator_

AttributeError: 'Pipeline' object has no attribute 'best_estimator_'

In [ ]:

%%time 
list_scores = []
for p in np.linspace(6, 8, 3):
  for radius in range(1,4):
    pow = 2 ** p
    
    m2f = Mol2Fingerprint(radius, pow, True)
    print('radius', radius, 'dimensions', pow)
    X=m2f.fit_transform(df_logS.mol)
    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=.3, random_state=66)
    for name, kernel in kernels:
      print(name, end=' ')
      gpr = GaussianProcessRegressor(kernel=kernel)
      gpr.fit(Xtrain, ytrain)
      score_logli = gpr.log_marginal_likelihood()
      print('%.3f' % score_logli)
      list_scores.append([name, int(radius), int(pow), score_logli])

radius 1 dimensions 64.0
tanimoto -638.638
dice -670.836
radius 2 dimensions 64.0
tanimoto -661.545
dice -679.492
radius 3 dimensions 64.0
tanimoto -654.934
dice -673.287
radius 1 dimensions 128.0
tanimoto -626.873
dice -652.698
radius 2 dimensions 128.0
tanimoto -649.035
dice -669.259
radius 3 dimensions 128.0
tanimoto -643.639
dice -660.163
radius 1 dimensions 256.0
tanimoto -624.681
dice -632.691
radius 2 dimensions 256.0
tanimoto -642.977
dice -651.185
radius 3 dimensions 256.0
tanimoto -647.595
dice -653.056
CPU times: user 22min 55s, sys: 23.6 s, total: 23min 18s
Wall time: 22min 50s

In [ ]:

def pick(w, name='tanimoto'):
  if w[0] == name:
    return w[1:]
  else:
    return 0,0,0

In [ ]:

import matplotlib.pyplot as plt
import matplotlib as mpl

In [ ]:

t = np.matrix(list(map(lambda e:pick(e), list_scores)))
t = t[np.where(t.sum(axis=1)<0)[0],:]
smap=np.zeros((3, 3)) # score map
for row_e in t:
  row = row_e
  smap[int(row[0,0]-1), int(row[0,1]//128)] = row[0,2]
plt.imshow(smap)
plt.colorbar()
plt.xticks(range(0,3), [1,2,3])
plt.yticks(range(0,3), [64,128,256])
plt.xlabel('radius')
plt.ylabel('dimension')
plt.show()

In [ ]:

t = np.matrix(list(map(lambda e:pick(e, 'dice'), list_scores)))
t = t[np.where(t.sum(axis=1)<0)[0],:]
smap=np.zeros((3, 3)) # score map
for row_e in t:
  row = row_e
  smap[int(row[0,0]-1), int(row[0,1]//128)] = row[0,2]
plt.imshow(smap)
plt.colorbar()
plt.xticks(range(0,3), [1,2,3])
plt.yticks(range(0,3), [64,128,256])
plt.xlabel('radius')
plt.ylabel('dimension')
plt.show()

[Feature Work] Score Indicator for Gaussian Process Regression¶

For each sample of $n$ , GPR outputs the mean $m_n$ and the standard deviation $s_n$ of the predicted value. Then, the prediction errors through GPR should be evaluated for model $m$ with:

$er_m = \sum_{n} \frac{| y_{{\rm obs}, n} - m_n |}{s_n}.$

The $er_m$ evaluates the prediction error ratio taking into account the standard deviation.