Tanimoto vs. Dice coefficients as kernel function.
!wget https://raw.githubusercontent.com/funatsu-lab/support-page/master/data/delaney-solubility/delaney-processed.csv?token=AAQJRTXGFXP44OXYO5VHHKK7A3VI6
!mv delaney-processed.csv?token=AAQJRTXGFXP44OXYO5VHHKK7A3VI6 delaney-processed.csv
!ls delaney-processed.csv
from pandas import read_csv
df_logS = read_csv('delaney-processed.csv')
!curl -Lo rdkit_installer.py https://git.io/fxiPZ
import rdkit_installer
%time rdkit_installer.install()
--2020-07-03 05:46:58-- https://raw.githubusercontent.com/funatsu-lab/support-page/master/data/delaney-solubility/delaney-processed.csv?token=AAQJRTXGFXP44OXYO5VHHKK7A3VI6 Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ... Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected. HTTP request sent, awaiting response... 200 OK Length: 96699 (94K) [text/plain] Saving to: ‘delaney-processed.csv?token=AAQJRTXGFXP44OXYO5VHHKK7A3VI6’ delaney-processed.c 100%[===================>] 94.43K --.-KB/s in 0.03s 2020-07-03 05:46:58 (3.67 MB/s) - ‘delaney-processed.csv?token=AAQJRTXGFXP44OXYO5VHHKK7A3VI6’ saved [96699/96699] delaney-processed.csv % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0 100 2415 100 2415 0 0 4236 0 --:--:-- --:--:-- --:--:-- 4236
add /root/miniconda/lib/python3.6/site-packages to PYTHONPATH python version: 3.6.9 fetching installer from https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh done installing miniconda to /root/miniconda done installing rdkit done rdkit-2020.03.3 installation finished!
CPU times: user 319 ms, sys: 190 ms, total: 510 ms Wall time: 56.9 s
from rdkit.Chem import MolFromSmiles
df_logS['mol'] = df_logS.smiles.apply(MolFromSmiles)
mols = df_logS.mol.tolist()
import numpy as np
from sklearn.metrics import pairwise as pw
# jaccard, dice, simpson
def jaccard(u,v,gamma=1.0):
return gamma*np.sum(np.logical_and(np.array(u), np.array(v)))/np.sum(np.logical_or(np.array(u), np.array(v)))
jaccard([1, 0, 1], [1, 0, 1], 1.0), jaccard([1, 0, 1], [1, 0, 0], 1.0)
(1.0, 0.5)
sum(map(sum, [[1, 0, 1], [1, 0, 0]]))
3
def dice(u,v,gamma=1.0):
return gamma*2.*np.sum(np.logical_and(np.array(u), np.array(v)))/ sum(map(sum, [u, v]))
print('dice', dice([1, 0, 1], [1, 0, 1], 1.0), dice([1, 0, 1], [1, 0, 0], 1.0) )
def simpson(u,v,gamma=1.0):
return gamma* np.sum(np.logical_and(np.array(u), np.array(v)))/ min(map(sum, [u, v]))
print('simpson', simpson([1, 0, 1], [1, 0, 1], 1.0), simpson([1, 0, 1], [1, 0, 0], 1.0) )
dice 1.0 0.6666666666666666 simpson 1.0 1.0
from sklearn.gaussian_process.kernels import PairwiseKernel, WhiteKernel, ConstantKernel
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.preprocessing import StandardScaler
from rdkit.Chem.AllChem import GetHashedMorganFingerprint, GetMorganFingerprintAsBitVect
from sklearn.model_selection import train_test_split, GridSearchCV
sklearn.pipeline.Pipeline
の利用sklearn.model_selection.GridSearchCV
の利用: 交差検定を利用したパラメータ探索from sklearn.base import BaseEstimator, TransformerMixin
class Mol2Fingerprint(BaseEstimator, TransformerMixin):
def __init__(self, radius=2, n_dims=128):
self.radius = int(radius)
self.n_dims = int(n_dims)
# self._func =
def fit(self, x, y=None):
return self
def transform(self, data):
return np.matrix(list(map(lambda m:GetMorganFingerprintAsBitVect(m, radius=self.radius, nBits=self.n_dims), data)))
preprocessor = Mol2Fingerprint()
X = preprocessor.fit_transform(df_logS.mol)
print(X.shape, df_logS.mol.shape)
(1128, 128) (1128,)
type([1, 10, 3]), type(['a', 'cc', 'zzz']), type(('mol2fp', Mol2Fingerprint()))
(list, list, tuple)
from sklearn.pipeline import make_pipeline, Pipeline
pipeline = Pipeline([('mol2fp', Mol2Fingerprint()),
('gpr', GaussianProcessRegressor())])
print(pipeline)
Pipeline(memory=None, steps=[('mol2fp', Mol2Fingerprint(n_dims=128, radius=2)), ('gpr', GaussianProcessRegressor(alpha=1e-10, copy_X_train=True, kernel=None, n_restarts_optimizer=0, normalize_y=False, optimizer='fmin_l_bfgs_b', random_state=None))], verbose=False)
TARGET = ['measured log solubility in mols per litre']
y = df_logS[TARGET]
print(y.shape)
(1128, 1)
pipeline.fit(df_logS, y)
--------------------------------------------------------------------------- ArgumentError Traceback (most recent call last) <ipython-input-27-75d5f787fed2> in <module>() ----> 1 pipeline.fit(df_logS, y) /usr/local/lib/python3.6/dist-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params) 348 This estimator 349 """ --> 350 Xt, fit_params = self._fit(X, y, **fit_params) 351 with _print_elapsed_time('Pipeline', 352 self._log_message(len(self.steps) - 1)): /usr/local/lib/python3.6/dist-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params) 313 message_clsname='Pipeline', 314 message=self._log_message(step_idx), --> 315 **fit_params_steps[name]) 316 # Replace the transformer of the step with the fitted 317 # transformer. This is necessary when loading the transformer /usr/local/lib/python3.6/dist-packages/joblib/memory.py in __call__(self, *args, **kwargs) 350 351 def __call__(self, *args, **kwargs): --> 352 return self.func(*args, **kwargs) 353 354 def call_and_shelve(self, *args, **kwargs): /usr/local/lib/python3.6/dist-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params) 726 with _print_elapsed_time(message_clsname, message): 727 if hasattr(transformer, 'fit_transform'): --> 728 res = transformer.fit_transform(X, y, **fit_params) 729 else: 730 res = transformer.fit(X, y, **fit_params).transform(X) /usr/local/lib/python3.6/dist-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params) 572 else: 573 # fit method of arity 2 (supervised transformation) --> 574 return self.fit(X, y, **fit_params).transform(X) 575 576 <ipython-input-24-e99a42e81794> in transform(self, data) 8 return self 9 def transform(self, data): ---> 10 return np.matrix(list(map(lambda m:GetMorganFingerprintAsBitVect(m, radius=self.radius, nBits=self.n_dims), data))) 11 12 preprocessor = Mol2Fingerprint() <ipython-input-24-e99a42e81794> in <lambda>(m) 8 return self 9 def transform(self, data): ---> 10 return np.matrix(list(map(lambda m:GetMorganFingerprintAsBitVect(m, radius=self.radius, nBits=self.n_dims), data))) 11 12 preprocessor = Mol2Fingerprint() ArgumentError: Python argument types in rdkit.Chem.rdMolDescriptors.GetMorganFingerprintAsBitVect(str) did not match C++ signature: GetMorganFingerprintAsBitVect(RDKit::ROMol mol, int radius, unsigned int nBits=2048, boost::python::api::object invariants=[], boost::python::api::object fromAtoms=[], bool useChirality=False, bool useBondTypes=True, bool useFeatures=False, boost::python::api::object bitInfo=None, bool includeRedundantEnvironments=False)
kernels = [
('tanimoto', PairwiseKernel(metric=jaccard)+WhiteKernel()),
('dice', PairwiseKernel(metric=dice)+WhiteKernel()),
# ('simpson', PairwiseKernel(metric=simpson)+WhiteKernel()),
]
params = {
'm2fp__n_dims': 2 ** np.arange(3,8),
'm2fp__radius': np.arange(1,4), # 'm2fp__asBit': np.ones(1, dtype=np.bool),
'gpr__kernel': [PairwiseKernel(metric=jaccard)+WhiteKernel(),
PairwiseKernel(metric=dice)+WhiteKernel()]
}
pipeline = Pipeline([('m2fp', Mol2Fingerprint(as_bit=True)),
('gpr', GaussianProcessRegressor())])
# pipeline.set_params(m2fp__n_dims=128, m2fp__radius=1, gpr__kernel=PairwiseKernel(metric=dice)+WhiteKernel(),
# m2fp__asBit=np.ones(1, dtype=np.bool))
model = GridSearchCV(pipeline, params, cv=5)
# model.fit(df_logS.mol,y)
model.fit(df_logS.mol, y)
# model.best_estimator_
/usr/local/lib/python3.6/dist-packages/sklearn/model_selection/_validation.py:536: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details: Boost.Python.ArgumentError: Python argument types in rdkit.Chem.rdMolDescriptors.GetMorganFingerprintAsBitVect(Mol) did not match C++ signature: GetMorganFingerprintAsBitVect(RDKit::ROMol mol, int radius, unsigned int nBits=2048, boost::python::api::object invariants=[], boost::python::api::object fromAtoms=[], bool useChirality=False, bool useBondTypes=True, bool useFeatures=False, boost::python::api::object bitInfo=None, bool includeRedundantEnvironments=False) FitFailedWarning)
--------------------------------------------------------------------------- RuntimeError Traceback (most recent call last) <ipython-input-66-01122cd32832> in <module>() 12 # model.fit(df_logS.mol,y) 13 ---> 14 model.fit(df_logS.mol, y) 15 # model.best_estimator_ /usr/local/lib/python3.6/dist-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params) 734 # of the params are estimators as well. 735 self.best_estimator_ = clone(clone(base_estimator).set_params( --> 736 **self.best_params_)) 737 refit_start_time = time.time() 738 if y is not None: /usr/local/lib/python3.6/dist-packages/sklearn/base.py in clone(estimator, safe) 69 new_object_params = estimator.get_params(deep=False) 70 for name, param in new_object_params.items(): ---> 71 new_object_params[name] = clone(param, safe=False) 72 new_object = klass(**new_object_params) 73 params_set = new_object.get_params(deep=False) /usr/local/lib/python3.6/dist-packages/sklearn/base.py in clone(estimator, safe) 57 # XXX: not handling dictionaries 58 if estimator_type in (list, tuple, set, frozenset): ---> 59 return estimator_type([clone(e, safe=safe) for e in estimator]) 60 elif not hasattr(estimator, 'get_params') or isinstance(estimator, type): 61 if not safe: /usr/local/lib/python3.6/dist-packages/sklearn/base.py in <listcomp>(.0) 57 # XXX: not handling dictionaries 58 if estimator_type in (list, tuple, set, frozenset): ---> 59 return estimator_type([clone(e, safe=safe) for e in estimator]) 60 elif not hasattr(estimator, 'get_params') or isinstance(estimator, type): 61 if not safe: /usr/local/lib/python3.6/dist-packages/sklearn/base.py in clone(estimator, safe) 57 # XXX: not handling dictionaries 58 if estimator_type in (list, tuple, set, frozenset): ---> 59 return estimator_type([clone(e, safe=safe) for e in estimator]) 60 elif not hasattr(estimator, 'get_params') or isinstance(estimator, type): 61 if not safe: /usr/local/lib/python3.6/dist-packages/sklearn/base.py in <listcomp>(.0) 57 # XXX: not handling dictionaries 58 if estimator_type in (list, tuple, set, frozenset): ---> 59 return estimator_type([clone(e, safe=safe) for e in estimator]) 60 elif not hasattr(estimator, 'get_params') or isinstance(estimator, type): 61 if not safe: /usr/local/lib/python3.6/dist-packages/sklearn/base.py in clone(estimator, safe) 80 raise RuntimeError('Cannot clone object %s, as the constructor ' 81 'either does not set or modifies parameter %s' % ---> 82 (estimator, name)) 83 return new_object 84 RuntimeError: Cannot clone object Mol2Fingerprint(as_bit=True, n_dims=8, radius=1), as the constructor either does not set or modifies parameter n_dims
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-23-8922de862c59> in <module>() ----> 1 model.best_estimator_ AttributeError: 'Pipeline' object has no attribute 'best_estimator_'
%%time
list_scores = []
for p in np.linspace(6, 8, 3):
for radius in range(1,4):
pow = 2 ** p
m2f = Mol2Fingerprint(radius, pow, True)
print('radius', radius, 'dimensions', pow)
X=m2f.fit_transform(df_logS.mol)
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=.3, random_state=66)
for name, kernel in kernels:
print(name, end=' ')
gpr = GaussianProcessRegressor(kernel=kernel)
gpr.fit(Xtrain, ytrain)
score_logli = gpr.log_marginal_likelihood()
print('%.3f' % score_logli)
list_scores.append([name, int(radius), int(pow), score_logli])
radius 1 dimensions 64.0 tanimoto -638.638 dice -670.836 radius 2 dimensions 64.0 tanimoto -661.545 dice -679.492 radius 3 dimensions 64.0 tanimoto -654.934 dice -673.287 radius 1 dimensions 128.0 tanimoto -626.873 dice -652.698 radius 2 dimensions 128.0 tanimoto -649.035 dice -669.259 radius 3 dimensions 128.0 tanimoto -643.639 dice -660.163 radius 1 dimensions 256.0 tanimoto -624.681 dice -632.691 radius 2 dimensions 256.0 tanimoto -642.977 dice -651.185 radius 3 dimensions 256.0 tanimoto -647.595 dice -653.056 CPU times: user 22min 55s, sys: 23.6 s, total: 23min 18s Wall time: 22min 50s
def pick(w, name='tanimoto'):
if w[0] == name:
return w[1:]
else:
return 0,0,0
import matplotlib.pyplot as plt
import matplotlib as mpl
t = np.matrix(list(map(lambda e:pick(e), list_scores)))
t = t[np.where(t.sum(axis=1)<0)[0],:]
smap=np.zeros((3, 3)) # score map
for row_e in t:
row = row_e
smap[int(row[0,0]-1), int(row[0,1]//128)] = row[0,2]
plt.imshow(smap)
plt.colorbar()
plt.xticks(range(0,3), [1,2,3])
plt.yticks(range(0,3), [64,128,256])
plt.xlabel('radius')
plt.ylabel('dimension')
plt.show()
t = np.matrix(list(map(lambda e:pick(e, 'dice'), list_scores)))
t = t[np.where(t.sum(axis=1)<0)[0],:]
smap=np.zeros((3, 3)) # score map
for row_e in t:
row = row_e
smap[int(row[0,0]-1), int(row[0,1]//128)] = row[0,2]
plt.imshow(smap)
plt.colorbar()
plt.xticks(range(0,3), [1,2,3])
plt.yticks(range(0,3), [64,128,256])
plt.xlabel('radius')
plt.ylabel('dimension')
plt.show()
For each sample of n, GPR outputs the mean mn and the standard deviation sn of the predicted value. Then, the prediction errors through GPR should be evaluated for model m with:
erm=∑n|yobs,n−mn|sn.The erm evaluates the prediction error ratio taking into account the standard deviation.