from rdkit import Chem
from rdkit.Chem import AllChem
import matplotlib as mpl
import matplotlib.pyplot as plt
from rdkit.Chem.Draw import IPythonConsole
import pandas as pd
def sdf2df(filename):
sppl = Chem.SDMolSupplier(filename)
df = pd.DataFrame([])
for mol in sppl:
if mol is not None:
df=df.append(pd.Series([Chem.MolToSmiles(mol),mol.GetProp('logS')]),
ignore_index=True)
return df
df=sdf2df('./logSdataset1290_2d.sdf')
df.rename(columns={k:v for k,v in enumerate(['smiles','logS'])},inplace=True)
df.sample(2)
smiles | logS | |
---|---|---|
79 | CC(C)[C@H](C)O | -0.2 |
427 | Oc1ccccc1-c1ccccc1 | -2.39 |
from rdkit.Chem import PandasTools
from sklearn.base import TransformerMixin, BaseEstimator
import gc
class Morgan(TransformerMixin, BaseEstimator):
def __init__(self,radius=3):
self.radius=radius
def fit(self,x,y=None):
return self
def transform(self, data):
df=pd.DataFrame([
AllChem.GetMorganFingerprint(Chem.MolFromSmiles(smi),
self.radius).GetNonzeroElements() for smi in data])
df.fillna(0.,inplace=True)
return df
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.pipeline import Pipeline
from sklearn.cross_decomposition import PLSRegression
import numpy as np
from sklearn.metrics import r2_score
base = Pipeline([('pls',PLSRegression())])
params = {'pls__n_components':np.arange(1, 11)}
model = GridSearchCV(base, params, cv=5)
X = Morgan().fit_transform(df['smiles'])
for r in [42, 66, 57, 10, 33]:
X_train, X_test, ls_train, ls_test =train_test_split(X, df['logS'],
test_size=.2, random_state=r)
model.fit(X_train, ls_train)
print('{:.4f} {:.4f}'.format(r2_score(ls_train, model.predict(X_train)),
r2_score(ls_test, model.predict(X_test))))
0.9957 0.6879 0.9966 0.7776 0.9953 0.7357 0.9952 0.7415 0.9953 0.7673
X = Morgan(2).fit_transform(df['smiles'])
for r in [42, 66, 57, 10, 33]:
X_train, X_test, ls_train, ls_test =train_test_split(X, df['logS'],
test_size=.2, random_state=r)
model.fit(X_train, ls_train)
print('{:.4f} {:.4f}'.format(r2_score(ls_train, model.predict(X_train)),
r2_score(ls_test, model.predict(X_test))))
0.9917 0.7468 0.9923 0.8222 0.9911 0.7801 0.9911 0.8040 0.9917 0.8192
X = Morgan(1).fit_transform(df['smiles'])
for r in [42, 66, 57, 10, 33]:
X_train, X_test, ls_train, ls_test =train_test_split(X, df['logS'],
test_size=.2, random_state=r)
model.fit(X_train, ls_train)
print('{:.4f} {:.4f}'.format(r2_score(ls_train, model.predict(X_train)),
r2_score(ls_test, model.predict(X_test))))
0.9593 0.7576 0.9575 0.8396 0.9647 0.8096 0.9566 0.8336 0.9637 0.8558
The data size is too small to apply Morgan fingerprint.
from rdkit.Chem import rdMolDescriptors, Descriptors
len(Descriptors.descList)
200
class RDKit(TransformerMixin,BaseEstimator):
def fit(self, x, y=None):
return self
def transform(self, data):
dframe=pd.DataFrame([list(map(lambda f: f(Chem.MolFromSmiles(smi)),
dict(Descriptors.descList).values()))
for smi in data])
return dframe
from sklearn.model_selection import cross_validate,cross_val_score,cross_val_predict
X = RDKit().fit_transform(df['smiles'])
for r in [42, 66, 57, 10, 33]:
X_train, X_test, ls_train, ls_test =train_test_split(X, df['logS'],
test_size=.2, random_state=r)
model.fit(X_train, ls_train)
print('{:.4f} {:.4f}'.format(r2_score(ls_train, model.predict(X_train)),
r2_score(ls_test, model.predict(X_test))))
c:\users\shojiro_shibayama\miniconda3\envs\cheminfo\lib\site-packages\sklearn\model_selection\_search.py:841: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal. DeprecationWarning)
0.8368 0.7857
c:\users\shojiro_shibayama\miniconda3\envs\cheminfo\lib\site-packages\sklearn\model_selection\_search.py:841: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal. DeprecationWarning)
0.8326 0.8152 0.8968 0.8885
c:\users\shojiro_shibayama\miniconda3\envs\cheminfo\lib\site-packages\sklearn\model_selection\_search.py:841: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal. DeprecationWarning)
0.8898 0.8890 0.9047 0.8786
c:\users\shojiro_shibayama\miniconda3\envs\cheminfo\lib\site-packages\sklearn\model_selection\_search.py:841: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal. DeprecationWarning)
Xall = RDKit().fit_transform(df.smiles)
cross_validate(PLSRegression(), Xall, df.logS,cv=5)
c:\users\shojiro_shibayama\miniconda3\envs\cheminfo\lib\site-packages\sklearn\utils\deprecation.py:125: FutureWarning: You are accessing a training score ('train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True warnings.warn(*warn_args, **warn_kwargs)
{'fit_time': array([0.07252812, 0.06805325, 0.05869269, 0.0587759 , 0.06913924]), 'score_time': array([0.01473451, 0.02531195, 0.0130384 , 0.01367426, 0.01160932]), 'test_score': array([-1.33943507, -3.05134004, -1.41332243, 0.89351645, 0.44010176]), 'train_score': array([0.81350277, 0.85265739, 0.85562129, 0.76937422, 0.82636682])}
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
from sklearn.preprocessing import StandardScaler
from sklearn.externals import joblib
df_train,df_test = train_test_split(df, test_size=.2,
random_state=66)
svr = Pipeline([('sc',StandardScaler()),('svr',SVR())])
mfp1 pls fitting...
0.9575 0.8396
mfp1 svr fitting...
0.9989 0.4904
mfp1 rf fitting...
0.9997 0.8566
mfp2 pls fitting...
0.9945 0.8291
mfp2 svr fitting...
0.9993 0.0507
mfp2 rf fitting...
0.9766 0.8618
mfp3 pls fitting...
0.9985 0.7822
mfp3 svr fitting...
0.9994 0.0188
mfp3 rf fitting...
0.9764 0.8579
# %%time
scores=[]
# for desc,tr in zip(list(map(lambda x:'mfp'+str(x), range(1,4)))+['rdkit'],
# [Morgan(1), Morgan(2), Morgan(3), RDKit()]):
# for desc, tr in zip(['mfp1'],[Morgan(1)]):
# for base in [('pls', PLSRegression())]:
for desc,tr in zip(['rdkit'],[RDKit()]):
Xall = tr.fit_transform(df_train.smiles.tolist()+df_test.smiles.tolist())
Xtrain = Xall.iloc[:df_train.shape[0],:]
Xtest = Xall.iloc[df_train.shape[0]:,:]
for base in [('pls', PLSRegression()),
('svr', svr),
('rf', RandomForestRegressor(n_estimators=500))]:
print(desc, base[0],'fitting...')
if base[0]=='pls':
params={'n_components':np.arange(1,16)}
elif base[0]=='svr':
params={'svr__C':2.**np.arange(-5,6),
'svr__gamma':2.**np.arange(-5,6),
'svr__epsilon':2.**np.arange(-5,6)}
elif base[0]=='rf':
params = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
if base[0]=='pls':
model=GridSearchCV(base[1], params,cv=5)
else:
#https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74
model=RandomizedSearchCV(base[1],
param_distributions=params,
n_iter=100, cv=5,
random_state=42,
n_jobs = -1)
model.fit(Xtrain, df_train.logS)
scores.append([desc+base[0],
r2_score(df_train.logS, model.predict(Xtrain)),
r2_score(df_test.logS, model.predict(Xtest))])
print('{:.4f} {:.4f}'.format(r2_score(df_train.logS, model.predict(Xtrain)),
r2_score(df_test.logS, model.predict(Xtest))))
joblib.dump(model, desc+base[0]+'best.joblib')
rdkit pls fitting...
c:\users\shojiro_shibayama\miniconda3\envs\cheminfo\lib\site-packages\sklearn\model_selection\_search.py:841: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal. DeprecationWarning)
0.8326 0.8152 rdkit svr fitting...
c:\users\shojiro_shibayama\miniconda3\envs\cheminfo\lib\site-packages\sklearn\model_selection\_search.py:841: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal. DeprecationWarning) c:\users\shojiro_shibayama\miniconda3\envs\cheminfo\lib\site-packages\sklearn\preprocessing\data.py:625: DataConversionWarning: Data with input dtype int64, float64 were all converted to float64 by StandardScaler. return self.partial_fit(X, y) c:\users\shojiro_shibayama\miniconda3\envs\cheminfo\lib\site-packages\sklearn\base.py:465: DataConversionWarning: Data with input dtype int64, float64 were all converted to float64 by StandardScaler. return self.fit(X, y, **fit_params).transform(X) c:\users\shojiro_shibayama\miniconda3\envs\cheminfo\lib\site-packages\sklearn\pipeline.py:331: DataConversionWarning: Data with input dtype int64, float64 were all converted to float64 by StandardScaler. Xt = transform.transform(Xt) c:\users\shojiro_shibayama\miniconda3\envs\cheminfo\lib\site-packages\sklearn\pipeline.py:331: DataConversionWarning: Data with input dtype int64, float64 were all converted to float64 by StandardScaler. Xt = transform.transform(Xt) c:\users\shojiro_shibayama\miniconda3\envs\cheminfo\lib\site-packages\sklearn\pipeline.py:331: DataConversionWarning: Data with input dtype int64, float64 were all converted to float64 by StandardScaler. Xt = transform.transform(Xt) c:\users\shojiro_shibayama\miniconda3\envs\cheminfo\lib\site-packages\sklearn\pipeline.py:331: DataConversionWarning: Data with input dtype int64, float64 were all converted to float64 by StandardScaler. Xt = transform.transform(Xt)
0.9989 0.8069 rdkit rf fitting... 0.9881 0.9179
from pprint import pprint
pprint(scores)
[['mfp1pls', 0.9574626534970376, 0.8395725598204108], ['mfp1svr', 0.9988716232526271, 0.4904020514687174], ['mfp1rf', 0.9996516941556608, 0.8566323785700072], ['mfp2pls', 0.9945497585374266, 0.8291318386353449], ['mfp2svr', 0.999337896118609, 0.05068749288040464], ['mfp2rf', 0.976580144178383, 0.8618371228433085], ['mfp3pls', 0.9985089538907509, 0.7822238708003658], ['mfp3svr', 0.9993675150556893, 0.018776592909252265], ['mfp3rf', 0.9764081178790016, 0.857928636290177]]
pprint(scores)
[['rdkitpls', 0.8325879237930572, 0.8152240940788283], ['rdkitsvr', 0.9988676341522326, 0.8069065926433822], ['rdkitrf', 0.9882167059044964, 0.9166879005899062]]
Xtrain.to_csv('rdkit.x_train.csv')
Xtest.to_csv('rdkit.x_test.csv')