!git clone https://github.com/HIPS/neural-fingerprint.git
import pandas as pd
df = pd.read_csv('./neural-fingerprint/data/2015-05-24-delaney/delaney-processed.csv')
!curl -Lo rdkit_installer.py https://git.io/fxiPZ
import rdkit_installer
%time rdkit_installer.install()
import rdkit
Cloning into 'neural-fingerprint'... remote: Enumerating objects: 1760, done. remote: Total 1760 (delta 0), reused 0 (delta 0), pack-reused 1760 Receiving objects: 100% (1760/1760), 51.21 MiB | 24.78 MiB/s, done. Resolving deltas: 100% (1141/1141), done. % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0 100 2415 100 2415 0 0 3018 0 --:--:-- --:--:-- --:--:-- 3018
add /root/miniconda/lib/python3.6/site-packages to PYTHONPATH python version: 3.6.9 fetching installer from https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh done installing miniconda to /root/miniconda done installing rdkit done rdkit-2017.09.1 installation finished!
CPU times: user 313 ms, sys: 182 ms, total: 495 ms Wall time: 50.7 s
import numpy as np
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.cross_decomposition import PLSRegression
import matplotlib as mpl
import matplotlib.pyplot as plt
from rdkit.Chem import PandasTools, rdMolDescriptors, Descriptors
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
Traceback (most recent call last): File "/root/miniconda/lib/python3.6/site-packages/rdkit/Chem/PandasTools.py", line 130, in <module> if 'display.width' in pd.core.config._registered_options: AttributeError: module 'pandas.core' has no attribute 'config'
from rdkit import Chem
df['ROMol'] = [Chem.MolFromSmiles(m)for m in df.smiles]
X = np.matrix([np.array(list(map(lambda f:f(m), dict(Descriptors.descList).values())))
for m in df.ROMol] )
X.shape
(1128, 200)
TARGET = [df.columns[8]]
A = pd.DataFrame(X)
A = pd.concat((A,df[TARGET]),axis=1)
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
train, test = train_test_split(A, test_size=0.33, random_state = 42)
methods = A.columns[:-1]
kernel = 1. * RBF(length_scale=100.,
length_scale_bounds=(1e-2, 1e4)) \
+ WhiteKernel(noise_level=1,
noise_level_bounds=(1e-10, 1e+1))
model = make_pipeline(StandardScaler(),GaussianProcessRegressor(kernel=kernel,
alpha=0.))
model.fit(train.iloc[:,:-1],train[TARGET])
Pipeline(memory=None, steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('gaussianprocessregressor', GaussianProcessRegressor(alpha=0.0, copy_X_train=True, kernel=1**2 * RBF(length_scale=100) + WhiteKernel(noise_level=1), n_restarts_optimizer=0, normalize_y=False, optimizer='fmin_l_bfgs_b', random_state=None))], verbose=False)
with mpl.style.context('seaborn'):
fig = plt.figure()
ax = fig.gca()
plt.plot(train[TARGET], model.predict(train[methods]), 'o')
plt.plot(test[TARGET], model.predict(test[methods]), 'o')
xlim = ax.get_xlim()
ax.set_ylim(xlim)
plt.plot(xlim,xlim, '-k')
plt.axis('square')
plt.xlabel('Measured log solubility')
plt.ylabel('Estimated log solubility')
plt.legend({'train', 'test'})
print('%.4f %.4f' % (np.sqrt(metrics.mean_squared_error(train[TARGET], model.predict(train[methods]))),\
np.sqrt(metrics.mean_squared_error(test[TARGET], model.predict(test[methods])))))
print('%.4f %.4f' % (metrics.r2_score(train[TARGET], model.predict(train[methods])),\
metrics.r2_score(test[TARGET], model.predict(test[methods]))))
0.1574 0.6889 0.9942 0.8956
Train | Test | |
---|---|---|
w/o scaler RMSE | 0.6615 | 1.4443 |
w/o scaler R2 | 0.8980 | 0.5412 |
w/ scaler RMSE | 0.1574 | 0.6889 |
w/ scaler R2 | 0.9942 | 0.8956 |
PLS w/ fr_ desc. RMSE | 1.1154 | 1.1615 |
PLS w/ fr_ desc. R2 | 0.7101 | 0.7032 |