!git clone https://github.com/HIPS/neural-fingerprint.git
import pandas as pd
df = pd.read_csv('./neural-fingerprint/data/2015-05-24-delaney/delaney-processed.csv')
!curl -Lo rdkit_installer.py https://git.io/fxiPZ
import rdkit_installer
%time rdkit_installer.install()
import rdkit
fatal: destination path 'neural-fingerprint' already exists and is not an empty directory. % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0 100 2415 100 2415 0 0 4573 0 --:--:-- --:--:-- --:--:-- 485k
add /root/miniconda/lib/python3.6/site-packages to PYTHONPATH rdkit is already installed
CPU times: user 2.38 ms, sys: 99 µs, total: 2.48 ms Wall time: 2.3 ms
import numpy as np
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.cross_decomposition import PLSRegression
import matplotlib as mpl
import matplotlib.pyplot as plt
from rdkit.Chem import PandasTools, rdMolDescriptors, Descriptors
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
PandasTools.AddMoleculeColumnToFrame(frame=df, smilesCol='smiles')
count_frags =lambda moleobj: np.array(list(map(lambda m: getattr(Descriptors, m)( moleobj ),
[method for method in dir(Descriptors) if method.startswith('fr_')])))
methods = [method for method in dir(Descriptors) if method.startswith('fr_')]
n_descs = len(methods)
n_samples = df.shape[0]
X = np.zeros((n_samples,n_descs))
for ix, mol in enumerate(df.ROMol):
X[ix,:] = count_frags(mol)
TARGET = df.columns[8]
M = pd.DataFrame(X, columns = methods)
A = pd.concat((df,M),axis=1)
train, test = train_test_split(A, test_size=0.33, random_state = 42)
params = {'n_components': np.arange(1, 16)}
model = GridSearchCV(PLSRegression(), params, cv=3)
model.fit(train[methods], train[TARGET])
GridSearchCV(cv=3, error_score='raise-deprecating', estimator=PLSRegression(copy=True, max_iter=500, n_components=2, scale=True, tol=1e-06), iid='warn', n_jobs=None, param_grid={'n_components': array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])}, pre_dispatch='2*n_jobs', refit=True, return_train_score=False, scoring=None, verbose=0)
with mpl.style.context('seaborn'):
fig = plt.figure()
ax = fig.gca()
plt.plot(train[TARGET], model.predict(train[methods]), 'o')
plt.plot(test[TARGET], model.predict(test[methods]), 'o')
xlim = ax.get_xlim()
ax.set_ylim(xlim)
plt.plot(xlim,xlim, '-k')
plt.axis('square')
plt.xlabel('Measured log solubility')
plt.ylabel('Estimated log solubility')
plt.legend({'train', 'test'})
print(np.sqrt(metrics.mean_squared_error(train[TARGET], model.predict(train[methods]))),\
np.sqrt(metrics.mean_squared_error(test[TARGET], model.predict(test[methods]))))
print(metrics.r2_score(train[TARGET], model.predict(train[methods])),\
metrics.r2_score(test[TARGET], model.predict(test[methods])))
1.1153743787284136 1.1615487203232076 0.710121766066296 0.7032509664048506
def t2_score(data, model):
assert type(data)==pd.DataFrame or type(data)==np.ndarray, "input must be pandas.DataFrame or np.array"
explained_std_ = np.sqrt(model.best_estimator_.x_scores_.var(axis=0))
scores_whiten = model.transform(data) / explained_std_
return (scores_whiten ** 2.).sum(axis=1)
def q_value(data, model):
assert type(data)==pd.DataFrame or type(data)==np.ndarray, "input must be pandas.DataFrame or np.array"
x_reproduced_ = model.transform(data) \
@ model.best_estimator_.x_loadings_.T \
* model.best_estimator_.x_std_ + model.best_estimator_.x_mean_
return ((data - x_reproduced_)**2.).sum(axis=1)
x_reproduced_ = model.transform(train[methods]) \
@ model.best_estimator_.x_loadings_.T \
* model.best_estimator_.x_std_ + model.best_estimator_.x_mean_
((train[methods]-x_reproduced_)**2.).sum(axis=1)
df_t2q=pd.DataFrame([t2_score(train[methods], model),q_value(train[methods], model)]).T
df_t2q.rename(columns={ix:name for ix,name in enumerate(['t2','q'])},
index={ix: idx for ix,idx in enumerate(train.index)},
inplace=True)
df_t2q.head(2)
t2 | q | |
---|---|---|
84 | 1.195129 | 4.235145 |
432 | 35.129500 | 79.107809 |
# help(np.quantile)
for x in [df_t2q['t2'], df_t2q['q']]:
print('%.4f' % np.quantile(x, 0.997),end=' ')
103.2511 80.0971
with mpl.style.context('seaborn'):
fig = plt.figure()
for data in [train, test]:
plt.plot(t2_score(data[methods], model),
q_value(data[methods], model),
'o')
plt.vlines(np.quantile(df_t2q['t2'], 0.997), 0, 250)
plt.hlines(np.quantile(df_t2q['q'], 0.997), 0, 175)
plt.legend(['train','test'])
from copy import deepcopy
pls = deepcopy(model.best_estimator_)