!git clone https://github.com/HIPS/neural-fingerprint.git
import pandas as pd
df = pd.read_csv('./neural-fingerprint/data/2015-05-24-delaney/delaney-processed.csv')
!curl -Lo rdkit_installer.py https://git.io/fxiPZ
import rdkit_installer
%time rdkit_installer.install()
import rdkit
fatal: destination path 'neural-fingerprint' already exists and is not an empty directory. % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0 100 2415 100 2415 0 0 3659 0 --:--:-- --:--:-- --:--:-- 3659
add /root/miniconda/lib/python3.6/site-packages to PYTHONPATH rdkit is already installed
CPU times: user 1.94 ms, sys: 0 ns, total: 1.94 ms Wall time: 2.01 ms
import numpy as np
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.cross_decomposition import PLSRegression
import matplotlib as mpl
import matplotlib.pyplot as plt
from rdkit.Chem import rdMolDescriptors, Descriptors, MolFromSmiles
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
mpl.rcParams['figure.figsize'] = [12, 8]
mpl.rcParams['font.size'] = 30.
df['mol'] = df.smiles.apply(MolFromSmiles)
count_frags =lambda moleobj: np.array(list(map(lambda m: getattr(Descriptors, m)( moleobj ),
[method for method in dir(Descriptors) if method.startswith('fr_')])))
methods = [method for method in dir(Descriptors) if method.startswith('fr_')]
n_descs = len(methods)
n_samples = df.shape[0]
X = np.zeros((n_samples,n_descs))
for ix, mol in enumerate(df.mol):
X[ix,:] = count_frags(mol)
TARGET = df.columns[8]
M = pd.DataFrame(X, columns = methods)
A = pd.concat((df,M),axis=1)
train, test = train_test_split(A, test_size=0.33, random_state = 42)
params = {'n_components': np.arange(1, 16)}
model = GridSearchCV(PLSRegression(), params, cv=3)
model.fit(train[methods], train[TARGET])
GridSearchCV(cv=3, error_score=nan, estimator=PLSRegression(copy=True, max_iter=500, n_components=2, scale=True, tol=1e-06), iid='deprecated', n_jobs=None, param_grid={'n_components': array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])}, pre_dispatch='2*n_jobs', refit=True, return_train_score=False, scoring=None, verbose=0)
with mpl.style.context('seaborn'):
fig = plt.figure()
ax = fig.gca()
plt.plot(train[TARGET], model.predict(train[methods]), 'o')
plt.plot(test[TARGET], model.predict(test[methods]), 'o')
xlim = ax.get_xlim()
ax.set_ylim(xlim)
plt.plot(xlim,xlim, '-k')
plt.axis('square')
plt.xlabel('Measured log solubility')
plt.ylabel('Estimated log solubility')
plt.legend({'train', 'test'})
print(np.sqrt(metrics.mean_squared_error(train[TARGET], model.predict(train[methods]))),\
np.sqrt(metrics.mean_squared_error(test[TARGET], model.predict(test[methods]))))
print(metrics.r2_score(train[TARGET], model.predict(train[methods])),\
metrics.r2_score(test[TARGET], model.predict(test[methods])))
1.1153743787284136 1.1615487203232076 0.710121766066296 0.7032509664048506
where d denotes the number of the variables in X, tk denotes the x_scores of the k-th component, qk denotes the y_loading of the k-th component.
Let W d-by-A matrix, SS 1-by-A matrix. Then,
√d×W×SS′sum(SS)would be the VIP of the whole variables (j=1,…,d).
def vip(pls_model):
n_dims = pls_model.x_weights_.shape[0]
n_components = pls_model.n_components
t_a = pls_model.x_scores_
w_a = pls_model.x_weights_# d x a
q_a = pls_model.y_loadings_
denom = list(map(lambda ix:(q_a[:,ix] **2.)*t_a[:,ix].T@t_a[:,ix], range(n_components)))
denom = np.matrix(denom)
numer = ((w_a **2.) * denom.T).T
return (n_dims*numer)/denom.sum()
vipval =vip( model.best_estimator_ )
ixshow = np.where(vipval>1)[1]
print(len(ixshow))
plt.figure()
plt.plot(vipval.T, '-o')
plt.plot(ixshow, vipval[:,ixshow].T, 'o')
plt.xlabel('variable index')
plt.ylabel('VIP value')
plt.show()
14