!git clone https://github.com/HIPS/neural-fingerprint.git import pandas as pd df = pd.read_csv('./neural-fingerprint/data/2015-05-24-delaney/delaney-processed.csv') !curl -Lo rdkit_installer.py https://git.io/fxiPZ import rdkit_installer %time rdkit_installer.install() import rdkit import numpy as np from sklearn import metrics from sklearn.model_selection import GridSearchCV, train_test_split from sklearn.cross_decomposition import PLSRegression import matplotlib as mpl import matplotlib.pyplot as plt from rdkit.Chem import PandasTools, rdMolDescriptors, Descriptors from rdkit.Chem.Draw import IPythonConsole from rdkit.Chem import Draw from rdkit import Chem df['ROMol'] = [Chem.MolFromSmiles(m)for m in df.smiles] X = np.matrix([np.array(list(map(lambda f:f(m), dict(Descriptors.descList).values()))) for m in df.ROMol] ) X.shape TARGET = [df.columns[8]] A = pd.DataFrame(X) A = pd.concat((A,df[TARGET]),axis=1) from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process.kernels import RBF, WhiteKernel from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler train, test = train_test_split(A, test_size=0.33, random_state = 42) methods = A.columns[:-1] kernel = 1. * RBF(length_scale=100., length_scale_bounds=(1e-2, 1e4)) \ + WhiteKernel(noise_level=1, noise_level_bounds=(1e-10, 1e+1)) model = make_pipeline(StandardScaler(),GaussianProcessRegressor(kernel=kernel, alpha=0.)) model.fit(train.iloc[:,:-1],train[TARGET]) with mpl.style.context('seaborn'): fig = plt.figure() ax = fig.gca() plt.plot(train[TARGET], model.predict(train[methods]), 'o') plt.plot(test[TARGET], model.predict(test[methods]), 'o') xlim = ax.get_xlim() ax.set_ylim(xlim) plt.plot(xlim,xlim, '-k') plt.axis('square') plt.xlabel('Measured log solubility') plt.ylabel('Estimated log solubility') plt.legend({'train', 'test'}) print('%.4f %.4f' % (np.sqrt(metrics.mean_squared_error(train[TARGET], model.predict(train[methods]))),\ np.sqrt(metrics.mean_squared_error(test[TARGET], model.predict(test[methods]))))) print('%.4f %.4f' % (metrics.r2_score(train[TARGET], model.predict(train[methods])),\ metrics.r2_score(test[TARGET], model.predict(test[methods]))))