import numpy as np
import scipy as sp
from scipy.spatial.distance import pdist, cdist, squareform
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = [12., 8.]
mpl.rcParams['font.size'] = 25.
mpl.rcParams['font.family'] = 'MS Gothic'
# 1D data
x = np.linspace(-3, 3, 51)
gauss_k = lambda a, b, beta: np.exp(-beta*(np.linalg.norm(a-b)**2.))
gauss_k(x[0],x[1],1)
0.985703184122443
D=squareform(pdist(x.reshape(-1,1), lambda a, b: gauss_k(a,b, 0.44)))
d = D.shape[0]
D+=np.diag(np.ones(d))
plt.imshow(D)
cbar = plt.colorbar()
plt.xlabel('sample index');plt.ylabel('sample index')
cbar.ax.set_ylabel('Gaussian-kernel similarity', rotation=90)
plt.show()
# sampling 10 times
mean_ = np.zeros(d)
cov_ = D
samples = [np.random.multivariate_normal(mean_, cov_) for _ in range(10)]
plt.figure()
for group in samples:
plt.plot(x, group, '-')
plt.xlabel('x')
plt.ylabel('mean of the response: m(x)')
plt.show()
!wget https://raw.githubusercontent.com/funatsu-lab/support-page/master/data/delaney-solubility/delaney-processed.csv?token=AAQJRTRMLTL3G4MUTFQ72VS6F7KYU
!mv delaney-processed.csv?token=AAQJRTRMLTL3G4MUTFQ72VS6F7KYU delaney-processed.csv
!ls delaney-processed.csv
from pandas import read_csv
df_logS = read_csv('delaney-processed.csv')
--2020-01-21 06:33:48-- https://raw.githubusercontent.com/funatsu-lab/support-page/master/data/delaney-solubility/delaney-processed.csv?token=AAQJRTRMLTL3G4MUTFQ72VS6F7KYU Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ... Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected. HTTP request sent, awaiting response... 200 OK Length: 96699 (94K) [text/plain] Saving to: ‘delaney-processed.csv?token=AAQJRTRMLTL3G4MUTFQ72VS6F7KYU’ delaney-processed.c 100%[===================>] 94.43K --.-KB/s in 0.009s 2020-01-21 06:33:49 (10.3 MB/s) - ‘delaney-processed.csv?token=AAQJRTRMLTL3G4MUTFQ72VS6F7KYU’ saved [96699/96699] delaney-processed.csv
df_logS.head(3)
Compound ID | ESOL predicted log solubility in mols per litre | Minimum Degree | Molecular Weight | Number of H-Bond Donors | Number of Rings | Number of Rotatable Bonds | Polar Surface Area | measured log solubility in mols per litre | smiles | |
---|---|---|---|---|---|---|---|---|---|---|
0 | Amigdalin | -0.974 | 1 | 457.432 | 7 | 3 | 7 | 202.32 | -0.77 | OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)... |
1 | Fenfuram | -2.885 | 1 | 201.225 | 1 | 2 | 2 | 42.24 | -3.30 | Cc1occc1C(=O)Nc2ccccc2 |
2 | citral | -2.579 | 1 | 152.237 | 0 | 0 | 4 | 17.07 | -2.06 | CC(C)=CCCC(C)=CC(=O) |
!curl -Lo rdkit_installer.py https://git.io/fxiPZ
import rdkit_installer
%time rdkit_installer.install()
from rdkit import Chem
% Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0 100 2415 100 2415 0 0 4052 0 --:--:-- --:--:-- --:--:-- 4052
add /root/miniconda/lib/python3.6/site-packages to PYTHONPATH python version: 3.6.9 fetching installer from https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh done installing miniconda to /root/miniconda done installing rdkit done rdkit-2017.09.1 installation finished!
CPU times: user 272 ms, sys: 180 ms, total: 452 ms Wall time: 39.9 s
df_logS['Mol'] = df_logS.smiles.apply(Chem.MolFromSmiles)
from rdkit.DataManip.Metric import GetTanimotoDistMat
from rdkit.Chem import AllChem
morganfps = [AllChem.GetMorganFingerprintAsBitVect(m,3, nBits=128) for m in df_logS.Mol]
distmat = GetTanimotoDistMat(morganfps)
def fill_lower_diag(a, fill_diagonal=False):
"""
https://stackoverflow.com/questions/51439271/convert-1d-array-to-lower-triangular-matrix
"""
n = int(np.sqrt(len(a)*2))+1
if fill_diagonal:
mask = np.tri(n, dtype=bool, k=0)
else:
mask = np.tri(n, dtype=bool, k=-1) # or np.arange(n)[:,None] > np.arange(n)
out = np.zeros((n,n),dtype=int)
out[mask] = a
return out
distMorgan = fill_lower_diag(distmat)
plt.imshow(distMorgan+distMorgan.T)
cbar = plt.colorbar()
plt.xlabel('sample index');plt.ylabel('sample index')
cbar.ax.set_ylabel('Tanimoto-kernel similarity', rotation=90)
plt.show()
a=list(AllChem.GetMorganFingerprintAsBitVect(df_logS.Mol[0],2))
b=list(AllChem.GetMorganFingerprintAsBitVect(df_logS.Mol[0],2))
tanimoto = lambda v1, v2: np.sum(np.bool(v1) and np.bool(v2))/np.sum(np.bool(v1) or np.bool(v2))
print('tanimoto kernel', tanimoto(a,b))
tanimoto_sp = lambda k,l:np.sum(np.logical_and(k,l))/np.sum(np.logical_or(k,l))
print('tanimoto kernel for scipy', tanimoto_sp(a, b)) # tanimoto kernel for
tanimoto kernel 1.0 tanimoto kernel for scipy 1.0
from sklearn.model_selection import train_test_split
train_mols, test_mols, ytrain, ytest = train_test_split(df_logS.Mol, df_logS['measured log solubility in mols per litre'], test_size=.66)
def gpr_tanimoto(data_mols, noise=0.01):
"in: data_mols, out: mean_, cov_. cov_=K_n,n + I_n * \sigma^2"
bitvec = np.matrix([list(AllChem.GetMorganFingerprintAsBitVect(m,3, nBits=128)) for m in data_mols])
n_samples = bitvec.shape[0]
mean_ = np.zeros(n_samples)
Knn = squareform(pdist(bitvec, lambda u,v:tanimoto_sp(u,v))) #Knn += Knn.T
Knn += np.eye(n_samples)
cov_ = Knn + np.eye(n_samples) * (noise**2.)
return mean_, cov_, bitvec, Knn
def gpr_pred_tanimoto(list_mols, da_mols, y_obs, noise):
mean_prior_, cov_prior_, Xnn, Knn = gpr_tanimoto(da_mols, noise)
n_data = Xnn.shape[0]
n_input = len(list_mols)
Xm = np.matrix([list(AllChem.GetMorganFingerprintAsBitVect(m,3, nBits=128)) for m in list_mols])
Knm = cdist(Xnn, Xm, lambda u,v: tanimoto_sp(u,v))
pairD = pdist(Xm, lambda u,v: tanimoto_sp(u,v))
Kmm = squareform(pairD) + np.eye(n_input) # + squareform(pairD).T
mean_posterior_ = Knm.T@np.linalg.pinv(Knn+np.eye(n_data)*(noise**2.))@(y_obs-mean_prior_)
cov_posterior_ = Kmm - Knm.T@np.linalg.pinv(Knn+np.eye(n_data)*(noise**2.))@Knm
return mean_posterior_,cov_posterior_
print('logS stdev.', ytrain.std())
m_, c_ = gpr_pred_tanimoto(test_mols, train_mols, ytrain, ytrain.std())
logS stdev. 2.0977356410603774
test_mols.shape, train_mols.shape
((745,), (383,))
plt.figure()
plt.plot(ytest, m_, 'o')
plt.axis('square')
plt.xlim([-10, 2])
plt.ylim([-10, 2])
plt.xlabel('measured value'); plt.ylabel('predicted value')
plt.show()