!python --version
Python 3.7.0
!wget -c https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
!chmod +x Miniconda3-latest-Linux-x86_64.sh
!time bash ./Miniconda3-latest-Linux-x86_64.sh -b -f -p /usr/local
!time conda install -q -y -c conda-forge rdkit
--2018-10-31 13:08:02-- https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh Resolving repo.continuum.io (repo.continuum.io)... 104.16.19.10, 104.16.18.10, 2400:cb00:2048:1::6810:120a, ... Connecting to repo.continuum.io (repo.continuum.io)|104.16.19.10|:443... connected. HTTP request sent, awaiting response... 200 OK The file is already fully retrieved; nothing to do. PREFIX=/usr/local reinstalling: python-3.7.0-hc3d631a_0 ... Python 3.7.0 reinstalling: ca-certificates-2018.03.07-0 ... reinstalling: conda-env-2.6.0-1 ... reinstalling: libgcc-ng-8.2.0-hdf63c60_1 ... reinstalling: libstdcxx-ng-8.2.0-hdf63c60_1 ... reinstalling: libffi-3.2.1-hd88cf55_4 ... reinstalling: ncurses-6.1-hf484d3e_0 ... reinstalling: openssl-1.0.2p-h14c3975_0 ... reinstalling: xz-5.2.4-h14c3975_4 ... reinstalling: yaml-0.1.7-had09818_2 ... reinstalling: zlib-1.2.11-ha838bed_2 ... reinstalling: libedit-3.1.20170329-h6b74fdf_2 ... reinstalling: readline-7.0-h7b6447c_5 ... reinstalling: tk-8.6.8-hbc83047_0 ... reinstalling: sqlite-3.24.0-h84994c4_0 ... reinstalling: asn1crypto-0.24.0-py37_0 ... reinstalling: certifi-2018.8.24-py37_1 ... reinstalling: chardet-3.0.4-py37_1 ... reinstalling: idna-2.7-py37_0 ... reinstalling: pycosat-0.6.3-py37h14c3975_0 ... reinstalling: pycparser-2.18-py37_1 ... reinstalling: pysocks-1.6.8-py37_0 ... reinstalling: ruamel_yaml-0.15.46-py37h14c3975_0 ... reinstalling: six-1.11.0-py37_1 ... reinstalling: cffi-1.11.5-py37he75722e_1 ... reinstalling: setuptools-40.2.0-py37_0 ... reinstalling: cryptography-2.3.1-py37hc365091_0 ... reinstalling: wheel-0.31.1-py37_0 ... reinstalling: pip-10.0.1-py37_0 ... reinstalling: pyopenssl-18.0.0-py37_0 ... reinstalling: urllib3-1.23-py37_0 ... reinstalling: requests-2.19.1-py37_0 ... reinstalling: conda-4.5.11-py37_0 ... unlinking: ca-certificates-2018.10.15-ha4d7672_0 unlinking: certifi-2018.10.15-py37_1000 unlinking: conda-4.5.11-py37_1000 unlinking: openssl-1.0.2p-h470a237_1 installation finished. WARNING: You currently have a PYTHONPATH environment variable set. This may cause unexpected behavior when running the Python interpreter in Miniconda3. For best results, please verify that your PYTHONPATH only points to directories of packages that are compatible with the Python interpreter in Miniconda3: /usr/local real 0m15.630s user 0m13.287s sys 0m4.019s Solving environment: ...working... done ## Package Plan ## environment location: /usr/local added / updated specs: - rdkit The following packages will be UPDATED: ca-certificates: 2018.03.07-0 --> 2018.10.15-ha4d7672_0 conda-forge certifi: 2018.8.24-py37_1 --> 2018.10.15-py37_1000 conda-forge conda: 4.5.11-py37_0 --> 4.5.11-py37_1000 conda-forge openssl: 1.0.2p-h14c3975_0 --> 1.0.2p-h470a237_1 conda-forge Preparing transaction: ...working... done Verifying transaction: ...working... done Executing transaction: ...working... done real 0m40.560s user 0m38.017s sys 0m1.007s
%matplotlib inline
import matplotlib.pyplot as plt
import sys
import os
sys.path.append('/usr/local/lib/python3.7/site-packages/')
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem import RDConfig
from rdkit import rdBase
from rdkit.Chem.Draw import IPythonConsole
rdBase.rdkitVersion
'2018.09.1'
trainsdf = Chem.SDMolSupplier(os.path.join( RDConfig.RDDocsDir, 'Book/data/solubility.train.sdf'))
testsdf = Chem.SDMolSupplier(os.path.join( RDConfig.RDDocsDir, 'Book/data/solubility.test.sdf'))
train_mols = [mol for mol in trainsdf if mol != None]
test_mols = [mol for mol in testsdf if mol != None]
print(len(train_mols), len(test_mols))
sol_class = {"(A) low":0, "(B) medium":1, "(C) high": 2}
1025 257
# 2048 bit vector
def mol2arr(mol):
arr = np.zeros((1,))
fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2)
DataStructs.ConvertToNumpyArray(fp, arr)
return arr
np.sum(mol2arr(train_mols[0]))
7.0
(2048,)
import tensorflow as tf
import tensorflow.keras as keras
from keras import Model
from keras.layers import Activation, Dense, Dropout, Input
from keras.utils import np_utils
trainX = np.array([mol2arr(mol) for mol in train_mols])
trainY = [sol_class[mol.GetProp("SOL_classification")] for mol in train_mols]
trainY = np_utils.to_categorical(trainY)
testX = np.array([mol2arr(mol) for mol in test_mols])
testY = [sol_class[mol.GetProp("SOL_classification")] for mol in test_mols]
testY = np_utils.to_categorical(testY)
inputs = Input(shape=(2048,), name='input')
x = Dense(100, activation='relu', name='Layer1')(inputs)
x = Dense(20, activation='relu', name='Layer2')(x)
x = Dense(2, activation='relu', name='Layer3')(x)
predictions = Dense(3, activation='softmax', name='output')(x)
model = Model(inputs=inputs, outputs = predictions)
model.compile(optimizer=tf.train.RMSPropOptimizer(0.001),
loss='categorical_crossentropy',
metrics=['accuracy'])
model.summary()
model_int = Model(inputs=inputs, outputs=model.get_layer(name='Layer3').output)
model_int.summary()
_________________________________________________________________ Layer (type) Output Shape Param # ================================================================= input (InputLayer) (None, 2048) 0 _________________________________________________________________ Layer1 (Dense) (None, 100) 204900 _________________________________________________________________ Layer2 (Dense) (None, 20) 2020 _________________________________________________________________ Layer3 (Dense) (None, 2) 42 _________________________________________________________________ output (Dense) (None, 3) 9 ================================================================= Total params: 206,971 Trainable params: 206,971 Non-trainable params: 0 _________________________________________________________________ _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= input (InputLayer) (None, 2048) 0 _________________________________________________________________ Layer1 (Dense) (None, 100) 204900 _________________________________________________________________ Layer2 (Dense) (None, 20) 2020 _________________________________________________________________ Layer3 (Dense) (None, 2) 42 ================================================================= Total params: 206,962 Trainable params: 206,962 Non-trainable params: 0 _________________________________________________________________
epochs = 50
hist = model.fit(np.array(trainX), trainY, batch_size=32, epochs=epochs)
Epoch 1/50 1025/1025 [==============================] - 0s 478us/step - loss: 1.0971 - acc: 0.4322 Epoch 2/50 1025/1025 [==============================] - 0s 175us/step - loss: 1.0896 - acc: 0.4244 Epoch 3/50 1025/1025 [==============================] - 0s 176us/step - loss: 1.0620 - acc: 0.4117 Epoch 4/50 1025/1025 [==============================] - 0s 178us/step - loss: 1.0195 - acc: 0.4351 Epoch 5/50 1025/1025 [==============================] - 0s 181us/step - loss: 0.9382 - acc: 0.5307 Epoch 6/50 1025/1025 [==============================] - 0s 173us/step - loss: 0.8520 - acc: 0.6010 Epoch 7/50 1025/1025 [==============================] - 0s 177us/step - loss: 0.7807 - acc: 0.6654 Epoch 8/50 1025/1025 [==============================] - 0s 189us/step - loss: 0.7262 - acc: 0.6995 Epoch 9/50 1025/1025 [==============================] - 0s 190us/step - loss: 0.6837 - acc: 0.7239 Epoch 10/50 1025/1025 [==============================] - 0s 186us/step - loss: 0.6427 - acc: 0.7434 Epoch 11/50 1025/1025 [==============================] - 0s 195us/step - loss: 0.6119 - acc: 0.7532 Epoch 12/50 1025/1025 [==============================] - 0s 188us/step - loss: 0.5826 - acc: 0.7610 Epoch 13/50 1025/1025 [==============================] - 0s 244us/step - loss: 0.5492 - acc: 0.7698 Epoch 14/50 1025/1025 [==============================] - 0s 182us/step - loss: 0.5258 - acc: 0.7776 Epoch 15/50 1025/1025 [==============================] - 0s 181us/step - loss: 0.4976 - acc: 0.8449 Epoch 16/50 1025/1025 [==============================] - 0s 193us/step - loss: 0.4642 - acc: 0.9580 Epoch 17/50 1025/1025 [==============================] - 0s 184us/step - loss: 0.4318 - acc: 0.9600 Epoch 18/50 1025/1025 [==============================] - 0s 188us/step - loss: 0.3990 - acc: 0.9649 Epoch 19/50 1025/1025 [==============================] - 0s 192us/step - loss: 0.3606 - acc: 0.9756 Epoch 20/50 1025/1025 [==============================] - 0s 187us/step - loss: 0.3201 - acc: 0.9824 Epoch 21/50 1025/1025 [==============================] - 0s 200us/step - loss: 0.2810 - acc: 0.9854 Epoch 22/50 1025/1025 [==============================] - 0s 187us/step - loss: 0.2484 - acc: 0.9824 Epoch 23/50 1025/1025 [==============================] - 0s 183us/step - loss: 0.2151 - acc: 0.9844 Epoch 24/50 1025/1025 [==============================] - 0s 184us/step - loss: 0.1897 - acc: 0.9824 Epoch 25/50 1025/1025 [==============================] - 0s 184us/step - loss: 0.1650 - acc: 0.9863 Epoch 26/50 1025/1025 [==============================] - 0s 198us/step - loss: 0.1451 - acc: 0.9854 Epoch 27/50 1025/1025 [==============================] - 0s 186us/step - loss: 0.1268 - acc: 0.9863 Epoch 28/50 1025/1025 [==============================] - 0s 190us/step - loss: 0.1071 - acc: 0.9824 Epoch 29/50 1025/1025 [==============================] - 0s 188us/step - loss: 0.0798 - acc: 0.9844 Epoch 30/50 1025/1025 [==============================] - 0s 183us/step - loss: 0.0574 - acc: 0.9873 Epoch 31/50 1025/1025 [==============================] - 0s 188us/step - loss: 0.0482 - acc: 0.9893 Epoch 32/50 1025/1025 [==============================] - 0s 199us/step - loss: 0.0440 - acc: 0.9863 Epoch 33/50 1025/1025 [==============================] - 0s 186us/step - loss: 0.0395 - acc: 0.9893 Epoch 34/50 1025/1025 [==============================] - 0s 184us/step - loss: 0.0351 - acc: 0.9873 Epoch 35/50 1025/1025 [==============================] - 0s 182us/step - loss: 0.0393 - acc: 0.9854 Epoch 36/50 1025/1025 [==============================] - 0s 183us/step - loss: 0.0372 - acc: 0.9883 Epoch 37/50 1025/1025 [==============================] - 0s 193us/step - loss: 0.0362 - acc: 0.9834 Epoch 38/50 1025/1025 [==============================] - 0s 187us/step - loss: 0.0348 - acc: 0.9873 Epoch 39/50 1025/1025 [==============================] - 0s 190us/step - loss: 0.0295 - acc: 0.9893 Epoch 40/50 1025/1025 [==============================] - 0s 193us/step - loss: 0.0320 - acc: 0.9873 Epoch 41/50 1025/1025 [==============================] - 0s 208us/step - loss: 0.0272 - acc: 0.9883 Epoch 42/50 1025/1025 [==============================] - 0s 243us/step - loss: 0.0372 - acc: 0.9873 Epoch 43/50 1025/1025 [==============================] - 0s 184us/step - loss: 0.0325 - acc: 0.9873 Epoch 44/50 1025/1025 [==============================] - 0s 187us/step - loss: 0.0304 - acc: 0.9873 Epoch 45/50 1025/1025 [==============================] - 0s 187us/step - loss: 0.0251 - acc: 0.9902 Epoch 46/50 1025/1025 [==============================] - 0s 183us/step - loss: 0.0319 - acc: 0.9883 Epoch 47/50 1025/1025 [==============================] - 0s 192us/step - loss: 0.0265 - acc: 0.9893 Epoch 48/50 1025/1025 [==============================] - 0s 185us/step - loss: 0.0306 - acc: 0.9893 Epoch 49/50 1025/1025 [==============================] - 0s 190us/step - loss: 0.0284 - acc: 0.9883 Epoch 50/50 1025/1025 [==============================] - 0s 184us/step - loss: 0.0300 - acc: 0.9873
plt.plot(range(epochs), hist.history['acc'])
plt.plot(range(epochs), hist.history['loss'])
[<matplotlib.lines.Line2D at 0x7f9e210f7550>]
from sklearn.metrics import classification_report
predY = model.predict(testX)
predY = [np.argmax(y) for y in predY]
Y = [np.argmax(y) for y in testY]
print(classification_report(Y, predY))
precision recall f1-score support 0 0.78 0.73 0.75 102 1 0.69 0.78 0.73 115 2 0.77 0.60 0.68 40 avg / total 0.74 0.73 0.73 257
intdata = model_int.predict(testX)
colormap = {0:"red", 1:"blue", 2:"green"}
c = [colormap[i] for i in Y]
intdata[:,0].shape
(257,)
plt.scatter(intdata[:,0], intdata[:,1], c=c)
<matplotlib.collections.PathCollection at 0x7f9e20090e10>