Generating molecules with a ChEMBL_23 trained autoencoder

In [1]:
import numpy as np
import json
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from model import MoleculeVAE
from utils import encode_smiles, decode_latent_molecule, interpolate, get_unique_mols

# number of dimensions to represent the molecules
# as the model was trained with this number, any operation made with the model must share the dimensions.
latent_dim = 292

# trained_model 0.99 validation accuracy
# trained with 80% of ALL chembl molecules, validated on the other 20.
trained_model = 'chembl_23_model.h5'
charset_file = 'charset.json'

aspirin_smiles = 'CC(=O)Oc1ccccc1C(=O)O'
Using TensorFlow backend.

Load the model

In [2]:
# load charset and model
with open('charset.json', 'r') as outfile:
    charset = json.load(outfile)

model = MoleculeVAE()
model.load(charset, trained_model, latent_rep_size = latent_dim)

Encode aspirin SMILES into it's latent representation

  • Our aspirin now looks like a 292 dimension continuous array...
In [3]:
aspirin_latent = encode_smiles(aspirin_smiles, model, charset)
aspirin_latent
Out[3]:
array([[  9.71856061e-04,  -7.18720779e-02,  -1.07823825e-02,
          7.04334006e-02,   8.07407778e-05,   5.52062728e-02,
         -4.82345792e-03,  -1.46738719e-02,   5.00548892e-02,
         -3.61140221e-02,   4.96174470e-02,   9.41123366e-02,
         -5.76002114e-02,  -2.32497044e-02,  -2.03067763e-03,
          3.01523246e-02,  -9.35144536e-03,  -2.21620724e-02,
          1.16465613e-02,   9.56867486e-02,   5.99888433e-03,
         -1.14213698e-01,   3.02174240e-02,   9.38316062e-02,
          4.03911583e-02,  -4.41120490e-02,   1.50727615e-01,
         -1.07745416e-01,  -1.54005349e-01,  -3.73830693e-03,
         -3.86567339e-02,   3.16653028e-02,   2.70223357e-02,
         -8.04287791e-02,  -2.06629112e-02,  -3.46344523e-02,
         -1.71711408e-02,   9.32422429e-02,   2.97126323e-02,
          6.88279718e-02,  -7.57273361e-02,   1.01418652e-01,
         -6.64200336e-02,  -4.84784693e-02,  -7.68535286e-02,
         -6.34723157e-02,  -6.47386014e-02,  -1.04456255e-02,
         -1.00863941e-01,   6.46540150e-02,  -6.65204227e-02,
          4.34707403e-02,   2.47059185e-02,   8.00070241e-02,
         -4.73975614e-02,   7.38497600e-02,   7.84721076e-02,
          1.06323428e-01,  -7.15399161e-02,   3.18531618e-02,
         -1.22007821e-02,   2.13934965e-02,  -1.86711892e-01,
          8.20212066e-04,  -1.78792644e-02,   1.05424121e-01,
         -2.85565443e-02,  -6.50172681e-02,  -3.66293192e-02,
         -4.73038852e-03,  -1.02103099e-01,  -1.32479509e-02,
          9.39315036e-02,   3.52116339e-02,  -4.69638444e-02,
         -1.27485111e-01,   6.25211522e-02,  -3.63147818e-02,
          1.04968660e-01,  -9.07738954e-02,   3.89073081e-02,
         -3.99739929e-02,   1.31433934e-01,   7.20718876e-02,
         -2.88961567e-02,  -1.91392396e-02,  -3.81738618e-02,
         -8.70520622e-02,  -3.19942050e-02,   1.73808306e-01,
         -1.26380101e-01,   4.05026898e-02,   3.09970081e-02,
         -2.93224566e-02,   8.97004269e-04,  -2.51153409e-02,
         -1.02176517e-01,   2.41658818e-02,  -3.94056886e-02,
          8.34702477e-02,   4.08332646e-02,  -1.29018530e-01,
          1.62444860e-01,   1.50975212e-01,   1.24025449e-01,
         -2.21976824e-02,   1.41564645e-02,   5.77150062e-02,
         -9.43952650e-02,   7.26570338e-02,  -1.97977759e-02,
         -6.27060682e-02,   6.25068229e-03,   5.66705950e-02,
         -7.46692419e-02,  -5.35597932e-03,  -4.52673472e-02,
         -1.06255360e-01,   1.21805444e-03,   9.56868082e-02,
          4.57077064e-02,  -1.05798125e-01,  -2.33358424e-03,
          3.80463973e-02,   7.26596564e-02,   7.48837516e-02,
         -2.44056191e-02,   6.41506985e-02,   1.11634769e-01,
          1.47801548e-01,   7.49644786e-02,  -7.74692930e-03,
         -1.19351521e-01,  -3.96492556e-02,  -8.60037729e-02,
         -7.43885487e-02,  -4.94571552e-02,  -5.74258119e-02,
          4.47401777e-02,   8.71800184e-02,  -7.63173923e-02,
          8.56915265e-02,   5.50135002e-02,   1.53072774e-01,
         -6.87290207e-02,   9.51777250e-02,  -2.29161263e-01,
         -3.86219770e-02,  -5.11146486e-02,  -2.21403576e-02,
         -3.57111618e-02,   1.05116852e-01,  -1.56388059e-01,
         -2.32945949e-01,   3.02064978e-02,  -1.60570592e-02,
          6.04691170e-02,   1.21265367e-01,  -9.76143628e-02,
         -5.54246195e-02,  -9.69891250e-02,   2.36563478e-02,
          1.66263223e-01,  -1.36794776e-01,  -5.81177464e-03,
         -5.54939099e-02,   8.23493972e-02,   6.98872954e-02,
          1.14865322e-03,  -1.27370715e-01,  -6.21549487e-02,
         -7.09266495e-03,   7.19872937e-02,   8.81983116e-02,
          2.22912189e-02,   3.83054242e-02,   3.97429131e-02,
          8.56342092e-02,   5.39878383e-02,  -5.75969592e-02,
         -2.28973106e-02,   1.39037403e-03,   3.41783352e-02,
          3.19306403e-02,  -1.30027812e-02,  -1.41367450e-01,
         -6.18742220e-03,   3.90469171e-02,   5.92161939e-02,
          6.72234967e-02,   7.77835399e-02,  -7.25040436e-02,
         -6.12245053e-02,  -6.56979606e-02,  -5.48129305e-02,
         -9.09105875e-03,   4.95510735e-02,  -9.74308252e-02,
          4.71654981e-02,  -7.41969571e-02,   1.27165467e-01,
          1.08109936e-01,  -1.41882569e-01,  -5.22158071e-02,
         -3.68368141e-02,  -1.09284617e-01,   9.24470276e-02,
          8.90988708e-02,   7.14422911e-02,   1.71037763e-02,
         -6.42426983e-02,   2.81091500e-02,  -7.53262788e-02,
         -2.22312883e-02,   5.43231070e-02,   1.13657244e-01,
          5.80681674e-02,  -1.28053240e-02,  -9.11020339e-02,
          2.29092482e-02,  -2.34639179e-02,  -1.20296881e-01,
          1.97384432e-02,  -1.27541676e-01,   2.12547630e-02,
          1.57594308e-03,   3.59715298e-02,  -1.67556271e-01,
         -4.08001840e-02,  -2.31258348e-02,  -1.04855210e-01,
          6.01465907e-03,  -8.74107629e-02,  -1.58640981e-01,
          6.85472414e-02,   1.97597280e-01,   5.25407046e-02,
         -3.59934568e-02,  -1.31010830e-01,  -7.52944425e-02,
         -5.66634815e-03,   9.81045589e-02,   6.92072213e-02,
         -2.03930885e-01,  -5.03597483e-02,  -4.51382101e-02,
          2.36538380e-01,   2.66888235e-02,   9.34257824e-03,
          1.53316393e-01,  -4.11946587e-02,  -4.19318303e-02,
          8.43373388e-02,   1.70614496e-02,  -2.36854814e-02,
         -4.03766297e-02,   7.86749870e-02,   1.06246576e-01,
         -6.65645581e-03,  -1.40359448e-02,   2.12623533e-02,
          6.49262965e-02,   4.03402792e-03,   6.06924444e-02,
          1.80055261e-01,  -1.09377708e-02,  -1.85241878e-01,
         -1.92784593e-02,   1.16975829e-02,   3.07626883e-03,
         -4.59671989e-02,   1.13580979e-01,  -3.57096898e-03,
         -9.87143368e-02,   2.70727091e-02,   9.43240616e-03,
         -3.33302990e-02,   4.09079306e-02,  -1.54041946e-01,
         -4.18284312e-02,   3.23817022e-02,  -4.69824634e-02,
          4.75761481e-02,  -6.54134378e-02,  -3.98326889e-02,
          8.50882083e-02,  -3.09867803e-02,  -7.53753260e-02,
         -3.33729833e-02,   1.14400871e-03,   1.61657482e-01,
         -7.92967435e-03]], dtype=float32)

Silly check: Are we reconstructing our aspirin properly?

  • Looks like we are able to convert aspirin SMILES to a 292 dimension array and go back from it, good!
In [4]:
reconstructed_aspirin = decode_latent_molecule(aspirin_latent, model, charset, latent_dim)
original = Chem.MolFromSmiles(aspirin_smiles)
reconstructued = Chem.MolFromSmiles(reconstructed_aspirin)

Draw.MolsToGridImage([original, reconstructued])
Out[4]:

Looking for 1k aspirin neighbours

  • Generating 1k random 292 dimension continuous arrays with mean=aspirin_latent and stdev==0.1 to sample the latent space next to the aspirin.
In [5]:
stdev = 0.1
latent_mols = stdev * np.random.randn(1000, latent_dim) + aspirin_latent

Decode the randomly aspirin centered generated latent representations

  • We just generated some arrays, we now want molecules!
In [6]:
decoded_molecules = []
for lm in latent_mols:
    decoded_molecules.append(decode_latent_molecule(lm, model, charset, latent_dim))

Validate the molecules using RDKit

  • RDKit is always handful tool to check molecule validity.
  • Most of the 1k latent representations won't end in a valid molecule, this is completelly normal due to the complexity of the chemical space. Also notice that this is NOT a perfect autoencoder, it's trained with a validation accuracy of 0.99, so some molecules won't be correctly decoded after the encoding phase.
In [7]:
from rdkit import Chem
from rdkit import RDLogger

# remove warnings and errors from notebook (lots of them due non valid molecule generation)
lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)

working_mols = []
for smiles in decoded_molecules:
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            working_mols.append(mol)
    except:
        continue

Let's visualise the aspirinish molecules we artificially generated!

In [8]:
Draw.MolsToGridImage([mol for mol in working_mols], molsPerRow=10)
Out[8]: