#!/usr/bin/env python
# coding: utf-8

# # Generating molecules with a ChEMBL_23 trained autoencoder

# In[1]:


import numpy as np
import json
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from model import MoleculeVAE
from utils import encode_smiles, decode_latent_molecule, interpolate, get_unique_mols

# number of dimensions to represent the molecules
# as the model was trained with this number, any operation made with the model must share the dimensions.
latent_dim = 292

# trained_model 0.99 validation accuracy
# trained with 80% of ALL chembl molecules, validated on the other 20.
trained_model = 'chembl_23_model.h5'
charset_file = 'charset.json'

aspirin_smiles = 'CC(=O)Oc1ccccc1C(=O)O'


# # Load the model

# In[2]:


# load charset and model
with open('charset.json', 'r') as outfile:
    charset = json.load(outfile)

model = MoleculeVAE()
model.load(charset, trained_model, latent_rep_size = latent_dim)


# # Encode aspirin SMILES into it's latent representation
# - Our aspirin now looks like a 292 dimension continuous array...

# In[3]:


aspirin_latent = encode_smiles(aspirin_smiles, model, charset)
aspirin_latent


# # Silly check: Are we reconstructing our aspirin properly?
# - Looks like we are able to convert aspirin SMILES to a 292 dimension array and go back from it, good!

# In[4]:


reconstructed_aspirin = decode_latent_molecule(aspirin_latent, model, charset, latent_dim)
original = Chem.MolFromSmiles(aspirin_smiles)
reconstructued = Chem.MolFromSmiles(reconstructed_aspirin)

Draw.MolsToGridImage([original, reconstructued])


# # Looking for 1k aspirin neighbours 
# - Generating 1k random 292 dimension continuous arrays with mean=aspirin_latent and stdev==0.1 to sample the latent space next to the aspirin. 

# In[5]:


stdev = 0.1
latent_mols = stdev * np.random.randn(1000, latent_dim) + aspirin_latent


# # Decode the randomly aspirin centered generated latent representations
# - We just generated some arrays, we now want molecules!

# In[6]:


decoded_molecules = []
for lm in latent_mols:
    decoded_molecules.append(decode_latent_molecule(lm, model, charset, latent_dim))


# # Validate the molecules using RDKit
# - RDKit is always handful tool to check molecule validity.
# - Most of the 1k latent representations won't end in a valid molecule, this is completelly normal due to the complexity of the chemical space. Also notice that this is NOT a perfect autoencoder, it's trained with a validation accuracy of 0.99, so some molecules won't be correctly decoded after the encoding phase.

# In[7]:


from rdkit import Chem
from rdkit import RDLogger

# remove warnings and errors from notebook (lots of them due non valid molecule generation)
lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)

working_mols = []
for smiles in decoded_molecules:
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            working_mols.append(mol)
    except:
        continue


# # Let's visualise the aspirinish molecules we artificially generated!

# In[8]:


Draw.MolsToGridImage([mol for mol in working_mols], molsPerRow=10)


# # Check how many of them we can find using UniChem 
# - We managed to generate many unique valid molecules, most of them not existing in ChEMBL. 
# - This means that we can sample the model which is representing the ChEMBL chemical space to retrieve molecules that don't exist in the database.

# In[9]:


import requests

url = 'https://www.ebi.ac.uk/unichem/rest/inchikey/{}'

unique_mols = get_unique_mols(working_mols)

in_unichem = []
for um in unique_mols:
    res = requests.get(url.format(um[1]))
    if res.status_code == 200:
        in_unichem.append([um[0], res.json()])

not_in_chembl = filter(lambda mol: 1 not in [int(res['src_id']) for res in mol[1]], in_unichem)

print "rdkit valid mols:", len(working_mols)
print "unique molecules:", len(unique_mols)
print "found in unichem:", len(in_unichem)
print "found in unichem and not in chembl:", len(not_in_chembl)


# # Show mols found in UniChem and not in ChEMBL

# In[10]:


Draw.MolsToGridImage([mol[0] for mol in not_in_chembl], molsPerRow=10)


# # Interpolation of two molecules
# - Let's make a linear interpolation from paracetamol to celecoxib latent representations and let's check what we got...
# - Our results with linear interpolation are not as good as the ones showed in the original paper...

# In[11]:


paracetamol_smiles = 'CC(=O)Nc1ccc(O)cc1'
celecoxib_smiles = 'Cc1ccc(cc1)c2cc(nn2c3ccc(cc3)S(=O)(=O)N)C(F)(F)F'

steps = 300
results = interpolate(paracetamol_smiles, celecoxib_smiles, steps, charset, model, latent_dim)
    
working_results = []
for smiles in results:
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            working_results.append(mol)
    except:
        continue
                
Draw.MolsToGridImage(working_results, molsPerRow=10)


# In[ ]: