In [1]:
from rdkit import Chem
In [2]:
def clogSw(molObj):
    """
    Calculate an estimation of water solubility.

    Input: RDKit molecule object

    Output: clogSw estimation
    """
    from rdkit import Chem
    from rdkit.Chem import Descriptors

    # calculate MolWeight
    MolWeight = Descriptors.MolWt(molObj)
    # calculate clogP
    clogP = Descriptors.MolLogP(molObj)
    # calculate RotBonds
    RotBonds = Descriptors.NumRotatableBonds(molObj)
    # calculate the number of aromatic heavyatoms in the molecule
    aromaticHeavyatoms = len(molObj.GetSubstructMatches(
                             Chem.MolFromSmarts("[a]")))
    # calculate total number of atoms in the molecule
    numAtoms = molObj.GetNumAtoms()
    # calculate Aromatic Proportion
    AromProp = float(aromaticHeavyatoms) / numAtoms

    # then calculate clogSw...
    #clogSw_value = 0.16 \
    #               - 0.63 * clogP \
    #               - 0.0062 * MolWeight \
    #               + 0.066 * RotBonds \
    #               - 0.74 * AromProp
    
    # New clogSw with coefficients calculated using Ridge Regression with Crossvalidation
    clogSw_value = 0.233743817233 \
                   -0.74253027 * clogP \
                   -0.00676305 * MolWeight \
                   +0.01580559 * RotBonds \
                   -0.35483585 * AromProp

    return clogSw_value
In [10]:
logSw = dict()
supplier = Chem.SmilesMolSupplier("Delaney_SupplData.smi", delimiter="\t", titleLine=True)
for mol in supplier:
    if mol:
        mol_id = "ESOL_" + mol.GetProp("_Name")
        mol.SetProp("_Name", mol_id)
        logSw[mol_id] = dict()
        logSw[mol_id]["CANSMILES"] = Chem.MolToSmiles(mol)
        # RDKit clogSw
        logSw[mol_id]["RDKit_clogSw"] = clogSw(mol)
        mol.SetProp("RDKit_clogSw", str(logSw[mol_id]["RDKit_clogSw"]))
        # ESOL
        logSw[mol_id]["ESOL predicted log(solubility:mol/L)"] = mol.GetProp("ESOL predicted log(solubility:mol/L)")
        # Measured Solubility
        logSw[mol_id]["measured log(solubility:mol/L)"] = mol.GetProp("measured log(solubility:mol/L)")
    # end if
# end for

# Dump logSw in csv
import csv
f = open('ESOL_Results.csv','wb')
w = csv.writer(f)
w.writerow(["Canonical SMILES", "Mol_Id", "measured log(solubility:mol/L)", "ESOL predicted log(solubility:mol/L)", "RDKit_clogSw"])
for mol_id in logSw:
    w.writerow([logSw[mol_id]["CANSMILES"], 
                mol_id, 
                logSw[mol_id]["measured log(solubility:mol/L)"], 
                logSw[mol_id]["ESOL predicted log(solubility:mol/L)"],
                logSw[mol_id]["RDKit_clogSw"]])
# end for
f.flush()
f.close()
1143
1143
In [ ]: