Descriptors and Regression

Helper Methods

In [2]:
def createImportancePlot(splt,desc,importances,caption):
    import numpy.numarray as na    
    labels = []
    weights = []
    threshold = sort([abs(w) for w in importances])[-11]
    for d in zip(desc,importances):
        if abs(d[1]) > threshold:
            labels.append(d[0])
            weights.append(d[1])
    xlocations = na.array(range(len(labels)))+0.5
    width = 0.8
    splt.bar(xlocations, weights, width=width)
    splt.set_xticks([r+1 for r in range(len(labels))])
    splt.set_xticklabels(labels)
    splt.set_xlim(0, xlocations[-1]+width*2)
    splt.set_title(caption)
    splt.get_xaxis().tick_bottom()
    splt.get_yaxis().tick_left()

Dataset

Publicly available quantitive solubility data from "Estimation of Aqueous Solubility for a Diverse Set of Organic Compounds Based on Molecular Topology", Jarmo Huuskonen, J. Chem. Inf. Comput. Sci., 2000, 40, 773-777

Training data set obtained from cheminformatics.org: http://cheminformatics.org/datasets/huuskonen/train.smi

Test set obtained from cheminformatics.org: http://cheminformatics.org/datasets/huuskonen/test1.smi

Training data

In [3]:
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors

#http://cheminformatics.org/datasets/huuskonen/train.smi
with open('data/train.smi','r') as train_file:
    train_file.readline()
    raw = [line.split() for line in train_file.readlines()]
    temp = []
    for d in raw:
        try:
            t = float(d[3])
            temp.append(d)
        except:
            print d
            continue
molecules = [Chem.MolFromSmiles(d[-1]) for d in temp]
observations = [float(d[3]) for d in temp]
data = [(x,y) for x,y in zip(molecules,observations) if x is not None]
['976', '?', '7-Chloropteridine-0.87', '?', '-0.29', 'c1c(Cl)nc2cncnc2n1']
['979', '?', '4-Mercaptopteridine-2.77', '?', '-0.24', 'c1cnc2cnc(S)nc2n1']
['980', '?', '7-Mercaptopteridine-2.71', '?', '-0.24', 'c1c(S)nc2cncnc2n1']

Test data

In [4]:
#Loading and processing external test data set
#http://cheminformatics.org/datasets/huuskonen/test1.smi
with open('data/test1.smi','r') as test_file:
    test_file.readline()
    test_raw = [line.split() for line in test_file.readlines()]
    temp = []
    for d in test_raw:
        try:
            t = float(d[3])
            temp.append(d)
        except:
            print d
            continue
test_molecules = [Chem.MolFromSmiles(d[-1]) for d in temp]
test_observations = [float(d[3]) for d in temp]
test_data = [(x,y) for x,y in zip(test_molecules,test_observations) if x is not None]
test_patterns = []
test_ys = []
    
for sample in test_data:
    pattern = []
    use = True
    for D in Descriptors.descList:
        if D[0] == 'MolecularFormula': continue
        try:
            d = float(D[1](sample[0]))
            pattern.append(d)
            
        except:
            use = False
            print 'Error computing descriptor %s for %s'%(D[0],sample[0])
            break
    if use:
        test_patterns.append(pattern)
        test_ys.append(sample[1])
['978', '?', '2-Mercaptopteridine-2.36', '?', '-0.24', 'c1cnc2c(S)ncnc2n1']

Descriptor calculation

In [5]:
patterns = []
ys = []
descriptors = []

for D in Descriptors._descList:
    if D[0] == 'MolecularFormula': continue
    descriptors.append(D[0])   
calculator = MoleculeDescriptors.MolecularDescriptorCalculator(descriptors)
for sample in data:
    use = True
    try:
            pattern = calculator.CalcDescriptors(sample[0])
            
    except e:
            use = False
            print 'Error computing descriptor %s for %s'%(D[0],sample[0])
            
    if use:
        patterns.append(pattern)
        ys.append(sample[1])
        

Descriptive Modelling - Revealing patterns in the data

In [6]:
figure,(plt1,plt2,plt3) = pyplot.subplots(1,3)
figure.set_size_inches(15,15)
#simple linear regression
from sklearn import linear_model

simple_linearreg = linear_model.LinearRegression()
simple_linearreg.fit(patterns,ys)
simple_prediction = simple_linearreg.predict(patterns)
plt1.scatter(ys,simple_prediction)
plt1.set_aspect('equal')
plt1.set_title('Simple linear regression')
plt1.set_xlabel('Measured logS')
plt1.set_ylabel('Predicted logS')
plt1.set_xlim(-12,2)
plt1.set_ylim(-12,2)

#simple decision tree regression
from sklearn import tree
simple_tree = tree.DecisionTreeRegressor(compute_importances=True)
simple_tree.fit(patterns,ys)
simple_treeprediction = simple_tree.predict(patterns)
plt2.scatter(ys,simple_treeprediction)
plt2.set_aspect('equal')
plt2.set_title('Standard Decision Tree')
plt2.set_xlabel('Measured logS')
plt2.set_ylabel('Predicted logS')
plt2.set_xlim(-12,2)
plt2.set_ylim(-12,2)

#custom decision tree regression
from sklearn import tree
custom_tree = tree.DecisionTreeRegressor(max_depth=10, min_samples_split = 50,compute_importances=True)
custom_tree.fit(patterns,ys)
custom_treeprediction = custom_tree.predict(patterns)
plt3.scatter(ys,custom_treeprediction)
plt3.set_aspect('equal')
plt3.set_title('Custom Decision Tree (regularized)')
plt3.set_xlabel('Measured logS')
plt3.set_ylabel('Predicted logS')
plt3.set_xlim(-12,2)
plt3.set_ylim(-12,2)

from sklearn.metrics import mean_squared_error
print 'Explained variance (Simple): %0.5f'%(simple_linearreg.score(patterns,ys))
print 'MSE (Simple): %0.5f'%(mean_squared_error(ys,simple_prediction))
print 'Explained variance (Custom): %0.5f'%(simple_tree.score(patterns,ys))
print 'MSE (Custom): %0.5f'%(mean_squared_error(ys,simple_treeprediction))
print 'Explained variance (Custom): %0.5f'%(custom_tree.score(patterns,ys))
print 'MSE (Custom): %0.5f'%(mean_squared_error(ys,custom_treeprediction))
#print zip(simple_prediction,custom_prediction)
Explained variance (Simple): 0.90063
MSE (Simple): 0.41525
Explained variance (Custom): 0.99998
MSE (Custom): 0.00007
Explained variance (Custom): 0.91630
MSE (Custom): 0.34978

Analyzing the models

In [7]:
figure,(plt1,plt2,plt3) = pyplot.subplots(3,1)
figure.set_size_inches(15,5)
figure.tight_layout()
imp = simple_linearreg.coef_
createImportancePlot(plt1,descriptors,imp,"Most important descriptors in linear model")
imp2 = simple_tree.feature_importances_ #Gini importances
createImportancePlot(plt2,descriptors,imp2,"Most important descriptors in simple tree model")
imp3 = custom_tree.feature_importances_ #Gini importances
createImportancePlot(plt3,descriptors,imp3,"Most important descriptors in custom tree model")

#inspect logP contribution
from scipy.stats.stats import pearsonr
ind = descriptors.index('MolLogP')
logP = [d[ind] for d in patterns]
fig,ax = pyplot.subplots(1,1)
ax.scatter(logP,ys)
ax.set_title('rdkit:mollogP vs logS')
ax.set_xlabel('rdkit:mollogP')
ax.set_ylabel('Measured logS')
print 'Pearson Correlation (MolLogP): %0.5f'%pearsonr(logP,ys)[0]
Pearson Correlation (MolLogP): -0.84915

Infer "general" applicable models

In [8]:
figure,(plt1,plt2,plt3) = pyplot.subplots(1,3)
figure.set_size_inches(15,15)
#simple linear regression
from sklearn import linear_model

simple_ext_prediction = simple_linearreg.predict(test_patterns)
plt1.scatter(test_ys,simple_ext_prediction)
plt1.set_aspect('equal')
plt1.set_title('Simple linear regression')
plt1.set_xlabel('Measured logS')
plt1.set_ylabel('Predicted logS')
plt1.set_xlim(-12,2)
plt1.set_ylim(-12,2)

#simple decision tree regression
simple_tree_ext_prediction = simple_tree.predict(test_patterns)
plt2.scatter(test_ys,simple_tree_ext_prediction)
plt2.set_aspect('equal')
plt2.set_title('Standard Decision Tree')
plt2.set_xlabel('Measured logS')
plt2.set_ylabel('Predicted logS')
plt2.set_xlim(-12,2)
plt2.set_ylim(-12,2)

#custom decision tree regression
custom_tree_ext_prediction = custom_tree.predict(test_patterns)
plt3.scatter(test_ys,custom_tree_ext_prediction)
plt3.set_aspect('equal')
plt3.set_title('Custom Decision Tree (regularized)')
plt3.set_xlabel('Measured logS')
plt3.set_ylabel('Predicted logS')
plt3.set_xlim(-12,2)
plt3.set_ylim(-12,2)

from sklearn.metrics import mean_squared_error
print 'Explained variance (Simple): %0.5f'%(simple_linearreg.score(test_patterns,test_ys))
print 'MSE (Simple): %0.5f'%(mean_squared_error(test_ys,simple_ext_prediction))
print 'Explained variance (Custom): %0.5f'%(simple_tree.score(test_patterns,test_ys))
print 'MSE (Custom): %0.5f'%(mean_squared_error(test_ys,simple_tree_ext_prediction))
print 'Explained variance (Custom): %0.5f'%(custom_tree.score(test_patterns,test_ys))
print 'MSE (Custom): %0.5f'%(mean_squared_error(test_ys,custom_tree_ext_prediction))
Explained variance (Simple): 0.89059
MSE (Simple): 0.45049
Explained variance (Custom): 0.81740
MSE (Custom): 0.75182
Explained variance (Custom): 0.86516
MSE (Custom): 0.55519

Finding the optimal parameters

In [10]:
from sklearn.cross_validation import KFold
from sklearn.grid_search import GridSearchCV
params = {'max_depth':[2,5,10,20],'min_samples_split':[2,8,32,128],'min_samples_leaf':[1,2,5,10],'compute_importances':[True]}
cv = KFold(n=len(patterns),k=10,indices=False,shuffle=True)
gs = GridSearchCV(simple_tree, params, cv=cv,verbose=1,refit=True)
gs.fit(patterns, ys)
print 'Best score: %0.2f'%gs.best_score_
print 'Training set performance using best parameters (%s)'%gs.best_params_
best_treemodel = gs.best_estimator_
figure,(plt1,plt2) = pyplot.subplots(1,2)
figure.set_size_inches(15,15)
#training set evaluation
best_tree_int_prediction = best_treemodel.predict(patterns)
plt1.scatter(ys,best_tree_int_prediction)
plt1.set_aspect('equal')
plt1.set_title('Optimized tree on training set')
plt1.set_xlabel('Measured logS')
plt1.set_ylabel('Predicted logS')
plt1.set_xlim(-12,2)
plt1.set_ylim(-12,2)

#test set evaluation
best_tree_ext_prediction = best_treemodel.predict(test_patterns)
plt2.scatter(test_ys,best_tree_ext_prediction)
plt2.set_aspect('equal')
plt2.set_title('Optimized tree on test set')
plt2.set_xlabel('Measured logS')
plt2.set_ylabel('Predicted logS')
plt2.set_xlim(-12,2)
plt2.set_ylim(-12,2)

print 'Explained variance (Internal): %0.5f'%(best_treemodel.score(patterns,ys))
print 'MSE (Internal): %0.5f'%(mean_squared_error(ys,best_tree_int_prediction))
print 'Explained variance (External): %0.5f'%(best_treemodel.score(test_patterns,test_ys))
print 'MSE (External): %0.5f'%(mean_squared_error(test_ys,best_tree_ext_prediction))
fig,a = pyplot.subplots(1,1)
fig.set_size_inches(15,5)
createImportancePlot(a,descriptors,best_treemodel.feature_importances_,"Most important descriptors in the optimized tree")
[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:    0.9s
[Parallel(n_jobs=1)]: Done 200 jobs       | elapsed:    4.1s
[Parallel(n_jobs=1)]: Done 450 jobs       | elapsed:   13.8s
Best score: 0.85
Training set performance using best parameters ({'compute_importances': True, 'min_samples_split': 32, 'max_depth': 20, 'min_samples_leaf': 10})
Explained variance (Internal): 0.92493
MSE (Internal): 0.31371
Explained variance (External): 0.86615
MSE (External): 0.55110
[Parallel(n_jobs=1)]: Done 640 out of 640 | elapsed:   22.9s finished
In [12]:
from sklearn.cross_validation import KFold
from sklearn.grid_search import GridSearchCV
from sklearn.metrics.metrics import mean_squared_error
params = {'max_depth':[2,5,10,20],'min_samples_split':[2,8,32,128],'min_samples_leaf':[1,2,5,10],'compute_importances':[True]}
cv = KFold(n=len(patterns),k=10,indices=False,shuffle=True)
gs = GridSearchCV(simple_tree, params, cv=cv,verbose=1,refit=True,loss_func=mean_squared_error)
gs.fit(patterns, ys)
print 'Best score: %0.2f'%gs.best_score_
print 'Training set performance using best parameters (%s)'%gs.best_params_
best_treemodel = gs.best_estimator_
figure,(plt1,plt2) = pyplot.subplots(1,2)
figure.set_size_inches(15,15)
#training set evaluation
best_tree_int_prediction = best_treemodel.predict(patterns)
plt1.scatter(ys,best_tree_int_prediction)
plt1.set_aspect('equal')
plt1.set_title('Optimized tree on training set')
plt1.set_xlabel('Measured logS')
plt1.set_ylabel('Predicted logS')
plt1.set_xlim(-12,2)
plt1.set_ylim(-12,2)

#test set evaluation
best_tree_ext_prediction = best_treemodel.predict(test_patterns)
plt2.scatter(test_ys,best_tree_ext_prediction)
plt2.set_aspect('equal')
plt2.set_title('Optimized tree on test set')
plt2.set_xlabel('Measured logS')
plt2.set_ylabel('Predicted logS')
plt2.set_xlim(-12,2)
plt2.set_ylim(-12,2)

print 'Explained variance (Internal): %0.5f'%(best_treemodel.score(patterns,ys))
print 'MSE (Internal): %0.5f'%(mean_squared_error(ys,best_tree_int_prediction))
print 'Explained variance (External): %0.5f'%(best_treemodel.score(test_patterns,test_ys))
print 'MSE (External): %0.5f'%(mean_squared_error(test_ys,best_tree_ext_prediction))
fig,a = pyplot.subplots(1,1)
fig.set_size_inches(15,5)
createImportancePlot(a,descriptors,best_treemodel.feature_importances_,"Most important descriptors in the optimized tree")
[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:    0.9s
[Parallel(n_jobs=1)]: Done 200 jobs       | elapsed:    4.0s
[Parallel(n_jobs=1)]: Done 450 jobs       | elapsed:   13.7s
Best score: -0.59
Training set performance using best parameters ({'compute_importances': True, 'min_samples_split': 2, 'max_depth': 10, 'min_samples_leaf': 10})
Explained variance (Internal): 0.93597
MSE (Internal): 0.26756
Explained variance (External): 0.85485
MSE (External): 0.59763
[Parallel(n_jobs=1)]: Done 640 out of 640 | elapsed:   22.8s finished

Feature extraction

In [13]:
#normalize descriptors
from sklearn.preprocessing import Scaler
scaler = Scaler().fit(patterns)
std_patterns = scaler.transform(patterns)
std_test_patterns = scaler.transform(test_patterns)

#compute pca; provide ten most descriptive principal components
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(std_patterns)
print 'rel. explained variance per pc',pca.explained_variance_ratio_
pc_patterns = pca.transform(std_patterns)
pc_test = pca.transform(std_test_patterns)
fig,(plt1,plt2) = pyplot.subplots(2,1)
fig.set_size_inches(15,5)
createImportancePlot(plt1,descriptors,pca.components_[0],"Most important descriptors in 1st principal component (Hypothesis: Size)")
createImportancePlot(plt2,descriptors,pca.components_[1],"Most important descriptors in 2nd principal component (Hypothesis: Electrostatic interactions)")

#evaluate modelling performance
figure,(plt1,plt2,plt3) = pyplot.subplots(1,3)
figure.set_size_inches(15,5)

#training set
pc_linearreg = linear_model.LinearRegression()
pc_linearmodel = pc_linearreg.fit(pc_patterns,ys)
pc_prediction = pc_linearmodel.predict(pc_patterns)
plt1.scatter(ys,pc_prediction)
plt1.set_aspect('equal')
plt1.set_title('Full model regression')
plt1.set_xlabel('Measured logS')
plt1.set_ylabel('Predicted logS')
plt1.set_xlim(-12,2)
plt1.set_ylim(-12,2)

plt2.scatter(ys, [pc[0] for pc in pc_patterns])
plt2.set_title('1st PC')
plt2.set_xlabel('Measured logS')
plt2.set_ylabel('1st PC')


plt3.scatter(ys, [pc[1] for pc in pc_patterns])
plt3.set_title('2nd PC')
plt3.set_xlabel('Measured logS')
plt3.set_ylabel('2nd PC')


#test set
figure,(plt1,plt2,plt3) = pyplot.subplots(1,3)
figure.set_size_inches(15,5)

pc_prediction = pc_linearmodel.predict(pc_test)
plt1.scatter(test_ys,pc_prediction)
plt1.set_aspect('equal')
plt1.set_title('Full model regression')
plt1.set_xlabel('Measured logS')
plt1.set_ylabel('Predicted logS')
plt1.set_xlim(-12,2)
plt1.set_ylim(-12,2)

plt2.scatter(test_ys, [pc[0] for pc in pc_test])
plt2.set_title('1st PC')
plt2.set_xlabel('Measured logS')
plt2.set_ylabel('1st PC')


plt3.scatter(test_ys, [pc[1] for pc in pc_test])
plt3.set_title('2nd PC')
plt3.set_xlabel('Measured logS')
plt3.set_ylabel('2nd PC')
rel. explained variance per pc [ 0.33812708  0.1139046 ]
Out[13]:
<matplotlib.text.Text at 0x105390950>