In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import os
import numpy as np
from rdkit import Chem
from rdkit import rdBase
from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs
from rdkit.Chem import RDConfig
from rdkit.Chem import rdFingerprintGenerator
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
# This is new class for blending
from blending_classification import BlendingClassifier

datadir = os.path.join(RDConfig.RDDocsDir, "Book/data")
print(rdBase.rdkitVersion)
2018.09.1
In [2]:
def mols2feat(mols):
    generator = rdFingerprintGenerator.GetMorganGenerator(radius=2)
    res = []
    for mol in mols:
        fp = generator.GetFingerprint(mol)
        arr = np.zeros(0,)
        DataStructs.ConvertToNumpyArray(fp, arr)
        res.append(arr)
    return res
In [3]:
# load train and test data
train_mol = [ mol for mol in Chem.SDMolSupplier(os.path.join(datadir,'solubility.train.sdf')) if mol != None]
test_mol = [ mol for mol in Chem.SDMolSupplier(os.path.join(datadir,'solubility.test.sdf')) if mol != None]

cls = list(set([mol.GetProp('SOL_classification') for mol in train_mol]))
cls_dic = {}
for i, cl in enumerate(cls):
    cls_dic[cl] = i
In [4]:
# make train X, y and test X, y
train_X = np.array(mols2feat(train_mol))
train_y = np.array([cls_dic[mol.GetProp('SOL_classification')] for mol in train_mol])

test_X = np.array(mols2feat(test_mol))
test_y = np.array([cls_dic[mol.GetProp('SOL_classification')] for mol in test_mol])
In [5]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
et = ExtraTreesClassifier(n_estimators=100, n_jobs=-1)
gbc = GradientBoostingClassifier(learning_rate=0.01)
xgbc = XGBClassifier(n_estimators=100, n_jobs=-1) 
# To use SVC, probability option must be True
svc = SVC(probability=True, gamma='auto')

l1_clfs = [rf, et, gbc, xgbc]
l2_clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
In [6]:
blendclf = BlendingClassifier(l1_clfs, l2_clf, verbose=1)
In [7]:
blendclf.fit(train_X, train_y)
pred_y = blendclf.predict(test_X)
4
Fitting 4 l1_classifiers...
3 classes classification
1-1th hold, <class 'sklearn.ensemble.forest.RandomForestClassifier'> classifier
1-2th hold, <class 'sklearn.ensemble.forest.RandomForestClassifier'> classifier
1-3th hold, <class 'sklearn.ensemble.forest.RandomForestClassifier'> classifier
1-4th hold, <class 'sklearn.ensemble.forest.RandomForestClassifier'> classifier
1-5th hold, <class 'sklearn.ensemble.forest.RandomForestClassifier'> classifier
2-1th hold, <class 'sklearn.ensemble.forest.ExtraTreesClassifier'> classifier
2-2th hold, <class 'sklearn.ensemble.forest.ExtraTreesClassifier'> classifier
2-3th hold, <class 'sklearn.ensemble.forest.ExtraTreesClassifier'> classifier
2-4th hold, <class 'sklearn.ensemble.forest.ExtraTreesClassifier'> classifier
2-5th hold, <class 'sklearn.ensemble.forest.ExtraTreesClassifier'> classifier
3-1th hold, <class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'> classifier
3-2th hold, <class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'> classifier
3-3th hold, <class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'> classifier
3-4th hold, <class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'> classifier
3-5th hold, <class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'> classifier
4-1th hold, <class 'xgboost.sklearn.XGBClassifier'> classifier
4-2th hold, <class 'xgboost.sklearn.XGBClassifier'> classifier
4-3th hold, <class 'xgboost.sklearn.XGBClassifier'> classifier
4-4th hold, <class 'xgboost.sklearn.XGBClassifier'> classifier
4-5th hold, <class 'xgboost.sklearn.XGBClassifier'> classifier
--- Blending ---
(1025, 4, 3)
In [8]:
print(classification_report(test_y, pred_y))
              precision    recall  f1-score   support

           0       0.56      0.68      0.61        40
           1       0.74      0.83      0.78       102
           2       0.70      0.57      0.63       115

   micro avg       0.69      0.69      0.69       257
   macro avg       0.67      0.69      0.68       257
weighted avg       0.70      0.69      0.69       257

In [9]:
from mlxtend.plotting import plot_confusion_matrix
cm = confusion_matrix(test_y, pred_y)
plot_confusion_matrix(cm)
Out[9]:
(<Figure size 432x288 with 1 Axes>,
 <matplotlib.axes._subplots.AxesSubplot at 0x1a191b8f98>)
In [10]:
mono_rf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
mono_rf.fit(train_X, train_y)
pred_y2 = mono_rf.predict(test_X) 
print(classification_report(test_y, pred_y2))
              precision    recall  f1-score   support

           0       0.71      0.75      0.73        40
           1       0.81      0.81      0.81       102
           2       0.75      0.73      0.74       115

   micro avg       0.77      0.77      0.77       257
   macro avg       0.76      0.76      0.76       257
weighted avg       0.77      0.77      0.77       257

Check the models correlation with PCA

In [11]:
labels = ["rf", "et", "gbc", "xgbc", "mono_rf"]
feature_importances_list = [clf.feature_importances_ for clf in blendclf.l1_clfs_]
feature_importances_list.append(mono_rf.feature_importances_)
In [12]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
res = pca.fit_transform(feature_importances_list)
In [13]:
res.shape
Out[13]:
(5, 2)
In [14]:
from adjustText import adjust_text
x, y = res[:,0], res[:,1]
plt.plot(x, y, 'bo')
plt.xlim(-0.1, 0.3)
plt.ylim(-0.05, 0.1)

texts = [plt.text(x[i], y[i], '{}'.format(labels[i])) for i in range(len(labels))]
adjust_text(texts)
Out[14]:
6
  • The PCA plot indicates that RF and ET in layer one and Mono_rf model learned similar feature importance.
In [ ]: