%matplotlib inline
import matplotlib.pyplot as plt
import os
import numpy as np
from rdkit import Chem
from rdkit import rdBase
from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs
from rdkit.Chem import RDConfig
from rdkit.Chem import rdFingerprintGenerator
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
# This is new class for blending
from blending_classification import BlendingClassifier
datadir = os.path.join(RDConfig.RDDocsDir, "Book/data")
print(rdBase.rdkitVersion)
2018.09.1
def mols2feat(mols):
generator = rdFingerprintGenerator.GetMorganGenerator(radius=2)
res = []
for mol in mols:
fp = generator.GetFingerprint(mol)
arr = np.zeros(0,)
DataStructs.ConvertToNumpyArray(fp, arr)
res.append(arr)
return res
# load train and test data
train_mol = [ mol for mol in Chem.SDMolSupplier(os.path.join(datadir,'solubility.train.sdf')) if mol != None]
test_mol = [ mol for mol in Chem.SDMolSupplier(os.path.join(datadir,'solubility.test.sdf')) if mol != None]
cls = list(set([mol.GetProp('SOL_classification') for mol in train_mol]))
cls_dic = {}
for i, cl in enumerate(cls):
cls_dic[cl] = i
# make train X, y and test X, y
train_X = np.array(mols2feat(train_mol))
train_y = np.array([cls_dic[mol.GetProp('SOL_classification')] for mol in train_mol])
test_X = np.array(mols2feat(test_mol))
test_y = np.array([cls_dic[mol.GetProp('SOL_classification')] for mol in test_mol])
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
et = ExtraTreesClassifier(n_estimators=100, n_jobs=-1)
gbc = GradientBoostingClassifier(learning_rate=0.01)
xgbc = XGBClassifier(n_estimators=100, n_jobs=-1)
# To use SVC, probability option must be True
svc = SVC(probability=True, gamma='auto')
l1_clfs = [rf, et, gbc, xgbc]
l2_clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
blendclf = BlendingClassifier(l1_clfs, l2_clf, verbose=1)
blendclf.fit(train_X, train_y)
pred_y = blendclf.predict(test_X)
4 Fitting 4 l1_classifiers... 3 classes classification 1-1th hold, <class 'sklearn.ensemble.forest.RandomForestClassifier'> classifier 1-2th hold, <class 'sklearn.ensemble.forest.RandomForestClassifier'> classifier 1-3th hold, <class 'sklearn.ensemble.forest.RandomForestClassifier'> classifier 1-4th hold, <class 'sklearn.ensemble.forest.RandomForestClassifier'> classifier 1-5th hold, <class 'sklearn.ensemble.forest.RandomForestClassifier'> classifier 2-1th hold, <class 'sklearn.ensemble.forest.ExtraTreesClassifier'> classifier 2-2th hold, <class 'sklearn.ensemble.forest.ExtraTreesClassifier'> classifier 2-3th hold, <class 'sklearn.ensemble.forest.ExtraTreesClassifier'> classifier 2-4th hold, <class 'sklearn.ensemble.forest.ExtraTreesClassifier'> classifier 2-5th hold, <class 'sklearn.ensemble.forest.ExtraTreesClassifier'> classifier 3-1th hold, <class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'> classifier 3-2th hold, <class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'> classifier 3-3th hold, <class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'> classifier 3-4th hold, <class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'> classifier 3-5th hold, <class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'> classifier 4-1th hold, <class 'xgboost.sklearn.XGBClassifier'> classifier 4-2th hold, <class 'xgboost.sklearn.XGBClassifier'> classifier 4-3th hold, <class 'xgboost.sklearn.XGBClassifier'> classifier 4-4th hold, <class 'xgboost.sklearn.XGBClassifier'> classifier 4-5th hold, <class 'xgboost.sklearn.XGBClassifier'> classifier --- Blending --- (1025, 4, 3)
print(classification_report(test_y, pred_y))
precision recall f1-score support 0 0.56 0.68 0.61 40 1 0.74 0.83 0.78 102 2 0.70 0.57 0.63 115 micro avg 0.69 0.69 0.69 257 macro avg 0.67 0.69 0.68 257 weighted avg 0.70 0.69 0.69 257
from mlxtend.plotting import plot_confusion_matrix
cm = confusion_matrix(test_y, pred_y)
plot_confusion_matrix(cm)
(<Figure size 432x288 with 1 Axes>, <matplotlib.axes._subplots.AxesSubplot at 0x1a191b8f98>)
mono_rf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
mono_rf.fit(train_X, train_y)
pred_y2 = mono_rf.predict(test_X)
print(classification_report(test_y, pred_y2))
precision recall f1-score support 0 0.71 0.75 0.73 40 1 0.81 0.81 0.81 102 2 0.75 0.73 0.74 115 micro avg 0.77 0.77 0.77 257 macro avg 0.76 0.76 0.76 257 weighted avg 0.77 0.77 0.77 257
labels = ["rf", "et", "gbc", "xgbc", "mono_rf"]
feature_importances_list = [clf.feature_importances_ for clf in blendclf.l1_clfs_]
feature_importances_list.append(mono_rf.feature_importances_)
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
res = pca.fit_transform(feature_importances_list)
res.shape
(5, 2)
from adjustText import adjust_text
x, y = res[:,0], res[:,1]
plt.plot(x, y, 'bo')
plt.xlim(-0.1, 0.3)
plt.ylim(-0.05, 0.1)
texts = [plt.text(x[i], y[i], '{}'.format(labels[i])) for i in range(len(labels))]
adjust_text(texts)
6