#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt import os import numpy as np from rdkit import Chem from rdkit import rdBase from rdkit.Chem import AllChem from rdkit.Chem import DataStructs from rdkit.Chem import RDConfig from rdkit.Chem import rdFingerprintGenerator from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import ExtraTreesClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.svm import SVC from xgboost import XGBClassifier from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix # This is new class for blending from blending_classification import BlendingClassifier datadir = os.path.join(RDConfig.RDDocsDir, "Book/data") print(rdBase.rdkitVersion) # In[2]: def mols2feat(mols): generator = rdFingerprintGenerator.GetMorganGenerator(radius=2) res = [] for mol in mols: fp = generator.GetFingerprint(mol) arr = np.zeros(0,) DataStructs.ConvertToNumpyArray(fp, arr) res.append(arr) return res # In[3]: # load train and test data train_mol = [ mol for mol in Chem.SDMolSupplier(os.path.join(datadir,'solubility.train.sdf')) if mol != None] test_mol = [ mol for mol in Chem.SDMolSupplier(os.path.join(datadir,'solubility.test.sdf')) if mol != None] cls = list(set([mol.GetProp('SOL_classification') for mol in train_mol])) cls_dic = {} for i, cl in enumerate(cls): cls_dic[cl] = i # In[4]: # make train X, y and test X, y train_X = np.array(mols2feat(train_mol)) train_y = np.array([cls_dic[mol.GetProp('SOL_classification')] for mol in train_mol]) test_X = np.array(mols2feat(test_mol)) test_y = np.array([cls_dic[mol.GetProp('SOL_classification')] for mol in test_mol]) # In[5]: rf = RandomForestClassifier(n_estimators=100, n_jobs=-1) et = ExtraTreesClassifier(n_estimators=100, n_jobs=-1) gbc = GradientBoostingClassifier(learning_rate=0.01) xgbc = XGBClassifier(n_estimators=100, n_jobs=-1) # To use SVC, probability option must be True svc = SVC(probability=True, gamma='auto') l1_clfs = [rf, et, gbc, xgbc] l2_clf = RandomForestClassifier(n_estimators=100, n_jobs=-1) # In[6]: blendclf = BlendingClassifier(l1_clfs, l2_clf, verbose=1) # In[7]: blendclf.fit(train_X, train_y) pred_y = blendclf.predict(test_X) # In[8]: print(classification_report(test_y, pred_y)) # In[9]: from mlxtend.plotting import plot_confusion_matrix cm = confusion_matrix(test_y, pred_y) plot_confusion_matrix(cm) # In[10]: mono_rf = RandomForestClassifier(n_estimators=100, n_jobs=-1) mono_rf.fit(train_X, train_y) pred_y2 = mono_rf.predict(test_X) print(classification_report(test_y, pred_y2)) # ## Check the models correlation with PCA # In[11]: labels = ["rf", "et", "gbc", "xgbc", "mono_rf"] feature_importances_list = [clf.feature_importances_ for clf in blendclf.l1_clfs_] feature_importances_list.append(mono_rf.feature_importances_) # In[12]: from sklearn.decomposition import PCA pca = PCA(n_components=2) res = pca.fit_transform(feature_importances_list) # In[13]: res.shape # In[14]: from adjustText import adjust_text x, y = res[:,0], res[:,1] plt.plot(x, y, 'bo') plt.xlim(-0.1, 0.3) plt.ylim(-0.05, 0.1) texts = [plt.text(x[i], y[i], '{}'.format(labels[i])) for i in range(len(labels))] adjust_text(texts) # - The PCA plot indicates that RF and ET in layer one and Mono_rf model learned similar feature importance. # In[ ]: