#!/usr/bin/env python # coding: utf-8 # 이 노트북의 코드에 대한 설명은 [분류기 체인: ClassifierChain](https://tensorflow.blog/2018/02/16/%EB%B6%84%EB%A5%98%EA%B8%B0-%EC%B2%B4%EC%9D%B8-classifierchain/) 글을 참고하세요. # In[1]: import matplotlib.pyplot as plt import numpy as np from sklearn.datasets import fetch_openml # fetch yeast dataset (https://www.openml.org/d/40597) # # 사이킷런 0.20 버전에서 `fetch_openml`이 추가되었습니다. `fetch_mldata`는 0.22 버전에서 삭제될 예정입니다. # In[2]: yeast = fetch_openml('yeast', version=4) # In[3]: X = yeast['data'] Y = yeast['target'] # In[4]: X.shape, Y.shape # In[5]: Y # In[6]: Y = Y == 'TRUE' # In[7]: Y # In[8]: from sklearn.model_selection import train_test_split X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42) # In[9]: from sklearn.linear_model import LogisticRegression from sklearn.multiclass import OneVsRestClassifier # for OVR model # In[10]: ovr = OneVsRestClassifier(LogisticRegression(solver='liblinear')) ovr.fit(X_train, Y_train) pred_ovr = ovr.predict(X_test) # In[11]: from sklearn.metrics import jaccard_score ovr_score = jaccard_score(Y_test, pred_ovr, average='micro') ovr_score # for CC model (https://www.cs.waikato.ac.nz/ml/publications/2009/chains.pdf) # In[12]: from sklearn.multioutput import ClassifierChain # In[13]: cc = ClassifierChain(LogisticRegression(solver='liblinear'), order='random', random_state=42) cc.fit(X_train, Y_train) pred_cc = cc.predict(X_test) cc_score = jaccard_score(Y_test, pred_cc, average='micro') cc_score # for CC ensemble # In[14]: chains = [ClassifierChain(LogisticRegression(solver='liblinear'), order='random', random_state=42+i) for i in range(10)] for chain in chains: chain.fit(X_train, Y_train) # In[15]: pred_chains = np.array([chain.predict(X_test) for chain in chains]) chain_scores = [jaccard_score(Y_test, pred_chain, average='micro') for pred_chain in pred_chains] chain_scores # In[16]: plt.bar(np.arange(11), [ovr_score]+chain_scores) plt.show() # In[17]: proba_chains = np.array([chain.predict_proba(X_test) for chain in chains]) proba_ensemble = proba_chains.mean(axis=0) ensemble_score = jaccard_score(Y_test, proba_ensemble >= 0.5, average='micro') ensemble_score # In[18]: plt.bar(np.arange(12), [ovr_score]+chain_scores+[ensemble_score]) plt.show()