%matplotlib inline
import sys
sys.path.append('..')
from preamble import *
mglearn.plots.plot_animal_tree()
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, stratify=cancer.target, random_state=42)
tree = DecisionTreeClassifier(random_state=0)
tree.fit(X_train, y_train)
print("훈련 세트 정확도: {:.3f}".format(tree.score(X_train, y_train)))
print("테스트 세트 정확도: {:.3f}".format(tree.score(X_test, y_test)))
훈련 세트 정확도: 1.000 테스트 세트 정확도: 0.937
tree = DecisionTreeClassifier(max_depth = 4, random_state=0)
tree.fit(X_train, y_train)
print("훈련 세트 정확도: {:.3f}".format(tree.score(X_train, y_train)))
print("테스트 세트 정확도: {:.3f}".format(tree.score(X_test, y_test)))
훈련 세트 정확도: 0.988 테스트 세트 정확도: 0.951
from sklearn.tree import export_graphviz
export_graphviz(tree, out_file="tree.dot", class_names=["악성", "양성"],
feature_names=cancer.feature_names, impurity=False, filled=True)
#graphviz 를 활용한 시각화
import graphviz
with open("tree.dot") as f:
dot_graph = f.read()
display(graphviz.Source(dot_graph))
print("특성 중요도:\n{}".format(tree.feature_importances_))
특성 중요도: [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.01 0.048 0. 0. 0.002 0. 0. 0. 0. 0. 0.727 0.046 0. 0. 0.014 0. 0.018 0.122 0.012 0. ]
def plot_feature_importances_cancer(model):
n_features = cancer.data.shape[1]
plt.barh(range(n_features), model.feature_importances_, align='center')
plt.yticks(np.arange(n_features), cancer.feature_names)
plt.xlabel("feature importances")
plt.ylabel("feature")
plt.ylim(-1, n_features)
plot_feature_importances_cancer(tree)
tree = mglearn.plots.plot_tree_not_monotone()
display(tree)
Feature importances: [ 0. 1.]
import pandas as pd
import os
ram_prices = pd.read_csv(os.path.join(mglearn.datasets.DATA_PATH, "ram_price.csv"))
plt.semilogy(ram_prices.date, ram_prices.price)
plt.xlabel("year")
plt.ylabel("price ($/Mbyte)")
Text(0,0.5,'price ($/Mbyte)')
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
#2000년 이전의 데이터를 훈련, 2000년 이후의 데이터를 테스트로 만듦
data_train = ram_prices[ram_prices.date < 2000]
data_test = ram_prices[ram_prices.date >= 2000]
X_train = data_train.date[:, np.newaxis] #날짜의 특성만을 추출
y_train = np.log(data_train.price) #데이터와 타셋의 관계를 간단하게 하기 위하여 로그 스케일로 변경
tree = DecisionTreeRegressor().fit(X_train, y_train)
linear_reg = LinearRegression().fit(X_train, y_train)
X_all = ram_prices.date[:, np.newaxis]
pred_tree = tree.predict(X_all) #예측은 전체 기간에 대해 수행
pred_lr = linear_reg.predict(X_all)
price_tree = np.exp(pred_tree) #예측한 값의 로그 스케일
price_lr = np.exp(pred_lr)
plt.semilogy(data_train.date, data_train.price, label="Train Data")
plt.semilogy(data_test.date, data_test.price, label="Test Data")
plt.semilogy(ram_prices.date, price_tree, label="Tree Prediction")
plt.semilogy(ram_prices.date, price_lr, label="Linear Regression Prediction")
plt.legend()
<matplotlib.legend.Legend at 0x1c1a6659e8>