%matplotlib inline
import sys
sys.path.append('..')
from preamble import *
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_moons
X, y = make_moons(n_samples=100, noise=0.25, random_state=3)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
forest = RandomForestClassifier(n_estimators=5, random_state=2)
forest.fit(X_train, y_train)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=1, oob_score=False, random_state=2, verbose=0, warm_start=False)
fig, axes = plt.subplots(2, 3, figsize=(20,10))
for i, (ax, tree) in enumerate(zip(axes.ravel(), forest.estimators_)): #forest.estimators_에 만들어진 트리의 속성 저장
ax.set_title("Tree {}".format(i))
mglearn.plots.plot_tree_partition(X, y, tree, ax=ax)
mglearn.plots.plot_2d_separator(forest, X, fill=True, ax=axes[-1, -1], alpha=.4)
axes[-1, -1].set_title("Random Forest")
mglearn.discrete_scatter(X[:, 0], X[:, 1], y)
[<matplotlib.lines.Line2D at 0x1c2cc75c18>, <matplotlib.lines.Line2D at 0x1c2cc7c2b0>]
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=0)
forest100 = RandomForestClassifier(n_estimators=100, random_state=0)
forest100.fit(X_train, y_train)
forest10 = RandomForestClassifier(n_estimators=10, random_state=0)
forest10.fit(X_train, y_train)
forest1000 = RandomForestClassifier(n_estimators=1000, random_state=0)
forest1000.fit(X_train, y_train)
print("n_estimator is 10 Train set accuracy: {:.3f}".format(forest10.score(X_train, y_train)))
print("n_estimator is 10 Test set accuracy: {:.3f}".format(forest10.score(X_test, y_test)))
print("n_estimator is 100 Train set accuracy: {:.3f}".format(forest100.score(X_train, y_train)))
print("n_estimator is 100 Test set accuracy: {:.3f}".format(forest100.score(X_test, y_test)))
print("n_estimator is 1000 Train set accuracy: {:.3f}".format(forest1000.score(X_train, y_train)))
print("n_estimator is 1000 Test set accuracy: {:.3f}".format(forest1000.score(X_test, y_test)))
n_estimator is 10 Train set accuracy: 1.000 n_estimator is 10 Test set accuracy: 0.951 n_estimator is 100 Train set accuracy: 1.000 n_estimator is 100 Test set accuracy: 0.972 n_estimator is 1000 Train set accuracy: 1.000 n_estimator is 1000 Test set accuracy: 0.972
def plot_feature_importances_cancer(model):
n_features = cancer.data.shape[1]
plt.barh(range(n_features), model.feature_importances_, align='center')
plt.yticks(np.arange(n_features), cancer.feature_names)
plt.xlabel("feature importances")
plt.ylabel("feature")
plt.ylim(-1, n_features)
plot_feature_importances_cancer(forest10)
plot_feature_importances_cancer(forest100)
plot_feature_importances_cancer(forest1000)
from sklearn.ensemble import GradientBoostingClassifier
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=0)
gbrt = GradientBoostingClassifier(random_state=0)
gbrt.fit(X_train, y_train)
print("Train set Accuracy: {:.3f}".format(gbrt.score(X_train, y_train)))
print("Test set Accuracy: {:.3f}".format(gbrt.score(X_test, y_test)))
Train set Accuracy: 1.000 Test set Accuracy: 0.958
gbrt = GradientBoostingClassifier(random_state=0, max_depth=1) #트리의 최대 깊이를 줄여 사전 가지치기를 강하게 하여 학습률 낮춤
gbrt.fit(X_train, y_train)
print("Train set Accuracy: {:.3f}".format(gbrt.score(X_train, y_train)))
print("Test set Accuracy: {:.3f}".format(gbrt.score(X_test, y_test)))
Train set Accuracy: 0.991 Test set Accuracy: 0.972
gbrt = GradientBoostingClassifier(random_state=0, learning_rate=0.01) #트리의 최대 깊이를 줄여 사전 가지치기를 강하게 하여 학습률 낮춤
gbrt.fit(X_train, y_train)
print("Train set Accuracy: {:.3f}".format(gbrt.score(X_train, y_train)))
print("Test set Accuracy: {:.3f}".format(gbrt.score(X_test, y_test)))
Train set Accuracy: 0.988 Test set Accuracy: 0.965
gbrt = GradientBoostingClassifier(random_state=0, max_depth=1)
gbrt.fit(X_train, y_train)
plot_feature_importances_cancer(gbrt)
gbrt = GradientBoostingClassifier(random_state=0, max_depth=10)
gbrt.fit(X_train, y_train)
plot_feature_importances_cancer(gbrt)