#importing iris flower dataset
from sklearn import datasets
iris =datasets.load_iris()
#printing iris flower target_names and feature names
print(iris.target_names)
print(iris.feature_names)
['setosa' 'versicolor' 'virginica'] ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
print(iris.data[0:5])
[[5.1 3.5 1.4 0.2] [4.9 3. 1.4 0.2] [4.7 3.2 1.3 0.2] [4.6 3.1 1.5 0.2] [5. 3.6 1.4 0.2]]
print(iris.target)
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
#here we see type of data is in numpy , so need to convert as pandas
type(iris.data),type(iris.data)
(numpy.ndarray, numpy.ndarray)
import pandas as pd
import numpy as np
data_np = np.array(iris.data)
target_np =np.array(iris.target)
combined = np.insert(data_np, 4, target_np, axis=1)
data = pd.DataFrame(combined,columns=['sepal length','sepal width','petal length','petal width','species'])
data['species'] =data['species'].astype(int)
data.head()
sepal length | sepal width | petal length | petal width | species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | 0 |
1 | 4.9 | 3.0 | 1.4 | 0.2 | 0 |
2 | 4.7 | 3.2 | 1.3 | 0.2 | 0 |
3 | 4.6 | 3.1 | 1.5 | 0.2 | 0 |
4 | 5.0 | 3.6 | 1.4 | 0.2 | 0 |
#seperating features and target
X=data.iloc[:,:4]
Y=data.iloc[:,-1]
#building training and test set using modelselection in 70-30 proportions
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.3,random_state=5)
#n_estimators means it will create 100 random forest
from sklearn.ensemble import RandomForestClassifier
clf =RandomForestClassifier(n_estimators=100)
clf.fit(x_train,y_train)
y_pred =clf.predict(x_test)
from sklearn import metrics
metrics.confusion_matrix(y_test,y_pred)
array([[15, 0, 0], [ 0, 15, 1], [ 0, 2, 12]], dtype=int64)
metrics.accuracy_score(y_test,y_pred)
0.9333333333333333
clf.predict([[7,2,6,2]])
array([2])
import pandas as pd
feature_imp = pd.Series(clf.feature_importances_,index=iris.feature_names).sort_values(ascending=False)
feature_imp
petal width (cm) 0.472576 petal length (cm) 0.436343 sepal length (cm) 0.070123 sepal width (cm) 0.020958 dtype: float64
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.barplot(x=feature_imp,y=feature_imp.index)
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title('visualizing Important Features')
plt.show()
#building training and test set using modelselection in 70-30 proportions
from sklearn.model_selection import train_test_split
x=data[['petal length','petal width']]
y=data['species']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=5)
#n_estimators means it will create 100 random forest
from sklearn.ensemble import RandomForestClassifier
clf =RandomForestClassifier(n_estimators=100)
clf.fit(x_train,y_train)
y_pred =clf.predict(x_test)
metrics.confusion_matrix(y_test,y_pred)
array([[15, 0, 0], [ 0, 15, 1], [ 0, 0, 14]], dtype=int64)
metrics.accuracy_score(y_test,y_pred)
0.9777777777777777