In [101]:

#importing iris flower dataset
from sklearn import datasets
iris =datasets.load_iris()

In [102]:

#printing iris flower target_names and feature names
print(iris.target_names)

print(iris.feature_names)

['setosa' 'versicolor' 'virginica']
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']

In [103]:

print(iris.data[0:5])

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]

In [104]:

print(iris.target)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]

In [105]:

#here we see type of data is in numpy , so need to convert as pandas 
type(iris.data),type(iris.data)

Out[105]:

(numpy.ndarray, numpy.ndarray)

In [106]:

import pandas as pd
import numpy as np
data_np = np.array(iris.data)
target_np =np.array(iris.target)
combined = np.insert(data_np, 4, target_np, axis=1)
data = pd.DataFrame(combined,columns=['sepal length','sepal width','petal length','petal width','species'])
data['species'] =data['species'].astype(int)
data.head()

Out[106]:

	sepal length	sepal width	petal length	petal width
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

In [107]:

#seperating features and target
X=data.iloc[:,:4]
Y=data.iloc[:,-1]

In [108]:

#building training and test set using modelselection in 70-30 proportions
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.3,random_state=5)

In [109]:

#n_estimators means it will create 100 random forest
from sklearn.ensemble import RandomForestClassifier

clf =RandomForestClassifier(n_estimators=100)
clf.fit(x_train,y_train)
y_pred =clf.predict(x_test)

In [110]:

from sklearn import metrics
metrics.confusion_matrix(y_test,y_pred)

Out[110]:

array([[15,  0,  0],
       [ 0, 15,  1],
       [ 0,  2, 12]], dtype=int64)

In [111]:

metrics.accuracy_score(y_test,y_pred)

Out[111]:

0.9333333333333333

In [112]:

clf.predict([[7,2,6,2]])

Out[112]:

array([2])

In [113]:

import pandas as pd
feature_imp = pd.Series(clf.feature_importances_,index=iris.feature_names).sort_values(ascending=False)
feature_imp

Out[113]:

petal width (cm)     0.472576
petal length (cm)    0.436343
sepal length (cm)    0.070123
sepal width (cm)     0.020958
dtype: float64

In [114]:

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

sns.barplot(x=feature_imp,y=feature_imp.index)

plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title('visualizing Important Features')
plt.show()

In [115]:

#building training and test set using modelselection in 70-30 proportions
from sklearn.model_selection import train_test_split

x=data[['petal length','petal width']]
y=data['species']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=5)

In [116]:

#n_estimators means it will create 100 random forest
from sklearn.ensemble import RandomForestClassifier

clf =RandomForestClassifier(n_estimators=100)
clf.fit(x_train,y_train)
y_pred =clf.predict(x_test)

In [117]:

metrics.confusion_matrix(y_test,y_pred)

Out[117]:

array([[15,  0,  0],
       [ 0, 15,  1],
       [ 0,  0, 14]], dtype=int64)

In [118]:

metrics.accuracy_score(y_test,y_pred)

Out[118]:

0.9777777777777777

	sepal length	sepal width	petal length	petal width
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	sepal length	sepal width	petal length	petal width
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	sepal length	sepal width	petal length	petal width
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2