from google.colab import drive
drive.mount('/data/')
data_dir = '/data/My Drive/Colab Notebooks/Experiment'
!ls '/data/My Drive/Colab Notebooks/Experiment'
!pip install matplotlib
Mounted at /data/ Iris.csv m_data.csv w_data.csv Requirement already satisfied: matplotlib in /usr/local/lib/python3.6/dist-packages (3.2.2) Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (2.4.7) Requirement already satisfied: numpy>=1.11 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (1.18.5) Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (1.2.0) Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (2.8.1) Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (0.10.0) Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.6/dist-packages (from python-dateutil>=2.1->matplotlib) (1.15.0)
import pandas as pd
df = pd.read_csv(data_dir+'/Iris.csv')
df.head()
Id | SepalLengthCm | SepalWidthCm | PetalLengthCm | PetalWidthCm | Species | |
---|---|---|---|---|---|---|
0 | 1 | 5.1 | 3.5 | 1.4 | 0.2 | Iris-setosa |
1 | 2 | 4.9 | 3.0 | 1.4 | 0.2 | Iris-setosa |
2 | 3 | 4.7 | 3.2 | 1.3 | 0.2 | Iris-setosa |
3 | 4 | 4.6 | 3.1 | 1.5 | 0.2 | Iris-setosa |
4 | 5 | 5.0 | 3.6 | 1.4 | 0.2 | Iris-setosa |
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 150 entries, 0 to 149 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Id 150 non-null int64 1 SepalLengthCm 150 non-null float64 2 SepalWidthCm 150 non-null float64 3 PetalLengthCm 150 non-null float64 4 PetalWidthCm 150 non-null float64 5 Species 150 non-null object dtypes: float64(4), int64(1), object(1) memory usage: 7.2+ KB
df['Species'].value_counts()
Iris-setosa 50 Iris-virginica 50 Iris-versicolor 50 Name: Species, dtype: int64
df.drop('Id', axis=1, inplace=True)
df
SepalLengthCm | SepalWidthCm | PetalLengthCm | PetalWidthCm | Species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | Iris-setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | Iris-setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | Iris-setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | Iris-setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | Iris-setosa |
... | ... | ... | ... | ... | ... |
145 | 6.7 | 3.0 | 5.2 | 2.3 | Iris-virginica |
146 | 6.3 | 2.5 | 5.0 | 1.9 | Iris-virginica |
147 | 6.5 | 3.0 | 5.2 | 2.0 | Iris-virginica |
148 | 6.2 | 3.4 | 5.4 | 2.3 | Iris-virginica |
149 | 5.9 | 3.0 | 5.1 | 1.8 | Iris-virginica |
150 rows × 5 columns
figure = df[df['Species']=='Iris-setosa'].plot.scatter(x='SepalLengthCm', y='SepalWidthCm', color='red', label='Setosa')
df[df['Species']=='Iris-versicolor'].plot.scatter(x='SepalLengthCm', y='SepalWidthCm', color='blue', label='versicolor', ax=figure)
df[df['Species']=='Iris-virginica'].plot.scatter(x='SepalLengthCm', y='SepalWidthCm', color='green', label='virginica', ax=figure)
figure.set_xlabel("Sepal Length")
figure.set_ylabel("Sepal Width")
figure.set_title("Length VS Width")
figure=plt.gcf()
figure.set_size_inches(10, 6)
plt.show()
figure = df[df['Species']=='Iris-setosa'].plot.scatter(x='PetalLengthCm', y='PetalWidthCm', color='red', label='Setosa')
df[df['Species']=='Iris-versicolor'].plot.scatter(x='PetalLengthCm', y='PetalWidthCm', color='blue', label='versicolor', ax=figure)
df[df['Species']=='Iris-virginica'].plot.scatter(x='PetalLengthCm', y='PetalWidthCm', color='green', label='virginica', ax=figure)
figure.set_xlabel("Petal Length")
figure.set_ylabel("Petal Width")
figure.set_title("Length VS Width")
figure=plt.gcf()
figure.set_size_inches(10, 6)
plt.show()
df.hist(edgecolor='green', linewidth=1.2)
figure=plt.gcf()
figure.set_size_inches(12, 6)
plt.show()
plt.figure(figsize=(12,6))
plt.subplot(2,2,1)
sns.violinplot(x='Species',y='SepalLengthCm',data=df)
plt.subplot(2,2,2)
sns.violinplot(x='Species',y='SepalWidthCm',data=df)
plt.subplot(2,2,3)
sns.violinplot(x='Species',y='PetalLengthCm',data=df)
plt.subplot(2,2,4)
sns.violinplot(x='Species',y='PetalWidthCm',data=df)
<matplotlib.axes._subplots.AxesSubplot at 0x7f7c10691080>
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
df.shape
(150, 5)
plt.figure(figsize=(7,4))
sns.heatmap(df.corr(),annot=True,cmap='cubehelix_r')
plt.show()
train, test = train_test_split(df, test_size = 0.3)
print(train.shape)
print(test.shape)
(105, 5) (45, 5)
train_x = train[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']]
train_y=train['Species']
test_x= test[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']]
test_y =test['Species']
train_x.head()
SepalLengthCm | SepalWidthCm | PetalLengthCm | PetalWidthCm | |
---|---|---|---|---|
74 | 6.4 | 2.9 | 4.3 | 1.3 |
129 | 7.2 | 3.0 | 5.8 | 1.6 |
16 | 5.4 | 3.9 | 1.3 | 0.4 |
124 | 6.7 | 3.3 | 5.7 | 2.1 |
19 | 5.1 | 3.8 | 1.5 | 0.3 |
train_y.head()
74 Iris-versicolor 129 Iris-virginica 16 Iris-setosa 124 Iris-virginica 19 Iris-setosa Name: Species, dtype: object
model = svm.SVC()
model.fit(train_x, train_y)
prediction = model.predict(test_x)
print('the accuracy rate of SVM is:', metrics.accuracy_score(prediction, test_y))
the accuracy rate of SVM is: 0.9555555555555556
model = LogisticRegression()
model.fit(train_x,train_y)
prediction=model.predict(test_x)
print('The accuracy of the Logistic Regression is',metrics.accuracy_score(prediction,test_y))
The accuracy of the Logistic Regression is 0.9555555555555556
/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1): STOP: TOTAL NO. of ITERATIONS REACHED LIMIT. Increase the number of iterations (max_iter) or scale the data as shown in: https://scikit-learn.org/stable/modules/preprocessing.html Please also refer to the documentation for alternative solver options: https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
model = DecisionTreeClassifier()
model.fit(train_x, train_y)
prediction = model.predict(test_x)
print('The accuracy of Decision Tree is:', metrics.accuracy_score(prediction, test_y))
The accuracy of Decision Tree is: 0.9555555555555556
model = KNeighborsClassifier(n_neighbors=3)
model.fit(train_x, train_y)
prediction = model.predict(test_x)
print('The accuracy of the KNeighbors is:', metrics.accuracy_score(prediction, test_y))
The accuracy of the KNeighbors is: 0.9777777777777777
a_index=list(range(1,11))
a=pd.Series()
x=[1,2,3,4,5,6,7,8,9,10]
for i in list(range(1,11)):
model=KNeighborsClassifier(n_neighbors=i)
model.fit(train_x,train_y)
prediction=model.predict(test_x)
a=a.append(pd.Series(metrics.accuracy_score(prediction,test_y)))
plt.plot(a_index, a)
plt.xticks(x)
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:2: DeprecationWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning.
([<matplotlib.axis.XTick at 0x7f7c075da748>, <matplotlib.axis.XTick at 0x7f7c075da710>, <matplotlib.axis.XTick at 0x7f7c075da358>, <matplotlib.axis.XTick at 0x7f7c0757ce10>, <matplotlib.axis.XTick at 0x7f7c075882e8>, <matplotlib.axis.XTick at 0x7f7c07588780>, <matplotlib.axis.XTick at 0x7f7c07588c18>, <matplotlib.axis.XTick at 0x7f7c07590160>, <matplotlib.axis.XTick at 0x7f7c07588a90>, <matplotlib.axis.XTick at 0x7f7c075901d0>], <a list of 10 Text major ticklabel objects>)
petal=df[['PetalLengthCm','PetalWidthCm','Species']]
sepal=df[['SepalLengthCm','SepalWidthCm','Species']]
train_p,test_p=train_test_split(petal,test_size=0.3,random_state=0)
train_x_p=train_p[['PetalWidthCm','PetalLengthCm']]
train_y_p=train_p.Species
test_x_p=test_p[['PetalWidthCm','PetalLengthCm']]
test_y_p=test_p.Species
train_s,test_s=train_test_split(sepal,test_size=0.3,random_state=0)
train_x_s=train_s[['SepalWidthCm','SepalLengthCm']]
train_y_s=train_s.Species
test_x_s=test_s[['SepalWidthCm','SepalLengthCm']]
test_y_s=test_s.Species
model=svm.SVC()
model.fit(train_x_p,train_y_p)
prediction=model.predict(test_x_p)
print('The accuracy of the SVM using Petals is:',metrics.accuracy_score(prediction,test_y_p))
model=svm.SVC()
model.fit(train_x_s,train_y_s)
prediction=model.predict(test_x_s)
print('The accuracy of the SVM using Sepal is:',metrics.accuracy_score(prediction,test_y_s))
The accuracy of the SVM using Petals is: 0.9777777777777777 The accuracy of the SVM using Sepal is: 0.8
model = LogisticRegression()
model.fit(train_x_p,train_y_p)
prediction=model.predict(test_x_p)
print('The accuracy of the Logistic Regression using Petals is:',metrics.accuracy_score(prediction,test_y_p))
model.fit(train_x_s,train_y_s)
prediction=model.predict(test_x_s)
print('The accuracy of the Logistic Regression using Sepals is:',metrics.accuracy_score(prediction,test_y_s))
The accuracy of the Logistic Regression using Petals is: 0.9777777777777777 The accuracy of the Logistic Regression using Sepals is: 0.8222222222222222
model=DecisionTreeClassifier()
model.fit(train_x_p,train_y_p)
prediction=model.predict(test_x_p)
print('The accuracy of the Decision Tree using Petals is:',metrics.accuracy_score(prediction,test_y_p))
model.fit(train_x_s,train_y_s)
prediction=model.predict(test_x_s)
print('The accuracy of the Decision Tree using Sepals is:',metrics.accuracy_score(prediction,test_y_s))
The accuracy of the Decision Tree using Petals is: 0.9555555555555556 The accuracy of the Decision Tree using Sepals is: 0.6666666666666666
model=KNeighborsClassifier(n_neighbors=3)
model.fit(train_x_p,train_y_p)
prediction=model.predict(test_x_p)
print('The accuracy of the KNN using Petals is:',metrics.accuracy_score(prediction,test_y_p))
model.fit(train_x_s,train_y_s)
prediction=model.predict(test_x_s)
print('The accuracy of the KNN using Sepals is:',metrics.accuracy_score(prediction,test_y_s))
The accuracy of the KNN using Petals is: 0.9777777777777777 The accuracy of the KNN using Sepals is: 0.7333333333333333
Using Petals over Sepal for training the data gives a much better accuracy