import pandas as pd
import numpy as np
abalone = pd.read_csv("abalone.csv")
abalone.head(5)
Sex | Length | Diameter | Height | Whole weight | Shucked weight | Viscera weight | Shell weight | Rings | |
---|---|---|---|---|---|---|---|---|---|
0 | M | 0.455 | 0.365 | 0.095 | 0.5140 | 0.2245 | 0.1010 | 0.150 | 15 |
1 | M | 0.350 | 0.265 | 0.090 | 0.2255 | 0.0995 | 0.0485 | 0.070 | 7 |
2 | F | 0.530 | 0.420 | 0.135 | 0.6770 | 0.2565 | 0.1415 | 0.210 | 9 |
3 | M | 0.440 | 0.365 | 0.125 | 0.5160 | 0.2155 | 0.1140 | 0.155 | 10 |
4 | I | 0.330 | 0.255 | 0.080 | 0.2050 | 0.0895 | 0.0395 | 0.055 | 7 |
abalone.shape
(4177, 9)
abalone.isnull().sum()
Sex 0 Length 0 Diameter 0 Height 0 Whole weight 0 Shucked weight 0 Viscera weight 0 Shell weight 0 Rings 0 dtype: int64
# Splitting the dataset into training data and test data.
# Training data is 75% of the rows, and the test data is the remaining 25%
abalone_new['Rings'].astype(float)
abalone_train = abalone_new[0:3132]
abalone_test = abalone_new[3132:]
abalone_new['Rings'].describe()
count 4177.000000 mean 9.933684 std 3.224169 min 1.000000 25% 8.000000 50% 9.000000 75% 11.000000 max 29.000000 Name: Rings, dtype: float64
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
def knn_train_test(train_col, target_col, df):
knn = KNeighborsRegressor()
np.random.seed(1)
abalone_new = abalone.copy()
# Randomize order of rows in data frame.
shuffled_index = np.random.permutation(abalone_new.index)
rand_abalone = abalone_new.reindex(shuffled_index)
# Select the first 75% of rows and set as training set.
# Select the last 25% and set as test set.
train_abalone = rand_abalone.iloc[0:3132]
test_abalone = rand_abalone.iloc[3132:]
# Fit a KNN model using default k value.
knn.fit(train_abalone[[train_col]], train_abalone[target_col])
# Make predictions using model.
predicted_labels = knn.predict(test_abalone[[train_col]])
# Calculate and return RMSE.
mse = mean_squared_error(test_abalone[target_col], predicted_labels)
rmse = np.sqrt(mse)
return rmse
print("RMSE of 'Length':", knn_train_test('Length','Rings', abalone))
print("RMSE of 'Diameter':", knn_train_test('Diameter','Rings', abalone))
print("RMSE of 'Height':", knn_train_test('Height','Rings', abalone))
print("RMSE of 'Whole weight':", knn_train_test('Whole weight','Rings', abalone))
print("RMSE of 'Shucked weight':", knn_train_test('Shucked weight','Rings', abalone))
print("RMSE of 'Viscera weight':", knn_train_test('Viscera weight','Rings', abalone))
print("RMSE of 'Shell weight':", knn_train_test('Shell weight','Rings', abalone))
RMSE of 'Length': 3.1153455651710007 RMSE of 'Diameter': 2.8890289092145673 RMSE of 'Height': 2.7378823933836163 RMSE of 'Whole weight': 2.911807344171207 RMSE of 'Shucked weight': 3.078637615793387 RMSE of 'Viscera weight': 2.8558208835044763 RMSE of 'Shell weight': 2.5934699004233526
# two feature model
def knn_train_test(train_col_1, train_col_2, target_col, df):
knn = KNeighborsRegressor()
np.random.seed(1)
abalone_new = abalone.copy()
# Randomize order of rows in data frame.
shuffled_index = np.random.permutation(abalone_new.index)
rand_abalone = abalone_new.reindex(shuffled_index)
# Select the first 75% of the rows and set as training set.
# Select the last 25% and set as test set.
train_abalone = rand_abalone.iloc[0:3132]
test_abalone = rand_abalone.iloc[3132:]
# Fit a KNN model using default k value.
knn.fit(train_abalone[[train_col_1, train_col_2]], train_abalone[target_col])
# Make predictions using model.
predicted_labels = knn.predict(test_abalone[[train_col_1, train_col_2]])
# Calculate and return RMSE.
mse = mean_squared_error(test_abalone[target_col], predicted_labels)
rmse = np.sqrt(mse)
return rmse
## Try all combinations of just two features and look for lowest root mean squared error
print("RMSE of 'Shell weight' and 'Height':", knn_train_test('Shell weight','Height','Rings', abalone))
print("RMSE of 'Shell weight' and 'Viscera weight':", knn_train_test('Shell weight', 'Viscera weight','Rings', abalone))
print("RMSE of 'Shell weight' and 'Diameter':", knn_train_test('Shell weight','Diameter','Rings', abalone))
print("RMSE of 'Height' and 'Viscera weight':", knn_train_test('Height','Viscera weight','Rings', abalone))
print("RMSE of 'Height' and 'Diameter':", knn_train_test('Height','Diameter','Rings', abalone))
print("RMSE of 'Viscera weight' and 'Diameter':", knn_train_test('Viscera weight','Diameter','Rings', abalone))
RMSE of 'Shell weight' and 'Height': 2.6617258853975208 RMSE of 'Shell weight' and 'Viscera weight': 2.658099475747006 RMSE of 'Shell weight' and 'Diameter': 2.4966733848124028 RMSE of 'Height' and 'Viscera weight': 2.764297327155431 RMSE of 'Height' and 'Diameter': 2.7920598285359866 RMSE of 'Viscera weight' and 'Diameter': 2.888850038588806
# three feature model
def knn_train_test(train_col_1, train_col_2, train_col_3, target_col, df):
knn = KNeighborsRegressor()
np.random.seed(1)
abalone_new = abalone.copy()
# Randomize order of rows in data frame.
shuffled_index = np.random.permutation(abalone_new.index)
rand_abalone = abalone_new.reindex(shuffled_index)
# Select the first 75% of rows and set as training set.
# Select the last 25% and set as test set.
train_abalone = rand_abalone.iloc[0:3132]
test_abalone = rand_abalone.iloc[3132:]
# Fit a KNN model using default k value.
knn.fit(train_abalone[[train_col_1, train_col_2, train_col_3]], train_abalone[target_col])
# Make predictions using model.
predicted_labels = knn.predict(test_abalone[[train_col_1, train_col_2, train_col_3]])
# Calculate and return RMSE.
mse = mean_squared_error(test_abalone[target_col], predicted_labels)
rmse = np.sqrt(mse)
return rmse
print("RMSE of 'Shell weight', 'Diameter', and 'Height':", knn_train_test('Shell weight','Diameter','Height','Rings', abalone))
print("RMSE of 'Shell weight', 'Diameter', and 'Viscera weight':", knn_train_test('Shell weight','Diameter', 'Viscera weight','Rings', abalone))
RMSE of 'Shell weight', 'Diameter', and 'Height': 2.5050456260349323 RMSE of 'Shell weight', 'Diameter', and 'Viscera weight': 2.482150634094339
# four feature model
def knn_train_test(train_col_1, train_col_2, train_col_3, train_col_4, target_col, df):
knn = KNeighborsRegressor()
np.random.seed(1)
abalone_new = abalone.copy()
# Randomize order of rows in data frame.
shuffled_index = np.random.permutation(abalone_new.index)
rand_abalone = abalone_new.reindex(shuffled_index)
# Select the first 75% of rows and set as training set.
# Select the remaining 25% and set as test set.
train_abalone = rand_abalone.iloc[0:3132]
test_abalone = rand_abalone.iloc[3132:]
# Fit a KNN model using default k value.
knn.fit(train_abalone[[train_col_1, train_col_2, train_col_3, train_col_4]], train_abalone[target_col])
# Make predictions using model.
predicted_labels = knn.predict(test_abalone[[train_col_1, train_col_2, train_col_3, train_col_4]])
# Calculate and return RMSE.
mse = mean_squared_error(test_abalone[target_col], predicted_labels)
rmse = np.sqrt(mse)
return rmse
print("RMSE of 'Shell weight', 'Diameter', 'Viscera weight', and 'Height':", knn_train_test('Shell weight','Diameter', 'Viscera weight', 'Height','Rings', abalone))
RMSE of 'Shell weight', 'Diameter', 'Viscera weight', and 'Height': 2.493912204975228
import matplotlib.pyplot as plt
%matplotlib inline
features = ['Shell weight', 'Diameter', 'Viscera weight']
hyper_params = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
mse_values1 = list()
for item in hyper_params:
np.random.seed(1)
abalone_new = abalone.copy()
# Randomize order of rows in data frame.
shuffled_index = np.random.permutation(abalone_new.index)
rand_abalone = abalone_new.reindex(shuffled_index)
# Select the first half and set as training set.
# Select the second half and set as test set.
train_abalone = rand_abalone.iloc[0:3132]
test_abalone = rand_abalone.iloc[3132:]
knn = KNeighborsRegressor(n_neighbors=item , algorithm='brute')
knn.fit(train_abalone[features], train_abalone['Rings'])
predictions = knn.predict(test_abalone[features])
mse = mean_squared_error(test_abalone['Rings'], predictions)
mse_values1.append(mse)
plt.scatter(hyper_params, mse_values1)
plt.title('RMSE values per Number of K neighbors')
plt.xlabel('K neighbors')
plt.ylabel('RMSE values')
plt.show()
print(mse_values1)
[10.964593301435407, 7.832057416267943, 6.8990962254120145, 6.589055023923445, 6.1885167464114845, 6.075544922913345, 6.047944536666341, 5.994617224880383, 5.870057298127474, 5.840124401913876, 5.8631104432757315, 5.860041201488569, 5.822332323546898, 5.776721023337565, 5.721730994152046, 5.716234300239234, 5.695736825549246, 5.712106444562584, 5.721737862662196, 5.718435406698564]
features = ['Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight','Shell weight']
hyper_params = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
mse_values2 = list()
for item in hyper_params:
np.random.seed(1)
abalone_new = abalone.copy()
# Randomize order of rows in data frame.
shuffled_index = np.random.permutation(abalone_new.index)
rand_abalone = abalone_new.reindex(shuffled_index)
# Select the first half and set as training set.
# Select the second half and set as test set.
train_abalone = rand_abalone.iloc[0:3132]
test_abalone = rand_abalone.iloc[3132:]
knn = KNeighborsRegressor(n_neighbors=item , algorithm='brute')
knn.fit(train_abalone[features], train_abalone['Rings'])
predictions = knn.predict(test_abalone[features])
mse = mean_squared_error(test_abalone['Rings'], predictions)
mse_values2.append(mse)
plt.scatter(hyper_params, mse_values2)
plt.title('RMSE values per Number of K neighbors')
plt.xlabel('K neighbors')
plt.ylabel('RMSE values')
plt.show()
print(mse_values2)
[8.432535885167464, 6.610047846889952, 5.814460393407762, 5.407655502392345, 5.127502392344497, 5.005635300372142, 4.911883605116688, 4.8977721291866025, 4.86187016362455, 4.780143540669856, 4.735829807426153, 4.688549973418394, 4.6913281050932865, 4.736036519871107, 4.772236044657097, 4.748355263157895, 4.75411334249433, 4.7273701931596674, 4.732606131293987, 4.75333971291866]
# cross validation using the three best features
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
import numpy as np
#create a new KNN model
abalone_new = abalone.copy()
abalone_new = abalone_new.loc[np.random.permutation(len(abalone_new))]
knn_cv = KNeighborsClassifier(n_neighbors=17)
#train model with cv of 5
cv_scores = cross_val_score(knn_cv, abalone_new[['Shell weight', 'Diameter','Viscera weight']], abalone_new['Rings'], cv=5)
#print each cv score (accuracy) and average them
print(cv_scores)
print('cv_scores mean:', (np.mean(cv_scores)))
[0.23086124 0.26196172 0.2491018 0.24550898 0.24431138] cv_scores mean: 0.24634902443915996
/Users/aprilspencer/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_split.py:666: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5. warnings.warn(("The least populated class in y has only %d"
# cross validation using all the features
abalone_new = abalone.copy()
abalone_new = abalone_new.loc[np.random.permutation(len(abalone_new))]
knn_cv = KNeighborsClassifier(n_neighbors=12)
#train model with cv of 5
cv_scores = cross_val_score(knn_cv, abalone_new[['Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight','Shell weight']], abalone_new['Rings'], cv=11)
#print each cv score (accuracy) and average them
print(cv_scores)
print('cv_scores mean:', (np.mean(cv_scores)))
[0.25526316 0.24473684 0.23421053 0.26578947 0.23684211 0.24736842 0.26842105 0.22894737 0.22163588 0.26121372 0.22691293] cv_scores mean: 0.24466740730454103
/Users/aprilspencer/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_split.py:666: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=11. warnings.warn(("The least populated class in y has only %d"