from datascience import *
import numpy as np
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
def distance(pt1, pt2):
"""Return the distance between two points, represented as arrays"""
return np.sqrt(sum((pt1 - pt2)**2))
def row_distance(row1, row2):
"""Return the distance between two numerical rows of a table"""
return distance(np.array([row1])[0], np.array([row2])[0])
def distances(training, example):
"""Compute distance between example and every row in training.
Return training augmented with Distance column"""
distances = make_array()
attributes = training.drop('Class')
for row in attributes.rows:
distances = np.append(distances, row_distance(row, example))
return training.with_column('Distance', distances)
def closest(training, example, k):
"""Return a table of the k closest neighbors to example"""
return distances(training, example).sort('Distance').take(np.arange(k))
def majority_class(topk):
"""Return the class with the highest count"""
return topk.group('Class').sort('count', descending=True).column(0).item(0)
def classify(training, example, k):
"Return the majority class among the k nearest neighbors of example"
return majority_class(closest(training, example, k))
#Data: https://www.kaggle.com/uciml/default-of-credit-card-clients-dataset
#Class: Default payment (1=yes, 0=no)
#LIMIT_BAL: Amount of given credit in NT dollars (i.e., New Taiwan dollar)-includes individual and family/supplementary credit
# Scale:(0=pay duly, 1=payment delay for one month,
### 2=payment delay for two months, ...
### 8=payment delay for eight months,
### 9=payment delay for nine months and above)
# PAY_0: Repayment status in September, 2005
# PAY_2: Repayment status in August, 2005 (scale same as above)
# PAY_3: Repayment status in July, 2005 (scale same as above)
# PAY_4: Repayment status in June, 2005 (scale same as above)
# PAY_5: Repayment status in May, 2005 (scale same as above)
# PAY_6: Repayment status in April, 2005 (scale same as above)
credit = Table.read_table('credit.csv')
credit.show(10)
credit_payments = credit.drop('LIMIT_BAL')
credit_payments
example123 = credit_payments.drop('Class').row(123)
example123
classify(credit_payments.exclude(123), example123, 5)
credit_payments.row(123)
credit_payments.num_rows
training_set = credit_payments.take(np.arange(500))
test_set = credit_payments.take(np.arange(500, 1000))
print(training_set.num_rows)
print(test_set.num_rows)
def evaluate_accuracy(training, test, k):
"""Return the proportion of correctly classified examples
in the test set"""
test_attributes = test.drop('Class')
num_correct = 0
for i in np.arange(test.num_rows):
c = classify(training, test_attributes.row(i), k)
num_correct = num_correct + (c == test.column('Class').item(i))
return num_correct / test.num_rows
evaluate_accuracy(training_set, test_set, 3)
credit_payments.column('Class')
training_set.group('Class')
test_set.group('Class')
shuffled = credit_payments.sample(with_replacement=False)
training_set = shuffled.take(np.arange(500))
test_set = shuffled.take(np.arange(500, 1000))
evaluate_accuracy(training_set, test_set, 3)
evaluate_accuracy(training_set, test_set, 5)
evaluate_accuracy(training_set, training_set, 1)
training_example = training_set.drop("Class").row(0)
new_training = training_set.exclude(0)
training_example
distances(new_training, np.array([training_example])[0]).sort("Distance")