In [ ]:

from datascience import *
import numpy as np
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

Classification¶

In [ ]:

def distance(pt1, pt2):
    """Return the distance between two points, represented as arrays"""
    return np.sqrt(sum((pt1 - pt2)**2))

def row_distance(row1, row2):
    """Return the distance between two numerical rows of a table"""
    return distance(np.array([row1])[0], np.array([row2])[0])

def distances(training, example):
    """Compute distance between example and every row in training.
    Return training augmented with Distance column"""
    distances = make_array()
    attributes = training.drop('Class')
    for row in attributes.rows:
        distances = np.append(distances, row_distance(row, example))
    return training.with_column('Distance', distances)

def closest(training, example, k):
    """Return a table of the k closest neighbors to example"""
    return distances(training, example).sort('Distance').take(np.arange(k))

def majority_class(topk):
    """Return the class with the highest count"""
    return topk.group('Class').sort('count', descending=True).column(0).item(0)

def classify(training, example, k):
    "Return the majority class among the k nearest neighbors of example"
    return majority_class(closest(training, example, k))

In [ ]:

#Data:  https://www.kaggle.com/uciml/default-of-credit-card-clients-dataset
#Class:  Default payment (1=yes, 0=no)
#LIMIT_BAL: Amount of given credit in NT dollars (i.e., New Taiwan dollar)-includes individual and family/supplementary credit
# Scale:(0=pay duly, 1=payment delay for one month, 
### 2=payment delay for two months, ...
### 8=payment delay for eight months, 
### 9=payment delay for nine months and above)
# PAY_0: Repayment status in September, 2005 
# PAY_2: Repayment status in August, 2005 (scale same as above)
# PAY_3: Repayment status in July, 2005 (scale same as above)
# PAY_4: Repayment status in June, 2005 (scale same as above)
# PAY_5: Repayment status in May, 2005 (scale same as above)
# PAY_6: Repayment status in April, 2005 (scale same as above)
credit = Table.read_table('credit.csv')
credit.show(10)

In [ ]:

credit_payments = credit.drop('LIMIT_BAL')
credit_payments

In [ ]:

example123 = credit_payments.drop('Class').row(123)
example123

In [ ]:

classify(credit_payments.exclude(123), example123, 5)

In [ ]:

credit_payments.row(123)

Evaluation¶

In [ ]:

credit_payments.num_rows

In [ ]:

training_set = credit_payments.take(np.arange(500))
test_set = credit_payments.take(np.arange(500, 1000))

In [ ]:

print(training_set.num_rows)
print(test_set.num_rows)

In [ ]:

def evaluate_accuracy(training, test, k):
    """Return the proportion of correctly classified examples 
    in the test set"""
    test_attributes = test.drop('Class')
    num_correct = 0
    for i in np.arange(test.num_rows):
        c = classify(training, test_attributes.row(i), k)
        num_correct = num_correct + (c == test.column('Class').item(i))
    return num_correct / test.num_rows

In [ ]:

evaluate_accuracy(training_set, test_set, 3)

In [ ]:

credit_payments.column('Class')

In [ ]:

training_set.group('Class')

In [ ]:

test_set.group('Class')

In [ ]:

shuffled = credit_payments.sample(with_replacement=False)
training_set = shuffled.take(np.arange(500))
test_set = shuffled.take(np.arange(500, 1000))

In [ ]:

evaluate_accuracy(training_set, test_set, 3)

In [ ]:

evaluate_accuracy(training_set, test_set, 5)

In [ ]:

evaluate_accuracy(training_set, training_set, 1)

In [ ]:

training_example = training_set.drop("Class").row(0)
new_training = training_set.exclude(0)
training_example

In [ ]:

distances(new_training, np.array([training_example])[0]).sort("Distance")

In [ ]: