from datascience import *
import numpy as np
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
banknotes = Table.read_table('banknote.csv')
banknotes
banknotes.scatter('WaveletVar', 'WaveletCurt', colors='Class')
banknotes.scatter('WaveletSkew', 'Entropy', colors='Class')
fig = plots.figure(figsize=(8,8))
ax = Axes3D(fig)
ax.scatter(banknotes.column('WaveletSkew'),
banknotes.column('WaveletVar'),
banknotes.column('WaveletCurt'),
c=banknotes.column('Class'),
cmap='viridis',
s=50);
#Class 1 = malignant (cancer)
#Class 0 = benign (not cancer)
patients = Table.read_table('breast-cancer.csv').drop('ID')
patients.show(5)
#A number of points are layered
patients.scatter('Bland Chromatin', 'Single Epithelial Cell Size', colors='Class')
#Function to "jitter" the points (for visualization purposes)
def randomize_column(a):
return a + np.random.normal(0.0, 0.09, size=len(a))
jittered = Table().with_columns([
'Bland Chromatin (jittered)',
randomize_column(patients.column('Bland Chromatin')),
'Single Epithelial Cell Size (jittered)',
randomize_column(patients.column('Single Epithelial Cell Size')),
'Class',
patients.column('Class')
])
jittered.scatter(0, 1, colors='Class')
def distance(pt1, pt2):
"""Return the distance between two points, represented as arrays"""
return np.sqrt(sum((pt1 - pt2)**2))
def row_distance(row1, row2):
"""Return the distance between two numerical rows of a table"""
return distance(np.array([row1])[0], np.array([row2])[0])
attributes = patients.drop('Class')
attributes.show(3)
#pull values from a row
np.array([attributes.row(1)])[0]
row_distance(attributes.row(0), attributes.row(1))
row_distance(attributes.row(0), attributes.row(0))
def distances(training, example):
"""Compute distance between example and every row in training.
Return training augmented with Distance column"""
distances = make_array()
attributes = training.drop('Class')
for row in attributes.rows:
distances = np.append(distances, row_distance(row, example))
return training.with_column('Distance', distances)
patients.take(15)
example = attributes.row(15)
example
distances(patients.exclude(15), example).sort('Distance')
###########################################################
def closest(training, example, k):
"""Return a table of the k closest neighbors to example"""
return distances(training, example).sort('Distance').take(np.arange(k))
closest(patients.exclude(15), example, 5)
def majority_class(topk):
"""Return the class with the highest count"""
return topk.group('Class').sort('count', descending=True).column(0).item(0)
def classify(training, example, k):
"Return the majority class among the k nearest neighbors of example"
return majority_class(closest(training, example, k))
classify(patients.exclude(15), example, 5)
patients.take(15)
new_example = attributes.row(10)
classify(patients.exclude(10), new_example, 5)
patients.take(10)
patients.num_rows
shuffled = patients.sample(with_replacement=False) # Randomly permute the rows
training_set = shuffled.take(np.arange(342))
test_set = shuffled.take(np.arange(342, 683))
print(training_set.num_rows)
print(test_set.num_rows)
def evaluate_accuracy(training, test, k):
"""Return the proportion of correctly classified examples
in the test set"""
test_attributes = test.drop('Class')
num_correct = 0
for i in np.arange(test.num_rows):
c = classify(training, test_attributes.row(i), k)
num_correct = num_correct + (c == test.column('Class').item(i))
return num_correct / test.num_rows
evaluate_accuracy(training_set, test_set, 5)
evaluate_accuracy(training_set, test_set, 3)
evaluate_accuracy(training_set, test_set, 11)
evaluate_accuracy(training_set, training_set, 1)
evaluate_accuracy(training_set, test_set, 1)