#!/usr/bin/env python # coding: utf-8 # In[2]: # HIDDEN import matplotlib #matplotlib.use('Agg') from datascience import * get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D import numpy as np import math import scipy.stats as stats plt.style.use('fivethirtyeight') # In[3]: # HIDDEN def standard_units(x): return (x - np.mean(x))/np.std(x) # In[4]: # HIDDEN def distance(pt1, pt2): """The Euclidean distance between two arrays of numbers.""" return np.sqrt(np.sum((pt1 - pt2)**2)) def distance_from_individual(attribute_table, i, p): """The Euclidean distance between p (an array of numbers) and the numbers in row i of attribute_table.""" return distance(np.array(attribute_table.row(i)), p) def table_with_dists(training, p): """A copy of the training table with the Euclidean distance from each row to array p.""" dists = make_array() attributes = training.drop('Class') for i in np.arange(training.num_rows): dists = np.append(dists, distance_from_individual(attributes, i , p)) return training.with_column('Distance', dists) def closest(training, p, k): """A table containing the k closest rows in the training table to array p.""" with_dists = table_with_dists(training, p) sorted_by_dist = with_dists.sort('Distance') topk = sorted_by_dist.take(np.arange(k)) return topk def majority(topkclasses): """1 if the majority of the "Class" column is 1s, and 0 otherwise.""" ones = topkclasses.where('Class', are.equal_to(1)).num_rows zeros = topkclasses.where('Class', are.equal_to(0)).num_rows if ones > zeros: return 1 else: return 0 def classify(training, p, k): """Classify an example with attributes p using k-nearest neighbor classification with the given training table.""" closestk = closest(training, p, k) topkclasses = closestk.select('Class') return majority(topkclasses) # ### Nearest Neighbors ### # In this section we'll develop the *nearest neighbor* method of classification. Just focus on the ideas for now and don't worry if some of the code is mysterious. Later in the chapter we'll see how to organize our ideas into code that performs the classification. # ### Chronic kidney disease # # Let's work through an example. We're going to work with a data set that was collected to help doctors diagnose chronic kidney disease (CKD). Each row in the data set represents a single patient who was treated in the past and whose diagnosis is known. For each patient, we have a bunch of measurements from a blood test. We'd like to find which measurements are most useful for diagnosing CKD, and develop a way to classify future patients as "has CKD" or "doesn't have CKD" based on their blood test results. # In[5]: ckd = Table.read_table('ckd.csv').relabeled('Blood Glucose Random', 'Glucose') ckd # Some of the attributes are categorical (words like "abnormal" or "normal"), and some quantitative. The quantitative variables all have different scales. We're going to want to make comparisons and estimate distances, often by eye, so let's select just a few of the variables and work in standard units. Then we won't have to worry about the scale of each of the different variables. # In[6]: ckd = Table().with_columns( 'Hemoglobin', standard_units(ckd.column('Hemoglobin')), 'Glucose', standard_units(ckd.column('Glucose')), 'White Blood Cell Count', standard_units(ckd.column('White Blood Cell Count')), 'Class', ckd.column('Class') ) ckd # Let's look at two columns in particular: the hemoglobin level (in the patient's blood), and the blood glucose level (at a random time in the day; without fasting specially for the blood test). # # We'll draw a scatter plot to visualize the relation between the two variables. Blue dots are patients with CKD; gold dots are patients without CKD. What kind of medical test results seem to indicate CKD? # In[7]: color_table = Table().with_columns( 'Class', make_array(1, 0), 'Color', make_array('darkblue', 'gold') ) ckd = ckd.join('Class', color_table) # In[8]: ckd.scatter('Hemoglobin', 'Glucose', colors='Color') # Suppose Alice is a new patient who is not in the data set. If I tell you Alice's hemoglobin level and blood glucose level, could you predict whether she has CKD? It sure looks like it! You can see a very clear pattern here: points in the lower-right tend to represent people who don't have CKD, and the rest tend to be folks with CKD. To a human, the pattern is obvious. But how can we program a computer to automatically detect patterns such as this one? # ### A Nearest Neighbor Classifier ### # # There are lots of kinds of patterns one might look for, and lots of algorithms for classification. But I'm going to tell you about one that turns out to be surprisingly effective. It is called *nearest neighbor classification*. Here's the idea. If we have Alice's hemoglobin and glucose numbers, we can put her somewhere on this scatterplot; the hemoglobin is her x-coordinate, and the glucose is her y-coordinate. Now, to predict whether she has CKD or not, we find the nearest point in the scatterplot and check whether it is blue or gold; we predict that Alice should receive the same diagnosis as that patient. # # In other words, to classify Alice as CKD or not, we find the patient in the training set who is "nearest" to Alice, and then use that patient's diagnosis as our prediction for Alice. The intuition is that if two points are near each other in the scatterplot, then the corresponding measurements are pretty similar, so we might expect them to receive the same diagnosis (more likely than not). We don't know Alice's diagnosis, but we do know the diagnosis of all the patients in the training set, so we find the patient in the training set who is most similar to Alice, and use that patient's diagnosis to predict Alice's diagnosis. # In the graph below, the red dot represents Alice. It is joined with a black line to the point that is nearest to it – its *nearest neighbor* in the training set. The figure is drawn by a function called `show_closest`. It takes an array that represents the $x$ and $y$ coordinates of Alice's point. Vary those to see how the closest point changes! Note especially when the closest point is blue and when it is gold. # In[12]: # HIDDEN def show_closest(point): """point = array([x,y]) gives the coordinates of a new point shown in red""" HemoGl = ckd.drop('White Blood Cell Count', 'Color') t = closest(HemoGl, point, 1) x_closest = t.row(0).item(1) y_closest = t.row(0).item(2) ckd.scatter('Hemoglobin', 'Glucose', colors='Color') plt.scatter(point.item(0), point.item(1), color='red', s=30) plt.plot(make_array(point.item(0), x_closest), make_array(point.item(1), y_closest), color='k', lw=2); # In[14]: # In this example, Alice's Hemoglobin attribute is 0 and her Glucose is 1.5. alice = make_array(0, 1.5) show_closest(alice) # Thus our *nearest neighbor classifier* works like this: # - Find the point in the training set that is nearest to the new point. # - If that nearest point is a "CKD" point, classify the new point as "CKD". If the nearest point is a "not CKD" point, classify the new point as "not CKD". # # The scatterplot suggests that this nearest neighbor classifier should be pretty accurate. Points in the lower-right will tend to receive a "no CKD" diagnosis, as their nearest neighbor will be a gold point. The rest of the points will tend to receive a "CKD" diagnosis, as their nearest neighbor will be a blue point. So the nearest neighbor strategy seems to capture our intuition pretty well, for this example. # ### Decision boundary ### # # Sometimes a helpful way to visualize a classifier is to map out the kinds of attributes where the classifier would predict 'CKD', and the kinds where it would predict 'not CKD'. We end up with some boundary between the two, where points on one side of the boundary will be classified 'CKD' and points on the other side will be classified 'not CKD'. This boundary is called the *decision boundary*. Each different classifier will have a different decision boundary; the decision boundary is just a way to visualize what criteria the classifier is using to classify points. # # For example, suppose the coordinates of Alice's point are (0, 1.5). Notice that the nearest neighbor is blue. Now try reducing the height (the $y$-coordinate) of the point. You'll see that at around $y = 0.95$ the nearest neighbor turns from blue to gold. # In[15]: alice = make_array(0, 0.97) show_closest(alice) # Here are hundreds of new unclassified points, all in red. # In[16]: # HIDDEN x_array = make_array() y_array = make_array() for x in np.arange(-2, 2.1, 0.1): for y in np.arange(-2, 2.1, 0.1): x_array = np.append(x_array, x) y_array = np.append(y_array, y) test_grid = Table().with_columns( 'Hemoglobin', x_array, 'Glucose', y_array ) # In[17]: # HIDDEN test_grid.scatter('Hemoglobin', 'Glucose', color='red', alpha=0.4, s=30) plt.scatter(ckd.column('Hemoglobin'), ckd.column('Glucose'), c=ckd.column('Color'), edgecolor='k') plt.xlim(-2, 2) plt.ylim(-2, 2); # Each of the red points has a nearest neighbor in the training set (the same blue and gold points as before). For some red points you can easily tell whether the nearest neighbor is blue or gold. For others, it's a little more tricky to make the decision by eye. Those are the points near the decision boundary. # # But the computer can easily determine the nearest neighbor of each point. So let's get it to apply our nearest neighbor classifier to each of the red points: # # For each red point, it must find the closest point in the training set; it must then change the color of the red point to become the color of the nearest neighbor. # # The resulting graph shows which points will get classified as 'CKD' (all the blue ones), and which as 'not CKD' (all the gold ones). # In[18]: # HIDDEN def classify_grid(training, test, k): c = make_array() for i in range(test.num_rows): # Run the classifier on the ith patient in the test set c = np.append(c, classify(training, make_array(test.row(i)), k)) return c # In[19]: # HIDDEN c = classify_grid(ckd.drop('White Blood Cell Count', 'Color'), test_grid, 1) # In[20]: # HIDDEN test_grid = test_grid.with_column('Class', c).join('Class', color_table) test_grid.scatter('Hemoglobin', 'Glucose', colors='Color', alpha=0.4, s=30) plt.scatter(ckd.column('Hemoglobin'), ckd.column('Glucose'), c=ckd.column('Color'), edgecolor='k') plt.xlim(-2, 2) plt.ylim(-2, 2); # The decision boundary is where the classifier switches from turning the red points blue to turning them gold. # ### k-Nearest Neighbors ### # # However, the separation between the two classes won't always be quite so clean. For instance, suppose that instead of hemoglobin levels we were to look at white blood cell count. Look at what happens: # In[21]: ckd.scatter('White Blood Cell Count', 'Glucose', colors='Color') # As you can see, non-CKD individuals are all clustered in the lower-left. Most of the patients with CKD are above or to the right of that cluster... but not all. There are some patients with CKD who are in the lower left of the above figure (as indicated by the handful of blue dots scattered among the gold cluster). What this means is that you can't tell for certain whether someone has CKD from just these two blood test measurements. # # If we are given Alice's glucose level and white blood cell count, can we predict whether she has CKD? Yes, we can make a prediction, but we shouldn't expect it to be 100% accurate. Intuitively, it seems like there's a natural strategy for predicting: plot where Alice lands in the scatter plot; if she is in the lower-left, predict that she doesn't have CKD, otherwise predict she has CKD. # # This isn't perfect -- our predictions will sometimes be wrong. (Take a minute and think it through: for which patients will it make a mistake?) As the scatterplot above indicates, sometimes people with CKD have glucose and white blood cell levels that look identical to those of someone without CKD, so any classifier is inevitably going to make the wrong prediction for them. # # Can we automate this on a computer? Well, the nearest neighbor classifier would be a reasonable choice here too. Take a minute and think it through: how will its predictions compare to those from the intuitive strategy above? When will they differ? # # Its predictions will be pretty similar to our intuitive strategy, but occasionally it will make a different prediction. In particular, if Alice's blood test results happen to put her right near one of the red dots in the lower-left, the intuitive strategy would predict 'not CKD', whereas the nearest neighbor classifier will predict 'CKD'. # # There is a simple generalization of the nearest neighbor classifier that fixes this anomaly. It is called the *k-nearest neighbor classifier*. To predict Alice's diagnosis, rather than looking at just the one neighbor closest to her, we can look at the 3 points that are closest to her, and use the diagnosis for each of those 3 points to predict Alice's diagnosis. In particular, we'll use the majority value among those 3 diagnoses as our prediction for Alice's diagnosis. Of course, there's nothing special about the number 3: we could use 4, or 5, or more. (It's often convenient to pick an odd number, so that we don't have to deal with ties.) In general, we pick a number $k$, and our predicted diagnosis for Alice is based on the $k$ patients in the training set who are closest to Alice. Intuitively, these are the $k$ patients whose blood test results were most similar to Alice, so it seems reasonable to use their diagnoses to predict Alice's diagnosis. # # The $k$-nearest neighbor classifier will now behave just like our intuitive strategy above. # ### Banknote authentication # # Let's do another example. This time we'll look at predicting whether a banknote (e.g., a \$20 bill) is counterfeit or legitimate. Researchers have put together a data set for us, based on photographs of many individual banknotes: some counterfeit, some legitimate. They computed a few numbers from each image, using techniques that we won't worry about for this course. So, for each banknote, we know a few numbers that were computed from a photograph of it as well as its class (whether it is counterfeit or not). Let's load it into a table and take a look. # In[22]: banknotes = Table.read_table('banknote.csv') banknotes # Let's look at whether the first two numbers tell us anything about whether the banknote is counterfeit or not. Here's a scatterplot: # In[23]: banknotes = banknotes.join('Class', color_table) # In[24]: banknotes.scatter('WaveletVar', 'WaveletCurt', colors='Color') # Pretty interesting! Those two measurements do seem helpful for predicting whether the banknote is counterfeit or not. However, in this example you can now see that there is some overlap between the blue cluster and the gold cluster. This indicates that there will be some images where it's hard to tell whether the banknote is legitimate based on just these two numbers. Still, you could use a $k$-nearest neighbor classifier to predict the legitimacy of a banknote. # # Take a minute and think it through: Suppose we used $k=11$ (say). What parts of the plot would the classifier get right, and what parts would it make errors on? What would the decision boundary look like? # # The patterns that show up in the data can get pretty wild. For instance, here's what we'd get if used a different pair of measurements from the images: # In[25]: banknotes.scatter('WaveletSkew', 'Entropy', colors='Color') # There does seem to be a pattern, but it's a pretty complex one. Nonetheless, the $k$-nearest neighbors classifier can still be used and will effectively "discover" patterns out of this. This illustrates how powerful machine learning can be: it can effectively take advantage of even patterns that we would not have anticipated, or that we would have thought to "program into" the computer. # ### Multiple attributes # # So far I've been assuming that we have exactly 2 attributes that we can use to help us make our prediction. What if we have more than 2? For instance, what if we have 3 attributes? # # Here's the cool part: you can use the same ideas for this case, too. All you have to do is make a 3-dimensional scatterplot, instead of a 2-dimensional plot. You can still use the $k$-nearest neighbors classifier, but now computing distances in 3 dimensions instead of just 2. It just works. Very cool! # # In fact, there's nothing special about 2 or 3. If you have 4 attributes, you can use the $k$-nearest neighbors classifier in 4 dimensions. 5 attributes? Work in 5-dimensional space. And no need to stop there! This all works for arbitrarily many attributes; you just work in a very high dimensional space. It gets wicked-impossible to visualize, but that's OK. The computer algorithm generalizes very nicely: all you need is the ability to compute the distance, and that's not hard. Mind-blowing stuff! # # For instance, let's see what happens if we try to predict whether a banknote is counterfeit or not using 3 of the measurements, instead of just 2. Here's what you get: # In[49]: ax = plt.figure(figsize=(8,8)).add_subplot(111, projection='3d') ax.scatter(banknotes.column('WaveletSkew'), banknotes.column('WaveletVar'), banknotes.column('WaveletCurt'), c=banknotes.column('Color')); # Awesome! With just 2 attributes, there was some overlap between the two clusters (which means that the classifier was bound to make some mistakes for pointers in the overlap). But when we use these 3 attributes, the two clusters have almost no overlap. In other words, a classifier that uses these 3 attributes will be more accurate than one that only uses the 2 attributes. # # This is a general phenomenom in classification. Each attribute can potentially give you new information, so more attributes sometimes helps you build a better classifier. Of course, the cost is that now we have to gather more information to measure the value of each attribute, but this cost may be well worth it if it significantly improves the accuracy of our classifier. # # To sum up: you now know how to use $k$-nearest neighbor classification to predict the answer to a yes/no question, based on the values of some attributes, assuming you have a training set with examples where the correct prediction is known. The general roadmap is this: # # 1. identify some attributes that you think might help you predict the answer to the question. # 2. Gather a training set of examples where you know the values of the attributes as well as the correct prediction. # 3. To make predictions in the future, measure the value of the attributes and then use $k$-nearest neighbor classification to predict the answer to the question.