import Levenshtein as L import pandas as pd Students = pd.DataFrame({'student_id': [1, 2], 'name': ['Alice', 'Bob']}) Students Grades = pd.DataFrame({'student_id': [1, 1, 2, 2], 'class_id': [1, 2, 1, 3], 'grade': ['A', 'C', 'B', 'B']}) Grades pd.merge(Students, Grades, on='student_id') Classes = pd.DataFrame({'class_id': [1, 2, 3], 'title': ['Math', 'English', 'Spanish']}) resto = pd.read_csv('restaurants.csv') resto.info() resto[:10] clusters = pd.merge(resto, resto, on='cluster') clusters = clusters[clusters.id_x != clusters.id_y] clusters[:10] resto['dummy'] = 0 prod = pd.merge(resto, resto, on='dummy') # Clean up del prod['dummy'] del resto['dummy'] # Show that prod is the size of "resto" squared: print len(prod), len(resto)**2 prod[:10] import Levenshtein as L L.distance('Hello, World!', 'Hallo, World!') # This takes a minute or two to run prod['distance'] = prod.apply(lambda r: L.distance(r['name_x'], r['name_y']), axis=1) %matplotlib inline import pylab def accuracy(max_distance): similar = prod[prod.distance < max_distance] correct = float(sum(similar.cluster_x == similar.cluster_y)) precision = correct / len(similar) recall = correct / len(clusters) return (precision, recall) thresholds = range(1, 11) p = [] r = [] for t in thresholds: acc = accuracy(t) p.append(acc[0]) r.append(acc[1]) pylab.plot(thresholds, p) pylab.plot(thresholds, r) pylab.legend(['precision', 'recall'], loc=2)