class Learner: """Abstract Learner. A Learner can be trained with a dataset X and the corresponding labels y. After training the Learner can be asked to predict the outcome of an example.""" def fit(self, X, y): """Fit or train the learner.""" self.X = X self.y = y def predict(self, x): """Predict the outcome of an example x.""" raise NotImplementedError class MajorityLearner(Learner): """Dumb Learner that always predicts that the class of a new, unseen example is the label that occurs most often in the training data.""" def fit(self, X, y): "Find that label in the training data that occurs most often." # insert your code here def predict(self, x): "Always predict that `x`'s label is equal to the most popular one." return self.most_popular # these tests should return True if your code is correct learner = MajorityLearner() X = [('a', 'a', 'b'), ('a', 'a', 'a'), ('b', 'b', 'b')] y = [0, 0, 1] learner.fit(X, y) print(learner.predict(('c', 'c', 'c')) == 0) # insert your code here # insert your code here # insert your code here from collections import defaultdict, Counter class NaiveBayesLearner(Learner): """Naive Bayes Learner. This learner can be initialized using nb = NaiveBayesLearner() to construct a classifier that incorporates the prior probabilities of each possible class label.""" def fit(self, X, y): """Fit or train the naive bayes classifier. X must be an iterable of examples where each example is an iterable as well.""" self.C = # insert your code here (class counts) self.N = # insert your code here (feature counts per class) # add the feature counts per class here self.V = len(set(x for y_x in self.N for x in self.N[y_x])) # number of unique features def predict(self, x): """Predict the outcome for example x. Choose the most likely outcome out of all possible outcomes.""" pass # these tests should return True if your code is correct nb = NaiveBayesLearner() X, y = [('a', 'a', 'a', 'b'), ('a', 'b', 'b')], [1, 0] nb.fit(X, y) print(nb.N[1]['b'] == 1 and nb.C[1] == 1) class NaiveBayesLearner(Learner): """Naive Bayes Learner. This learner can be initialized using nb = NaiveBayesLearner() to construct a classifier that incorporates the prior probabilities of each possible class label.""" def fit(self, X, y): """Fit or train the naive bayes classifier. X must be an iterable of examples where each example is an iterable as well.""" self.C = Counter(y) self.N = defaultdict(Counter) for x, y_x in zip(X, y): self.N[y_x] += Counter(x) self.V = len(set(x for y_x in self.N for x in self.N[y_x])) def prior(self, y): """Return the prior probability of class y.""" pass def probability(self, x, y): """Apply Laplace Smoothing to give a probability estimate of feature x given y.""" # insert your code here # these tests should return True if your code is correct nb = NaiveBayesLearner() X, y = [('a', 'a', 'a', 'b'), ('a', 'b', 'b'), ('b', 'b', 'b')], [1, 0, 0] nb.fit(X, y) print(abs(nb.probability('a', 1) - 0.66666) < 0.00001) print(abs(nb.probability('c', 0) - 0.125) < 0.00001) # these tests should return True if your code is correct nb = NaiveBayesLearner() X, y = [('a', 'a', 'a', 'b'), ('a', 'b', 'b'), ('b', 'b', 'b')], [1, 0, 0] nb.fit(X, y) print(round(nb.prior(1), 2) == 0.33) x = 1e-10 for i in range(10): x = x * x print(x) from math import log x = log(1e-10) for i in range(10): x = x + x print(x) class NaiveBayesLearner(Learner): """Naive Bayes Learner. This learner can be initialized using nb = NaiveBayesLearner() to construct a classifier that incorporates the prior probabilities of each possible class label.""" def fit(self, X, y): """Fit or train the naive bayes classifier. X must be an iterable of examples where each example is an iterable as well.""" self.C = Counter(y) self.N = defaultdict(Counter) for x, y_x in zip(X, y): self.N[y_x] += Counter(x) self.V = len(set(x for y_x in self.N for x in self.N[y_x])) def prior(self, y): """Return the prior probability of class y.""" return self.C[y] / sum(self.C.values()) def probability(self, x, y): """Apply Laplace Smoothing to give a probability estimate of feature x given y.""" return (self.N[y][x] + 1.0) / (sum(self.N[y].values()) + self.V) def predict(self, x): """Predict the outcome for example x. Choose the most likely outcome out of all possible outcomes.""" # insert your code here # these tests should return True if your code is correct nb = NaiveBayesLearner() X, y = [('a', 'a', 'a', 'b'), ('a', 'b', 'b'), ('b', 'b', 'b')], [1, 0, 0] nb.fit(X, y) print(nb.predict(('a', 'a')) == 1 and nb.predict(('a', 'b')) == 0 and nb.predict(('b', 'b')) == 0) from collections import namedtuple Point = namedtuple("Point", ["x", "y"]) p = Point(1, y=2) print(p.x) # by name print(p[1]) # by index x, y = p print(x) Document = namedtuple("Document", ["author", "title", "text"]) def ngrams(iterable, ngram_range=(1, 1)): "Return the iterable as a sequence of n-grams." min_n, max_n = ngram_range if isinstance(iterable, (list, set)): iterable = tuple(iterable) ngrams = [] # insert your code here return ngrams # these tests should return True if your code is correct print(ngrams("humanities", ngram_range=(2, 3)) == ['hu', 'um', 'ma', 'an', 'ni', 'it', 'ti', 'ie', 'es', 'hum', 'uma', 'man', 'ani', 'nit', 'iti', 'tie', 'ies']) print(ngrams("teach me Python".split(), ngram_range=(1, 2)) == [ ('teach',), ('me',), ('Python',), ('teach', 'me'), ('me', 'Python')]) from os.path import basename from itertools import islice import re def tokenize(text, lowercase=True): "Tokenize a string on whitespace." text = text.lower() if lowercase else text for match in re.finditer(r"\w+(\.?\w+)*", text): yield match.group() def make_document(filename, ngram_range=(1, 1), lowercase=True, ngram_type='word', sample=5000): with open(filename) as infile: text = list(islice(tokenize(infile.read(), lowercase=lowercase), sample)) if ngram_type == 'char': text = ' '.join(text) author, title = basename(filename).replace('.txt', '').split('_') return Document(author, title, ngrams(text, ngram_range=ngram_range)) class AuthorshipLearner(NaiveBayesLearner): """Subclass of NaiveBayesLearner that targets the authorship attribution problem. The learner can be initialized using >>> al = AuthorshipLearner(n_most_frequent=100) where the argument `n_most_frequent` is used to specify the n most frequent features. If set to None, all features will be used.""" def __init__(self, n_most_frequent=100): self.n_most_frequent = n_most_frequent def fit(self, X, y): """Fit or train the Authorship Learner. X must be an iterable of examples where each example is an iterable as well.""" self.C = Counter(y) self.N = defaultdict(Counter) # insert your code here # these tests should return True if your code is correct X = [('a', 'a', 'a', 'b', 'b', 'c'), ('a', 'c', 'b', 'b'), ('b', 'b', 'b', 'c', 'c', 'a')] y = [1, 0, 0] al = AuthorshipLearner(n_most_frequent=2) al.fit(X, y) print('c' not in al.N[1] and set(['b', 'c']) == set(al.N[0].keys())) def accuracy(pred_labels, true_labels): """Return the accuracy, defined as the proportion of correctly predicted labels.""" # insert your code here # these tests should return True if your code is correct predictions = [1, 2, 0, 0, 1] true = [1, 2, 0, 1, 1] print(accuracy(predictions, true) == 0.8) def error_statistics(preds, trues): """Given a list of predictions and ground truth labels, return a matrix in which for each unique label, we count the number of true and false positives and the number of false negatives.""" errors = defaultdict(Counter) for pred, true in zip(preds, trues): if pred == true: errors[true]['tp'] += 1 else: errors[pred]['fn'] += 1 errors[true]['fp'] += 1 return errors from statistics import mean # only available in Python 3.4+ def precision(pred_labels, true_labels, average=True, matrix=None): if matrix is None: matrix = error_statistics(pred_labels, true_labels) scores = {} for label in set(true_labels): try: scores[label] = matrix[label]['tp'] / (matrix[label]['tp'] + matrix[label]['fp']) except ZeroDivisionError: scores[label] = 0.0 return scores if not average else mean(scores.values()) print(precision(predictions, true)) from statistics import mean def recall(pred_labels, true_labels, average=True, matrix = None): if matrix is None: matrix = error_statistics(pred_labels, true_labels) # insert your code here # these tests should return True if your code is correct predictions = [1, 2, 0, 0, 1] true = [1, 2, 0, 1, 1] print(abs(recall(predictions, true) - 0.8333) < 0.0001) def f_score(pred_labels, true_labels, average=True, matrix=None): if matrix is None: matrix = error_statistics(pred_labels, true_labels) # insert your code here # these tests should return True if your code is correct predictions = [1, 2, 0, 0, 1] true = [1, 2, 0, 1, 1] print(abs(f_score(predictions, true, average=True) - 0.8222) < 0.0001) def test(learner, test_X, test_y, score_fn=accuracy, verbose=False): """Return the proportion of examples in test_X that are correctly predicted by the learner. The learner must already be fitted.""" pred_y = [] for y_x, x in zip(test_y, test_X): prediction = learner.predict(x) pred_y.append(prediction) if verbose >= 3: print('%s! (true = %s, pred = %s)' % ( 'Wrong' if y_x != prediction else 'Correct', y_x, prediction)) return score_fn(pred_y, test_y) def train_and_test(learner, X, y, start, end, score_fn=accuracy, verbose=False): """Train and test a Learner. We reserve X[start:end] for testing and test on the remainder. Returns the proportion of examples that are correctly predicted by the learner.""" train_X, train_y = X[:start] + X[end:], y[:start] + y[end:] test_X, test_y = X[start:end], y[start:end] learner.fit(train_X, train_y) return test(learner, test_X, test_y, score_fn=score_fn, verbose=verbose) import random from statistics import mean def shuffle(*args, seed=None): """Similar to random.shuffle, but takes multiple arguments that will be shuffled in parallel.""" data = list(zip(*args)) random.shuffle(data, random=seed) return zip(*data) def cross_validate(learner, X, y, k=10, score_fn=accuracy, verbose=False): if k is None: k = len(X) n = len(X) X, y = shuffle(X, y) scores = [] for i in range(k): start, end = int(i * (n / k)), int((i + 1) * (n / k)) score = train_and_test(learner, X, y, start, end, score_fn=score_fn, verbose=verbose) if verbose >= 2: print("Cross validation on fold %d/%d = %.3f" % (i+1, k, score)) scores.append(score) return mean(scores) from glob import glob documents = [make_document(f) for f in glob('data/british-novels/*.txt')] authors, titles, texts = zip(*documents) cross_validate(AuthorshipLearner(), texts, authors, k=None, score_fn=f_score) %matplotlib inline # execute this line first scores = {} # insert your code here import seaborn as sb samples, scores = zip(*sorted(scores.items())) sb.plt.plot(samples, scores) scores = {} # insert your code here n_most_frequent, scores = zip(*sorted(scores.items())) sb.plt.plot(n_most_frequent, scores) from itertools import product def grid_search(learner, directory, n_folds=10, params={}, score_fn=accuracy, verbose=0): if not isinstance(learner, Learner): raise ValueError("Learner is not initialized") arguments, values = zip(*params.items()) grid_scores = [] for param_setting in product(*values): param_setting = dict(zip(arguments, param_setting)) if verbose >= 1: print('Searching grid with %s' % param_setting) learner_settings, feature_settings = {}, {} for arg, value in param_setting.items(): if arg in learner.__dict__: learner_settings[arg] = value else: feature_settings[arg] = value for arg, value in learner_settings.items(): learner.__setattr__(arg, value) documents = [make_document(f, **feature_settings) for f in glob(directory + '/*.txt')] y, _, X = zip(*documents) score = cross_validate(learner, X, y, k=n_folds, score_fn=score_fn, verbose=verbose) grid_scores.append((param_setting, score)) grid_scores.sort(key=lambda i: i[1]) return grid_scores[-1] params = {'n_most_frequent': [50, 100], 'sample': [500, 1000, 1500]} grid_search(AuthorshipLearner(), 'data/british-novels/', params=params, n_folds=None, score_fn=f_score, verbose=1) # insert your code here from IPython.core.display import HTML def css_styling(): styles = open("styles/custom.css", "r").read() return HTML(styles) css_styling()