class Learner:
    """Abstract Learner. A Learner can be trained
    with a dataset X and the corresponding labels y. After training
    the Learner can be asked to predict the outcome of an example."""

    def fit(self, X, y):
        """Fit or train the learner."""
        self.X = X
        self.y = y

    def predict(self, x):
        """Predict the outcome of an example x."""
        raise NotImplementedError

class MajorityLearner(Learner):
    """Dumb Learner that always predicts that the class of
    a new, unseen example is the label that occurs most often
    in the training data."""
    
    def fit(self, X, y):
        "Find that label in the training data that occurs most often."
        # insert your code here
        
    def predict(self, x):
        "Always predict that `x`'s label is equal to the most popular one."
        return self.most_popular
    
# these tests should return True if your code is correct
learner = MajorityLearner()
X = [('a', 'a', 'b'), ('a', 'a', 'a'), ('b', 'b', 'b')]
y = [0, 0, 1]
learner.fit(X, y)
print(learner.predict(('c', 'c', 'c')) == 0)

# insert your code here

# insert your code here

# insert your code here

from collections import defaultdict, Counter


class NaiveBayesLearner(Learner):
    """Naive Bayes Learner. This learner can be
    initialized using nb = NaiveBayesLearner()
    to construct a classifier that incorporates the prior
    probabilities of each possible class label."""

    def fit(self, X, y):
        """Fit or train the naive bayes classifier. X must be an
        iterable of examples where each example is an iterable as
        well."""
        self.C = # insert your code here (class counts)
        self.N = # insert your code here (feature counts per class)
        # add the feature counts per class here
        
        self.V = len(set(x for y_x in self.N for x in self.N[y_x])) # number of unique features
       
    def predict(self, x):
        """Predict the outcome for example x. Choose the most
        likely outcome out of all possible outcomes."""
        pass
    
# these tests should return True if your code is correct
nb = NaiveBayesLearner()
X, y = [('a', 'a', 'a', 'b'), ('a', 'b', 'b')], [1, 0]
nb.fit(X, y)
print(nb.N[1]['b'] == 1 and nb.C[1] == 1)

class NaiveBayesLearner(Learner):
    """Naive Bayes Learner. This learner can be
    initialized using nb = NaiveBayesLearner()
    to construct a classifier that incorporates the prior
    probabilities of each possible class label."""

    def fit(self, X, y):
        """Fit or train the naive bayes classifier. X must be an
        iterable of examples where each example is an iterable as
        well."""
        self.C = Counter(y)
        self.N = defaultdict(Counter)
        for x, y_x in zip(X, y):
            self.N[y_x] += Counter(x)
        self.V = len(set(x for y_x in self.N for x in self.N[y_x]))

    def prior(self, y):
        """Return the prior probability of class y."""
        pass
            
    def probability(self, x, y):
        """Apply Laplace Smoothing to give a probability
        estimate of feature x given y."""
        # insert your code here

# these tests should return True if your code is correct
nb = NaiveBayesLearner()
X, y = [('a', 'a', 'a', 'b'), ('a', 'b', 'b'), ('b', 'b', 'b')], [1, 0, 0]
nb.fit(X, y)
print(abs(nb.probability('a', 1) - 0.66666) < 0.00001)
print(abs(nb.probability('c', 0) - 0.125) < 0.00001)

# these tests should return True if your code is correct
nb = NaiveBayesLearner()
X, y = [('a', 'a', 'a', 'b'), ('a', 'b', 'b'), ('b', 'b', 'b')], [1, 0, 0]
nb.fit(X, y)
print(round(nb.prior(1), 2) == 0.33)

x = 1e-10
for i in range(10):
    x = x * x
    print(x)

from math import log

x = log(1e-10)
for i in range(10):
    x = x + x
    print(x)

class NaiveBayesLearner(Learner):
    """Naive Bayes Learner. This learner can be
    initialized using nb = NaiveBayesLearner()
    to construct a classifier that incorporates the prior
    probabilities of each possible class label."""

    def fit(self, X, y):
        """Fit or train the naive bayes classifier. X must be an
        iterable of examples where each example is an iterable as
        well."""
        self.C = Counter(y)
        self.N = defaultdict(Counter)
        for x, y_x in zip(X, y):
            self.N[y_x] += Counter(x)
        self.V = len(set(x for y_x in self.N for x in self.N[y_x]))

    def prior(self, y):
        """Return the prior probability of class y."""
        return self.C[y] / sum(self.C.values())

    def probability(self, x, y):
        """Apply Laplace Smoothing to give a probability
        estimate of feature x given y."""
        return (self.N[y][x] + 1.0) / (sum(self.N[y].values()) + self.V)

    def predict(self, x):
        """Predict the outcome for example x. Choose the most
        likely outcome out of all possible outcomes."""
        # insert your code here

# these tests should return True if your code is correct
nb = NaiveBayesLearner()
X, y = [('a', 'a', 'a', 'b'), ('a', 'b', 'b'), ('b', 'b', 'b')], [1, 0, 0]
nb.fit(X, y)
print(nb.predict(('a', 'a')) == 1 and 
      nb.predict(('a', 'b')) == 0 and
      nb.predict(('b', 'b')) == 0)

from collections import namedtuple

Point = namedtuple("Point", ["x", "y"])
p = Point(1, y=2)

print(p.x) # by name
print(p[1]) # by index

x, y = p
print(x)

Document = namedtuple("Document", ["author", "title", "text"])

def ngrams(iterable, ngram_range=(1, 1)):
    "Return the iterable as a sequence of n-grams."
    min_n, max_n = ngram_range
    if isinstance(iterable, (list, set)):
        iterable = tuple(iterable)
    ngrams = []
    # insert your code here
    return ngrams

# these tests should return True if your code is correct
print(ngrams("humanities", ngram_range=(2, 3)) == ['hu', 'um', 'ma', 'an', 'ni', 'it', 
                                                   'ti', 'ie', 'es', 'hum', 'uma', 'man', 
                                                   'ani', 'nit', 'iti', 'tie', 'ies'])
print(ngrams("teach me Python".split(), ngram_range=(1, 2)) ==  [
    ('teach',), ('me',), ('Python',), ('teach', 'me'), ('me', 'Python')])

from os.path import basename
from itertools import islice
import re

def tokenize(text, lowercase=True):
    "Tokenize a string on whitespace."
    text = text.lower() if lowercase else text
    for match in re.finditer(r"\w+(\.?\w+)*", text):
        yield match.group()

def make_document(filename, ngram_range=(1, 1), lowercase=True, ngram_type='word', sample=5000):
    with open(filename) as infile:
        text = list(islice(tokenize(infile.read(), lowercase=lowercase), sample))
    if ngram_type == 'char':
        text = ' '.join(text)
    author, title = basename(filename).replace('.txt', '').split('_')
    return Document(author, title, ngrams(text, ngram_range=ngram_range))

class AuthorshipLearner(NaiveBayesLearner):
    """Subclass of NaiveBayesLearner that targets the authorship
    attribution problem. The learner can be initialized using
    >>> al = AuthorshipLearner(n_most_frequent=100)
    where the argument `n_most_frequent` is used to specify
    the n most frequent features. If set to None, all features
    will be used."""
    
    def __init__(self, n_most_frequent=100):
        self.n_most_frequent = n_most_frequent
        
    def fit(self, X, y):
        """Fit or train the Authorship Learner. X must be an
        iterable of examples where each example is an iterable as
        well."""
        self.C = Counter(y)
        self.N = defaultdict(Counter)
        # insert your code here
            
# these tests should return True if your code is correct
X = [('a', 'a', 'a', 'b', 'b', 'c'), ('a', 'c', 'b', 'b'), ('b', 'b', 'b', 'c', 'c', 'a')]
y = [1, 0, 0]
al = AuthorshipLearner(n_most_frequent=2)
al.fit(X, y)
print('c' not in al.N[1] and set(['b', 'c']) == set(al.N[0].keys()))

def accuracy(pred_labels, true_labels):
    """Return the accuracy, defined as the proportion of
    correctly predicted labels."""
    # insert your code here

# these tests should return True if your code is correct
predictions = [1, 2, 0, 0, 1]
true =        [1, 2, 0, 1, 1]
print(accuracy(predictions, true) == 0.8)

def error_statistics(preds, trues):
    """Given a list of predictions and ground truth labels,
    return a matrix in which for each unique label, we count
    the number of true and false positives and the number of 
    false negatives."""
    errors = defaultdict(Counter)
    for pred, true in zip(preds, trues):
        if pred == true:
            errors[true]['tp'] += 1
        else:
            errors[pred]['fn'] += 1
            errors[true]['fp'] += 1
    return errors

from statistics import mean # only available in Python 3.4+

def precision(pred_labels, true_labels, average=True, matrix=None):
    if matrix is None:
        matrix = error_statistics(pred_labels, true_labels)
    scores = {}
    for label in set(true_labels):
        try:
            scores[label] = matrix[label]['tp'] / (matrix[label]['tp'] + matrix[label]['fp'])
        except ZeroDivisionError:
            scores[label] = 0.0
    return scores if not average else mean(scores.values())

print(precision(predictions, true))

from statistics import mean

def recall(pred_labels, true_labels, average=True, matrix = None):
    if matrix is None:
        matrix = error_statistics(pred_labels, true_labels)
    # insert your code here
    
# these tests should return True if your code is correct
predictions = [1, 2, 0, 0, 1]
true =        [1, 2, 0, 1, 1]
print(abs(recall(predictions, true) - 0.8333) < 0.0001)

def f_score(pred_labels, true_labels, average=True, matrix=None):
    if matrix is None:
        matrix = error_statistics(pred_labels, true_labels)
    # insert your code here

# these tests should return True if your code is correct
predictions = [1, 2, 0, 0, 1]
true =        [1, 2, 0, 1, 1]
print(abs(f_score(predictions, true, average=True) - 0.8222) < 0.0001)

def test(learner, test_X, test_y, score_fn=accuracy, verbose=False):
    """Return the proportion of examples in test_X that
    are correctly predicted by the learner. The learner
    must already be fitted."""
    pred_y = []
    for y_x, x in zip(test_y, test_X):
        prediction = learner.predict(x)
        pred_y.append(prediction)
        if verbose >= 3:
            print('%s! (true = %s, pred = %s)' % (
                'Wrong' if y_x != prediction else 'Correct', y_x, prediction))
    return score_fn(pred_y, test_y)

def train_and_test(learner, X, y, start, end, score_fn=accuracy, verbose=False):
    """Train and test a Learner. We reserve X[start:end] for testing
    and test on the remainder. Returns the proportion of examples
    that are correctly predicted by the learner."""
    train_X, train_y = X[:start] + X[end:], y[:start] + y[end:]
    test_X, test_y = X[start:end], y[start:end]
    learner.fit(train_X, train_y)
    return test(learner, test_X, test_y, score_fn=score_fn, verbose=verbose)

import random
from statistics import mean 

def shuffle(*args, seed=None):
    """Similar to random.shuffle, but takes multiple arguments
    that will be shuffled in parallel."""
    data = list(zip(*args))
    random.shuffle(data, random=seed)
    return zip(*data)

def cross_validate(learner, X, y, k=10, score_fn=accuracy, verbose=False):
    if k is None: k = len(X)
    n = len(X)
    X, y = shuffle(X, y)
    scores = []
    for i in range(k):
        start, end = int(i * (n / k)), int((i + 1) * (n / k))
        score = train_and_test(learner, X, y, start, end, score_fn=score_fn, verbose=verbose)
        if verbose >= 2:
            print("Cross validation on fold %d/%d = %.3f" % (i+1, k, score))
        scores.append(score)
    return mean(scores)

from glob import glob

documents = [make_document(f) for f in glob('data/british-novels/*.txt')]

authors, titles, texts = zip(*documents)
cross_validate(AuthorshipLearner(), texts, authors, k=None, score_fn=f_score)

%matplotlib inline # execute this line first

scores = {}
# insert your code here

import seaborn as sb

samples, scores = zip(*sorted(scores.items()))
sb.plt.plot(samples, scores)

scores = {}
# insert your code here

n_most_frequent, scores = zip(*sorted(scores.items()))
sb.plt.plot(n_most_frequent, scores)

from itertools import product

def grid_search(learner, directory, n_folds=10, params={}, score_fn=accuracy, verbose=0):
    if not isinstance(learner, Learner):
        raise ValueError("Learner is not initialized")
    arguments, values = zip(*params.items())
    grid_scores = []
    for param_setting in product(*values):
        param_setting = dict(zip(arguments, param_setting))
        if verbose >= 1:
            print('Searching grid with %s' % param_setting)
        learner_settings, feature_settings = {}, {}
        for arg, value in param_setting.items():
            if arg in learner.__dict__:
                learner_settings[arg] = value
            else:
                feature_settings[arg] = value
        for arg, value in learner_settings.items():
            learner.__setattr__(arg, value)
        documents = [make_document(f, **feature_settings) for f in glob(directory + '/*.txt')]
        y, _, X = zip(*documents)
        score = cross_validate(learner, X, y, k=n_folds, score_fn=score_fn, verbose=verbose)
        grid_scores.append((param_setting, score))
    grid_scores.sort(key=lambda i: i[1])
    return grid_scores[-1]

params = {'n_most_frequent': [50, 100], 'sample': [500, 1000, 1500]}

grid_search(AuthorshipLearner(), 'data/british-novels/', 
            params=params, n_folds=None, score_fn=f_score, verbose=1)

# insert your code here

from IPython.core.display import HTML
def css_styling():
    styles = open("styles/custom.css", "r").read()
    return HTML(styles)
css_styling()