# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# Import a bunch of libraries.
import time
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_mldata
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report

# Set the randomizer seed so results are the same each time.
np.random.seed(0)

# Load the digit data either from mldata.org, or once downloaded to data_home, from disk. The data is about 53MB so this cell
# should take a while the first time your run it.
mnist = fetch_mldata('MNIST original', data_home='~/datasets/mnist')
X, Y = mnist.data, mnist.target

# Rescale grayscale values to [0,1].
X = X / 255.0

# Shuffle the input: create a random permutation of the integers between 0 and the number of data points and apply this
# permutation to X and Y.
# NOTE: Each time you run this cell, you'll re-shuffle the data, resulting in a different ordering.
shuffle = np.random.permutation(np.arange(X.shape[0]))
X, Y = X[shuffle], Y[shuffle]

print 'data shape: ', X.shape
print 'label shape:', Y.shape

# Set some variables to hold test, dev, and training data.
test_data, test_labels = X[61000:], Y[61000:]
dev_data, dev_labels = X[60000:61000], Y[60000:61000]
train_data, train_labels = X[:60000], Y[:60000]
mini_train_data, mini_train_labels = X[:1000], Y[:1000]

def P1(num_examples=10):

### STUDENT START ###

    num_digits = 10 # number of digits to show

    ## Select `num_examples` examples for each

    # Create index dictionary
    index_dict = { i : [] for i in xrange(num_digits) }
    digits_completed = 0

    # Iterate through labels
    for i, label in enumerate(train_labels):
        # If we are in need of the label we just came accross add it
        if len(index_dict[label]) < num_examples:
            index_dict[label].append(i)

            # If we have just finished the digit, add one to digits_completed
            if len(index_dict[label]) == num_examples:
                digits_completed += 1

                # When we have finished our 10 digits, stop iterating through labels
                if digits_completed == num_digits:
                    break

    ## Generate the figure
    fig, ax = plt.subplots(num_digits, num_examples, figsize=(10,10))

    for i in xrange(num_digits): # for each digit
        for j in xrange(num_examples): # for each plot within each digit
            # Get the position in the data
            _ind = index_dict[i][j]

            # Plot with no axes
            ax[i,j].imshow(train_data[_ind].reshape(28,28), cmap=plt.cm.gray)
            ax[i,j].axis('off')

    # Minor figure adjustments
    fig.suptitle('Showing {} examples for each digit'.format(num_examples), size=18)
    fig.subplots_adjust(wspace=.02, hspace=.02)

### STUDENT END ###

P1(10)

def P2(k_values, extra=False):

### STUDENT START ###

    # For extra report
    extra_reports = []

    for k in k_values:
        # Create and train model
        _model = KNeighborsClassifier(n_neighbors=k)
        _model.fit(mini_train_data, mini_train_labels)
        
        # Predict
        _predicted = _model.predict(dev_data)
    
    ## Extra report comparing the overall performance of each of the models
        if extra:
            _p = classification_report(y_true=dev_labels, y_pred=_predicted, digits=3)
            
            # Report for k==1
            if k == 1:
                _p = classification_report(y_true=dev_labels, y_pred=_predicted, digits=3)
                print 'For k==1 neighbors:'
                print _p
            
            # Parse report (obtain prec, recall and F1 for avg / total)
            line = _p.split('\n')[-2]
            _1, precision, recall, f1, _2 = [x for x in line.split('  ') if len(x)]
            
            # Calculate accuracy and append to list
            _correct = sum([ i == j for i,j in zip(_predicted, dev_labels) ])
            accuracy = 1. * _correct / len(dev_labels)

            extra_reports.append( (k, accuracy, precision, recall, f1) )
    
    # Print extra report
    if extra:
        print '  MODEL      Accuracy  Precision   Recall  F1score'
        for entry in extra_reports:
            print 'Neighbors {0}:    {1:.1%}     {2}    {3}   {4}'.format(*entry)
    
### STUDENT END ###

k_values = [1, 3, 5, 7, 9]
P2(k_values, extra=True)

def P3(train_sizes, accuracies):

### STUDENT START ###

    print '  TRAIN SIZE   Accuracy   Precision   Recall  F1score    Time'

    for size in train_sizes:
        # Train model
        _model = KNeighborsClassifier(n_neighbors=1)
        _model.fit(train_data[:size], train_labels[:size])

        # Predict and time the prediction
        _start = time.time()
        _predicted = _model.predict(dev_data)
        _elapsed = time.time() - _start
        
        # Calculate accuracy and append to list
        _correct = sum([ i == j for i,j in zip(_predicted, dev_labels) ])
        accuracy = 1. * _correct / len(dev_labels)
        accuracies.append(accuracy)

        # Extra reporting
        _p = classification_report(y_true=dev_labels, y_pred=_predicted, digits=3)
        # Parse report (obtain prec, recall and F1 for avg / total)
        line = _p.split('\n')[-2]
        _1, precision, recall, f1, _2 = [x for x in line.split('  ') if len(x)]
        
        # Format time and print along with accuracies
        _t = str(_elapsed).split('.')
        print '{s:5} records:    {a:<.1%}      {p}    {r}   {f}  {t[0]:>2}.{t[1]:.2}s'.format(
            s=size, a=accuracy, p=precision, r=recall, f=f1, t=_t)

### STUDENT END ###

train_sizes = [100, 200, 400, 800, 1600, 3200, 6400, 12800, 25000]
accuracies = []
P3(train_sizes, accuracies)

def P4(transform=False, size=60000):

### STUDENT START ###

    # Access data from P3 and reshape train sizes (to matrix)
    _ts = map( lambda x: [x,], train_sizes )
    
    # If specified, transform data
    if transform:
        _acc = map( lambda x: 1./(1 - x) , accuracies )
    else:
        _acc = accuracies
    
    # Fit model and predict for 600k
    _m = LinearRegression()
    _m.fit(_ts, _acc)
    pred = _m.predict([[size]])

    # If transformed, untransform data
    if transform:
        _pred = 1 -  1. / pred[0]
    else:
        _pred = pred[0]

    # Print result
    print 'Transformation {w:>3} -> Accuracy predicted (size={s}k): {p:>6.1%}'.format(
        w= 'ON' if transform else 'OFF', s=size/1000, p=_pred )

### STUDENT END ###

P4()
P4(transform=True)

def P5():

### STUDENT START ###

    # Fit model and get predictions
    _m = KNeighborsClassifier(n_neighbors=1)
    _m.fit(mini_train_data, mini_train_labels)
    _p = _m.predict(dev_data)

    # Print confusion matrix
    print 'CONFUSION MATRIX'
    print confusion_matrix(y_true=dev_labels, y_pred=_p, labels= range(10))    

    # Print examples of mislabeled 9s (two 9s as 3s and two 9s as 7s)
    fig, ax = plt.subplots(2, 2, figsize=(6,6))
    
    _num_as3, _num_as7 = 0, 0 # aux for keeping track of examples plotted
    
    for i, label in enumerate(dev_labels):
        if label == 9:
            
            # If 9 mislabeled as 3
            if _p[i] == 3 and _num_as3 < 2:
                ax[0,_num_as3].imshow(dev_data[i].reshape(28,28), cmap=plt.cm.gray)
                ax[0,_num_as3].axis('off')
                _num_as3 += 1

            # If 9 mislabeled as 7
            if _p[i] == 7 and _num_as7 < 2:
                ax[1,_num_as7].imshow(dev_data[i].reshape(28,28), cmap=plt.cm.gray)
                ax[1,_num_as7].axis('off')
                _num_as7 += 1
            
            if _num_as3 + _num_as7 >= 4:
                break

    fig.suptitle('TOP: 9s classified as 3s , BOTTOM: 9s classified as 7s', size=15)
    fig.subplots_adjust(wspace=.1, hspace=.1)
    
### STUDENT END ###

P5()

class GaussianBlur:
    '''Implements a Gaussian Blur with the specified blur_range and std_deviation
    
    Attributes:
        blur_range [int]: Length of the blur's radius (Defaults to 1)
        std_dev [float]: Standard deviation of the Gaussian applied (Defaults to 1)
        weights [matrix of floats]: weights for the specified `blur_range` and `std_dev`
    
    '''
    
    def __init__(self, blur_range=1, std_dev=1):
        '''Initialize a GaussianBlur with specified blur length and standard deviation'''
        self.blur_range = blur_range
        self.std_dev = std_dev
        
        # Calculate weights
        self.weights = np.zeros([self.blur_range + 1, self.blur_range + 1])
        for i in xrange(self.blur_range + 1):
            for j in xrange(self.blur_range + 1):
                self.weights[i, j] = np.exp( -1. * (i**2 + j**2) / (2 * self.std_dev**2) ) \
                    / ( 2 * np.pi * self.std_dev**2 )

    def blur(self, matrix):
        '''Apply Gaussian Blur'''
        # Get matrix shape, initialize blurred matrix
        height, width = matrix.shape
        blurred = np.zeros_like(matrix)

        for r in xrange(height):
            for c in xrange(width):
                _vals = [] # collects values of neighbor pixels
                _weights = [] # collects the weights as specified by the Gaussian

                # get allowed set of jitters
                possible_rows = xrange( max(0, r - self.blur_range), min(height, r + self.blur_range + 1) )
                possible_cols = xrange( max(0, c - self.blur_range), min(width, c + self.blur_range + 1) )

                # iterate through jitters
                for _r in possible_rows:
                    for _c in possible_cols:
                        _vals.append( matrix[_r , _c] )
                        _weights.append( self.weights[abs(r - _r) , abs(c - _c)] )

                blurred[r][c] = np.dot(_vals, _weights) / sum(_weights)

        return blurred

def P6(blur_train=True, blur_dev=True):

### STUDENT START ###

    _start = time.time() # 'start' timer
    _m = KNeighborsClassifier(n_neighbors=1)
    
    # Initialize GaussianBlur if necessary
    if blur_train or blur_dev:
        gblur = GaussianBlur()

    # Blur (if specified) and train
    if blur_train:
        _train = map( lambda x: gblur.blur( x.reshape(28,28) ).flatten() , mini_train_data )
    else:
        _train = mini_train_data
    _m.fit(_train, mini_train_labels)

    # Blur (if specified) and predict
    if blur_dev:
        _dev = map( lambda x: gblur.blur( x.reshape(28,28) ).flatten() , dev_data )
    else:
        _dev = dev_data
    _predicted = _m.predict(_dev)
    
    _elapsed = time.time() - _start # 'stop' timer
    _t = str(_elapsed).split('.')
    
    # Calculate accuracy and print report
    _correct = sum([ i == j for i,j in zip(_predicted, dev_labels) ])
    accuracy = 1. * _correct / len(dev_labels)
    
    print '{tr:^16}    {dv:^12}      {a:.1%}   {t[0]:>2}.{t[1]:.2}s'.format(
        tr= 'ON' if blur_train else 'OFF',
        dv= 'ON' if blur_dev else 'OFF',
        a=accuracy, t=_t )

### STUDENT END ###

print 'Blurred Training    Blurred Dev    Accuracy    Time'
P6(blur_train=False, blur_dev=False)
P6(blur_train=False, blur_dev=True)
P6(blur_train=True, blur_dev=False)
P6(blur_train=True, blur_dev=True)

def binarize(data, thresholds=[.5]):
    '''Split data into different classes given a set of thresholds (defaults to binarizing at .5)'''

    binarized_data = map( lambda d: np.digitize(d, thresholds), data )

    return binarized_data

rep_template = '{m:^15}  {th:^12}    {a:.1%}   {tm[0]:>2}.{tm[1]:.3}s'

def P7(thresholds):

### STUDENT START ###

    _start = time.time() # 'start' timer
    
    # Select Binomial or Multinomial
    if len(thresholds) == 1:
        mod = BernoulliNB(alpha=1., fit_prior=False)
    else:
        mod = MultinomialNB(alpha=1., fit_prior=False)

    # Train and fit (use binarize with .5, and alpha 1)
    mod.fit( binarize(mini_train_data, thresholds=thresholds) , mini_train_labels) 
    _p = mod.predict( binarize(dev_data,thresholds=thresholds) )

    _elapsed = time.time() - _start # 'stop' timer
    _t = str(_elapsed).split('.')
    
    # Calculate and report accuracy
    _correct = sum([ i == j for i,j in zip(_p, dev_labels) ])
    accuracy = 1. * _correct / len(dev_labels)
    
    print rep_template.format(
        m='BernoulliNB' if len(thresholds) == 1 else 'MultinomialNB',
        th=thresholds, a=accuracy, tm=_t)

### STUDENT END ###

# Report head
print '{:^15}  Threshold(s)  Accuracy    Time'.format('MODEL')

# BernoulliNB
P7([.5])
P7([.9])
P7([.1])
# MultinomialNB
P7([.33, .66])
P7([.1, .9])
P7([.4, .6])

def P8(alphas):

### STUDENT START ###
    
    nb = BernoulliNB(binarize = .5, fit_prior=False) # initialize estimator
    gr_srch = GridSearchCV(nb, alphas)
    gr_srch.fit(mini_train_data, mini_train_labels) # fit grid search
    
    return gr_srch

### STUDENT END ###

alphas = {'alpha': [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]}
nb = P8(alphas)

## Report accuracies and scores for each alpha specified
print 'ALPHA   AVG-Score    SCORES'
for row in nb.grid_scores_:
    print "{0[alpha]:<6}   {1:5.1%}   [ {2[0]:5.1%}, {2[1]:5.1%}, {2[2]:5.1%} ]".format(*row)

print nb.best_params_

report_template = '{n}. {ap:>24}    {ac:>5.1%}'

def P9():
### STUDENT START ###
    """Trains a Gaussian Naive Bayes model (assumes P(x|y)~N(theta,sigma))
    
    Various approaches tested:
        1. Raw - using the gnb model as given
        2. Set theta and sigma based on the data's mean and std deviation
        3. Fix sigma to be 1
        4. Standardize data using the GNB's estimated theta and sigma
    
    NOTE: A fifth approach was tried, standardizing data according to the data's theta and sigma.
        However, the approach failed due to some sigma's being zero.
    """
    
    gnb = GaussianNB()
    gnb.fit(mini_train_data, mini_train_labels)

    # Record original params
    sigma_gnb = gnb.sigma_.copy()
    theta_gnb = gnb.theta_.copy()

    # Estimate data's mean and standard deviation
    sigma_data = sigma_gnb.copy()
    theta_data = theta_gnb.copy()
    for c in xrange(10):
        _filter = mini_train_labels == c
        sigma_data[c] = np.std(mini_train_data[ _filter ], axis=0)
        theta_data[c] = np.average(mini_train_data[ _filter ], axis=0)
    
## 1. Raw approach
    # Predict and get accuracies 
    _p = gnb.predict(dev_data)
    _correct = sum([ i == j for i,j in zip(_p, dev_labels) ])
    accuracy = 1. * _correct / len(dev_labels)
    
    # Print report
    print report_template.format(n=1, ap='Raw (no transform)', ac=accuracy)


## 2. Calculate std dev from the data and apply to sigma parameter
    gnb.theta_ = theta_data
    gnb.sigma_ = sigma_data

    # Predict and get accuracies 
    _p = gnb.predict(dev_data)
    _correct = sum([ i == j for i,j in zip(_p, dev_labels) ])
    accuracy = 1. * _correct / len(dev_labels)

    # Print report
    print report_template.format(n=2, ap="Data's parameters", ac=accuracy)


## 3. Fix sigma to be 1
    gnb.theta_ = theta_gnb  # revert
    gnb.sigma_ = np.ones_like(gnb.sigma_)

    # Predict and get accuracies 
    _p = gnb.predict(dev_data)
    _correct = sum([ i == j for i,j in zip(_p, dev_labels) ])
    accuracy = 1. * _correct / len(dev_labels)

    print report_template.format(n=3, ap='Fix Sigma=1' ,ac=accuracy)
    
    
## 4. Standardize data approach using original GNB's estimated theta and sigma
    std_mini_train_data = np.zeros_like(mini_train_data)
    for i, c in enumerate(mini_train_labels):
        std_mini_train_data[i] = (mini_train_data[i] - theta_gnb[c]) / sigma_gnb[c]

    std_dev_data = np.zeros_like(dev_data)
    for i, c in enumerate(dev_labels):
        std_dev_data[i] = (dev_data[i] - theta_gnb[c]) / sigma_gnb[c]


    gnb2 = GaussianNB()
    gnb2.fit(std_mini_train_data, mini_train_labels)
    _p = gnb2.predict(std_dev_data)
    _correct = sum([ i == j for i,j in zip(_p, dev_labels) ])
    accuracy = 1. * _correct / len(dev_labels)

    print report_template.format(n=4, ap='Standardize Data (GNB)', ac=accuracy)
 
    
### STUDENT END ###

print '            APPROACH          Accuracy'
print report_template.format(n=0, ap='BernoulliNB (P8)', ac=.821)
P9()

def P10(num_examples=20):

### STUDENT START ###
    num_digits = 10 # number of digits to show

    # Fit BernoulliNB model with alpha=0.01 (as per P8's response)
    bnb = BernoulliNB(alpha=0.01, binarize=.5, fit_prior=False)
    bnb.fit( mini_train_data , mini_train_labels) 
    
    ## Generate the figure
    fig, ax = plt.subplots(num_digits, num_examples, figsize=(16,8))

    for dig in xrange(num_digits): # for each digit
        for j in xrange(num_examples): # for each plot within each digit
            
            # Generate example by generating random numbers and using the model's probabilities to binarize
            _example = (np.random.rand(784) < np.exp(bnb.feature_log_prob_[dig])).reshape(28,28)

            # Plot with no axes
            ax[dig,j].imshow(_example, cmap=plt.cm.gray)
            ax[dig,j].axis('off')

    # Minor figure adjustments
    fig.suptitle('Showing {} randomly generated examples for each digit'.format(num_examples), size=18)
    fig.subplots_adjust(wspace=.02, hspace=.02)


### STUDENT END ###

P10(20)

def P11(buckets, correct, total):
    
### STUDENT START ###

    # Fit BernoulliNB model with alpha=0.01 (as per P8's response)
    bnb = BernoulliNB(alpha=0.01, binarize=.5, fit_prior=False)
    bnb.fit( mini_train_data , mini_train_labels)
    _pred_prob = np.exp( bnb.predict_log_proba(dev_data) )
    
    # Get predicted probabilities and labels
    posterior_prob = np.max(_pred_prob, axis=1)
    predicted_label = np.argmax(_pred_prob, axis=1)
        
    # Update total and correct counts accordingly
    for i, p in enumerate(posterior_prob):  # for each prediction
        for j, b in enumerate(buckets):  # for each posterior bucket
            if p >= b:
                # add to total count
                total[j] += 1
                # check if correct
                if predicted_label[i] == dev_labels[i]:
                    correct[j] += 1
                
### STUDENT END ###

buckets = [0.5, 0.9, 0.999, 0.99999, 0.9999999, 0.999999999, 0.99999999999, 0.9999999999999, 1.0]
correct = [0 for i in buckets]
total = [0 for i in buckets]

P11(buckets, correct, total)

print '  POSTERIOR BUCKET    Accuracy  Total  Correct'
for i in range(len(buckets)):
    accuracy = 0.0
    if (total[i] > 0): accuracy = 1. * correct[i] / total[i]
    print 'P <= {:<15.13}    {:>5.1%}    {:>4d}      {:>3d}'.format(buckets[i],accuracy, total[i], correct[i])

# Additional imports necessary for image processing
from skimage import exposure
from skimage.filters import sobel
from skimage.feature import canny, hog
from skimage.measure import label, regionprops

import matplotlib.patches as mpatches # for plotting regions

# Create index dictionary for an example of each digit
index_dict = {}

for i, lbl in enumerate(mini_train_labels):
    # If we are in need of the label we just came accross add it
    if lbl not in index_dict:
        index_dict[lbl] = i

        # When we have finished our 10 digits, stop iterating through labels
        if len(index_dict) >= 10:
            break

## PLOT EXAMPLES
fig, ax = plt.subplots(10, 7, figsize=(10,14))

for i in xrange(10): # for each digit

## Feature Extraction
    # Get the image, Sobel filter, Hough transform and Enclosed regions
    _img = mini_train_data[index_dict[i]].reshape(28,28)
    _sobel = sobel(_img)
    _hough = canny(_img, sigma=1.25)
    
    # Histogram of Oriented Gradients (HOG)
    _, _hog = hog(_img, orientations=8, pixels_per_cell=(4, 4),
                    cells_per_block=(1, 1), visualise=True)
        # rescale HOG's histogram for better display
    _hog_adj = exposure.rescale_intensity(_hog, in_range=(0, 0.02))

    # Region detection on Raw
    _labeled = label(_img, connectivity=_img.ndim)
    _regions = regionprops(_labeled)

    # Region detection on Hough
    _labeled_hou = label(_hough, connectivity=_img.ndim)
    _regions_hou = regionprops(_labeled_hou)
    
    
## Plots (no axes)
    # Raw image
    ax[i,0].imshow(_img, cmap=plt.cm.gray)
    ax[i,0].axis('off')

    # Sobel Filter
    ax[i,1].imshow(_sobel, cmap=plt.cm.gray)
    ax[i,1].axis('off')

    # Hough Transform
    ax[i,2].imshow(_hough, cmap=plt.cm.gray)
    ax[i,2].axis('off')
    
    # HOG and adjusted-HOG
    ax[i,3].imshow(_hog, cmap=plt.cm.gray)
    ax[i,3].axis('off')
    ax[i,4].imshow(_hog_adj, cmap=plt.cm.gray)
    ax[i,4].axis('off')

    # Region on Raw
    ax[i,5].imshow(_labeled, cmap=plt.cm.gray)
    ax[i,5].axis('off')
    for reg in _regions:
        # draw rectangle around segmented coins
        minr, minc, maxr, maxc = reg.bbox
        rect = mpatches.Rectangle((minc, minr), maxc - minc, maxr - minr,
                                  fill=False, edgecolor='red', linewidth=2)
        ax[i,5].add_patch(rect)
    
    # Region on Hough (sigma=1)
    ax[i,6].imshow(_labeled, cmap=plt.cm.gray)
    ax[i,6].axis('off')
    for reg in _regions_hou:
        # draw rectangle around segmented coins
        minr, minc, maxr, maxc = reg.bbox
        rect = mpatches.Rectangle((minc, minr), maxc - minc, maxr - minr,
                                  fill=False, edgecolor='red', linewidth=2)
        ax[i,6].add_patch(rect)
    
    
    # Add column titles and figure title
    fig.text(.175,.915, 'Raw\nImage', ha='center', fontsize=17)
    fig.text(.285,.915, 'Sobel\nFilter', ha='center', fontsize=17)
    fig.text(.4,.915, 'Hough\nTransform', ha='center', fontsize=17)
    fig.text(.515,.925, 'HOG', ha='center', fontsize=17)
    fig.text(.625,.915, 'Adjusted\nHOG', ha='center', fontsize=17)
    fig.text(.74,.915, 'Regions\non Raw', ha='center', fontsize=17)
    fig.text(.85,.915, 'Regions\non Hough', ha='center', fontsize=17)
    
    fig.suptitle('Additional Features Extraction', fontsize=21)
        

## AUXILIARY FUNCTIONS for cleaner model comparison

accuracy_header = '{m:^40}  {ac:^5}   {tm:^7}'.format(m='MODEL', ac='Accuracy', tm='Time')
accuracy_template = '{m:<40}    {ac:>5.1%}   {tm[0]:>2}.{tm[1]:.3}s'

def report_model(model_label, predicted, time_elapsed):
    ''' Reports the model accuracy and time of computation '''
    # Calculate accuracy
    _correct = sum([ i == j for i,j in zip(predicted, dev_labels) ])
    accuracy = 1. * _correct / len(dev_labels)
    
    # Process time
    _t = str(time_elapsed).split('.')
    
    # Print report
    print accuracy_template.format(m=model_label, ac=accuracy, tm=_t)


def transform_and_train(model_label, transformations):
    ''' Trains a model and gets predictions given the transformed data '''
    _start = time.time() # 'start' timer

    # Transform data (training and development)
    transformed_train = mini_train_data.copy()
    transformed_dev = dev_data.copy()
    for tr in transformations:
        transformed_train = map( tr, transformed_train )
        transformed_dev = map( tr, transformed_dev )
    
    # Train model and predict development data
    bnb = BernoulliNB(alpha=0.01, binarize=.5, fit_prior=False)
    bnb.fit( transformed_train , mini_train_labels)
    _predicted = bnb.predict(transformed_dev)

    # Report model performance
    _elapsed = time.time() - _start # 'stop' timer
    report_model(model_label, _predicted, _elapsed)


## FEATURE EXTRACTION 

# Sum of rows and columns
def sum_rows_cols(flat_array):
    _shaped_array = flat_array[:784].reshape(28,28)
    _rows = np.sum(_shaped_array, axis=0)
    _cols = np.sum(_shaped_array, axis=1)
    return np.concatenate([flat_array, _rows, _cols], axis=0)


# Sobel Filter
def sobel_filter(flat_array):
    _shaped_array = flat_array[:784].reshape(28,28)
    _sobel = sobel(_shaped_array).flatten()
    return np.concatenate([flat_array, _sobel], axis=0)


# Hough Transform
def hough_transform(flat_array):
    _shaped_array = flat_array[:784].reshape(28,28)
    _hough = canny(_shaped_array, sigma=1.25).flatten()
    return np.concatenate([flat_array, _hough], axis=0)


# HOG
def hog_transform(flat_array):
    _shaped_array = flat_array[:784].reshape(28,28)
    _, _hog = hog(_shaped_array, orientations=8, pixels_per_cell=(4, 4), 
                  cells_per_block=(1, 1), visualise=True)
    return np.concatenate([flat_array, _hog.flatten()], axis=0)


# Adusted HOG
def hog_adj_transform(flat_array):
    _shaped_array = flat_array[:784].reshape(28,28)
    _, _hog = hog(_shaped_array, orientations=8, pixels_per_cell=(4, 4), 
                  cells_per_block=(1, 1), visualise=True)
    _hog_adj = exposure.rescale_intensity(_hog, in_range=(0, 0.02))
    return np.concatenate([flat_array, _hog_adj.flatten()], axis=0)


# Regions on Hough - just adds number of regions
def regions_on_hough(flat_array):
    _shaped_array = flat_array[:784].reshape(28,28)
    _hough = canny(_shaped_array, sigma=1.25)
    _labeled_hou = label(_hough, connectivity=_img.ndim)
    _regions_hou = regionprops(_labeled_hou)
    return np.concatenate([ flat_array, [len(_regions_hou)] ], axis=0)


## MODEL COMPARISON
print accuracy_header

transform_and_train('BernoulliNB', transformations= [lambda x: x])

print '\n--- Single Feature Added ---'
transform_and_train('+ Sum of rows/columns', [sum_rows_cols])
transform_and_train('+ Sobel Filter', [sobel_filter])
transform_and_train('+ Hough Transform', [hough_transform])
transform_and_train('+ Histogram of Oriented Gradients', [hog_transform])
transform_and_train('+ Adjusted HOG', [hog_adj_transform])
transform_and_train('+ Regions on Hough', [regions_on_hough])

print '\n--- Two Features Added ---'
transform_and_train('+ Sum + Sobel', [sum_rows_cols, sobel_filter])
transform_and_train('+ Sum + Hough', [sum_rows_cols, hough_transform])
transform_and_train('+ Sum + Adj HOG', [sum_rows_cols, hog_adj_transform])
transform_and_train('+ Sum + Regions', [sum_rows_cols, regions_on_hough])
transform_and_train('+ Sobel + Hough', [sobel_filter, hough_transform])
transform_and_train('+ Sobel + Regions', [sobel_filter, regions_on_hough])
transform_and_train('+ Hough + Regions', [hough_transform, regions_on_hough])
transform_and_train('+ Adj HOG + Regions', [hog_adj_transform, regions_on_hough])

print '\n--- Multiple Features Added ---'
transform_and_train('+ Sum + Sobel + Regions', [sum_rows_cols, sobel_filter, regions_on_hough])
transform_and_train('+ Sum + Adj HOG + Regions', [sum_rows_cols, hog_adj_transform, regions_on_hough])
transform_and_train('+ Sobel + Adj HOG + Regions', [sobel_filter, hog_adj_transform, regions_on_hough])
transform_and_train('+ Sum + Sobel + Adj HOG + Regions',
                    [sum_rows_cols, sobel_filter, hog_adj_transform, regions_on_hough])


## FINAL MODEL Performance

# Final Model specification
train_size = 5000
transformations = [hog_adj_transform, regions_on_hough]
# transformations = [regions_on_hough]


_start = time.time() # 'start' timer

# Transform data (training, development and test)
transformed_train = train_data[:train_size].copy()
transformed_dev = dev_data.copy()
transformed_test = test_data.copy()
for tr in transformations:
    transformed_train = map( tr, transformed_train )
    transformed_dev = map( tr, transformed_dev )
    transformed_test = map( tr, transformed_test )

# Train model
bnb = BernoulliNB(alpha=0.01, binarize=.5, fit_prior=False)
bnb.fit( transformed_train , train_labels[:train_size])
_pred_dev = bnb.predict(transformed_dev)
_pred_test = bnb.predict(transformed_test)

# Process time
_elapsed = time.time() - _start # 'stop' timer
_minutes = str(_elapsed / 60).split('.')
_t = [ _minutes[0], float('.'+_minutes[1]) * 60 ]

# Calculate accuracies (on development and test data)
d_correct = sum([ i == j for i,j in zip(_pred_dev, dev_labels) ])
daccuracy = 1. * d_correct / len(dev_labels)

t_correct = sum([ i == j for i,j in zip(_pred_test, test_labels) ])
taccuracy = 1. * t_correct / len(test_labels)


# Final Report
print '{m:^30}   {dac:^9}  {tac:^9}  {tm:^7}'.format(m='MODEL', dac='Dev Acc.', tac='Test Acc.', tm='Time')
print '{m:<30}    {dac:>5.1%}     {tac:>5.1%}   {tm[0]:>2}m {tm[1]:.1f}s'.format(
    m='BernoulliNB + Adj HOG + Regions', dac=daccuracy, tac=taccuracy, tm=_t)