import io, os, itertools
import numpy, pandas
import matplotlib.pyplot as plt
from sklearn import feature_extraction, preprocessing, decomposition, cross_validation, svm, metrics

# Tweak how tables are displayed
pandas.set_option('display.precision', 4)
pandas.set_option('display.max_colwidth', 30)
pandas.set_option('display.colheader_justify', 'left')

# Set up a simple BOW model
vectorizer = feature_extraction.text.TfidfVectorizer(
        lowercase=True, token_pattern=r'\b[-A-Za-z]{3,}\b',
        min_df=0.1, max_df=0.5, max_features=5000,
        use_idf=True, sublinear_tf=True)

# Get a list of all filenames in the 'train/' folder,
# and add their text to the BOW table 'X'.
filenames = os.listdir('train/')
X = vectorizer.fit_transform((io.open('train/' + a, encoding='utf8').read() for a in filenames))

vec = vectorizer.transform(['It was a dark and stormy night; '
                            'the rain fell in torrents — except at occasional intervals, '
                            'when it was checked by a violent gust of wind which swept '
                            'up the streets (for it is in London that our scene lies), '
                            'rattling along the housetops, and fiercely agitating the scanty '
                            'flame of the lamps that struggled against the darkness.'])

# we get back a large vector with a value for each possible word.
# show a table with only the non-zero items in the vector:
feature_names = vectorizer.get_feature_names()
pandas.DataFrame([(feature_names[b], (a, b), vec[a, b])
                        for a, b in zip(*vec.nonzero())],
                       columns=['word', 'index', 'weight'])

# Print the first 5 lines to see what the metadata looks like:
print(''.join(io.open('metadata.csv', encoding='utf8').readlines()[:5]))

# Load the data; metadata.index will be the filename.
metadata = pandas.read_csv('metadata.csv', index_col=3, encoding='utf8')
genres = dict(zip(metadata.index, metadata['Dataset']))

# convert the genre labels to integers
encoder = preprocessing.LabelEncoder()
y = encoder.fit_transform([genres[a] for a in filenames])

# Create an abbreviated label "Author_Title" for each text
authors = dict(zip(metadata.index, metadata['Author']))
titles = dict(zip(metadata.index, metadata['Title']))
labels = ['%s_%s' % (authors[a].split(',')[0].title(),
        titles[a][:15].title()) for a in filenames]

# Reduce the BOW model to 2 dimensions
dec = decomposition.TruncatedSVD(n_components=2)
X_r = dec.fit_transform(X)
print('Explained variance:', dec.explained_variance_ratio_)

# Make a scatter plot with the author/title of each text as label
plt.figure(figsize=(12, 8))
for c, (i, target_name) in zip('rbmkycg', enumerate(encoder.classes_, 2)):
    plt.scatter(X_r[y == i, 0], X_r[y == i, 1], c=c, label=target_name)
    for n, xpos, ypos in zip(
            (y == i).nonzero()[0], X_r[y == i, 0], X_r[y == i, 1]):
        plt.annotate(labels[n], xy=(xpos, ypos), xytext=(5, 5),
            textcoords='offset points', color=c,
            fontsize='small', ha='left', va='top')

plt.legend()
plt.title('%s of dataset' % dec.__class__.__name__)
plt.show()

# Randomly select 80% as training set, 20% as test/validation set,
# but make sure that each genre is well-represented.
train, test = next(iter(cross_validation.StratifiedShuffleSplit(
        y, test_size=0.2, n_iter=1, random_state=42)))

# Train an SVM classifier and predict the genre of the items in the test set.
clf = svm.LinearSVC(C=1.0, random_state=42)
clf.fit(X[train], y[train])
pred = clf.predict(X[test])

print('Overall accuracy:\t%4.1f %%\n' % (100 * metrics.accuracy_score(y[test], pred)))
print(metrics.classification_report(y[test], pred, target_names=encoder.classes_))
pandas.DataFrame(metrics.confusion_matrix(y[test], pred),
                       index=sorted(encoder.classes_),
                       columns=sorted(encoder.classes_))

data = sorted(zip(test, clf.decision_function(X[test])), key=lambda x: max(x[1]))[:10]
pandas.DataFrame([a for _, a in data],
                       index=[labels[n] for n, _ in data],
                       columns=encoder.classes_)

# Sort the weights of the classifier and take last 10 items
data = []
for n, target in enumerate(encoder.classes_):
    data.append([(feature_names[m], clf.coef_[n][m])
                 for m in numpy.argsort(clf.coef_[n])[-10:][::-1]])
    
pandas.DataFrame([itertools.chain(*a) for a in zip(*data)],
                       columns=list(itertools.chain(*((target, '') for target in encoder.classes_))),
                       index=range(1, 11))

# Since we now evaluate on an external test set, we can use everything
# as training data
clf.fit(X, y)

# Transform the new files to the format of the existing BOW table
newfiles = os.listdir('test/')
X1 = vectorizer.transform((io.open('test/' + a, encoding='utf8').read() for a in newfiles))
predictions = encoder.inverse_transform(clf.predict(X1))

pandas.DataFrame([
            (authors[a].title(), titles[a].title(),
            genres[a], b)
        for a, b in zip(newfiles, predictions)],
        index=newfiles,
        columns=['Author', 'Title', 'actual', 'predicted'])