import book_classification as bc
import shelve
import pandas
import numpy
import matplotlib.pyplot as plt
from sklearn import svm, decomposition, cross_validation

myShelf = shelve.open("storage_new.db")
aBookCollection = myShelf['aBookCollection']
print(len(aBookCollection))
del myShelf

anotherCollection = aBookCollection.selection().exclude_authors_below(4)
print(len(anotherCollection))
train_collection, test_collection = anotherCollection.selection().split_per_author_percentage(0.7)

collection_dataframe = anotherCollection.as_dataframe()
def label_for(book):
    if book in train_collection.books():
        return 'Train'
    else:
        return 'Test'
collection_dataframe['Set'] = collection_dataframe['Object'].map(label_for)

plt.figsize(12, 4)
print(collection_dataframe.groupby(['Set', 'Author']).size().unstack('Set').describe())
collection_dataframe.groupby(['Set', 'Author']).size().unstack('Set').plot(kind='bar', stacked=True)
#both_collections.groupby('Author').count().size().plot(kind='bar')

plt.figsize(12, 4)
test_collection.as_dataframe().groupby('Author').size().plot(kind='bar')

tokenizer = bc.BasicTokenizer()
grouper = bc.FixedGrouper(500)
extractor = bc.EntropiesExtractor(tokenizer, grouper)
#extractor = bc.FrequenciesExtractor(tokenizer)
model = bc.ClassificationModel(training, extractor, decomposition.TruncatedSVD(50), svm.SVC())

results = model.classify(testing)

from sklearn.metrics import classification_report, confusion_matrix
expected = []
predicted = []
for book in testing.books():
    expected.append(book.author())
    predicted.append(results[book])
plt.pcolor(confusion_matrix(expected, predicted))

print(classification_report(expected, predicted))

print(confusion_matrix(expected, predicted))