import book_classification as bc import shelve import pandas import numpy import matplotlib.pyplot as plt from sklearn import svm, decomposition, cross_validation myShelf = shelve.open("storage_new.db") aBookCollection = myShelf['aBookCollection'] print(len(aBookCollection)) del myShelf anotherCollection = aBookCollection.selection().exclude_authors_below(4) print(len(anotherCollection)) train_collection, test_collection = anotherCollection.selection().split_per_author_percentage(0.7) collection_dataframe = anotherCollection.as_dataframe() def label_for(book): if book in train_collection.books(): return 'Train' else: return 'Test' collection_dataframe['Set'] = collection_dataframe['Object'].map(label_for) plt.figsize(12, 4) print(collection_dataframe.groupby(['Set', 'Author']).size().unstack('Set').describe()) collection_dataframe.groupby(['Set', 'Author']).size().unstack('Set').plot(kind='bar', stacked=True) #both_collections.groupby('Author').count().size().plot(kind='bar') plt.figsize(12, 4) test_collection.as_dataframe().groupby('Author').size().plot(kind='bar') tokenizer = bc.BasicTokenizer() grouper = bc.FixedGrouper(500) extractor = bc.EntropiesExtractor(tokenizer, grouper) #extractor = bc.FrequenciesExtractor(tokenizer) model = bc.ClassificationModel(training, extractor, decomposition.TruncatedSVD(50), svm.SVC()) results = model.classify(testing) from sklearn.metrics import classification_report, confusion_matrix expected = [] predicted = [] for book in testing.books(): expected.append(book.author()) predicted.append(results[book]) plt.pcolor(confusion_matrix(expected, predicted)) print(classification_report(expected, predicted)) print(confusion_matrix(expected, predicted))