import book_classification as bc
import shelve
import pandas
import numpy
import matplotlib.pyplot as plt
from sklearn import svm, decomposition, cross_validation
myShelf = shelve.open("storage_new.db")
aBookCollection = myShelf['aBookCollection']
print(len(aBookCollection))
del myShelf
anotherCollection = aBookCollection.selection().exclude_authors_below(4)
print(len(anotherCollection))
train_collection, test_collection = anotherCollection.selection().split_per_author_percentage(0.7)
collection_dataframe = anotherCollection.as_dataframe()
def label_for(book):
if book in train_collection.books():
return 'Train'
else:
return 'Test'
collection_dataframe['Set'] = collection_dataframe['Object'].map(label_for)
575
plt.figsize(12, 4)
print(collection_dataframe.groupby(['Set', 'Author']).size().unstack('Set').describe())
collection_dataframe.groupby(['Set', 'Author']).size().unstack('Set').plot(kind='bar', stacked=True)
#both_collections.groupby('Author').count().size().plot(kind='bar')
Set Test Train count 38.000000 38.000000 mean 4.447368 10.684211 std 5.693512 13.154190 min 1.000000 3.000000 25% 1.250000 4.000000 50% 2.000000 6.000000 75% 4.000000 9.750000 max 28.000000 66.000000
<matplotlib.axes.AxesSubplot at 0x7fca44dd9710>
plt.figsize(12, 4)
test_collection.as_dataframe().groupby('Author').size().plot(kind='bar')
<matplotlib.axes.AxesSubplot at 0x7fca47cce690>
tokenizer = bc.BasicTokenizer()
grouper = bc.FixedGrouper(500)
extractor = bc.EntropiesExtractor(tokenizer, grouper)
#extractor = bc.FrequenciesExtractor(tokenizer)
model = bc.ClassificationModel(training, extractor, decomposition.TruncatedSVD(50), svm.SVC())
results = model.classify(testing)
from sklearn.metrics import classification_report, confusion_matrix
expected = []
predicted = []
for book in testing.books():
expected.append(book.author())
predicted.append(results[book])
plt.pcolor(confusion_matrix(expected, predicted))
<matplotlib.collections.PolyCollection at 0x7f089f88fa50>
print(classification_report(expected, predicted))
precision recall f1-score support Anthony Hamilton 1.00 1.00 1.00 4 Arthur Robert Harding 1.00 1.00 1.00 2 Bret Harte 1.00 0.62 0.77 53 Charles Dickens 0.84 0.50 0.63 42 E. Raymond Hall 0.68 1.00 0.81 17 Edward E. Hale 1.00 0.67 0.80 3 Ernst Haeckel 1.00 1.00 1.00 2 F. Colburn Adams 0.00 0.00 0.00 2 Frank Harris 0.18 0.40 0.25 5 George Ade 0.75 1.00 0.86 3 H. Irving Hancock 1.00 1.00 1.00 31 H. Rider Haggard 1.00 0.81 0.89 47 Harry Harrison 0.08 0.67 0.15 6 Ian Hay 0.75 0.60 0.67 5 James B. Hendryx 0.71 1.00 0.83 5 Joel Chandler Harris 1.00 0.67 0.80 6 John Hay 0.15 1.00 0.27 2 Julian Hawthorne 0.09 0.44 0.15 9 Lafcadio Hearn 1.00 0.50 0.67 8 Nathaniel Hawthorne 0.51 0.26 0.34 89 O. Henry 1.00 0.89 0.94 9 Samuel Hopkins Adams 1.00 1.00 1.00 3 Thomas Bailey Aldrich 0.25 0.14 0.18 14 Thomas Hardy 0.82 0.86 0.84 21 avg / total 0.76 0.61 0.65 388
print(confusion_matrix(expected, predicted))
[[ 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [ 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [ 0 0 33 0 2 0 0 0 6 1 0 0 2 0 0 0 1 2 0 6 0 0 0 0] [ 0 0 0 21 0 0 0 0 1 0 0 0 2 0 0 0 1 10 0 7 0 0 0 0] [ 0 0 0 0 17 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [ 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0] [ 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [ 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0] [ 0 0 0 0 0 0 0 0 2 0 0 0 2 0 0 0 1 0 0 0 0 0 0 0] [ 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [ 0 0 0 0 0 0 0 0 0 0 31 0 0 0 0 0 0 0 0 0 0 0 0 0] [ 0 0 0 0 0 0 0 0 1 0 0 38 1 1 0 0 1 3 0 2 0 0 0 0] [ 0 0 0 0 0 0 0 0 0 0 0 0 4 0 2 0 0 0 0 0 0 0 0 0] [ 0 0 0 0 0 0 0 0 0 0 0 0 1 3 0 0 1 0 0 0 0 0 0 0] [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 5 0 0 0 0 0 0 0 0 0] [ 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 4 1 0 0 0 0 0 0 0] [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0] [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 4 0 0 0 0 1 3] [ 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 4 1 0 0 0 0] [ 0 0 0 3 5 0 0 0 0 0 0 0 30 0 0 0 2 23 0 23 0 0 2 1] [ 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 8 0 0 0] [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0] [ 0 0 0 0 1 0 0 0 0 0 0 0 5 0 0 0 0 0 0 6 0 0 2 0] [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 2 18]]