Notebook

In [1]:

import book_classification as bc
import shelve
import pandas
import numpy
import matplotlib.pyplot as plt
from sklearn import svm, decomposition, cross_validation

In [2]:

myShelf = shelve.open("storage_new.db")
aBookCollection = myShelf['aBookCollection']
print(len(aBookCollection))
del myShelf

In [34]:

anotherCollection = aBookCollection.selection().exclude_authors_below(4)
print(len(anotherCollection))
train_collection, test_collection = anotherCollection.selection().split_per_author_percentage(0.7)

collection_dataframe = anotherCollection.as_dataframe()
def label_for(book):
    if book in train_collection.books():
        return 'Train'
    else:
        return 'Test'
collection_dataframe['Set'] = collection_dataframe['Object'].map(label_for)

In [83]:

plt.figsize(12, 4)
print(collection_dataframe.groupby(['Set', 'Author']).size().unstack('Set').describe())
collection_dataframe.groupby(['Set', 'Author']).size().unstack('Set').plot(kind='bar', stacked=True)
#both_collections.groupby('Author').count().size().plot(kind='bar')

Set         Test      Train
count  38.000000  38.000000
mean    4.447368  10.684211
std     5.693512  13.154190
min     1.000000   3.000000
25%     1.250000   4.000000
50%     2.000000   6.000000
75%     4.000000   9.750000
max    28.000000  66.000000

Out[83]:

<matplotlib.axes.AxesSubplot at 0x7fca44dd9710>

In [53]:

plt.figsize(12, 4)
test_collection.as_dataframe().groupby('Author').size().plot(kind='bar')

Out[53]:

<matplotlib.axes.AxesSubplot at 0x7fca47cce690>

In [59]:

tokenizer = bc.BasicTokenizer()
grouper = bc.FixedGrouper(500)
extractor = bc.EntropiesExtractor(tokenizer, grouper)
#extractor = bc.FrequenciesExtractor(tokenizer)
model = bc.ClassificationModel(training, extractor, decomposition.TruncatedSVD(50), svm.SVC())

In [60]:

results = model.classify(testing)

In [61]:

from sklearn.metrics import classification_report, confusion_matrix
expected = []
predicted = []
for book in testing.books():
    expected.append(book.author())
    predicted.append(results[book])
plt.pcolor(confusion_matrix(expected, predicted))

Out[61]:

<matplotlib.collections.PolyCollection at 0x7f089f88fa50>

In [62]:

print(classification_report(expected, predicted))

             precision    recall  f1-score   support

Anthony Hamilton       1.00      1.00      1.00         4
Arthur Robert Harding       1.00      1.00      1.00         2
 Bret Harte       1.00      0.62      0.77        53
Charles Dickens       0.84      0.50      0.63        42
E. Raymond Hall       0.68      1.00      0.81        17
Edward E. Hale       1.00      0.67      0.80         3
Ernst Haeckel       1.00      1.00      1.00         2
F. Colburn Adams       0.00      0.00      0.00         2
Frank Harris       0.18      0.40      0.25         5
 George Ade       0.75      1.00      0.86         3
H. Irving Hancock       1.00      1.00      1.00        31
H. Rider Haggard       1.00      0.81      0.89        47
Harry Harrison       0.08      0.67      0.15         6
    Ian Hay       0.75      0.60      0.67         5
James B. Hendryx       0.71      1.00      0.83         5
Joel Chandler Harris       1.00      0.67      0.80         6
   John Hay       0.15      1.00      0.27         2
Julian Hawthorne       0.09      0.44      0.15         9
Lafcadio Hearn       1.00      0.50      0.67         8
Nathaniel Hawthorne       0.51      0.26      0.34        89
   O. Henry       1.00      0.89      0.94         9
Samuel Hopkins Adams       1.00      1.00      1.00         3
Thomas Bailey Aldrich       0.25      0.14      0.18        14
Thomas Hardy       0.82      0.86      0.84        21

avg / total       0.76      0.61      0.65       388

In [63]:

print(confusion_matrix(expected, predicted))

[[ 4  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0 33  0  2  0  0  0  6  1  0  0  2  0  0  0  1  2  0  6  0  0  0  0]
 [ 0  0  0 21  0  0  0  0  1  0  0  0  2  0  0  0  1 10  0  7  0  0  0  0]
 [ 0  0  0  0 17  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0]
 [ 0  0  0  0  0  0  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  2  0  0  0  2  0  0  0  1  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  3  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0 31  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  1  0  0 38  1  1  0  0  1  3  0  2  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  4  0  2  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  1  3  0  0  1  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  5  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  4  1  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  2  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  4  0  0  0  0  1  3]
 [ 0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  1  1  4  1  0  0  0  0]
 [ 0  0  0  3  5  0  0  0  0  0  0  0 30  0  0  0  2 23  0 23  0  0  2  1]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  8  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  3  0  0]
 [ 0  0  0  0  1  0  0  0  0  0  0  0  5  0  0  0  0  0  0  6  0  0  2  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  2 18]]

In [ ]: