import io, os, itertools import numpy, pandas import matplotlib.pyplot as plt from sklearn import feature_extraction, preprocessing, decomposition, cross_validation, svm, metrics # Tweak how tables are displayed pandas.set_option('display.precision', 4) pandas.set_option('display.max_colwidth', 30) pandas.set_option('display.colheader_justify', 'left') # Set up a simple BOW model vectorizer = feature_extraction.text.TfidfVectorizer( lowercase=True, token_pattern=r'\b[-A-Za-z]{3,}\b', min_df=0.1, max_df=0.5, max_features=5000, use_idf=True, sublinear_tf=True) # Get a list of all filenames in the 'train/' folder, # and add their text to the BOW table 'X'. filenames = os.listdir('train/') X = vectorizer.fit_transform((io.open('train/' + a, encoding='utf8').read() for a in filenames)) vec = vectorizer.transform(['It was a dark and stormy night; ' 'the rain fell in torrents — except at occasional intervals, ' 'when it was checked by a violent gust of wind which swept ' 'up the streets (for it is in London that our scene lies), ' 'rattling along the housetops, and fiercely agitating the scanty ' 'flame of the lamps that struggled against the darkness.']) # we get back a large vector with a value for each possible word. # show a table with only the non-zero items in the vector: feature_names = vectorizer.get_feature_names() pandas.DataFrame([(feature_names[b], (a, b), vec[a, b]) for a, b in zip(*vec.nonzero())], columns=['word', 'index', 'weight']) # Print the first 5 lines to see what the metadata looks like: print(''.join(io.open('metadata.csv', encoding='utf8').readlines()[:5])) # Load the data; metadata.index will be the filename. metadata = pandas.read_csv('metadata.csv', index_col=3, encoding='utf8') genres = dict(zip(metadata.index, metadata['Dataset'])) # convert the genre labels to integers encoder = preprocessing.LabelEncoder() y = encoder.fit_transform([genres[a] for a in filenames]) # Create an abbreviated label "Author_Title" for each text authors = dict(zip(metadata.index, metadata['Author'])) titles = dict(zip(metadata.index, metadata['Title'])) labels = ['%s_%s' % (authors[a].split(',')[0].title(), titles[a][:15].title()) for a in filenames] # Reduce the BOW model to 2 dimensions dec = decomposition.TruncatedSVD(n_components=2) X_r = dec.fit_transform(X) print('Explained variance:', dec.explained_variance_ratio_) # Make a scatter plot with the author/title of each text as label plt.figure(figsize=(12, 8)) for c, (i, target_name) in zip('rbmkycg', enumerate(encoder.classes_, 2)): plt.scatter(X_r[y == i, 0], X_r[y == i, 1], c=c, label=target_name) for n, xpos, ypos in zip( (y == i).nonzero()[0], X_r[y == i, 0], X_r[y == i, 1]): plt.annotate(labels[n], xy=(xpos, ypos), xytext=(5, 5), textcoords='offset points', color=c, fontsize='small', ha='left', va='top') plt.legend() plt.title('%s of dataset' % dec.__class__.__name__) plt.show() # Randomly select 80% as training set, 20% as test/validation set, # but make sure that each genre is well-represented. train, test = next(iter(cross_validation.StratifiedShuffleSplit( y, test_size=0.2, n_iter=1, random_state=42))) # Train an SVM classifier and predict the genre of the items in the test set. clf = svm.LinearSVC(C=1.0, random_state=42) clf.fit(X[train], y[train]) pred = clf.predict(X[test]) print('Overall accuracy:\t%4.1f %%\n' % (100 * metrics.accuracy_score(y[test], pred))) print(metrics.classification_report(y[test], pred, target_names=encoder.classes_)) pandas.DataFrame(metrics.confusion_matrix(y[test], pred), index=sorted(encoder.classes_), columns=sorted(encoder.classes_)) data = sorted(zip(test, clf.decision_function(X[test])), key=lambda x: max(x[1]))[:10] pandas.DataFrame([a for _, a in data], index=[labels[n] for n, _ in data], columns=encoder.classes_) # Sort the weights of the classifier and take last 10 items data = [] for n, target in enumerate(encoder.classes_): data.append([(feature_names[m], clf.coef_[n][m]) for m in numpy.argsort(clf.coef_[n])[-10:][::-1]]) pandas.DataFrame([itertools.chain(*a) for a in zip(*data)], columns=list(itertools.chain(*((target, '') for target in encoder.classes_))), index=range(1, 11)) # Since we now evaluate on an external test set, we can use everything # as training data clf.fit(X, y) # Transform the new files to the format of the existing BOW table newfiles = os.listdir('test/') X1 = vectorizer.transform((io.open('test/' + a, encoding='utf8').read() for a in newfiles)) predictions = encoder.inverse_transform(clf.predict(X1)) pandas.DataFrame([ (authors[a].title(), titles[a].title(), genres[a], b) for a, b in zip(newfiles, predictions)], index=newfiles, columns=['Author', 'Title', 'actual', 'predicted'])