from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.metrics.pairwise import cosine_similarity from sklearn.manifold import MDS import matplotlib.pyplot as plt from scipy.cluster.hierarchy import ward, dendrogram %matplotlib inline # plot the documents from the corpus def plot_corpus_similarity(corpus, vectorizer): # generate the vector, distances and positions texts = [corpus.raw(fileid) for fileid in corpus.fileids()] documentTermMatrix = vectorizer.fit_transform(texts) distances = 1 - cosine_similarity(documentTermMatrix) mds = MDS(dissimilarity="precomputed", random_state=1) positions = mds.fit_transform(distances) # plot dendrogram linkage_matrix = ward(distances) plt.figure(figsize=(8,10)) dendrogram(linkage_matrix, labels=corpus.fileids(), orientation="right"); plt.show() # fixes margins # plot scatter xvalues = positions[:, 0] yvalues = positions[: ,1] plt.figure(figsize=(20,10)) for x, y, name in zip(xvalues, yvalues, corpus.fileids()): plt.scatter(x, y) # the colour-coding here is a bit of a hard-coded hack for what is otherwise mostly reusable code plt.text(x, y, name.replace(".txt", "")[:25], color='red' if 'Other' in name else 'green') plt.show() import nltk from nltk.corpus.reader.plaintext import PlaintextCorpusReader data_dir = "../../data/philosophy" corpus = PlaintextCorpusReader(data_dir+"/texts", ".*\.txt") simple_vectorizer = TfidfVectorizer(use_idf=False) plot_corpus_similarity(corpus, simple_vectorizer) stoplist_vectorizer = TfidfVectorizer(use_idf=False, stop_words=nltk.corpus.stopwords.words("english"), max_features=5000) plot_corpus_similarity(corpus, stoplist_vectorizer) keywords = [line.rstrip('\n') for line in open(data_dir+'/keywords.txt')] keywords_vectorizer = TfidfVectorizer(use_idf=False, vocabulary=keywords) plot_corpus_similarity(corpus, keywords_vectorizer) import random from pandas import DataFrame from collections import defaultdict from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import RidgeClassifier from sklearn.svm import LinearSVC from sklearn.linear_model import SGDClassifier from sklearn.linear_model import Perceptron from sklearn.linear_model import PassiveAggressiveClassifier from sklearn.naive_bayes import BernoulliNB, MultinomialNB from sklearn.neighbors import KNeighborsClassifier from sklearn.neighbors import NearestCentroid def benchmark_svms(labelled_texts, runs, vectorizer): results = defaultdict(list) split = int(len(labelled_texts)/2) for i in range(0, runs): random.shuffle(labelled_texts) train_set, test_set = labelled_texts[split:], labelled_texts[:split] train_set_categories = ["Philosophy" if "Philosophy" in category else "Other" for category, text in train_set] test_set_categories = ["Philosophy" if "Philosophy" in category else "Other" for category, text in test_set] X_train = vectorizer.fit_transform([text for category, text in train_set]) X_test = vectorizer.transform([text for category, text in test_set]) for clf, name in ( (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"), (Perceptron(n_iter=50), "Perceptron"), (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"), (KNeighborsClassifier(n_neighbors=10), "kNN"), (LinearSVC(), "LinearSVC"), (LinearSVC(loss='l2', penalty="l2", dual=False, tol=1e-3), "LinearSCV l2"), (LinearSVC(loss='l2', penalty="l1", dual=False, tol=1e-3), "LinearSCV l1"), (SGDClassifier(alpha=.0001, n_iter=50, penalty="l2"), "SGD l2"), (SGDClassifier(alpha=.0001, n_iter=50, penalty="l1"), "SGD l1"), (SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet"), "SGD elasticnet"), (NearestCentroid(), "NearestCentroid (aka Rocchio classifier"), (MultinomialNB(alpha=.01), "Naïve Bayes Multinomial"), (BernoulliNB(alpha=.01), "Naïve Bayes Bernoulli")): clf.fit(X_train, train_set_categories) pred = clf.predict(X_test) results[name].append(clf.score(X_test, test_set_categories)) orderedresults = [(name, values) for name, values in results.items()] results_df = DataFrame([values for name,values in orderedresults], index=[name for name,values in orderedresults]) print("Ordered averages:") print(results_df.mean(axis=1).order(ascending=False)) results_df.transpose().plot(figsize=(20, 10)) labelled_texts = [(fileid, corpus.raw(fileid)) for fileid in corpus.fileids()] benchmark_svms(labelled_texts, 5, stoplist_vectorizer) filtered_labelled_texts = [(fileid, text) for fileid, text in labelled_texts if "GameOfLogic" not in fileid and "ThusSpakeZarathustr" not in fileid] benchmark_svms(filtered_labelled_texts, 5, stoplist_vectorizer) import nltk from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB # define the training corpus to use (while filtering out our Philosohpical ouliers) data_dir = "../../data/philosophy" corpus = nltk.corpus.reader.plaintext.PlaintextCorpusReader(data_dir+"/texts", ".*\.txt") filtered_fileids = [fileid for fileid in corpus.fileids() if "GameOfLogic" not in fileid and "ThusSpakeZarathustr" not in fileid] # create TF-IDF (actually elative frequencies) vectorizer vectorizer = TfidfVectorizer(use_idf=False, stop_words=nltk.corpus.stopwords.words("english"), max_features=10000) X_train = vectorizer.fit_transform([corpus.raw(fileid) for fileid in filtered_fileids]) categories = ["Philosophy" if "Philosophy" in fileid else "Other" for fileid in filtered_fileids] # create a classiier clf = MultinomialNB(alpha=.01) clf.fit(X_train, categories) # now we should be able to predict new instances given a frequencies vector # (note that we're using TF-IDF values that may not correspond to the new corpus) # clf.predict(X_test)