%pylab inline import pylab as pl import numpy as np # Some nice default configuration for plots pl.rcParams['figure.figsize'] = 10, 7.5 pl.rcParams['axes.grid'] = True from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer(min_df=1) vectorizer.fit([ "The cat sat on the mat.", ]) vectorizer.vocabulary_ X = vectorizer.transform([ "The cat sat on the mat.", "This cat is a nice cat.", ]).toarray() print(len(vectorizer.vocabulary_)) print(vectorizer.get_feature_names()) print(X) vectorizer = CountVectorizer(min_df=1) vectorizer.fit([ "The cat sat on the mat.", "The quick brown fox jumps over the lazy dog.", ]) vectorizer.vocabulary_ X = vectorizer.transform([ "The cat sat on the mat.", "This cat is a nice cat.", ]).toarray() print(len(vectorizer.vocabulary_)) print(vectorizer.get_feature_names()) print(X) import os sentiment140_folder = os.path.join('datasets', 'sentiment140') training_csv_file = os.path.join(sentiment140_folder, 'training.1600000.processed.noemoticon.csv') testing_csv_file = os.path.join(sentiment140_folder, 'testdata.manual.2009.06.14.csv') !ls -lh datasets/sentiment140/training.1600000.processed.noemoticon.csv FIELDNAMES = ('polarity', 'id', 'date', 'query', 'author', 'text') def read_csv(csv_file, fieldnames=FIELDNAMES, max_count=None, n_partitions=1, partition_id=0): import csv # put the import inside for use in IPython.parallel texts = [] targets = [] with open(csv_file, 'rb') as f: reader = csv.DictReader(f, fieldnames=fieldnames, delimiter=',', quotechar='"') pos_count, neg_count = 0, 0 for i, d in enumerate(reader): if i % n_partitions != partition_id: # Skip entry if not in the requested partition continue if d['polarity'] == '4': if max_count and pos_count >= max_count / 2: continue pos_count += 1 texts.append(d['text']) targets.append(1) elif d['polarity'] == '0': if max_count and neg_count >= max_count / 2: continue neg_count += 1 texts.append(d['text']) targets.append(-1) return texts, targets %time text_train_all, target_train_all = read_csv(training_csv_file, max_count=200000) len(text_train_all), len(target_train_all) for text in text_train_all[:3]: print(text + "\n") print(target_train_all[:3]) for text in text_train_all[-3:]: print(text + "\n") print(target_train_all[-3:]) from sklearn.cross_validation import train_test_split text_train_small, text_validation, target_train_small, target_validation = train_test_split( text_train_all, target_train_all, test_size=.5, random_state=42) len(text_train_small) (target_train_small == -1).sum(), (target_train_small == 1).sum() len(text_validation) (target_validation == -1).sum(), (target_validation == 1).sum() text_test_all, target_test_all = read_csv(testing_csv_file) len(text_test_all), len(target_test_all) from sklearn.utils.murmurhash import murmurhash3_bytes_u32 for word in "the cat sat on the mat".split(): print("{0} => {1}".format( word, murmurhash3_bytes_u32(word, 0) % 2 ** 20)) from sklearn.feature_extraction.text import HashingVectorizer h_vectorizer = HashingVectorizer(charset='latin-1') h_vectorizer analyzer = h_vectorizer.build_analyzer() analyzer('This is a test sentence.') %time X_train_small = h_vectorizer.transform(text_train_small) X_train_small from sklearn.linear_model import PassiveAggressiveClassifier from sklearn.pipeline import Pipeline h_pipeline = Pipeline(( ('vec', HashingVectorizer(charset='latin-1')), ('clf', PassiveAggressiveClassifier(C=1, n_iter=1)), )) %time h_pipeline.fit(text_train_small, target_train_small).score(text_validation, target_validation) h_pipeline.score(text_test_all, target_test_all) from sklearn.feature_extraction.text import TfidfVectorizer vocabulary_vec = TfidfVectorizer(charset='latin-1', use_idf=False) vocabulary_pipeline = Pipeline(( ('vec', vocabulary_vec), ('clf', PassiveAggressiveClassifier(C=1, n_iter=1)), )) %time vocabulary_pipeline.fit(text_train_small, target_train_small).score(text_validation, target_validation) len(vocabulary_vec.vocabulary_) from random import Random class InfiniteStreamGenerator(object): """Simulate random polarity queries on the twitter streaming API""" def __init__(self, texts, targets, seed=0, batchsize=100): self.texts_pos = [text for text, target in zip(texts, targets) if target > 0] self.texts_neg = [text for text, target in zip(texts, targets) if target <= 0] self.rng = Random(seed) self.batchsize = batchsize def next_batch(self, batchsize=None): batchsize = self.batchsize if batchsize is None else batchsize texts, targets = [], [] for i in range(batchsize): # Select the polarity randomly target = self.rng.choice((-1, 1)) targets.append(target) # Combine 2 random texts of the right polarity pool = self.texts_pos if target > 0 else self.texts_neg text = self.rng.choice(pool) + " " + self.rng.choice(pool) texts.append(text) return texts, targets infinite_stream = InfiniteStreamGenerator(text_train_small, target_train_small) texts_in_batch, targets_in_batch = infinite_stream.next_batch(batchsize=3) for t in texts_in_batch: print(t + "\n") targets_in_batch n_batches = 1000 validation_scores = [] training_set_size = [] # Build the vectorizer and the classifier h_vectorizer = HashingVectorizer(charset='latin-1') clf = PassiveAggressiveClassifier(C=1) # Extract the features for the validation once and for all X_validation = h_vectorizer.transform(text_validation) classes = np.array([-1, 1]) n_samples = 0 for i in range(n_batches): texts_in_batch, targets_in_batch = infinite_stream.next_batch() n_samples += len(texts_in_batch) # Vectorize the text documents in the batch X_batch = h_vectorizer.transform(texts_in_batch) # Incrementally train the model on the new batch clf.partial_fit(X_batch, targets_in_batch, classes=classes) if n_samples % 100 == 0: # Compute the validation score of the current state of the model score = clf.score(X_validation, target_validation) validation_scores.append(score) training_set_size.append(n_samples) if i % 100 == 0: print("n_samples: {0}, score: {1:.4f}".format(n_samples, score)) pl.plot(training_set_size, validation_scores) pl.ylim(0.5, 1) pl.xlabel("Number of samples") pl.ylabel("Validation score") from IPython.parallel import Client client = Client() len(client) dv = client.direct_view() dv.scatter('partition_ids', range(len(client)), block=True) %px print(partition_ids) %px partition_id = partition_ids[0] %px print(partition_id) from sklearn.feature_extraction.text import HashingVectorizer h_vectorizer = HashingVectorizer(charset='latin-1') dv['h_vectorizer'] = h_vectorizer dv['read_csv'] = read_csv dv['training_csv_file'] = training_csv_file dv['n_partitions'] = len(client) %px print(training_csv_file) %px print(n_partitions) %%px max_count = 50000 print("Parsing %d items for partition %d..." % (max_count, partition_id)) texts, targets = read_csv(training_csv_file, n_partitions=n_partitions, partition_id=partition_id, max_count=50000) print("Shuffling the positive and negative examples...") from sklearn.utils import shuffle texts, targets = shuffle(texts, targets, random_state=1) print("Vectorizing text data...") vectors = h_vectorizer.transform(texts) print("Fitting a linear model...") from sklearn.linear_model import Perceptron clf = Perceptron(n_iter=1).fit(vectors, targets) print("Done!") classifiers = dv.gather('clf', block=True) classifiers from copy import copy def average_linear_model(models): """Compute a linear model that is the average of the others""" avg = copy(models[0]) avg.coef_ = np.sum([m.coef_ for m in models], axis=0) avg.coef_ /= len(models) avg.intercept_ = np.sum([m.intercept_ for m in models], axis=0) avg.intercept_ /= len(models) return avg clf = average_linear_model(classifiers) clf.score(h_vectorizer.transform(text_test_all), target_test_all) for c in classifiers: print(c.score(h_vectorizer.transform(text_test_all), target_test_all))