%pylab inline
import pylab as pl
import numpy as np

# Some nice default configuration for plots
pl.rcParams['figure.figsize'] = 10, 7.5
pl.rcParams['axes.grid'] = True

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(min_df=1)

vectorizer.fit([
    "The cat sat on the mat.",
])
vectorizer.vocabulary_

X = vectorizer.transform([
    "The cat sat on the mat.",
    "This cat is a nice cat.",
]).toarray()

print(len(vectorizer.vocabulary_))
print(vectorizer.get_feature_names())
print(X)

vectorizer = CountVectorizer(min_df=1)

vectorizer.fit([
    "The cat sat on the mat.",
    "The quick brown fox jumps over the lazy dog.",
])
vectorizer.vocabulary_

X = vectorizer.transform([
    "The cat sat on the mat.",
    "This cat is a nice cat.",
]).toarray()

print(len(vectorizer.vocabulary_))
print(vectorizer.get_feature_names())
print(X)

import os

sentiment140_folder = os.path.join('datasets', 'sentiment140')
training_csv_file = os.path.join(sentiment140_folder, 'training.1600000.processed.noemoticon.csv')
testing_csv_file = os.path.join(sentiment140_folder, 'testdata.manual.2009.06.14.csv')

!ls -lh datasets/sentiment140/training.1600000.processed.noemoticon.csv

FIELDNAMES = ('polarity', 'id', 'date', 'query', 'author', 'text')

def read_csv(csv_file, fieldnames=FIELDNAMES, max_count=None,
             n_partitions=1, partition_id=0):
    
    import csv  # put the import inside for use in IPython.parallel
    
    texts = []
    targets = []
    with open(csv_file, 'rb') as f:
        reader = csv.DictReader(f, fieldnames=fieldnames,
                                delimiter=',', quotechar='"')
        pos_count, neg_count = 0, 0
        for i, d in enumerate(reader):
            if i % n_partitions != partition_id:
                # Skip entry if not in the requested partition
                continue

            if d['polarity'] == '4':
                if max_count and pos_count >= max_count / 2:
                    continue
                pos_count += 1
                texts.append(d['text'])
                targets.append(1)

            elif d['polarity'] == '0':
                if max_count and neg_count >= max_count / 2:
                    continue
                neg_count += 1
                texts.append(d['text'])
                targets.append(-1)

    return texts, targets

%time text_train_all, target_train_all = read_csv(training_csv_file, max_count=200000)

len(text_train_all), len(target_train_all)

for text in text_train_all[:3]:
    print(text + "\n")

print(target_train_all[:3])

for text in text_train_all[-3:]:
    print(text + "\n")

print(target_train_all[-3:])

from sklearn.cross_validation import train_test_split

text_train_small, text_validation, target_train_small, target_validation = train_test_split(
    text_train_all, target_train_all, test_size=.5, random_state=42)

len(text_train_small)

(target_train_small == -1).sum(), (target_train_small == 1).sum()

len(text_validation)

(target_validation == -1).sum(), (target_validation == 1).sum()

text_test_all, target_test_all = read_csv(testing_csv_file)

len(text_test_all), len(target_test_all)

from sklearn.utils.murmurhash import murmurhash3_bytes_u32

for word in "the cat sat on the mat".split():
    print("{0} => {1}".format(
        word, murmurhash3_bytes_u32(word, 0) % 2 ** 20))

from sklearn.feature_extraction.text import HashingVectorizer

h_vectorizer = HashingVectorizer(charset='latin-1')
h_vectorizer

analyzer = h_vectorizer.build_analyzer()
analyzer('This is a test sentence.')

%time X_train_small = h_vectorizer.transform(text_train_small)

X_train_small

from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.pipeline import Pipeline

h_pipeline = Pipeline((
    ('vec', HashingVectorizer(charset='latin-1')),
    ('clf', PassiveAggressiveClassifier(C=1, n_iter=1)),
))

%time h_pipeline.fit(text_train_small, target_train_small).score(text_validation, target_validation)

h_pipeline.score(text_test_all, target_test_all)

from sklearn.feature_extraction.text import TfidfVectorizer

vocabulary_vec = TfidfVectorizer(charset='latin-1', use_idf=False)
vocabulary_pipeline = Pipeline((
    ('vec', vocabulary_vec),
    ('clf', PassiveAggressiveClassifier(C=1, n_iter=1)),
))

%time vocabulary_pipeline.fit(text_train_small, target_train_small).score(text_validation, target_validation)

len(vocabulary_vec.vocabulary_)

from random import Random


class InfiniteStreamGenerator(object):
    """Simulate random polarity queries on the twitter streaming API"""
    
    def __init__(self, texts, targets, seed=0, batchsize=100):
        self.texts_pos = [text for text, target in zip(texts, targets)
                               if target > 0]
        self.texts_neg = [text for text, target in zip(texts, targets)
                               if target <= 0]
        self.rng = Random(seed)
        self.batchsize = batchsize

    def next_batch(self, batchsize=None):
        batchsize = self.batchsize if batchsize is None else batchsize
        texts, targets = [], []
        for i in range(batchsize):
            # Select the polarity randomly
            target = self.rng.choice((-1, 1))
            targets.append(target)
            
            # Combine 2 random texts of the right polarity
            pool = self.texts_pos if target > 0 else self.texts_neg
            text = self.rng.choice(pool) + " " + self.rng.choice(pool)
            texts.append(text)
        return texts, targets

infinite_stream = InfiniteStreamGenerator(text_train_small, target_train_small)

texts_in_batch, targets_in_batch = infinite_stream.next_batch(batchsize=3)

for t in texts_in_batch:
    print(t + "\n")

targets_in_batch

n_batches = 1000
validation_scores = []
training_set_size = []

# Build the vectorizer and the classifier
h_vectorizer = HashingVectorizer(charset='latin-1')
clf = PassiveAggressiveClassifier(C=1)

# Extract the features for the validation once and for all
X_validation = h_vectorizer.transform(text_validation)
classes = np.array([-1, 1])

n_samples = 0
for i in range(n_batches):
    
    texts_in_batch, targets_in_batch = infinite_stream.next_batch()    
    n_samples += len(texts_in_batch)

    # Vectorize the text documents in the batch
    X_batch = h_vectorizer.transform(texts_in_batch)
    
    # Incrementally train the model on the new batch
    clf.partial_fit(X_batch, targets_in_batch, classes=classes)
    
    if n_samples % 100 == 0:
        # Compute the validation score of the current state of the model
        score = clf.score(X_validation, target_validation)
        validation_scores.append(score)
        training_set_size.append(n_samples)

    if i % 100 == 0:
        print("n_samples: {0}, score: {1:.4f}".format(n_samples, score))

pl.plot(training_set_size, validation_scores)
pl.ylim(0.5, 1)
pl.xlabel("Number of samples")
pl.ylabel("Validation score")

from IPython.parallel import Client

client = Client()
len(client)

dv = client.direct_view()

dv.scatter('partition_ids', range(len(client)), block=True)

%px print(partition_ids)

%px partition_id = partition_ids[0]

%px print(partition_id)

from sklearn.feature_extraction.text import HashingVectorizer

h_vectorizer = HashingVectorizer(charset='latin-1')
dv['h_vectorizer'] = h_vectorizer
dv['read_csv'] = read_csv
dv['training_csv_file'] = training_csv_file
dv['n_partitions'] = len(client)

%px print(training_csv_file)
%px print(n_partitions)

%%px

max_count = 50000
print("Parsing %d items for partition %d..." % (max_count, partition_id))

texts, targets = read_csv(training_csv_file, n_partitions=n_partitions,
                         partition_id=partition_id, max_count=50000)

print("Shuffling the positive and negative examples...")

from sklearn.utils import shuffle
texts, targets = shuffle(texts, targets, random_state=1)

print("Vectorizing text data...")

vectors = h_vectorizer.transform(texts)

print("Fitting a linear model...")

from sklearn.linear_model import Perceptron
clf = Perceptron(n_iter=1).fit(vectors, targets)

print("Done!")

classifiers = dv.gather('clf', block=True)
classifiers

from copy import copy

def average_linear_model(models):
    """Compute a linear model that is the average of the others"""
    avg = copy(models[0])

    avg.coef_ = np.sum([m.coef_ for m in models], axis=0)
    avg.coef_ /= len(models)
    
    avg.intercept_ = np.sum([m.intercept_ for m in models], axis=0)
    avg.intercept_ /= len(models)

    return avg
    

clf = average_linear_model(classifiers)

clf.score(h_vectorizer.transform(text_test_all), target_test_all)

for c in classifiers:
    print(c.score(h_vectorizer.transform(text_test_all), target_test_all))