#!/usr/bin/env python
# coding: utf-8

# ## Using wrappers for Scikit learn API

# This tutorial is about using gensim models as a part of your scikit learn workflow with the help of wrappers found at ```gensim.sklearn_integration```

# The wrappers available (as of now) are :
# * LdaModel (```gensim.sklearn_api.ldamodel.LdaTransformer```), which implements gensim's ```LDA Model``` in a scikit-learn interface
# 
# * LsiModel (```gensim.sklearn_api.lsimodel.LsiTransformer```), which implements gensim's ```LSI Model``` in a scikit-learn interface
# 
# * RpModel (```gensim.sklearn_api.rpmodel.RpTransformer```), which implements gensim's ```Random Projections Model``` in a scikit-learn interface
# 
# * LDASeq Model (```gensim.sklearn_api.ldaseqmodel.LdaSeqTransformer```), which implements gensim's ```LdaSeqModel``` in a scikit-learn interface
# 
# * Word2Vec Model (```gensim.sklearn_api.w2vmodel.W2VTransformer```), which implements gensim's ```Word2Vec``` in a scikit-learn interface
# 
# * AuthorTopicModel Model (```gensim.sklearn_api.atmodel.AuthorTopicTransformer```), which implements gensim's ```AuthorTopicModel``` in a scikit-learn interface
# 
# * Doc2Vec Model (```gensim.sklearn_api.d2vmodel.D2VTransformer```), which implements gensim's ```Doc2Vec``` in a scikit-learn interface
# 
# * Text2Bow Model (```gensim.sklearn_api.text2bow.Text2BowTransformer```), which implements gensim's ```Dictionary``` in a scikit-learn interface
# 
# * TfidfModel Model (```gensim.sklearn_api.tfidf.TfIdfTransformer```), which implements gensim's ```TfidfModel``` in a scikit-learn interface
# 
# * HdpModel Model (```gensim.sklearn_api.hdp.HdpTransformer```), which implements gensim's ```HdpModel``` in a scikit-learn interface

# ### LDA Model

# To use LdaModel begin with importing LdaModel wrapper

# In[1]:


from gensim.sklearn_api import LdaTransformer


# Next we will create a dummy set of texts and convert it into a corpus

# In[2]:


from gensim.corpora import Dictionary
texts = [
    ['complier', 'system', 'computer'],
    ['eulerian', 'node', 'cycle', 'graph', 'tree', 'path'],
    ['graph', 'flow', 'network', 'graph'],
    ['loading', 'computer', 'system'],
    ['user', 'server', 'system'],
    ['tree', 'hamiltonian'],
    ['graph', 'trees'],
    ['computer', 'kernel', 'malfunction', 'computer'],
    ['server', 'system', 'computer']
]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]


# Then to run the LdaModel on it

# In[3]:


model = LdaTransformer(num_topics=2, id2word=dictionary, iterations=20, random_state=1)
model.fit(corpus)
model.transform(corpus)


# #### Integration with Sklearn

# To provide a better example of how it can be used with Sklearn, Let's use CountVectorizer method of sklearn. For this example we will use [20 Newsgroups data set](http://qwone.com/~jason/20Newsgroups/). We will only use the categories rec.sport.baseball and sci.crypt and use it to generate topics.

# In[4]:


import numpy as np
from gensim import matutils
from gensim.models.ldamodel import LdaModel
from sklearn.datasets import fetch_20newsgroups
from gensim.sklearn_api.ldamodel import LdaTransformer


# In[5]:


rand = np.random.mtrand.RandomState(1) # set seed for getting same result
cats = ['rec.sport.baseball', 'sci.crypt']
data = fetch_20newsgroups(subset='train', categories=cats, shuffle=True)


# Next, we use use the loaded data to create our dictionary and corpus.

# In[6]:


data_texts = [_.split() for _ in data.data]
id2word = Dictionary(data_texts)
corpus = [id2word.doc2bow(i.split()) for i in data.data]


# Next, we just need to fit corpus and id2word to our Lda wrapper.

# In[7]:


obj = LdaTransformer(id2word=id2word, num_topics=5, iterations=20)
lda = obj.fit(corpus)


# #### Example for Using Grid Search

# In[9]:


from sklearn.model_selection import GridSearchCV


# The inbuilt `score` function of Lda wrapper class provides two modes : `perplexity` and `u_mass` for computing the scores of the candidate models. The preferred mode for the scoring function is specified using `scorer` parameter of the wrapper as follows : 

# In[10]:


obj = LdaTransformer(id2word=id2word, num_topics=2, iterations=5, scorer='u_mass') # here 'scorer' can be 'perplexity' or 'u_mass'
parameters = {'num_topics': (2, 3, 5, 10), 'iterations': (1, 20, 50)}

# set `scoring` as `None` to use the inbuilt score function of `SklLdaModel` class
model = GridSearchCV(obj, parameters, cv=3, scoring=None)
model.fit(corpus)

model.best_params_


# You can also supply a custom scoring function of your choice using the `scoring` parameter of `GridSearchCV` function. The example shown below uses `c_v` mode of `CoherenceModel` class for computing the scores of the candidate models.

# In[11]:


from gensim.models.coherencemodel import CoherenceModel

# supplying a custom scoring function
def scoring_function(estimator, X, y=None):
    goodcm = CoherenceModel(model=estimator.gensim_model, texts=data_texts, dictionary=estimator.gensim_model.id2word, coherence='c_v')
    return goodcm.get_coherence()

obj = LdaTransformer(id2word=id2word, num_topics=5, iterations=5)
parameters = {'num_topics': (2, 3, 5, 10), 'iterations': (1, 20, 50)}

# set `scoring` as your custom scoring function
model = GridSearchCV(obj, parameters, cv=2, scoring=scoring_function)
model.fit(corpus)

model.best_params_


# #### Example of Using Pipeline

# In[8]:


from sklearn.pipeline import Pipeline
from sklearn import linear_model

def print_features_pipe(clf, vocab, n=10):
    ''' Better printing for sorted list '''
    coef = clf.named_steps['classifier'].coef_[0]
    print coef
    print 'Positive features: %s' % (' '.join(['%s:%.2f' % (vocab[j], coef[j]) for j in np.argsort(coef)[::-1][:n] if coef[j] > 0]))
    print 'Negative features: %s' % (' '.join(['%s:%.2f' % (vocab[j], coef[j]) for j in np.argsort(coef)[:n] if coef[j] < 0]))


# In[9]:


id2word = Dictionary([_.split() for _ in data.data])
corpus = [id2word.doc2bow(i.split()) for i in data.data]


# In[10]:


model = LdaTransformer(num_topics=15, id2word=id2word, iterations=10, random_state=37)
clf = linear_model.LogisticRegression(penalty='l2', C=0.1)  # l2 penalty used
pipe = Pipeline([('features', model,), ('classifier', clf)])
pipe.fit(corpus, data.target)
print_features_pipe(pipe, id2word.values())

print(pipe.score(corpus, data.target))


# ### LSI Model

# To use LsiModel begin with importing LsiModel wrapper

# In[11]:


from gensim.sklearn_api import LsiTransformer


# #### Example of Using Pipeline

# In[12]:


model = LsiTransformer(num_topics=15, id2word=id2word)
clf = linear_model.LogisticRegression(penalty='l2', C=0.1)  # l2 penalty used
pipe = Pipeline([('features', model,), ('classifier', clf)])
pipe.fit(corpus, data.target)
print_features_pipe(pipe, id2word.values())

print(pipe.score(corpus, data.target))


# ### Random Projections Model

# To use RpModel begin with importing RpModel wrapper

# In[13]:


from gensim.sklearn_api import RpTransformer


# #### Example of Using Pipeline

# In[14]:


model = RpTransformer(num_topics=2)
np.random.mtrand.RandomState(1)  # set seed for getting same result
clf = linear_model.LogisticRegression(penalty='l2', C=0.1)  # l2 penalty used
pipe = Pipeline([('features', model,), ('classifier', clf)])
pipe.fit(corpus, data.target)
print_features_pipe(pipe, id2word.values())

print(pipe.score(corpus, data.target))


# ### LDASeq Model

# To use LdaSeqModel begin with importing LdaSeqModel wrapper

# In[15]:


from gensim.sklearn_api import LdaSeqTransformer


# #### Example of Using Pipeline

# In[16]:


test_data = data.data[0:2]
test_target = data.target[0:2]
id2word_ldaseq = Dictionary(map(lambda x: x.split(), test_data))
corpus_ldaseq = [id2word_ldaseq.doc2bow(i.split()) for i in test_data]

model = LdaSeqTransformer(id2word=id2word_ldaseq, num_topics=2, time_slice=[1, 1, 1], initialize='gensim')
clf = linear_model.LogisticRegression(penalty='l2', C=0.1)  # l2 penalty used
pipe = Pipeline([('features', model,), ('classifier', clf)])
pipe.fit(corpus_ldaseq, test_target)
print_features_pipe(pipe, id2word_ldaseq.values())

print(pipe.score(corpus_ldaseq, test_target))


# ### Word2Vec Model

# To use Word2Vec model begin with importing Word2Vec wrapper

# In[17]:


from gensim.sklearn_api import W2VTransformer


# #### Example of Using Pipeline

# In[18]:


w2v_texts = [
    ['calculus', 'is', 'the', 'mathematical', 'study', 'of', 'continuous', 'change'],
    ['geometry', 'is', 'the', 'study', 'of', 'shape'],
    ['algebra', 'is', 'the', 'study', 'of', 'generalizations', 'of', 'arithmetic', 'operations'],
    ['differential', 'calculus', 'is', 'related', 'to', 'rates', 'of', 'change', 'and', 'slopes', 'of', 'curves'],
    ['integral', 'calculus', 'is', 'realted', 'to', 'accumulation', 'of', 'quantities', 'and', 'the', 'areas', 'under', 'and', 'between', 'curves'],
    ['physics', 'is', 'the', 'natural', 'science', 'that', 'involves', 'the', 'study', 'of', 'matter', 'and', 'its', 'motion', 'and', 'behavior', 'through', 'space', 'and', 'time'],
    ['the', 'main', 'goal', 'of', 'physics', 'is', 'to', 'understand', 'how', 'the', 'universe', 'behaves'],
    ['physics', 'also', 'makes', 'significant', 'contributions', 'through', 'advances', 'in', 'new', 'technologies', 'that', 'arise', 'from', 'theoretical', 'breakthroughs'],
    ['advances', 'in', 'the', 'understanding', 'of', 'electromagnetism', 'or', 'nuclear', 'physics', 'led', 'directly', 'to', 'the', 'development', 'of', 'new', 'products', 'that', 'have', 'dramatically', 'transformed', 'modern', 'day', 'society']
]

model = W2VTransformer(size=10, min_count=1)
model.fit(w2v_texts)

class_dict = {'mathematics': 1, 'physics': 0}
train_data = [
    ('calculus', 'mathematics'), ('mathematical', 'mathematics'), ('geometry', 'mathematics'), ('operations', 'mathematics'), ('curves', 'mathematics'),
    ('natural', 'physics'), ('nuclear', 'physics'), ('science', 'physics'), ('electromagnetism', 'physics'), ('natural', 'physics')
]

train_input = list(map(lambda x: x[0], train_data))
train_target = list(map(lambda x: class_dict[x[1]], train_data))

clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
clf.fit(model.transform(train_input), train_target)
text_w2v = Pipeline([('features', model,), ('classifier', clf)])
score = text_w2v.score(train_input, train_target)

print(score)


# ### AuthorTopic Model

# To use AuthorTopic model begin with importing AuthorTopic wrapper

# In[19]:


from gensim.sklearn_api import AuthorTopicTransformer


# #### Example of Using Pipeline

# In[20]:


from sklearn import cluster

atm_texts = [
    ['complier', 'system', 'computer'],
    ['eulerian', 'node', 'cycle', 'graph', 'tree', 'path'],
    ['graph', 'flow', 'network', 'graph'],
    ['loading', 'computer', 'system'],
    ['user', 'server', 'system'],
    ['tree', 'hamiltonian'],
    ['graph', 'trees'],
    ['computer', 'kernel', 'malfunction', 'computer'],
    ['server', 'system', 'computer'],
]
atm_dictionary = Dictionary(atm_texts)
atm_corpus = [atm_dictionary.doc2bow(text) for text in atm_texts]
author2doc = {'john': [0, 1, 2, 3, 4, 5, 6], 'jane': [2, 3, 4, 5, 6, 7, 8], 'jack': [0, 2, 4, 6, 8], 'jill': [1, 3, 5, 7]}

model = AuthorTopicTransformer(id2word=atm_dictionary, author2doc=author2doc, num_topics=10, passes=100)
model.fit(atm_corpus)

# create and train clustering model
clstr = cluster.MiniBatchKMeans(n_clusters=2)
authors_full = ['john', 'jane', 'jack', 'jill']
clstr.fit(model.transform(authors_full))

# stack together the two models in a pipeline
text_atm = Pipeline([('features', model,), ('cluster', clstr)])
author_list = ['jane', 'jack', 'jill']
ret_val = text_atm.predict(author_list)

print(ret_val)


# ### Doc2Vec Model

# To use Doc2Vec model begin with importing Doc2Vec wrapper

# In[21]:


from gensim.sklearn_api import D2VTransformer


# #### Example of Using Pipeline

# In[22]:


from gensim.models import doc2vec
d2v_sentences = [doc2vec.TaggedDocument(words, [i]) for i, words in enumerate(w2v_texts)]

model = D2VTransformer(min_count=1)
model.fit(d2v_sentences)

class_dict = {'mathematics': 1, 'physics': 0}
train_data = [
    (['calculus', 'mathematical'], 'mathematics'), (['geometry', 'operations', 'curves'], 'mathematics'),
    (['natural', 'nuclear'], 'physics'), (['science', 'electromagnetism', 'natural'], 'physics')
]
train_input = list(map(lambda x: x[0], train_data))
train_target = list(map(lambda x: class_dict[x[1]], train_data))

clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
clf.fit(model.transform(train_input), train_target)
text_d2v = Pipeline([('features', model,), ('classifier', clf)])
score = text_d2v.score(train_input, train_target)

print(score)


# ### Text2Bow Model

# To use Text2Bow model begin with importing Text2Bow wrapper

# In[23]:


from gensim.sklearn_api import Text2BowTransformer


# #### Example of Using Pipeline

# In[24]:


text2bow_model = Text2BowTransformer()
lda_model = LdaTransformer(num_topics=2, passes=10, minimum_probability=0, random_state=np.random.seed(0))
clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
text_t2b = Pipeline([('bow_model', text2bow_model), ('ldamodel', lda_model), ('classifier', clf)])
text_t2b.fit(data.data, data.target)
score = text_t2b.score(data.data, data.target)

print(score)


# ### TfIdf Model

# To use TfIdf model begin with importing TfIdf wrapper

# In[25]:


from gensim.sklearn_api import TfIdfTransformer


# #### Example of Using Pipeline

# In[26]:


tfidf_model = TfIdfTransformer()
tfidf_model.fit(corpus)
lda_model = LdaTransformer(num_topics=2, passes=10, minimum_probability=0, random_state=np.random.seed(0))
clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
text_tfidf = Pipeline((('tfidf_model', tfidf_model), ('ldamodel', lda_model), ('classifier', clf)))
text_tfidf.fit(corpus, data.target)
score = text_tfidf.score(corpus, data.target)

print(score)


# ### HDP Model

# To use HDP model begin with importing HDP wrapper

# In[27]:


from gensim.sklearn_api import HdpTransformer


# #### Example of Using Pipeline

# In[28]:


model = HdpTransformer(id2word=id2word)
clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
text_hdp = Pipeline([('features', model,), ('classifier', clf)])
text_hdp.fit(corpus, data.target)
score = text_hdp.score(corpus, data.target)

print(score)


# In[ ]: