#!/usr/bin/env python # coding: utf-8 # # Import necessary dependencies and settings # In[1]: import pandas as pd import numpy as np import re import nltk # # Sample corpus of text documents # In[2]: corpus = ['The sky is blue and beautiful.', 'Love this blue and beautiful sky!', 'The quick brown fox jumps over the lazy dog.', 'The brown fox is quick and the blue dog is lazy!', 'The sky is very blue and the sky is very beautiful today', 'The dog is lazy but the brown fox is quick!' ] labels = ['weather', 'weather', 'animals', 'animals', 'weather', 'animals'] corpus = np.array(corpus) corpus_df = pd.DataFrame({'Document': corpus, 'Category': labels}) corpus_df = corpus_df[['Document', 'Category']] corpus_df # # Simple text pre-processing # In[3]: wpt = nltk.WordPunctTokenizer() stop_words = nltk.corpus.stopwords.words('english') def normalize_document(doc): # lower case and remove special characters\whitespaces doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I) doc = doc.lower() doc = doc.strip() # tokenize document tokens = wpt.tokenize(doc) # filter stopwords out of document filtered_tokens = [token for token in tokens if token not in stop_words] # re-create document from filtered tokens doc = ' '.join(filtered_tokens) return doc normalize_corpus = np.vectorize(normalize_document) # In[4]: norm_corpus = normalize_corpus(corpus) norm_corpus # # Bag of Words Model # In[5]: from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer(min_df=0., max_df=1.) cv_matrix = cv.fit_transform(norm_corpus) cv_matrix = cv_matrix.toarray() cv_matrix # In[6]: vocab = cv.get_feature_names() pd.DataFrame(cv_matrix, columns=vocab) # # Bag of N-Grams Model # In[7]: bv = CountVectorizer(ngram_range=(2,2)) bv_matrix = bv.fit_transform(norm_corpus) bv_matrix = bv_matrix.toarray() vocab = bv.get_feature_names() pd.DataFrame(bv_matrix, columns=vocab) # # TF-IDF Model # In[8]: from sklearn.feature_extraction.text import TfidfVectorizer tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True) tv_matrix = tv.fit_transform(norm_corpus) tv_matrix = tv_matrix.toarray() vocab = tv.get_feature_names() pd.DataFrame(np.round(tv_matrix, 2), columns=vocab) # # Document Similarity # In[9]: from sklearn.metrics.pairwise import cosine_similarity similarity_matrix = cosine_similarity(tv_matrix) similarity_df = pd.DataFrame(similarity_matrix) similarity_df # ## Clustering documents using similarity features # In[10]: from sklearn.cluster import KMeans km = KMeans(n_clusters=2) km.fit_transform(similarity_df) cluster_labels = km.labels_ cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel']) pd.concat([corpus_df, cluster_labels], axis=1) # # Topic models # In[11]: from sklearn.decomposition import LatentDirichletAllocation lda = LatentDirichletAllocation(n_topics=2, max_iter=100, random_state=42) dt_matrix = lda.fit_transform(tv_matrix) features = pd.DataFrame(dt_matrix, columns=['T1', 'T2']) features # ## Show topics and their weights # In[12]: tt_matrix = lda.components_ for topic_weights in tt_matrix: topic = [(token, weight) for token, weight in zip(vocab, topic_weights)] topic = sorted(topic, key=lambda x: -x[1]) topic = [item for item in topic if item[1] > 0.6] print(topic) print() # ## Clustering documents using topic model features # In[13]: km = KMeans(n_clusters=2) km.fit_transform(features) cluster_labels = km.labels_ cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel']) pd.concat([corpus_df, cluster_labels], axis=1) # # Word Embeddings # In[14]: from gensim.models import word2vec wpt = nltk.WordPunctTokenizer() tokenized_corpus = [wpt.tokenize(document) for document in norm_corpus] # Set values for various parameters feature_size = 10 # Word vector dimensionality window_context = 10 # Context window size min_word_count = 1 # Minimum word count sample = 1e-3 # Downsample setting for frequent words w2v_model = word2vec.Word2Vec(tokenized_corpus, size=feature_size, window=window_context, min_count = min_word_count, sample=sample) # In[15]: w2v_model.wv['sky'] # In[16]: def average_word_vectors(words, model, vocabulary, num_features): feature_vector = np.zeros((num_features,),dtype="float64") nwords = 0. for word in words: if word in vocabulary: nwords = nwords + 1. feature_vector = np.add(feature_vector, model[word]) if nwords: feature_vector = np.divide(feature_vector, nwords) return feature_vector def averaged_word_vectorizer(corpus, model, num_features): vocabulary = set(model.wv.index2word) features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features) for tokenized_sentence in corpus] return np.array(features) # In[17]: w2v_feature_array = averaged_word_vectorizer(corpus=tokenized_corpus, model=w2v_model, num_features=feature_size) pd.DataFrame(w2v_feature_array) # In[18]: from sklearn.cluster import AffinityPropagation ap = AffinityPropagation() ap.fit(w2v_feature_array) cluster_labels = ap.labels_ cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel']) pd.concat([corpus_df, cluster_labels], axis=1)