https://github.com/JasonKessler/scattertext
Cite as: Jason S. Kessler. Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ. Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL): System Demonstrations. 2017.
Link to preprint: https://arxiv.org/abs/1703.00565
@article{kessler2017scattertext, author = {Kessler, Jason S.}, title = {Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ}, booktitle = {ACL System Demonstrations}, year = {2017}, }
%matplotlib inline
import scattertext as st
from gensim.models import word2vec
import re, io, itertools
from pprint import pprint
import pandas as pd
import numpy as np
import spacy
import os, pkgutil, json, urllib
from urllib.request import urlopen
from IPython.display import IFrame
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))
nlp = spacy.load('en')
# If this doesn't work, please uncomment the following line and use a regex-based parser instead
#nlp = st.whitespace_nlp_with_sentences
convention_df = st.SampleCorpora.ConventionData2012.get_data()
convention_df['parsed'] = convention_df.text.apply(nlp)
corpus = (st.CorpusFromParsedDocuments(convention_df, category_col='party', parsed_col='parsed')
.build()
.get_unigram_corpus())
model = word2vec.Word2Vec(size=100, window=5, min_count=10, workers=4)
model = st.Word2VecFromParsedCorpus(corpus, model).train(epochs=10000)
model.wv.most_similar('jobs')
[('create', 0.9190447926521301), ('businesses', 0.8814688920974731), ('million', 0.8395127058029175), ('taxes', 0.8300786018371582), ('millions', 0.829835057258606), ('created', 0.8269357085227966), ('pay', 0.8228686451911926), ('families', 0.8117849826812744), ('lives', 0.8079125881195068), ('debt', 0.8053802847862244)]
corpus._df[corpus._parsed_col].apply(lambda x: len(list(x.sents))).sum()
#model.corpus_count
9677
target_term = 'jobs'
html = st.word_similarity_explorer_gensim(corpus,
category='democrat',
category_name='Democratic',
not_category_name='Republican',
target_term=target_term,
minimum_term_frequency=5,
width_in_pixels=1000,
word2vec=model,
metadata=convention_df['speaker'])
file_name = 'output/demo_similarity_gensim.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)
# Note: this will fail if you did not use spaCy as your parser.
html = st.word_similarity_explorer(corpus,
category='democrat',
category_name='Democratic',
not_category_name='Republican',
target_term='jobs',
minimum_term_frequency=5,
width_in_pixels=1000,
metadata=convention_df['speaker'])
file_name = 'output/demo_similarity.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)