#!/usr/bin/env python # coding: utf-8 # ## Natural Language Visualization With Scattertext # ## Jason S. Kessler @jasonkessler # ### Global AI Conference 2018, Seattle, WA. April 27, 2018. # # The Github repository for talk is at [https://github.com/JasonKessler/GlobalAI2018](https://github.com/JasonKessler/GlobalAI2018). # # Visualizations were made using [Scattertext](https://github.com/JasonKessler/scattertext). # # Please cite as: # Jason S. Kessler. Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ. ACL System Demonstrations. 2017. # In[4]: import pandas as pd import numpy as np import scattertext as st import spacy from IPython.display import IFrame from IPython.core.display import display, HTML display(HTML("")) import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') # In[5]: assert st.__version__ >= '0.0.2.25' # ### The data # # Dataset consists of reviews of movies and plot descriptions. Plot descriptions are guaranteed to be from a movie which was reviewed. # # Data set is from http://www.cs.cornell.edu/people/pabo/movie-review-data/ # # References: # * Bo Pang, Lillian Lee, and Shivakumar Vaithyanathan, Thumbs up? Sentiment Classification using Machine Learning Techniques, Proceedings of EMNLP 2002. # # * Bo Pang and Lillian Lee, A Sentimental Education: Sentiment Analysis Using Subjectivity Summarization Based on Minimum Cuts, Proceedings of ACL 2004. # In[6]: rdf = st.SampleCorpora.RottenTomatoes.get_data() rdf['category_name'] = rdf['category'].apply(lambda x: {'plot': 'Plot', 'rotten': 'Negative', 'fresh': 'Positive'}[x]) print(rdf.category_name.value_counts()) rdf[['text', 'movie_name', 'category_name']].head() # In[7]: corpus = (st.CorpusFromPandas(rdf, category_col='category_name', text_col='text', nlp = st.whitespace_nlp_with_sentences) .build()) corpus.get_term_freq_df().to_csv('term_freqs.csv') unigram_corpus = corpus.get_unigram_corpus() # ### Let's visualize the corpus using Scattertext # # The x-axis indicates the rank of a word or bigram in the set of positive reviews, and the y-axis negative reviews. # # Ranks are determined using "dense" ranking, meaning the most frequent terms, regardless of ties, are given rank 1, the next most frequent terms, regardless of ties, are given rank 2, etc. # # It appears that terms more associated with a class are a further distance from the diagonal line between the lower-left and upper-right corners. Terms are colored according to this distance. We'll return to this in a bit. # # Scattertext selectively labels points in such a way as to prevent labels from overlapping other elements of the graph. Mouse-over points and term labels for a preview, and click for a key-word in context view. # # References: # * Jason S. Kessler. Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ. ACL System Demonstrations. 2017. # In[5]: html = st.produce_scattertext_explorer( corpus, category='Positive', not_categories=['Negative'], sort_by_dist=False, metadata=rdf['movie_name'], term_scorer=st.RankDifference(), transform=st.Scalers.percentile_dense ) file_name = 'rotten_fresh_stdense.html' open(file_name, 'wb').write(html.encode('utf-8')) IFrame(src=file_name, width = 1300, height=700) # ### We view can see more terms through breaking ties in ranking alphabetically. # Lower frequency terms are more prominent in this view, and more terms can be labeled. # In[6]: html = st.produce_scattertext_explorer( corpus, category='Positive', not_categories=['Negative'], sort_by_dist=False, metadata=rdf['movie_name'], term_scorer=st.RankDifference(), ) file_name = 'rotten_fresh_st.html' open(file_name, 'wb').write(html.encode('utf-8')) IFrame(src=file_name, width = 1300, height=700) # ### Naive approach 1 # ### tf.idf difference (not recommended) # $$ \mbox{Term Frquency}(\mbox{term}, \mbox{category}) = \#(\mbox{term}\in\mbox{category}) $$ # # $$ \mbox{Inverse Document Frquency}(\mbox{term}) = \log \frac{\mbox{# of categories}}{\mbox{# of categories containing term}} $$ # # $$ \mbox{tfidf}(\mbox{term}, \mbox{category}) = \mbox{Term Frquency}(\mbox{term}, \mbox{category}) \times \mbox{Inverse Document Frquency}(\mbox{term}) $$ # # $$ \mbox{tfidf-difference}(\mbox{term}, \mbox{category}) = \mbox{tf.idf}(\mbox{term}, \mbox{category}_a) - \mbox{tf.idf}(\mbox{term}, \mbox{category}_b) $$ # # Tf.idf ignores terms used in each category. Since we only consider three categories (positive, negative and plot descriptions), a large number of terms have zero (log 1) scores. The problem is Tf.idf doesn't weight how often a term is used in another category. This causes eccentric, brittle, low-frequency terms to be favored. # # This formulation does take into account data from a background corpus. # # $$ \#(\mbox{term}, \mbox{category}) \times \log \frac{\mbox{# of categories}}{\mbox{# of categories containing term}} $$ # # ## Scaled F-Score # ### Associatied terms have a *relatively* high category-specific precision and category-specific term frequency (i.e., % of terms in category are term) # ### Take the harmonic mean of precision and frequency (both have to be high) # ### We will make two adjustments to this method in order to come up with the final formulation of Scaled F-Score # # Given a word $w_i \in W$ and a category $c_j \in C$, define the precision of the word $w_i$ wrt to a category as: # $$ \mbox{prec}(i,j) = \frac{\#(w_i, c_j)}{\sum_{c \in C} \#(w_i, c)}. $$ # # The function $\#(w_i, c_j)$ represents either the number of times $w_i$ occurs in a document labeled with the category $c_j$ or the number of documents labeled $c_j$ which contain $w_i$. # # Similarly, define the frequency a word occurs in the category as: # # $$ \mbox{freq}(i, j) = \frac{\#(w_i, c_j)}{\sum_{w \in W} \#(w, c_j)}. $$ # # The harmonic mean of these two values of these two values is defined as: # # $$ \mathcal{H}_\beta(i,j) = (1 + \beta^2) \frac{\mbox{prec}(i,j) \cdot \mbox{freq}(i,j)}{\beta^2 \cdot \mbox{prec}(i,j) + \mbox{freq}(i,j)}. $$ # # $\beta \in \mathcal{R}^+$ is a scaling factor where frequency is favored if $\beta < 1$, precision if $\beta > 1$, and both are equally weighted if $\beta = 1$. F-Score is equivalent to the harmonic mean where $\beta = 1$. # In[8]: from scipy.stats import hmean term_freq_df = corpus.get_unigram_corpus().get_term_freq_df()[['Positive freq', 'Negative freq']] term_freq_df = term_freq_df[term_freq_df.sum(axis=1) > 0] term_freq_df['pos_precision'] = (term_freq_df['Positive freq'] * 1./ (term_freq_df['Positive freq'] + term_freq_df['Negative freq'])) term_freq_df['pos_freq_pct'] = (term_freq_df['Positive freq'] * 1. /term_freq_df['Positive freq'].sum()) term_freq_df['pos_hmean'] = (term_freq_df .apply(lambda x: (hmean([x['pos_precision'], x['pos_freq_pct']]) if x['pos_precision'] > 0 and x['pos_freq_pct'] > 0 else 0), axis=1)) term_freq_df.sort_values(by='pos_hmean', ascending=False).iloc[:10] # In[9]: term_freq_df.pos_freq_pct.describe() # In[10]: term_freq_df.pos_precision.describe() # In[77]: # The plot looks a bit better if you Anscombe transform the data, but it doesn't make a difference in SFS #freq = 2*(np.sqrt(term_freq_df.pos_freq_pct.values)+3/8) freq = term_freq_df.pos_freq_pct.values prec = term_freq_df.pos_precision.values html = st.produce_scattertext_explorer( corpus.remove_terms(set(corpus.get_terms()) - set(term_freq_df.index)), category='Positive', not_category_name='Negative', not_categories=['Negative'], x_label = 'Portion of words used in positive reviews', original_x = freq, x_coords = (freq - freq.min())/freq.max(), x_axis_values = [int(freq.min()*1000)/1000., int(freq.max() * 1000)/1000.], y_label = 'Portion of documents containing word that are positive', original_y = prec, y_coords = (prec - prec.min())/prec.max(), y_axis_values = [int(prec.min() * 1000)/1000., int((prec.max()/2.)*1000)/1000., int(prec.max() * 1000)/1000.], scores = term_freq_df.pos_hmean.values, sort_by_dist=False, show_characteristic=False ) file_name = 'not_normed_freq_prec.html' open(file_name, 'wb').write(html.encode('utf-8')) IFrame(src=file_name, width = 1300, height=700) # In[27]: import numpy as np import matplotlib.pyplot as plt import matplotlib import seaborn as sns from scipy.stats import norm fig, ax = plt.subplots(figsize=(15,10)) freqs = term_freq_df.pos_freq_pct[term_freq_df.pos_freq_pct > 0] log_freqs = np.log(freqs) sns.distplot(log_freqs[:1000], kde=False, rug=True, hist=False, rug_kws={"color": "k"}) x = np.linspace(log_freqs.min(), log_freqs.max(), 100) frozen_norm = norm(log_freqs.mean(), log_freqs.std()) y = frozen_norm.pdf(x) plt.plot(x, y ,color='k') term = 'beauty' word_freq = log_freqs.loc[term] term_cdf = frozen_norm.cdf(word_freq) plt.axvline(x=word_freq, color='red', label='Log frequency of "'+term+'"') plt.fill_between(x[x < word_freq], y[x < word_freq], y[x < word_freq] * 0, facecolor='blue', alpha=0.5, label="Log-normal CDF of %s: $%0.3f \in [0,1]$" % (term, term_cdf) ) ax.set_xlabel('Log term frequency') ax.set_ylabel('Cumulative term probability') plt.legend() for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] + ax.get_xticklabels() + ax.get_yticklabels() ): item.set_fontsize(20) plt.rc('legend', fontsize=20) plt.show() # ## Problem: harmonic means are dominated by the precision # ### Take the normal CDF of precision and frequency percentage scores, which will fall between 0 and 1, which scales and standardizes both scores. # # Define the the Normal CDF as: # # $$ \Phi(z) = \int_{-\infty}^z \mathcal{N}(x; \mu, \sigma^2)\ \mathrm{d}x.$$ # # Where $ \mathcal{N} $ is the PDF of the Normal distribution, $\mu$ is the mean, and $\sigma^2$ is the variance. # # $\Phi$ is used to scale and standardize the precisions and frequencies, and place them on the same scale $[0,1]$. # # Now we can define Scaled F-Score as the harmonic mean of the Normal CDF transformed frequency and precision: # # $$ \mbox{S-CAT}_{\beta}(i, j) = \mathcal{H}_{\beta}(\Phi(\mbox{prec}(i, j)), \Phi(\mbox{freq}(i, j))).$$ # # $\mu$ and $\sigma^2$ are defined separately as the mean and variance of precision and frequency. # # A $\beta$ of 2 is recommended and is the default value in Scattertext. # # Note that any function with the range of $[0,1]$ (this includes the identity function) may be used in place of $\Phi$. Also, when the precision is very small (e.g., of a tiny minority class) normalization may be foregone. # In[78]: from scipy.stats import norm def normcdf(x): return norm.cdf(x, x.mean(), x.std ()) term_freq_df['pos_precision_normcdf'] = normcdf(term_freq_df.pos_precision) term_freq_df['pos_freq_pct_normcdf'] = normcdf(term_freq_df.pos_freq_pct.values) term_freq_df['pos_scaled_f_score'] = hmean([term_freq_df['pos_precision_normcdf'], term_freq_df['pos_freq_pct_normcdf']]) term_freq_df.sort_values(by='pos_scaled_f_score', ascending=False).iloc[:10] # In[79]: term_freq_df.sort_values(by='pos_scaled_f_score', ascending=True).iloc[:10] # In[81]: freq = term_freq_df.pos_freq_pct_normcdf.values prec = term_freq_df.pos_precision_normcdf.values html = st.produce_scattertext_explorer( corpus.remove_terms(set(corpus.get_terms()) - set(term_freq_df.index)), category='Positive', not_category_name='Negative', not_categories=['Negative'], x_label = 'Portion of words used in positive reviews (norm-cdf)', original_x = freq, x_coords = (freq - freq.min())/freq.max(), x_axis_values = [int(freq.min()*1000)/1000., int(freq.max() * 1000)/1000.], y_label = 'documents containing word that are positive (norm-cdf)', original_y = prec, y_coords = (prec - prec.min())/prec.max(), y_axis_values = [int(prec.min() * 1000)/1000., int((prec.max()/2.)*1000)/1000., int(prec.max() * 1000)/1000.], scores = term_freq_df.pos_scaled_f_score.values, sort_by_dist=False, show_characteristic=False ) file_name = 'normed_freq_prec.html' open(file_name, 'wb').write(html.encode('utf-8')) IFrame(src=file_name, width = 1300, height=700) # ## A second problem: low scores are low-frequency brittle terms. # ## Make the approach fair to negative scoring terms # ### Solution: compute SFS of negative class. If that score has a higher magnitude than the positive SFS, keep that, but as a negative score. # # Define the Scaled F-Score for category $j$ as # $$ \mbox{S-CAT}^{j} = \mbox{S-CAT}_{\beta}(i, j). $$ # # Define a class $\neg j$ which includes all categories other than $j$. # # and the Scaled F-Score for all other categories as # $$ \mbox{S-CAT}^{\neg j} = \mbox{S-CAT}_{\beta}(i, \neg j). $$ # # Let the corrected version of Scaled F-Score be: # # $$\mathcal{S}_{\beta} = 2 \cdot \big(-0.5 + \begin{cases} # \mbox{S-CAT}^{j} & \text{if}\ \mbox{S-CAT}^{j} > \mbox{S-CAT}^{\neg j}, \\ # 1 - \mbox{S-CAT}^{\neg j} & \text{if}\ \mbox{S-CAT}^{j} < \mbox{S-CAT}^{\neg j}, \\ # 0 & \text{otherwise}. # \end{cases} \big).$$ # # Note that the range of $\mathcal{S}$ is now $[-1, 1]$, where $\mathcal{S} < 0$ indicates a term less associated with the category is question than average, and a positive score being more associated. # In[83]: term_freq_df['neg_precision_normcdf'] = normcdf((term_freq_df['Negative freq'] * 1./ (term_freq_df['Negative freq'] + term_freq_df['Positive freq']))) term_freq_df['neg_freq_pct_normcdf'] = normcdf((term_freq_df['Negative freq'] * 1. /term_freq_df['Negative freq'].sum())) term_freq_df['neg_scaled_f_score'] = hmean([term_freq_df['neg_precision_normcdf'], term_freq_df['neg_freq_pct_normcdf']]) term_freq_df['scaled_f_score'] = 0 term_freq_df.loc[term_freq_df['pos_scaled_f_score'] > term_freq_df['neg_scaled_f_score'], 'scaled_f_score'] = term_freq_df['pos_scaled_f_score'] term_freq_df.loc[term_freq_df['pos_scaled_f_score'] < term_freq_df['neg_scaled_f_score'], 'scaled_f_score'] = 1-term_freq_df['neg_scaled_f_score'] term_freq_df['scaled_f_score'] = 2 * (term_freq_df['scaled_f_score'] - 0.5) term_freq_df.sort_values(by='scaled_f_score', ascending=False).iloc[:10] # In[84]: term_freq_df.sort_values(by='scaled_f_score', ascending=True).iloc[:10] # In[86]: is_pos = term_freq_df.pos_scaled_f_score > term_freq_df.neg_scaled_f_score freq = term_freq_df.pos_freq_pct_normcdf*is_pos - term_freq_df.neg_freq_pct_normcdf*~is_pos prec = term_freq_df.pos_precision_normcdf*is_pos - term_freq_df.neg_precision_normcdf*~is_pos def scale(ar): return (ar - ar.min())/(ar.max() - ar.min()) def close_gap(ar): ar[ar > 0] -= ar[ar > 0].min() ar[ar < 0] -= ar[ar < 0].max() return ar html = st.produce_scattertext_explorer( corpus.remove_terms(set(corpus.get_terms()) - set(term_freq_df.index)), category='Positive', not_category_name='Negative', not_categories=['Negative'], x_label = 'Frequency', original_x = freq, x_coords = scale(close_gap(freq)), x_axis_labels = ['Frequent in Neg', 'Not Frequent', 'Frequent in Pos'], y_label = 'Precision', original_y = prec, y_coords = scale(close_gap(prec)), y_axis_labels = ['Neg Precise', 'Imprecise', 'Pos Precise'], scores = (term_freq_df.scaled_f_score.values + 1)/2, sort_by_dist=False, show_characteristic=False ) file_name = 'sfs_explain.html' open(file_name, 'wb').write(html.encode('utf-8')) IFrame(src=file_name, width = 1300, height=700) # In[90]: html = st.produce_frequency_explorer( corpus.remove_terms(set(corpus.get_terms()) - set(term_freq_df.index)), category='Positive', not_category_name='Negative', not_categories=['Negative'], term_scorer=st.ScaledFScorePresets(beta=1, one_to_neg_one=True), metadata = rdf['movie_name'], grey_threshold=0 ) file_name = 'freq_sfs.html' open(file_name, 'wb').write(html.encode('utf-8')) IFrame(src=file_name, width = 1300, height=700) # In[ ]: