Classy words: Exploring Term-Category Association Metrics¶

Jason S. Kessler (@jasonkessler)¶

This notebook makes heavy use of the library Scattertext (https://github.com/JasonKessler/scattertext) for language processing and visualizations.

The data used were scraped from Facebook by Max Woolf. Please see his original notebook at https://github.com/minimaxir/clickbait-cluster.

In [1]:

import pandas as pd
import numpy as np
import scattertext as st
import sys
import datetime
import nltk
import re
import scipy
import spacy
import altair as alt

from glob import glob
from scipy.stats import rankdata
from IPython.display import IFrame
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:

nlp = spacy.load('en')

In [3]:

df = pd.concat([pd.read_csv(fn, sep='\t')
                .assign(publication=fn.split('/')[-1].split('_')[0]) 
                for fn in glob('./fb_headlines/*')]).reset_index()
df['status_published'] = pd.to_datetime(df.status_published)
df = df.loc[df.link_name.dropna().index]

In [4]:

df['parse'] = df.link_name.apply(nlp)

In [5]:

df['year_month'] = df.status_published.apply(lambda x: '%s-%s' % (x.year, str(x.month).zfill(2)))
df['year'] = df.status_published.apply(lambda x: x.year)

In [6]:

df['reaction_percentile'] = (df.groupby(['publication', 'year'])['num_reactions']
                             .apply(lambda x: pd.Series(scipy.stats.rankdata(x)/len(x), 
                                                        index=x.index)))
df['reaction_bin'] = (df.reaction_percentile
                      .apply(lambda x: 'Hi' if x > 2./3 else 'Lo' if x < 1./3 else 'Mid'))

In [7]:

year_month_corpus = st.CorpusFromParsedDocuments(df, 
                                                 category_col='year_month', 
                                                 parsed_col='parse', 
                                                 feats_from_spacy_doc=st.PhraseMachinePhrases()).build()

In [ ]:

year_month_corpus_compact = year_month_corpus.compact(st.CompactTerms(minimum_term_count=3))

In [96]:

import numpy as np
import pandas as pd

from scattertext.termranking import AbsoluteFrequencyRanker
from scattertext.termscoring.RankDifference import RankDifference


class GanttChart(object):
    '''
    Note: the Gantt charts listed here are inspired by
    Dustin Arendt and Svitlana Volkova. ESTEEM: A Novel Framework for Qualitatively Evaluating and
    Visualizing Spatiotemporal Embeddings in Social Media. ACL System Demonstrations. 2017.
    http://www.aclweb.org/anthology/P/P17/P17-4005.pdf
    '''
    def __init__(self,
                 term_doc_matrix,
                 category_to_timestep_func,
                 is_gap_between_sequences_func,
                 timesteps_to_lag=4,
                 num_top_terms_each_timestep=10,
                 num_terms_to_include=40,
                 starting_time_step=None,
                 term_ranker=AbsoluteFrequencyRanker,
                 term_scorer=RankDifference()):
        '''
        Parameters
        ----------
        term_doc_matrix : TermDocMatrix
        category_to_timestep_func : lambda
        is_gap_between_sequences_func : lambda
        timesteps_to_lag : int
        num_top_terms_each_timestep : int
        num_terms_to_include : int
        starting_time_step : object
        term_ranker : TermRanker
        term_scorer : TermScorer
        '''
        self.corpus = term_doc_matrix
        self.timesteps_to_lag = timesteps_to_lag
        self.num_top_terms_each_timestep = num_top_terms_each_timestep
        self.num_terms_to_include = num_terms_to_include
        self.is_gap_between_sequences_func = is_gap_between_sequences_func
        self.category_to_timestep_func = category_to_timestep_func
        self.term_ranker = term_ranker
        self.term_scorer = term_scorer
        categories = list(sorted(self.corpus.get_categories()))
        if len(categories) <= timesteps_to_lag:
            raise Exception("The number of categories in the term doc matrix is <= "
                            + str(timesteps_to_lag))
        if starting_time_step is None:
            starting_time_step = categories[timesteps_to_lag + 1]
        self.starting_time_step = starting_time_step

    def make_chart(self):
        task_df = self.get_task_df()
        import altair as alt
        chart = alt.Chart(task_df, axisconfig = alt.AxisConfig(grid=True, ticks=len(set(task_df.start) | set(task_df.end)))).mark_bar().encode(
            x='start',
            x2='end',
            y='term',
        )
        return chart

    def _get_term_time_df(self):
        data = []
        tdf = self.term_ranker(self.corpus).get_ranks()
        for cat in sorted(self.corpus.get_categories()):
            if cat >= self.starting_time_step:
                negative_categories = sorted([x for x in tdf.columns if x < cat])[-self.timesteps_to_lag:]
                #print(negative_categories, cat)
                scores = self.term_scorer.get_scores(
                    tdf[sorted([x for x in tdf.columns if x < cat])[-self.timesteps_to_lag:]]
                        .sum(axis=1),
                    tdf[cat + ' freq'].astype(int)
                )
                for term in tdf.index[np.argsort(-scores)[:self.num_top_terms_each_timestep]]:
                    data.append({'time': self.category_to_timestep_func(cat),
                                 'term': term,
                                 'top': 1})
        return pd.DataFrame(data)

    def get_task_df(self):
        term_time_df = self._get_term_time_df()
        terms_to_include = (
            term_time_df
                .groupby('term')['top']
                .sum()
                .sort_values(ascending=False)
                .iloc[:self.num_terms_to_include].index
        )
        task_df = (
            term_time_df[term_time_df.term.isin(terms_to_include)][['time', 'term']]
                .groupby('term')
                .apply(lambda x: pd.Series(self._find_sequences(x['time'])))
                .reset_index()
                .rename({0: 'sequence'}, axis=1)
                .reset_index()
                .assign(start=lambda x: x['sequence'].apply(lambda x: x[0]))
                .assign(end=lambda x: x['sequence'].apply(lambda x: x[1]))
            [['term', 'start', 'end']]
        )
        return task_df

    def _find_sequences(self, time_steps):
        min_timestep = None
        last_timestep = None
        sequences = []
        cur_sequence = []
        for cur_timestep in sorted(time_steps):
            if min_timestep is None:
                cur_sequence = [cur_timestep]
                min_timestep = cur_timestep
            elif not self.is_gap_between_sequences_func(last_timestep, cur_timestep):
                cur_sequence.append(cur_timestep)
                min_timestep = cur_timestep
            else:
                sequences.append([cur_sequence[0], cur_sequence[-1]])
                cur_sequence = [cur_timestep]
            last_timestep = cur_timestep
        if len(cur_sequence) != []:
            sequences.append([cur_sequence[0], cur_sequence[-1]])
        return sequences

In [97]:

chart_start_year = '2015-01'
category_to_datetime = lambda category: pd.to_datetime(datetime.datetime(*([int(t) for t in category.split('-')] + [1])))
timesteps_to_lag = 4
num_top_terms_each_timestep = 10 
num_terms_to_include = 40 
def is_gap_between_sequences(t1, t2): 
    return np.abs((t2 - t1).days) > 31
corpus = year_month_corpus

In [ ]:

In [98]:

gc = GanttChart(year_month_corpus_compact, category_to_datetime, is_gap_between_sequences, starting_time_step=chart_start_year)
ttd = gc._get_term_time_df()
gc._find_sequences(ttd[ttd['term'] == 'north korea'].time)

Out[98]:

[[Timestamp('2015-02-01 00:00:00'), Timestamp('2015-04-01 00:00:00')],
 [Timestamp('2016-06-01 00:00:00'), Timestamp('2016-07-01 00:00:00')]]

In [99]:

list(ttd[ttd['term'] == 'high school'].time)

Out[99]:

[Timestamp('2015-07-01 00:00:00'), Timestamp('2016-07-01 00:00:00')]

In [100]:

len(set(GanttChart(year_month_corpus_compact, category_to_datetime, is_gap_between_sequences, starting_time_step=chart_start_year).get_task_df().start) | set(GanttChart(year_month_corpus_compact, category_to_datetime, is_gap_between_sequences, starting_time_step=chart_start_year).get_task_df().end))

Out[100]:

In [101]:

GanttChart(year_month_corpus_compact, category_to_datetime, is_gap_between_sequences, starting_time_step=chart_start_year).make_chart()

In [31]:

GanttChart(year_month_corpus_compact, category_to_datetime, is_gap_between_sequences, starting_time_step=chart_start_year).get_task_df()

Out[31]:

	term	start	end
0	boston marathon	2013-05-01	2013-08-01
1	bucket challenge	2014-09-01	2014-12-01
2	buzzfeed news	2014-01-01	2015-10-01
3	buzzfeed video	2015-02-01	2015-07-01
4	charlie hebdo	2015-02-01	2015-05-01
5	climate change	2013-01-01	2016-02-01
6	cover photo	2013-01-01	2013-06-01
7	fashion week	2013-10-01	2015-01-01
8	fox news	2013-03-01	2013-09-01
9	game of thrones	2014-07-01	2016-08-01
10	gay marriage	2013-06-01	2014-07-01
11	golden globes	2015-02-01	2015-05-01
12	gun control	2013-03-01	2013-08-01
13	high school	2013-08-01	2016-07-01
14	hurricane sandy	2013-01-01	2013-12-01
15	ice bucket	2014-09-01	2014-12-01
16	ice bucket challenge	2014-09-01	2014-12-01
17	live blog	2014-08-01	2014-10-01
18	miley cyrus	2013-11-01	2014-02-01
19	minimum wage	2014-07-01	2015-06-01
20	new hampshire	2016-03-01	2016-06-01
21	new york city	2014-02-01	2015-05-01
22	new york times	2013-01-01	2014-03-01
23	new york today	2014-10-01	2015-06-01
24	north korea	2013-05-01	2016-07-01
25	paris attacks	2015-12-01	2016-03-01
26	photos from cnn	2015-06-01	2016-08-01
27	pope francis	2015-10-01	2016-01-01
28	robin williams	2014-09-01	2014-12-01
29	san bernardino	2016-01-01	2016-04-01
30	sandra bland	2015-08-01	2015-11-01
31	sex marriage	2013-04-01	2015-09-01
32	state of the union	2013-04-01	2016-05-01
33	super bowl	2013-03-01	2016-06-01
34	supreme court	2013-04-01	2016-07-01
35	syrian refugees	2016-01-01	2016-03-01
36	taylor swift	2015-03-01	2015-10-01
37	thanksgiving recipe	2014-12-01	2015-03-01
38	united states	2013-12-01	2014-09-01
39	world cup	2014-08-01	2015-10-01

In [196]:

tdf = AbsoluteFrequencyRanker(year_month_corpus_compact).get_ranks()

In [91]:

tdf['1913-02 freq'].sort_values(ascending=False)

Out[91]:

term
times square                                   1
| upworthy                                     0
mom reacts                                     0
street art                                     0
mel gibson                                     0
cbs news                                       0
good use                                       0
joins twitter                                  0
oil spill                                      0
tech support                                   0
bill nye on                                    0
angry mom                                      0
hail storm                                     0
kung fu                                        0
insane clown posse                             0
kids reenact                                   0
olivia munn                                    0
goldman sachs                                  0
earth day                                      0
factory workers                                0
human trafficking                              0
tiger woods                                    0
warning signs                                  0
black parents                                  0
law degree                                     0
pie charts                                     0
gay men                                        0
action figure                                  0
10 songs                                       0
viral post                                     0
                                              ..
day in court                                   0
political talk                                 0
back pages                                     0
salmon roasted in butter recipe                0
home to roost                                  0
episode 3 recap                                0
carroll gardens                                0
u.n.c. football player                         0
medical bills                                  0
sells out                                      0
ukrainian military                             0
podcast | episode                              0
state dinner guest                             0
democratic debate takeaways                    0
tough path                                     0
hangs over                                     0
primary night                                  0
vietnamese steak with cucumber salad recipe    0
new york times fashion on instagram            0
rocket company                                 0
blue origin                                    0
contested republican convention                0
computer program                               0
bernie sanders wins michigan                   0
michigan primary                               0
right way to stretch before exercise           0
episode 7 recap                                0
star candidate                                 0
oregon occupier                                0
five years                                     0
Name: 1913-02 freq, Length: 23551, dtype: int64

In [85]:

term_time_df.groupby('term').apply(lambda x: len(list(x['time']))).sort_values(ascending=False)
term_time_df[term_time_df.term == 'michigan primary']

Out[85]:

	term	time	top
8	michigan primary	1918-06-01	1
17	michigan primary	1919-06-01	1
26	michigan primary	1926-02-01	1
36	michigan primary	1926-05-01	1
46	michigan primary	1928-11-01	1
56	michigan primary	1933-04-01	1
65	michigan primary	1933-12-01	1
76	michigan primary	1934-01-01	1
86	michigan primary	1934-02-01	1
96	michigan primary	1940-04-01	1
105	michigan primary	1942-02-01	1
115	michigan primary	1942-08-01	1
125	michigan primary	1946-11-01	1
135	michigan primary	1947-04-01	1
146	michigan primary	1947-08-01	1
156	michigan primary	1948-01-01	1
166	michigan primary	1951-01-01	1
176	michigan primary	1954-01-01	1
185	michigan primary	1955-03-01	1
195	michigan primary	1958-01-01	1
206	michigan primary	1959-01-01	1
216	michigan primary	1960-09-01	1
226	michigan primary	1962-03-01	1
236	michigan primary	1963-08-01	1
245	michigan primary	1963-11-01	1
256	michigan primary	1964-01-01	1
266	michigan primary	1964-03-01	1
278	michigan primary	1965-11-01	1
288	michigan primary	1966-01-01	1
299	michigan primary	1967-02-01	1
...	...	...	...
789	michigan primary	1997-10-01	1
796	michigan primary	1998-04-01	1
806	michigan primary	1999-05-01	1
816	michigan primary	1999-12-01	1
825	michigan primary	2000-10-01	1
836	michigan primary	2001-01-01	1
848	michigan primary	2001-04-01	1
856	michigan primary	2001-09-01	1
869	michigan primary	2001-10-01	1
874	michigan primary	2002-04-01	1
885	michigan primary	2002-05-01	1
895	michigan primary	2002-06-01	1
905	michigan primary	2003-02-01	1
915	michigan primary	2003-03-01	1
926	michigan primary	2003-12-01	1
936	michigan primary	2004-12-01	1
946	michigan primary	2005-05-01	1
956	michigan primary	2006-08-01	1
966	michigan primary	2006-11-01	1
976	michigan primary	2007-04-01	1
986	michigan primary	2007-07-01	1
997	michigan primary	2007-09-01	1
1006	michigan primary	2007-11-01	1
1017	michigan primary	2007-12-01	1
1026	michigan primary	2008-01-01	1
1036	michigan primary	2008-02-01	1
1046	michigan primary	2008-11-01	1
1056	michigan primary	2008-12-01	1
1066	michigan primary	2009-03-01	1
1076	michigan primary	2009-04-01	1

107 rows × 3 columns

In [37]:

class GanttChart(object):
    def __init__(self,
                 corpus,
                 category_to_datetime_func,
                 is_gap_between_sequences_func,
                 timesteps_to_lag = 4,
                 num_top_terms_each_timestep = 10,
                 num_terms_to_include = 40,
                 term_ranker = st.AbsoluteFrequencyRanker,
                 term_scorer = st.RankDifference):
        self.corpus = corpus
        self.timesteps_to_lag = timesteps_to_lag
        self.num_top_terms_each_timestep = num_top_terms_each_timestep
        self.num_terms_to_include = num_terms_to_include      
        self.is_gap_between_sequences_func = is_gap_between_sequences_func
        self.category_to_datetime_func = category_to_datetime_func
        
    def _find_sequences(self, time_steps):
        min_timestep = None
        max_timestep = None
        last_timestep = None
        gaps = []
        for cur_timestep in sorted(time_steps):
            if min_timestep is None:
                min_timestep = cur_timestep
            elif self.is_gap_between_sequences_func(cur_timestep, last_timestep):
                gaps.append([min_timestep, last_timestep])
                min_timestep = cur_timestep
            last_timestep = cur_timestep
        if gaps == [] or gaps[-1][1] != cur_timestep:
            gaps.append([min_timestep, cur_timestep])
        return gaps        

    def make_chart(self):
        data = []
        tdf = self.term_ranker(corpus).get_ranks()
        for cat in sorted(self.corpus.get_categories()):
            if cat >= chart_start_year:
                scores = st.RankDifference().get_scores(
                    tdf[sorted([x for x in tdf.columns if x < cat])[-timesteps_to_lag:]].sum(axis=1), 
                    tdf[cat].astype(int))
                for term in tdf.index[np.argsort(-scores)[:num_top_terms_each_timestep]]:
                    data.append({'time': category_to_datetime(cat), 
                                 'term': term, 
                                 'top': 1})

        term_time_df = pd.DataFrame(data)
        terms_to_include = (term_time_df
                            .groupby('term')
                            ['top']
                            .sum()
                            .sort_values(ascending=False)
                            .iloc[:num_terms_to_include].index)
        task_df = (term_time_df[term_time_df
                                .term.isin(terms_to_include)][['time', 'term']]
                   .groupby('term')
                       .apply(lambda x: pd.Series(find_sequences(x['time'])))
                       .reset_index()
                       .rename({0:'sequence'}, axis=1)
                   .reset_index()
                   .assign(start=lambda x: x['sequence'].apply(lambda x: x[0]))
                   .assign(end=lambda x: x['sequence'].apply(lambda x: x[1]))
                   [['term', 'start', 'end']])
        #print(task_df)
        chart = alt.Chart(task_df).mark_bar().encode(
            x = 'start',
            x2 = 'end',
            y = 'term',
        )
        return chart

  File "<ipython-input-37-57bf62f7605a>", line 6
    timesteps_to_lag = 4,
                   ^
SyntaxError: invalid syntax

In [20]:

chart = generate_diachronic_chart(category_to_datetime, 
                          timesteps_to_lag, 
                          num_top_terms_each_timestep, 
                          num_terms_to_include, 
                          is_gap_between_sequences,
                          corpus)

> <ipython-input-18-1571ba5eb391>(54)generate_diachronic_chart()
-> chart = alt.Chart(task_df).mark_bar().encode(
(Pdb) c

In [22]:

chart

In [ ]: