This notebook makes heavy use of the library Scattertext (https://github.com/JasonKessler/scattertext) for language processing and visualizations.
The data used were scraped from Facebook by Max Woolf. Please see his original notebook at https://github.com/minimaxir/clickbait-cluster.
import pandas as pd
import numpy as np
import scattertext as st
import sys
import datetime
import nltk
import re
import scipy
import spacy
import altair as alt
from glob import glob
from scipy.stats import rankdata
from IPython.display import IFrame
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))
import matplotlib.pyplot as plt
%matplotlib inline
nlp = spacy.load('en')
df = pd.concat([pd.read_csv(fn, sep='\t')
.assign(publication=fn.split('/')[-1].split('_')[0])
for fn in glob('./fb_headlines/*')]).reset_index()
df['status_published'] = pd.to_datetime(df.status_published)
df = df.loc[df.link_name.dropna().index]
df['parse'] = df.link_name.apply(nlp)
df['year_month'] = df.status_published.apply(lambda x: '%s-%s' % (x.year, str(x.month).zfill(2)))
df['year'] = df.status_published.apply(lambda x: x.year)
df['reaction_percentile'] = (df.groupby(['publication', 'year'])['num_reactions']
.apply(lambda x: pd.Series(scipy.stats.rankdata(x)/len(x),
index=x.index)))
df['reaction_bin'] = (df.reaction_percentile
.apply(lambda x: 'Hi' if x > 2./3 else 'Lo' if x < 1./3 else 'Mid'))
year_month_corpus = st.CorpusFromParsedDocuments(df,
category_col='year_month',
parsed_col='parse',
feats_from_spacy_doc=st.PhraseMachinePhrases()).build()
year_month_corpus_compact = year_month_corpus.compact(st.CompactTerms(minimum_term_count=3))
import numpy as np
import pandas as pd
from scattertext.termranking import AbsoluteFrequencyRanker
from scattertext.termscoring.RankDifference import RankDifference
class GanttChart(object):
'''
Note: the Gantt charts listed here are inspired by
Dustin Arendt and Svitlana Volkova. ESTEEM: A Novel Framework for Qualitatively Evaluating and
Visualizing Spatiotemporal Embeddings in Social Media. ACL System Demonstrations. 2017.
http://www.aclweb.org/anthology/P/P17/P17-4005.pdf
'''
def __init__(self,
term_doc_matrix,
category_to_timestep_func,
is_gap_between_sequences_func,
timesteps_to_lag=4,
num_top_terms_each_timestep=10,
num_terms_to_include=40,
starting_time_step=None,
term_ranker=AbsoluteFrequencyRanker,
term_scorer=RankDifference()):
'''
Parameters
----------
term_doc_matrix : TermDocMatrix
category_to_timestep_func : lambda
is_gap_between_sequences_func : lambda
timesteps_to_lag : int
num_top_terms_each_timestep : int
num_terms_to_include : int
starting_time_step : object
term_ranker : TermRanker
term_scorer : TermScorer
'''
self.corpus = term_doc_matrix
self.timesteps_to_lag = timesteps_to_lag
self.num_top_terms_each_timestep = num_top_terms_each_timestep
self.num_terms_to_include = num_terms_to_include
self.is_gap_between_sequences_func = is_gap_between_sequences_func
self.category_to_timestep_func = category_to_timestep_func
self.term_ranker = term_ranker
self.term_scorer = term_scorer
categories = list(sorted(self.corpus.get_categories()))
if len(categories) <= timesteps_to_lag:
raise Exception("The number of categories in the term doc matrix is <= "
+ str(timesteps_to_lag))
if starting_time_step is None:
starting_time_step = categories[timesteps_to_lag + 1]
self.starting_time_step = starting_time_step
def make_chart(self):
task_df = self.get_task_df()
import altair as alt
chart = alt.Chart(task_df, axisconfig = alt.AxisConfig(grid=True, ticks=len(set(task_df.start) | set(task_df.end)))).mark_bar().encode(
x='start',
x2='end',
y='term',
)
return chart
def _get_term_time_df(self):
data = []
tdf = self.term_ranker(self.corpus).get_ranks()
for cat in sorted(self.corpus.get_categories()):
if cat >= self.starting_time_step:
negative_categories = sorted([x for x in tdf.columns if x < cat])[-self.timesteps_to_lag:]
#print(negative_categories, cat)
scores = self.term_scorer.get_scores(
tdf[sorted([x for x in tdf.columns if x < cat])[-self.timesteps_to_lag:]]
.sum(axis=1),
tdf[cat + ' freq'].astype(int)
)
for term in tdf.index[np.argsort(-scores)[:self.num_top_terms_each_timestep]]:
data.append({'time': self.category_to_timestep_func(cat),
'term': term,
'top': 1})
return pd.DataFrame(data)
def get_task_df(self):
term_time_df = self._get_term_time_df()
terms_to_include = (
term_time_df
.groupby('term')['top']
.sum()
.sort_values(ascending=False)
.iloc[:self.num_terms_to_include].index
)
task_df = (
term_time_df[term_time_df.term.isin(terms_to_include)][['time', 'term']]
.groupby('term')
.apply(lambda x: pd.Series(self._find_sequences(x['time'])))
.reset_index()
.rename({0: 'sequence'}, axis=1)
.reset_index()
.assign(start=lambda x: x['sequence'].apply(lambda x: x[0]))
.assign(end=lambda x: x['sequence'].apply(lambda x: x[1]))
[['term', 'start', 'end']]
)
return task_df
def _find_sequences(self, time_steps):
min_timestep = None
last_timestep = None
sequences = []
cur_sequence = []
for cur_timestep in sorted(time_steps):
if min_timestep is None:
cur_sequence = [cur_timestep]
min_timestep = cur_timestep
elif not self.is_gap_between_sequences_func(last_timestep, cur_timestep):
cur_sequence.append(cur_timestep)
min_timestep = cur_timestep
else:
sequences.append([cur_sequence[0], cur_sequence[-1]])
cur_sequence = [cur_timestep]
last_timestep = cur_timestep
if len(cur_sequence) != []:
sequences.append([cur_sequence[0], cur_sequence[-1]])
return sequences
chart_start_year = '2015-01'
category_to_datetime = lambda category: pd.to_datetime(datetime.datetime(*([int(t) for t in category.split('-')] + [1])))
timesteps_to_lag = 4
num_top_terms_each_timestep = 10
num_terms_to_include = 40
def is_gap_between_sequences(t1, t2):
return np.abs((t2 - t1).days) > 31
corpus = year_month_corpus
gc = GanttChart(year_month_corpus_compact, category_to_datetime, is_gap_between_sequences, starting_time_step=chart_start_year)
ttd = gc._get_term_time_df()
gc._find_sequences(ttd[ttd['term'] == 'north korea'].time)
[[Timestamp('2015-02-01 00:00:00'), Timestamp('2015-04-01 00:00:00')], [Timestamp('2016-06-01 00:00:00'), Timestamp('2016-07-01 00:00:00')]]
list(ttd[ttd['term'] == 'high school'].time)
[Timestamp('2015-07-01 00:00:00'), Timestamp('2016-07-01 00:00:00')]
len(set(GanttChart(year_month_corpus_compact, category_to_datetime, is_gap_between_sequences, starting_time_step=chart_start_year).get_task_df().start) | set(GanttChart(year_month_corpus_compact, category_to_datetime, is_gap_between_sequences, starting_time_step=chart_start_year).get_task_df().end))
20
GanttChart(year_month_corpus_compact, category_to_datetime, is_gap_between_sequences, starting_time_step=chart_start_year).make_chart()
GanttChart(year_month_corpus_compact, category_to_datetime, is_gap_between_sequences, starting_time_step=chart_start_year).get_task_df()
term | start | end | |
---|---|---|---|
0 | boston marathon | 2013-05-01 | 2013-08-01 |
1 | bucket challenge | 2014-09-01 | 2014-12-01 |
2 | buzzfeed news | 2014-01-01 | 2015-10-01 |
3 | buzzfeed video | 2015-02-01 | 2015-07-01 |
4 | charlie hebdo | 2015-02-01 | 2015-05-01 |
5 | climate change | 2013-01-01 | 2016-02-01 |
6 | cover photo | 2013-01-01 | 2013-06-01 |
7 | fashion week | 2013-10-01 | 2015-01-01 |
8 | fox news | 2013-03-01 | 2013-09-01 |
9 | game of thrones | 2014-07-01 | 2016-08-01 |
10 | gay marriage | 2013-06-01 | 2014-07-01 |
11 | golden globes | 2015-02-01 | 2015-05-01 |
12 | gun control | 2013-03-01 | 2013-08-01 |
13 | high school | 2013-08-01 | 2016-07-01 |
14 | hurricane sandy | 2013-01-01 | 2013-12-01 |
15 | ice bucket | 2014-09-01 | 2014-12-01 |
16 | ice bucket challenge | 2014-09-01 | 2014-12-01 |
17 | live blog | 2014-08-01 | 2014-10-01 |
18 | miley cyrus | 2013-11-01 | 2014-02-01 |
19 | minimum wage | 2014-07-01 | 2015-06-01 |
20 | new hampshire | 2016-03-01 | 2016-06-01 |
21 | new york city | 2014-02-01 | 2015-05-01 |
22 | new york times | 2013-01-01 | 2014-03-01 |
23 | new york today | 2014-10-01 | 2015-06-01 |
24 | north korea | 2013-05-01 | 2016-07-01 |
25 | paris attacks | 2015-12-01 | 2016-03-01 |
26 | photos from cnn | 2015-06-01 | 2016-08-01 |
27 | pope francis | 2015-10-01 | 2016-01-01 |
28 | robin williams | 2014-09-01 | 2014-12-01 |
29 | san bernardino | 2016-01-01 | 2016-04-01 |
30 | sandra bland | 2015-08-01 | 2015-11-01 |
31 | sex marriage | 2013-04-01 | 2015-09-01 |
32 | state of the union | 2013-04-01 | 2016-05-01 |
33 | super bowl | 2013-03-01 | 2016-06-01 |
34 | supreme court | 2013-04-01 | 2016-07-01 |
35 | syrian refugees | 2016-01-01 | 2016-03-01 |
36 | taylor swift | 2015-03-01 | 2015-10-01 |
37 | thanksgiving recipe | 2014-12-01 | 2015-03-01 |
38 | united states | 2013-12-01 | 2014-09-01 |
39 | world cup | 2014-08-01 | 2015-10-01 |
tdf = AbsoluteFrequencyRanker(year_month_corpus_compact).get_ranks()
tdf['1913-02 freq'].sort_values(ascending=False)
term times square 1 | upworthy 0 mom reacts 0 street art 0 mel gibson 0 cbs news 0 good use 0 joins twitter 0 oil spill 0 tech support 0 bill nye on 0 angry mom 0 hail storm 0 kung fu 0 insane clown posse 0 kids reenact 0 olivia munn 0 goldman sachs 0 earth day 0 factory workers 0 human trafficking 0 tiger woods 0 warning signs 0 black parents 0 law degree 0 pie charts 0 gay men 0 action figure 0 10 songs 0 viral post 0 .. day in court 0 political talk 0 back pages 0 salmon roasted in butter recipe 0 home to roost 0 episode 3 recap 0 carroll gardens 0 u.n.c. football player 0 medical bills 0 sells out 0 ukrainian military 0 podcast | episode 0 state dinner guest 0 democratic debate takeaways 0 tough path 0 hangs over 0 primary night 0 vietnamese steak with cucumber salad recipe 0 new york times fashion on instagram 0 rocket company 0 blue origin 0 contested republican convention 0 computer program 0 bernie sanders wins michigan 0 michigan primary 0 right way to stretch before exercise 0 episode 7 recap 0 star candidate 0 oregon occupier 0 five years 0 Name: 1913-02 freq, Length: 23551, dtype: int64
term_time_df.groupby('term').apply(lambda x: len(list(x['time']))).sort_values(ascending=False)
term_time_df[term_time_df.term == 'michigan primary']
term | time | top | |
---|---|---|---|
8 | michigan primary | 1918-06-01 | 1 |
17 | michigan primary | 1919-06-01 | 1 |
26 | michigan primary | 1926-02-01 | 1 |
36 | michigan primary | 1926-05-01 | 1 |
46 | michigan primary | 1928-11-01 | 1 |
56 | michigan primary | 1933-04-01 | 1 |
65 | michigan primary | 1933-12-01 | 1 |
76 | michigan primary | 1934-01-01 | 1 |
86 | michigan primary | 1934-02-01 | 1 |
96 | michigan primary | 1940-04-01 | 1 |
105 | michigan primary | 1942-02-01 | 1 |
115 | michigan primary | 1942-08-01 | 1 |
125 | michigan primary | 1946-11-01 | 1 |
135 | michigan primary | 1947-04-01 | 1 |
146 | michigan primary | 1947-08-01 | 1 |
156 | michigan primary | 1948-01-01 | 1 |
166 | michigan primary | 1951-01-01 | 1 |
176 | michigan primary | 1954-01-01 | 1 |
185 | michigan primary | 1955-03-01 | 1 |
195 | michigan primary | 1958-01-01 | 1 |
206 | michigan primary | 1959-01-01 | 1 |
216 | michigan primary | 1960-09-01 | 1 |
226 | michigan primary | 1962-03-01 | 1 |
236 | michigan primary | 1963-08-01 | 1 |
245 | michigan primary | 1963-11-01 | 1 |
256 | michigan primary | 1964-01-01 | 1 |
266 | michigan primary | 1964-03-01 | 1 |
278 | michigan primary | 1965-11-01 | 1 |
288 | michigan primary | 1966-01-01 | 1 |
299 | michigan primary | 1967-02-01 | 1 |
... | ... | ... | ... |
789 | michigan primary | 1997-10-01 | 1 |
796 | michigan primary | 1998-04-01 | 1 |
806 | michigan primary | 1999-05-01 | 1 |
816 | michigan primary | 1999-12-01 | 1 |
825 | michigan primary | 2000-10-01 | 1 |
836 | michigan primary | 2001-01-01 | 1 |
848 | michigan primary | 2001-04-01 | 1 |
856 | michigan primary | 2001-09-01 | 1 |
869 | michigan primary | 2001-10-01 | 1 |
874 | michigan primary | 2002-04-01 | 1 |
885 | michigan primary | 2002-05-01 | 1 |
895 | michigan primary | 2002-06-01 | 1 |
905 | michigan primary | 2003-02-01 | 1 |
915 | michigan primary | 2003-03-01 | 1 |
926 | michigan primary | 2003-12-01 | 1 |
936 | michigan primary | 2004-12-01 | 1 |
946 | michigan primary | 2005-05-01 | 1 |
956 | michigan primary | 2006-08-01 | 1 |
966 | michigan primary | 2006-11-01 | 1 |
976 | michigan primary | 2007-04-01 | 1 |
986 | michigan primary | 2007-07-01 | 1 |
997 | michigan primary | 2007-09-01 | 1 |
1006 | michigan primary | 2007-11-01 | 1 |
1017 | michigan primary | 2007-12-01 | 1 |
1026 | michigan primary | 2008-01-01 | 1 |
1036 | michigan primary | 2008-02-01 | 1 |
1046 | michigan primary | 2008-11-01 | 1 |
1056 | michigan primary | 2008-12-01 | 1 |
1066 | michigan primary | 2009-03-01 | 1 |
1076 | michigan primary | 2009-04-01 | 1 |
107 rows × 3 columns
class GanttChart(object):
def __init__(self,
corpus,
category_to_datetime_func,
is_gap_between_sequences_func,
timesteps_to_lag = 4,
num_top_terms_each_timestep = 10,
num_terms_to_include = 40,
term_ranker = st.AbsoluteFrequencyRanker,
term_scorer = st.RankDifference):
self.corpus = corpus
self.timesteps_to_lag = timesteps_to_lag
self.num_top_terms_each_timestep = num_top_terms_each_timestep
self.num_terms_to_include = num_terms_to_include
self.is_gap_between_sequences_func = is_gap_between_sequences_func
self.category_to_datetime_func = category_to_datetime_func
def _find_sequences(self, time_steps):
min_timestep = None
max_timestep = None
last_timestep = None
gaps = []
for cur_timestep in sorted(time_steps):
if min_timestep is None:
min_timestep = cur_timestep
elif self.is_gap_between_sequences_func(cur_timestep, last_timestep):
gaps.append([min_timestep, last_timestep])
min_timestep = cur_timestep
last_timestep = cur_timestep
if gaps == [] or gaps[-1][1] != cur_timestep:
gaps.append([min_timestep, cur_timestep])
return gaps
def make_chart(self):
data = []
tdf = self.term_ranker(corpus).get_ranks()
for cat in sorted(self.corpus.get_categories()):
if cat >= chart_start_year:
scores = st.RankDifference().get_scores(
tdf[sorted([x for x in tdf.columns if x < cat])[-timesteps_to_lag:]].sum(axis=1),
tdf[cat].astype(int))
for term in tdf.index[np.argsort(-scores)[:num_top_terms_each_timestep]]:
data.append({'time': category_to_datetime(cat),
'term': term,
'top': 1})
term_time_df = pd.DataFrame(data)
terms_to_include = (term_time_df
.groupby('term')
['top']
.sum()
.sort_values(ascending=False)
.iloc[:num_terms_to_include].index)
task_df = (term_time_df[term_time_df
.term.isin(terms_to_include)][['time', 'term']]
.groupby('term')
.apply(lambda x: pd.Series(find_sequences(x['time'])))
.reset_index()
.rename({0:'sequence'}, axis=1)
.reset_index()
.assign(start=lambda x: x['sequence'].apply(lambda x: x[0]))
.assign(end=lambda x: x['sequence'].apply(lambda x: x[1]))
[['term', 'start', 'end']])
#print(task_df)
chart = alt.Chart(task_df).mark_bar().encode(
x = 'start',
x2 = 'end',
y = 'term',
)
return chart
File "<ipython-input-37-57bf62f7605a>", line 6 timesteps_to_lag = 4, ^ SyntaxError: invalid syntax
chart = generate_diachronic_chart(category_to_datetime,
timesteps_to_lag,
num_top_terms_each_timestep,
num_terms_to_include,
is_gap_between_sequences,
corpus)
> <ipython-input-18-1571ba5eb391>(54)generate_diachronic_chart() -> chart = alt.Chart(task_df).mark_bar().encode( (Pdb) c
chart