#!/usr/bin/env python # coding: utf-8 # # Using Scattertext to Explore the Effectiveness of Headlines # ### Jason S. Kessler ([@jasonkessler](http://www.twitter.com/JasonKessler)) # # The code in this notebook shows how you can use the Python package Scattertext to explore how language used in headlines # can correlate with social engagement. # # For background on the term-class association scores used and semiotic squares, please see https://github.com/JasonKessler/PuPPyTalk and https://github.com/JasonKessler/SemioticSquaresTalk # # This notebook makes heavy use of the library Scattertext (https://github.com/JasonKessler/scattertext) for language processing and visualizations. # # The data used were scraped from Facebook by Max Woolf. Please see his original notebook at https://github.com/minimaxir/clickbait-cluster. # In[2]: import pandas as pd import numpy as np import sys import umap import spacy import scattertext as st from gensim.models import word2vec import re from glob import glob from scipy.stats import rankdata from IPython.display import IFrame from IPython.core.display import display, HTML display(HTML("")) import matplotlib.pyplot as plt # In[3]: # You need to have a recent version of Scattertext to run this package assert st.__version__ >= '0.0.2.20' # In[4]: nlp = spacy.load('en') # ### Ingest and explore data showing headlines and their Facebook reaction count. # In[5]: df = pd.concat([pd.read_csv(fn, sep='\t') .assign(publication=fn.split('/')[-1].split('_')[0]) for fn in glob('./fb_headlines/*')]).reset_index() df['status_published'] = pd.to_datetime(df.status_published) # In[6]: df.iloc[0] # In[7]: df.publication.value_counts() # In[8]: df.status_published.apply(lambda x: x.year).value_counts().sort_index(ascending=False).head() # In[9]: df.status_published.max() # ### We'll restrict this to just 2016 # In[10]: df_2016 = df[df.status_published.apply(lambda x: x.year >= 2016)].drop_duplicates() df_2016 = df_2016.loc[df_2016['link_name'].dropna().index] df_2016.publication.value_counts() # In[11]: df_2016['parse'] = df_2016['link_name'].apply(nlp) # In[12]: # Restrict to headlines longer than two words df_2016 = df_2016[df_2016['parse'].apply(len) > 2] # ### Bin headlines by their publication-specific reaction percentile # In[15]: from scipy.stats import rankdata df_2016['reaction_percentile'] = df_2016.groupby('publication')['num_reactions'].apply(lambda x: pd.Series(rankdata(x)/len(x), index=x.index)) df_2016['reaction_bin'] = df_2016.reaction_percentile.apply(lambda x: 'Hi' if x > 2./3 else 'Lo' if x < 1./3 else 'Mid') # In[13]: reaction_corpus = (st.CorpusFromParsedDocuments(df_2016, parsed_col='parse', category_col='reaction_bin') .build() .compact(st.ClassPercentageCompactor(term_count=6)) .compact(st.CompactTerms(slack=3))) # In[21]: def get_metadata_from_corpus(corpus): df = corpus.get_df() return (df.page_id + ', ' + df.reaction_percentile.apply(lambda x: str(int(x * 100)) + '%') + ', ' + df.status_published.apply(lambda x: str(x.date()))) # ## Use the the dense rank difference to identify highly engaging language # * Headlines in the top 33% of their publication's reaction counts are labeled "Hi" # * Those in the bottom 33%, "Low" # In[15]: html = st.produce_frequency_explorer(reaction_corpus, category='Hi', not_categories=['Lo'], neutral_categories=['Mid'], minimum_term_frequency=0, pmi_filter_thresold=0, use_full_doc = True, term_scorer = st.RankDifference(), grey_threshold=0, width_in_pixels=1000, metadata=get_metadata_from_corpus(reaction_corpus)) file_name = 'output/reaction_freq.html' open(file_name, 'wb').write(html.encode('utf-8')) IFrame(src=file_name, width = 1200, height=700) # ## Semiotic Squares # * Visualize the dataset in four quadrants # * Y-axis: High-engagement vs. low-engagement # * X-axis: New York Times vs. Buzzfeed (i.e., highbrow vs. lowbrow) # In[16]: # Eliminate other categories from dataset (e.g., Upworthy or mid-engagment) df_2016['category'] = df_2016.publication + ' ' + df_2016.reaction_bin df_2016_four_square = df_2016[df_2016.publication.isin(['BuzzFeed', 'NYTimes']) & df_2016.reaction_bin.isin(['Hi', 'Lo'])] # Create corpus and filter terms four_square_corpus = (st.CorpusFromParsedDocuments(df_2016_four_square, category_col = 'category', parsed_col = 'parse') .build() .compact(st.CompactTerms(minimum_term_count=2, slack=5)) .compact(st.ClassPercentageCompactor(term_count=2))) # In[22]: # Set up chart structure four_square = st.FourSquare( four_square_corpus, category_a_list=['NYTimes Hi'], category_b_list=['BuzzFeed Hi'], not_category_a_list=['BuzzFeed Lo'], not_category_b_list=['NYTimes Lo'], scorer=st.RankDifference(), labels={'a': 'Highbrow Engagment', 'b': 'Lowbrow Engagment', 'not_a_and_not_b': 'Few Facebook Reactions', 'a_and_b': 'Many Facebook Reactions', 'a_and_not_b': 'NYTimes', 'b_and_not_a': 'BuzzFeed', 'not_a': 'Lowbrow Ignored', 'not_b': 'Highbrow Ignored'}) # In[23]: html = st.produce_four_square_explorer(four_square=four_square, x_label='NYTimes-Buzz', y_label='Hi-Low', use_full_doc=True, pmi_threshold_coefficient=0, metadata=get_metadata_from_corpus(four_square_corpus)) # In[24]: file_name = 'output/reaction_semiotic_axes.html' open(file_name, 'wb').write('

The Semiotics of Clickbait: Buzzfeed vs. The New York Times, High vs. Low Engagement

'.encode('utf-8') + html.encode('utf-8')) IFrame(src=file_name, width = 1600, height=900) # ## Publication-specific engagment # * We've split the corpus up into different categories. # * Instead of making the categories correspond to each corner, have the categories correspond to each axis # In[27]: four_square_axes = st.FourSquareAxes(four_square_corpus, ['NYTimes Hi'], ['NYTimes Lo'], ['BuzzFeed Hi'], ['BuzzFeed Lo'], labels = {'a': 'Appeals to all', 'a_and_not_b': 'NY Times: ↑ Engagement', 'b_and_not_a': 'NY Times: ↓ Engagement', 'a_and_b': 'BuzzFeed: ↑ Engagement', 'not_a_and_not_b': 'BuzzFeed: ↓ Engagement', 'not_a': 'Ignored by all', 'b': 'Ignored by elite, appeals to masses', 'not_b': 'Appeals to elite, ignored by masses'}) html = st.produce_four_square_axes_explorer( four_square_axes=four_square_axes, x_label='NYT: Hi-Lo', y_label='Buzz: Hi-Lo', use_full_doc=True, pmi_threshold_coefficient=0, metadata=get_metadata_from_corpus(four_square_corpus)) # In[28]: file_name = 'output/reaction_semiotic_axes.html' open(file_name, 'wb').write('

The Semiotics of Clickbait: Publication-Specific Engagement

'.encode('utf-8') + html.encode('utf-8')) IFrame(src=file_name, width = 1600, height=900) # In[33]: # View chart with multiple terms visible html = st.produce_four_square_explorer(four_square=four_square, x_label='NYTimes-Buzz', y_label='Hi-Low', use_full_doc=True, pmi_threshold_coefficient=0, metadata=get_metadata_from_corpus(four_square_corpus), censor_points=False) file_name = 'output/reaction_semiotic_censor.html' open(file_name, 'wb').write('

The Semiotics of Clickbait: Buzzfeed vs. The New York Times

'.encode('utf-8') + html.encode('utf-8')) IFrame(src=file_name, width = 1600, height=900) # ### Uniform Manifold Appromiation Projection (UMAP) visualization # * Goal: plot terms st terms are close to others that are semantically similar as represented by word embeddings # * The cosine distance as a similarity metric # * Requires very little turning (relative to t-sne) # # Python package: https://github.com/lmcinnes/umap # # In[32]: html = st.produce_projection_explorer(reaction_corpus, category='Hi', not_categories=['Lo'], neutral_categories=['Mid'], term_scorer = st.RankDifference(), neutral_category_name='Mid', width_in_pixels=1000, use_full_doc=True, projection_model = umap.UMAP(metric='cosine'), term_acceptance_re=re.compile(''), metadata=get_metadata_from_corpus(reaction_corpus)) file_name = 'output/reaction_umap_projection.html' open(file_name, 'wb').write(html.encode('utf-8')) IFrame(src=file_name, width = 1200, height=700) # In[ ]: