Exploring your TroveHarvester data¶

Under construction

In [ ]:

import os
import pandas as pd # makes manipulating the data easier
# import plotly.offline as py # for charts
# import plotly.graph_objs as go
import altair as alt
import wordcloud

# py.init_notebook_mode() # initialise plotly
alt.renderers.enable('notebook')

# Make sure data directory exists
# os.makedirs('../../data/TroveHarvester', exist_ok=True)

In [ ]:

def get_latest_harvest():
    '''
    Get the timestamp of the most recent harvest.
    '''
    harvests = sorted([d for d in os.listdir('data') if os.path.isdir(os.path.join('data', d))])
    return harvests[-1]

In [ ]:

def open_harvest_data(timestamp=None):
    '''
    Open the results of the specified harvest (most recent by default).
    
    Returns a DataFrame.
    '''
    if not timestamp:
        timestamp = get_latest_harvest()
    print(timestamp)
    df = pd.read_csv(os.path.join('data', timestamp, 'results.csv'), parse_dates=['date'])
    return df  

In [ ]:

df = open_harvest_data()

Show the most common newspapers¶

In [ ]:

alt.Chart(df).mark_bar().encode(
    x=alt.X('count:Q', title='Number of articles'),
    y=alt.Y('newspaper_title:N', title='Newspaper', sort=alt.EncodingSortField(field='count', order='descending', op='sum')),
    tooltip=[alt.Tooltip('newspaper_title:N', title='Newspaper'), alt.Tooltip('count:Q', title='Articles')]
).transform_aggregate(
    count='count()',
    groupby=['newspaper_title']
).transform_window(
    window=[{'op': 'rank', 'as': 'rank'}],
    sort=[{'field': 'count', 'order': 'descending'}]
).transform_filter('datum.rank <= 25')

Show when the articles were published¶

In [ ]:

alt.Chart(df).mark_line().encode(
    x='year(date):T',
    y='count()',
    tooltip=[alt.Tooltip('year(date):T', title='Year'), alt.Tooltip('count()', title='Articles')]
).properties(width=600)

Find the longest article¶

In [ ]:

# Which is the longest article(s)?
df[df['words'] == df['words'].max()]

In [ ]:

df.loc[df['title'].str.contains('protest', case=False, na=False)]

Make a simple word cloud¶

In [ ]:

df_titles = df[(df['title'] != 'No Title') & (df['title'] != 'Advertising')]
# Get all the articles titles and turn them into a single string
title_text = df_titles['title'].str.lower().str.cat(sep=' ').replace('advertising', '').replace('no title', '')

In [ ]:

from wordcloud import WordCloud

# Generate a word cloud image
wordcloud = WordCloud(width=1200, height=800).generate(title_text)

# Display the generated image:
# the matplotlib way:
import matplotlib.pyplot as plt
plt.figure(figsize=(12,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")

Using TextBlob¶

In [ ]:

from textblob import TextBlob
from operator import itemgetter
import nltk
nltk.download('stopwords')
nltk.download('punkt')
blob = TextBlob(title_text)
stopwords = nltk.corpus.stopwords.words('english')

In [ ]:

word_counts = [[word, count] for word, count in blob.lower().word_counts.items() if word not in stopwords]
word_counts = sorted(word_counts, key=itemgetter(1), reverse=True)[:25]
pd.DataFrame(word_counts).style.format({1: '{:,}'}).bar(subset=[1], color='#d65f5f').set_properties(subset=[1], **{'width': '300px'})

Analyse text files¶

So far we've only looked at the metadata, but we can also explore the content of the individual text files.

Created by Tim Sherrratt (@wragge) as part of the OzGLAM workbench.

If you think this project is worthwhile you can support it on Patreon.

In [ ]: