import os
import pandas as pd # makes manipulating the data easier
# import plotly.offline as py # for charts
# import plotly.graph_objs as go
import altair as alt
import wordcloud
# py.init_notebook_mode() # initialise plotly
alt.renderers.enable('notebook')
# Make sure data directory exists
# os.makedirs('../../data/TroveHarvester', exist_ok=True)
def get_latest_harvest():
'''
Get the timestamp of the most recent harvest.
'''
harvests = sorted([d for d in os.listdir('data') if os.path.isdir(os.path.join('data', d))])
return harvests[-1]
def open_harvest_data(timestamp=None):
'''
Open the results of the specified harvest (most recent by default).
Returns a DataFrame.
'''
if not timestamp:
timestamp = get_latest_harvest()
print(timestamp)
df = pd.read_csv(os.path.join('data', timestamp, 'results.csv'), parse_dates=['date'])
return df
df = open_harvest_data()
alt.Chart(df).mark_bar().encode(
x=alt.X('count:Q', title='Number of articles'),
y=alt.Y('newspaper_title:N', title='Newspaper', sort=alt.EncodingSortField(field='count', order='descending', op='sum')),
tooltip=[alt.Tooltip('newspaper_title:N', title='Newspaper'), alt.Tooltip('count:Q', title='Articles')]
).transform_aggregate(
count='count()',
groupby=['newspaper_title']
).transform_window(
window=[{'op': 'rank', 'as': 'rank'}],
sort=[{'field': 'count', 'order': 'descending'}]
).transform_filter('datum.rank <= 25')
alt.Chart(df).mark_line().encode(
x='year(date):T',
y='count()',
tooltip=[alt.Tooltip('year(date):T', title='Year'), alt.Tooltip('count()', title='Articles')]
).properties(width=600)
# Which is the longest article(s)?
df[df['words'] == df['words'].max()]
df.loc[df['title'].str.contains('protest', case=False, na=False)]
df_titles = df[(df['title'] != 'No Title') & (df['title'] != 'Advertising')]
# Get all the articles titles and turn them into a single string
title_text = df_titles['title'].str.lower().str.cat(sep=' ').replace('advertising', '').replace('no title', '')
from wordcloud import WordCloud
# Generate a word cloud image
wordcloud = WordCloud(width=1200, height=800).generate(title_text)
# Display the generated image:
# the matplotlib way:
import matplotlib.pyplot as plt
plt.figure(figsize=(12,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
from textblob import TextBlob
from operator import itemgetter
import nltk
nltk.download('stopwords')
nltk.download('punkt')
blob = TextBlob(title_text)
stopwords = nltk.corpus.stopwords.words('english')
word_counts = [[word, count] for word, count in blob.lower().word_counts.items() if word not in stopwords]
word_counts = sorted(word_counts, key=itemgetter(1), reverse=True)[:25]
pd.DataFrame(word_counts).style.format({1: '{:,}'}).bar(subset=[1], color='#d65f5f').set_properties(subset=[1], **{'width': '300px'})
So far we've only looked at the metadata, but we can also explore the content of the individual text files.
Created by Tim Sherrratt (@wragge) as part of the OzGLAM workbench.
If you think this project is worthwhile you can support it on Patreon.