import os
import pandas as pd
import fileinput
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from textblob import TextBlob
from operator import itemgetter
from pathlib import Path
import nltk
import numpy as np
import altair as alt
# Are you using Jupyter Lab?
# If so either don't run this cell or comment out the line below
# alt.renderers.enable('notebook')
# If you forget, run this cell, and then get strange warnings when you make a chart,
# uncomment the following line and run this cell to reset the chart renderer
# alt.renderers.enable('default')
# alt.data_transformers.enable('json')
#nltk.download('stopwords')
#nltk.download('punkt')
#stopwords = nltk.corpus.stopwords.words('english')
# Import a harvest zip file you've created previously
# First upload the zip file to the data directory, then run this cell
import zipfile
for zipped in sorted(Path('data').glob('*.zip')):
print(f'Unzipping {zipped}...')
with zipfile.ZipFile(zipped, 'r') as zip_file:
zip_file.extractall(Path(f'data/{zipped.stem}'))
def get_latest_harvest():
'''
Get the timestamp of the most recent harvest.
'''
harvests = sorted([d for d in Path('data').iterdir() if d.is_dir() and not d.name.startswith('.')])
try:
harvest = harvests[-1]
except IndexError:
print('No harvests!')
harvest = None
return harvest
def get_docs(harvest):
docs_path = get_docs_path(harvest)
for p in docs_path:
yield p.read_text(encoding='utf-8').strip()
def get_docs_path(harvest):
path = Path(f'{harvest}/text')
docs_path = [p for p in sorted(path.glob('*.txt'))]
return docs_path
def get_file_names(harvest):
return [p.stem for p in get_docs_path(harvest)]
harvest = get_latest_harvest()
vectorizer = CountVectorizer(stop_words='english', max_features=10000, ngram_range=(1,1))
# preprocessor = lambda x: re.sub(r'(\d[\d\.])+', 'NUM', x.lower())
X_freq = np.asarray(vectorizer.fit_transform(get_docs(harvest)).todense())
df_freq = pd.DataFrame(X_freq, columns=vectorizer.get_feature_names(), index=get_file_names(harvest))
df_freq.sum().nlargest(20)
# The number of words you want to show
num_words = 20
top_words = pd.DataFrame({n: df_freq.T[col].nlargest(num_words).index.tolist() for n, col in enumerate(df_freq.T)}).T
top_words.index = get_file_names(harvest)
top_words.head()
Each file name includes the date on which the article was published. For example, 18601224-13-5696044
was published on 24 December 1860. We can easily extract the year by just slicing the first four characters off the file name.
years = [int(p[:4]) for p in get_file_names(harvest)]
df_freq_years = df_freq.assign(year=years)
# Top words per year
year_groups = df_freq_years.groupby(by='year')
year_group_totals = year_groups.sum()
df_years = pd.DataFrame({n: year_group_totals.T[col].nlargest(10).index.tolist() for n, col in enumerate(year_group_totals.T)}).T
df_years.index = [name for name, _ in year_groups]
df_years.head()
compound_chart = alt.vconcat()
years = df_years.index.tolist()
# Number of columns
cols = 4
start = 0
while start < len(years):
row = alt.hconcat()
for year in years[start:start+cols]:
df_year_word_count = pd.DataFrame([{'word': w, 'count': year_group_totals.loc[year][w]} for w in df_years.loc[year].tolist()])
chart = alt.Chart(df_year_word_count).mark_bar().encode(
y='word:N',
x='count:Q',
).properties(width=120, height=120, title=str(year), columns=4)
row |= chart
compound_chart &= row
start += cols
compound_chart
def words_by_year(df, words):
df_words = pd.DataFrame()
for word in words:
try:
df_word = df.groupby(by='year').sum()[word].to_frame().reset_index().rename({word: 'count'}, axis=1)
except KeyError:
print(f"'{word}' not found")
else:
df_word['word'] = word
df_words = df_words.append(df_word, ignore_index=True)
return df_words
df_words = words_by_year(df_freq_years, ['racism', 'racial', 'white race', 'chinese', 'kangaroo'])
alt.Chart(df_words.loc[df_words['year'] > 0]).mark_line().encode(
x=alt.X('year:Q', axis=alt.Axis(format='c', title='Year')),
y='count:Q',
color='word:N',
facet='word:N'
).properties(width=700, height=100, columns=1)
# Create a chart
alt.Chart(df_words.loc[df_words['year'] > 0]).mark_circle(
# Style the circles
opacity=0.8,
stroke='black',
strokeWidth=1
).encode(
# Year on the X axis
x=alt.X('year:O', axis=alt.Axis(format='c', title='Year', labelAngle=0)),
# Object type on the Y axis
y=alt.Y('word:N', title='Word'),
# Size of the circles represents the number of objects
size=alt.Size('count:Q',
scale=alt.Scale(range=[0, 2000]),
legend=alt.Legend(title='Frequency')
),
# Color the circles by object type
color=alt.Color('word:N', legend=None),
# More details on hover
tooltip=[alt.Tooltip('word:N', title='Word'), alt.Tooltip('year:O', title='Year'), alt.Tooltip('count:Q', title='Frequency', format=',')]
).properties(
width=700,
height=300
)
The file titles also include the id of the newspaper they were published in. For example, 18601224-13-5696044
was published in the newspaper with the id of 13
, which happenes to be The Argus.