Created by: SmirkyGraphs. Code: GitHub. Source: ri.gov.
This notebook contains code used to get a word count of every word used in a state of the state address. All words are converted into lowercase with punctuation and special characters removed. The dataset also marks specific catagories of words using a list that can be found here.
import re
import pandas as pd
from pathlib import Path
from nltk.corpus import stopwords
# get count of words from every file
files = list(Path('../input/').rglob('*.txt'))
for file in files:
with open(file) as f:
try:
text = f.read()
except:
print(f.name)
text = text.splitlines()
# remove leading/trailing whitespace
text = [x.strip().lower() for x in text]
text = [x for x in text if x != '']
text = ' '.join(text)
# remove all non alphabet characters
text = re.sub(r'[^A-Za-z ]+', '', text)
# add year
year = file.name[:4]
# get list of all words
words = text.split()
# get count of each word
wds = {}
for w in words:
wds[w] = wds.get(w, 0) + 1
# Creating a csv
df = pd.DataFrame(list(wds.items()))
df.columns = ['words', 'count']
# save files
df.to_csv(f'../output/word_count/raw/{year}_word_count.csv', index=False)
# merge counts from each year
files = list(Path('../output/word_count/raw/').rglob('*.csv'))
combined = []
for file in files:
df = pd.read_csv(file)
df['year'] = file.name[:4]
combined.append(df)
df = pd.concat(combined)
df = df.pivot(index='words', columns='year', values='count')
df = df.fillna(0)
# year differences
df['2017-2018_dif'] = df['2018']-df['2017']
df['2018-2019_dif'] = df['2019']-df['2018']
df['2019-2020_dif'] = df['2020']-df['2019']
# generate stopwords list
stop = stopwords.words('english')
stop.append('rhode')
stop.append('island')
stop.append('islander')
stop.append('islanders')
stop.append('lets')
# adding stopwods
for i, row in df.iterrows():
if i in stop:
df.at[i,'stopword'] = 'Stopword'
else:
df.at[i,'stopword'] = 'Not Stopword'
# adding categories
categories = {}
files = list(Path('../files/lists/').rglob('*.txt'))
for f in files:
cat = f.name[:-4]
categories[cat] = [line.rstrip() for line in open(f)]
economy = categories['economy']
education = categories['education']
health = categories['health']
jobs = categories['jobs']
politics = categories['politics']
for i, row in df.iterrows():
if i in economy:
df.at[i,'category'] = 'economy'
elif i in education:
df.at[i,'category'] = 'education'
elif i in health:
df.at[i,'category'] = 'health'
elif i in jobs:
df.at[i,'category'] = 'jobs'
elif i in politics:
df.at[i,'category'] = 'politics'
else:
df.at[i,'category'] = 'no category'
# reset index and remove 'year'
df = df.reset_index()
df.columns.name = ''
# saving combined cleaned file
df.to_csv('../output/word_count/clean/word_counts_combined.csv', index=False)