State of the State Word Scrape¶

This notebook contains code used to get a word count of every word used in a state of the state address. All words are converted into lowercase with punctuation and special characters removed. The dataset also marks specific catagories of words using a list that can be found here.

In [1]:

import re
import pandas as pd
from pathlib import Path
from nltk.corpus import stopwords

In [2]:

# get count of words from every file
files = list(Path('../input/').rglob('*.txt'))
for file in files:
    
    with open(file) as f:
        try:
            text = f.read()
        except:
            print(f.name)
        text = text.splitlines()
        
        # remove leading/trailing whitespace
        text = [x.strip().lower() for x in text]
        text = [x for x in text if x != '']

        text = ' '.join(text)
        
        # remove all non alphabet characters
        text = re.sub(r'[^A-Za-z ]+', '', text)
        
        # add year
        year = file.name[:4]
        
        # get list of all words
        words = text.split()
        
        # get count of each word
        wds = {}
        for w in words:
            wds[w] = wds.get(w, 0) + 1
            
        # Creating a csv
        df = pd.DataFrame(list(wds.items()))
        df.columns = ['words', 'count']
                
        # save files
        df.to_csv(f'../output/word_count/raw/{year}_word_count.csv', index=False)

In [3]:

# merge counts from each year
files = list(Path('../output/word_count/raw/').rglob('*.csv'))

combined = []
for file in files:
    df = pd.read_csv(file)
    df['year'] = file.name[:4]
    combined.append(df)
    
df = pd.concat(combined)
df = df.pivot(index='words', columns='year', values='count')
df = df.fillna(0)

In [4]:

# year differences
df['2017-2018_dif'] = df['2018']-df['2017']
df['2018-2019_dif'] = df['2019']-df['2018']
df['2019-2020_dif'] = df['2020']-df['2019']

In [5]:

# generate stopwords list
stop = stopwords.words('english')
stop.append('rhode')
stop.append('island')
stop.append('islander')
stop.append('islanders')

stop.append('lets')

# adding stopwods
for i, row in df.iterrows():
    if i in stop:
        df.at[i,'stopword'] = 'Stopword'    
    else:
        df.at[i,'stopword'] = 'Not Stopword'   

In [6]:

# adding categories
categories = {}
files = list(Path('../files/lists/').rglob('*.txt'))

for f in files:
    cat = f.name[:-4]
    categories[cat] = [line.rstrip() for line in open(f)]
    
economy = categories['economy']
education = categories['education']
health = categories['health']
jobs = categories['jobs']
politics = categories['politics']

for i, row in df.iterrows():
    if i in economy:
        df.at[i,'category'] = 'economy'    
    elif i in education:
        df.at[i,'category'] = 'education' 
    elif i in health:
        df.at[i,'category'] = 'health' 
    elif i in jobs:
        df.at[i,'category'] = 'jobs' 
    elif i in politics:
        df.at[i,'category'] = 'politics' 
    else:
        df.at[i,'category'] = 'no category'

# reset index and remove 'year'
df = df.reset_index()
df.columns.name = ''

# saving combined cleaned file
df.to_csv('../output/word_count/clean/word_counts_combined.csv', index=False)

In [ ]: