Collecting and Cleaning School Attendance Data¶

Using Selenium to automatically collect data on attendance of schools from RIDE's online tool. The data is then cleaned with pandas to compare attendance pre vs. post COVID-19 remote learning. Previously I used the data to create an online tool allowing you to explore attendance and compare it easily with all other schools in Rhode Island.

In [1]:

import json
import time
from datetime import date
from selenium import webdriver

def chrome_options(download_dir):
    options = webdriver.ChromeOptions() 
    download_dir = download_dir.replace('/', '\\')

    prefs = {
        'download.prompt_for_download' : False,
        'download.default_directory' : f'{download_dir}',
    }

    #options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--start-maximized')
    options.add_experimental_option('prefs', prefs)

    return options

if __name__ == "__main__":
    
    with open('config.json', 'r') as f:
        config = json.load(f)
        save_path = config['save_path']
        chromedriver = config['chromedriver']
        
    url = 'https://www.eride.ri.gov/eride2K5/AggregateAttendance/AttendanceReports.aspx'
    chrome_options = chrome_options(save_path)
    browser = webdriver.Chrome(chromedriver, options=chrome_options)
    browser.get(url)

    # get current month & month xpath
    month_xpath = '//*[@id="calDateSelection"]/tbody/tr[1]/td/table/tbody/tr/td[2]'
    current_month = browser.find_element_by_xpath(month_xpath)

    # creating the loop for each month getting all schooldays
    while current_month.text != 'August 2008':
        # button to click to go to previous month
        prev_btn = '#calDateSelection > tbody > tr:nth-child(1) > td > table > tbody > tr > td:nth-child(1) > a'
        prev_month = browser.find_element_by_css_selector(prev_btn)
        
        # loop for each file to download
        for ele in browser.find_elements_by_xpath("//input[contains(@id,'_imgBtnXls')]"):
            # click button to download file
            ele.click()   
            time.sleep(1)

        # after loop move to next month and wait
        prev_month.click()
        time.sleep(3)
        
        # check newly selected month
        current_month = browser.find_element_by_xpath(month_xpath)

In [2]:

import glob
import csv
import pandas as pd
import numpy as np

# creating region dict
region_map = dict(csv.reader(open("./files/region_map.csv")))

def school_year(date):
    if date.month > 7:
        return date.year
    elif date.month < 7:
        return date.year + 1
    else:
        return 'summer'

def clean_data(files):
    frames = []
    for f in files:
        df = pd.read_csv(f)
        frames.append(df)
    df = pd.concat(frames)

    # replacing "NR" (not reported) with nulls
    df = df.replace('NR', np.nan)

    # convert types
    df['Enrollment'] = df['Enrollment'].astype(float)
    df['Absent'] = df['Absent'].astype(float)
    df['PercentageAbsent'] = (df['PercentageAbsent'].astype(float))/100
    df['AttendanceDate'] = pd.to_datetime(df['AttendanceDate'])

    # setting school year dates
    df['school_year'] = df['AttendanceDate'].apply(school_year)
    df['school_year'] = df['school_year'].astype(str)

    # setting region codes
    df['region'] = df['LEA_NAME'].map(region_map)

    return df

if __name__ == "__main__":
    
    # filepath of all individual date csvs
    files = glob.glob('./data/raw/*.csv')
    data = clean_data(files)
    data.to_csv('./data/clean/ri_attendance_clean.csv', index=False)

In [ ]: