Created by SmirkyGraphs. Code: Github. Source: RIDE.
Using Selenium to automatically collect data on attendance of schools from RIDE's online tool. The data is then cleaned with pandas to compare attendance pre vs. post COVID-19 remote learning. Previously I used the data to create an online tool allowing you to explore attendance and compare it easily with all other schools in Rhode Island.
import json
import time
from datetime import date
from selenium import webdriver
def chrome_options(download_dir):
options = webdriver.ChromeOptions()
download_dir = download_dir.replace('/', '\\')
prefs = {
'download.prompt_for_download' : False,
'download.default_directory' : f'{download_dir}',
}
#options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--start-maximized')
options.add_experimental_option('prefs', prefs)
return options
if __name__ == "__main__":
with open('config.json', 'r') as f:
config = json.load(f)
save_path = config['save_path']
chromedriver = config['chromedriver']
url = 'https://www.eride.ri.gov/eride2K5/AggregateAttendance/AttendanceReports.aspx'
chrome_options = chrome_options(save_path)
browser = webdriver.Chrome(chromedriver, options=chrome_options)
browser.get(url)
# get current month & month xpath
month_xpath = '//*[@id="calDateSelection"]/tbody/tr[1]/td/table/tbody/tr/td[2]'
current_month = browser.find_element_by_xpath(month_xpath)
# creating the loop for each month getting all schooldays
while current_month.text != 'August 2008':
# button to click to go to previous month
prev_btn = '#calDateSelection > tbody > tr:nth-child(1) > td > table > tbody > tr > td:nth-child(1) > a'
prev_month = browser.find_element_by_css_selector(prev_btn)
# loop for each file to download
for ele in browser.find_elements_by_xpath("//input[contains(@id,'_imgBtnXls')]"):
# click button to download file
ele.click()
time.sleep(1)
# after loop move to next month and wait
prev_month.click()
time.sleep(3)
# check newly selected month
current_month = browser.find_element_by_xpath(month_xpath)
import glob
import csv
import pandas as pd
import numpy as np
# creating region dict
region_map = dict(csv.reader(open("./files/region_map.csv")))
def school_year(date):
if date.month > 7:
return date.year
elif date.month < 7:
return date.year + 1
else:
return 'summer'
def clean_data(files):
frames = []
for f in files:
df = pd.read_csv(f)
frames.append(df)
df = pd.concat(frames)
# replacing "NR" (not reported) with nulls
df = df.replace('NR', np.nan)
# convert types
df['Enrollment'] = df['Enrollment'].astype(float)
df['Absent'] = df['Absent'].astype(float)
df['PercentageAbsent'] = (df['PercentageAbsent'].astype(float))/100
df['AttendanceDate'] = pd.to_datetime(df['AttendanceDate'])
# setting school year dates
df['school_year'] = df['AttendanceDate'].apply(school_year)
df['school_year'] = df['school_year'].astype(str)
# setting region codes
df['region'] = df['LEA_NAME'].map(region_map)
return df
if __name__ == "__main__":
# filepath of all individual date csvs
files = glob.glob('./data/raw/*.csv')
data = clean_data(files)
data.to_csv('./data/clean/ri_attendance_clean.csv', index=False)