Created by: SmirkyGraphs. Code: GitHub. Source: RIDE.
using pandas to clean and combine multiple different datasets covering PRACC, SAT, NECAP, Graduation, Attendance
import pandas as pd
import numpy as np
import glob
import csv
import os
necap_dict = csv.reader(open("./dicts/necap.csv"))
map_necap = dict(necap_dict)
pracc_dict = csv.reader(open("./dicts/pracc.csv"))
map_pracc = dict(pracc_dict)
middle_dict = csv.reader(open("./dicts/middle_schools.csv"))
middle_map = dict(middle_dict)
region_dict = csv.reader(open("./dicts/region_map.csv"))
region_map = dict(region_dict)
#NECAP Cleaning
fp = './raw/NECAP'
allFiles = glob.glob(fp + "/*.csv")
frame = pd.DataFrame()
data = []
for csv in allFiles:
df = pd.read_csv(csv, skiprows=3)
df['filename'] = os.path.basename(csv)
df['category'] = 'NECAP'
df = df.iloc[1:]
# Unpivot the columns
x = ['District', 'School', 'category', 'filename']
df = pd.melt(df, id_vars=x, value_vars=df.columns.drop(x).tolist())
df = df.rename(index=str, columns={'variable': 'year'})
df['year'] = df.year.str[13:-3]
df = df.rename(index=str, columns={'filename': 'subject'})
df['subject'] = df.subject.str[7:-12]
df['level'] = df.subject.map(map_necap)
df = df.replace('no data', np.nan)
df = df.replace('too few data', np.nan)
df['value'] = df['value'].str.rstrip('%').astype('float') / 100.0
df['rank'] = df.groupby(['subject', 'year'])['value'].rank(na_option='bottom', method='dense', ascending=0)
df['percentile'] = df.groupby(['subject', 'year'])['value'].rank(na_option='top', pct=True)
data.append(df)
frame = pd.concat(data)
cols = ['District', 'School', 'level', 'category', 'subject', 'year', 'value', 'rank', 'percentile']
frame = frame[cols]
necap = frame
necap.to_csv('./clean/necap_clean.csv', index=False)
necap.head()
District | School | level | category | subject | year | value | rank | percentile | |
---|---|---|---|---|---|---|---|---|---|
0 | Barrington | Barrington High School | High School | NECAP | 11th Grade Math | 2008 | 0.71 | 1.0 | 1.000000 |
1 | Bristol Warren | Mt. Hope High School | High School | NECAP | 11th Grade Math | 2008 | 0.40 | 9.0 | 0.827586 |
2 | Burrillville | Burrillville High School | High School | NECAP | 11th Grade Math | 2008 | 0.30 | 16.0 | 0.672414 |
3 | Central Falls | Central Falls High School | High School | NECAP | 11th Grade Math | 2008 | 0.04 | 32.0 | 0.206897 |
4 | Chariho | Chariho Regional High School | High School | NECAP | 11th Grade Math | 2008 | 0.31 | 15.0 | 0.724138 |
#PRACC Cleaning
fp = './raw/PRACC'
allFiles = glob.glob(fp + "/*.csv")
frame = pd.DataFrame()
data = []
for csv in allFiles:
df = pd.read_csv(csv, skiprows=3)
df['filename'] = os.path.basename(csv)
df['category'] = 'PRACC'
df = df.iloc[1:]
# Unpivot the columns
x = ['District', 'School', 'category', 'filename']
df = pd.melt(df, id_vars=x, value_vars=df.columns.drop(x).tolist())
df = df.rename(index=str, columns={'variable': 'year'})
df['year'] = df.year.str[29:-3]
df = df.rename(index=str, columns={'filename': 'subject'})
df['subject'] = df.subject.str[7:-12]
df['level'] = df.subject.map(map_pracc)
df['level2'] = df.School.map(middle_map)
df['level3'] = np.where((df['subject'] == 'Geometry Mathematics')
& (df['level2'] == 'Middle School')
| (df['subject'] == 'Algebra I Mathematics')
& (df['level2'] == 'Middle School'),'Middle School', None)
df.level3.fillna(df.level, inplace=True)
df = df.drop(['level'], axis=1)
df = df.drop(['level2'], axis=1)
df = df.rename(index=str, columns={'level3': 'level'})
df = df.replace('no data', np.nan)
df = df.replace('too few data', np.nan)
df['value'] = df['value'].str.rstrip('%').astype('float') / 100.0
df['rank'] = df.groupby(['subject', 'year'])['value'].rank(na_option='bottom', method='dense', ascending=0)
df['percentile'] = df.groupby(['subject', 'year'])['value'].rank(na_option='top', pct=True)
data.append(df)
frame = pd.concat(data)
cols = ['District', 'School', 'level', 'category', 'subject', 'year', 'value', 'rank', 'percentile']
frame = frame[cols]
pracc = frame
pracc.to_csv('./clean/pracc_clean.csv', index=False)
pracc.head()
District | School | level | category | subject | year | value | rank | percentile | |
---|---|---|---|---|---|---|---|---|---|
0 | Barrington | Barrington High School | High School | PRACC | 10th Grade ELA_Literacy | 2014 | 0.7510 | 2.0 | 0.983871 |
1 | Bristol Warren | Mt. Hope High School | High School | PRACC | 10th Grade ELA_Literacy | 2014 | 0.2418 | 31.0 | 0.516129 |
2 | Burrillville | Burrillville High School | High School | PRACC | 10th Grade ELA_Literacy | 2014 | 0.2636 | 29.0 | 0.548387 |
3 | Central Falls | Central Falls High School | High School | PRACC | 10th Grade ELA_Literacy | 2014 | NaN | 52.0 | 0.096774 |
4 | Chariho | Chariho Regional High School | High School | PRACC | 10th Grade ELA_Literacy | 2014 | 0.4220 | 15.0 | 0.774194 |
#SAT Cleaning
fp = './raw/SAT'
allFiles = glob.glob(fp + "/*.csv")
frame = pd.DataFrame()
data = []
for csv in allFiles:
df = pd.read_csv(csv, skiprows=3)
df['filename'] = os.path.basename(csv)
df['category'] = 'SAT'
df = df.iloc[1:]
df = df.replace('no data',np.nan)
df = df.replace('too few data',np.nan)
col_list = ['Mathematics Average', 'Reading Average', 'Writing Average']
df['Total Average'] = df[col_list].astype(float).sum(axis=1)
# Unpivot the columns
x = ['District', 'School', 'category', 'filename']
df = pd.melt(df, id_vars=x, value_vars=df.columns.drop(x).tolist())
df = df.rename(index=str, columns={'variable': 'subject'})
df['subject'] = df.subject.str[:-7]
df['subject'] = 'SAT ' + df['subject'].astype(str) + 'Avg'
df['level'] = 'High School'
df = df.rename(index=str, columns={'filename': 'year'})
df['year'] = df.year.str[:-9]
df['value'] = df['value'].astype('float')
df['rank'] = df.groupby(['subject', 'year'])['value'].rank(na_option='bottom', method='dense', ascending=0)
df['percentile'] = df.groupby(['subject', 'year'])['value'].rank(na_option='top', pct=True)
data.append(df)
frame = pd.concat(data)
cols = ['District', 'School', 'level', 'category', 'subject', 'year', 'value', 'rank', 'percentile']
frame = frame[cols]
sat = frame
sat.to_csv('./clean/sat_clean.csv', index=False)
sat.head()
District | School | level | category | subject | year | value | rank | percentile | |
---|---|---|---|---|---|---|---|---|---|
0 | Barrington | Barrington High School | High School | SAT | SAT Mathematics Avg | 2010 | 576.0 | 2.0 | 0.982456 |
1 | Bristol Warren | Mt. Hope High School | High School | SAT | SAT Mathematics Avg | 2010 | 484.0 | 19.0 | 0.622807 |
2 | Burrillville | Burrillville High School | High School | SAT | SAT Mathematics Avg | 2010 | 488.0 | 18.0 | 0.649123 |
3 | Central Falls | Central Falls High School | High School | SAT | SAT Mathematics Avg | 2010 | 374.0 | 39.0 | 0.263158 |
4 | Chariho | Chariho Regional High School | High School | SAT | SAT Mathematics Avg | 2010 | 520.0 | 9.0 | 0.842105 |
# Graduation Rate
fp = './raw/Graduation Rate'
allFiles = glob.glob(fp + "/*.csv")
frame = pd.DataFrame()
data = []
for csv in allFiles:
df = pd.read_csv(csv, skiprows=4)
df['filename'] = os.path.basename(csv)
df['category'] = 'GRAD'
df = df.iloc[1:, [0,1,2,6,7]]
# Unpivot the columns
x = ['District', 'School', 'category', 'filename']
df = pd.melt(df, id_vars=x, value_vars=df.columns.drop(x).tolist())
df = df.rename(index=str, columns={'variable': 'subject'})
df['subject'] = 'Pct. Graduated 4 Yrs'
df['level'] = 'High School'
df = df.rename(index=str, columns={'filename': 'year'})
df['year'] = df.year.str[0:4]
df = df.replace('no data', np.nan)
df = df.replace('too few data', np.nan)
df['value'] = df['value'].str.rstrip('%').astype('float') / 100.0
df['rank'] = df.groupby(['subject', 'year'])['value'].rank(na_option='bottom', method='dense', ascending=0)
df['percentile'] = df.groupby(['subject', 'year'])['value'].rank(na_option='top', pct=True)
data.append(df)
frame = pd.concat(data)
cols = ['District', 'School', 'category', 'level', 'subject', 'year', 'value', 'rank', 'percentile']
frame = frame[cols]
grad = frame
grad.to_csv('./clean/grad_clean.csv', index=False)
grad.head()
District | School | category | level | subject | year | value | rank | percentile | |
---|---|---|---|---|---|---|---|---|---|
0 | Barrington | Barrington High School | GRAD | High School | Pct. Graduated 4 Yrs | 2010 | 0.966 | 2.0 | 0.982456 |
1 | Bristol Warren | Mt. Hope High School | GRAD | High School | Pct. Graduated 4 Yrs | 2010 | 0.881 | 14.0 | 0.754386 |
2 | Burrillville | Burrillville High School | GRAD | High School | Pct. Graduated 4 Yrs | 2010 | 0.868 | 16.0 | 0.719298 |
3 | Central Falls | Central Falls High School | GRAD | High School | Pct. Graduated 4 Yrs | 2010 | 0.708 | 37.0 | 0.315789 |
4 | Chariho | Chariho Regional High School | GRAD | High School | Pct. Graduated 4 Yrs | 2010 | 0.893 | 10.0 | 0.824561 |
# Attendance
fp = './raw/Attendance'
allFiles = glob.glob(fp + "/*.csv")
frame = pd.DataFrame()
data = []
for csv in allFiles:
df = pd.read_csv(csv, skiprows=4)
df['filename'] = os.path.basename(csv)
df['category'] = 'Attend'
df = df.iloc[1:]
# Unpivot the columns
x = ['District', 'School', 'category', 'filename']
df = pd.melt(df, id_vars=x, value_vars=df.columns.drop(x).tolist())
df = df.rename(index=str, columns={'variable': 'subject'})
df['subject'] = 'Attendance Rate'
df['level'] = df.filename.str[-7:-5]
df = df.replace('ES', 'Elementary School')
df = df.replace('MS', 'Middle School')
df = df.replace('HS', 'High School')
df = df.rename(index=str, columns={'filename': 'year'})
df['year'] = df.year.str[0:4]
df = df.replace('no data', np.nan)
df = df.replace('too few data', np.nan)
df['value'] = df['value'].str.rstrip('%').astype('float') / 100.0
df['rank'] = df.groupby(['subject', 'year'])['value'].rank(na_option='bottom', method='dense', ascending=0)
df['percentile'] = df.groupby(['subject', 'year'])['value'].rank(na_option='top', pct=True)
data.append(df)
frame = pd.concat(data)
cols = ['District', 'School', 'level', 'category', 'subject', 'year', 'value', 'rank', 'percentile']
frame = frame[cols]
attend = frame
attend.to_csv('./clean/attend_clean.csv', index=False)
attend.head()
District | School | level | category | subject | year | value | rank | percentile | |
---|---|---|---|---|---|---|---|---|---|
0 | Barrington | Hampden Meadows School | Elementary School | Attend | Attendance Rate | 2010 | 0.9659 | 21.0 | 0.887179 |
1 | Barrington | Nayatt School | Elementary School | Attend | Attendance Rate | 2010 | 0.9585 | 50.0 | 0.712821 |
2 | Barrington | Primrose Hill School | Elementary School | Attend | Attendance Rate | 2010 | 0.9620 | 36.0 | 0.810256 |
3 | Barrington | Sowams Elementary School | Elementary School | Attend | Attendance Rate | 2010 | 0.9638 | 26.0 | 0.861538 |
4 | Bristol Warren | Colt Andrews School | Elementary School | Attend | Attendance Rate | 2010 | 0.9494 | 92.0 | 0.425641 |
# Chronic Absence
fp = './raw/Chronic Absenteeism'
allFiles = glob.glob(fp + "/*.csv")
frame = pd.DataFrame()
data = []
for csv in allFiles:
df = pd.read_csv(csv, skiprows=4)
df['filename'] = os.path.basename(csv)
df['category'] = 'Chronic'
df = df.iloc[1:]
# Unpivot the columns
x = ['District', 'School', 'category', 'filename']
df = pd.melt(df, id_vars=x, value_vars=df.columns.drop(x).tolist())
df = df.rename(index=str, columns={'variable': 'subject'})
df['subject'] = 'Chronic Absence'
df['level'] = df.filename.str[-7:-5]
df = df.replace('ES', 'Elementary School')
df = df.replace('MS', 'Middle School')
df = df.replace('HS', 'High School')
df = df.rename(index=str, columns={'filename': 'year'})
df['year'] = df.year.str[0:4]
df = df.replace('no data', np.nan)
df = df.replace('too few data', np.nan)
df['value'] = df['value'].str.rstrip('%').astype('float') / 100.0
df['rank'] = df.groupby(['subject', 'year'])['value'].rank(na_option='bottom', method='dense', ascending=0)
df['percentile'] = df.groupby(['subject', 'year'])['value'].rank(na_option='top', pct=True)
data.append(df)
frame = pd.concat(data)
cols = ['District', 'School', 'category', 'level', 'subject', 'year', 'value', 'rank', 'percentile']
frame = frame[cols]
chronic = frame
chronic.to_csv('./clean/chronic_clean.csv', index=False)
chronic.head()
District | School | category | level | subject | year | value | rank | percentile | |
---|---|---|---|---|---|---|---|---|---|
0 | Barrington | Hampden Meadows School | Chronic | Elementary School | Chronic Absence | 2012 | 0.0175 | 177.0 | 0.056995 |
1 | Barrington | Nayatt School | Chronic | Elementary School | Chronic Absence | 2012 | 0.0746 | 126.0 | 0.341969 |
2 | Barrington | Primrose Hill School | Chronic | Elementary School | Chronic Absence | 2012 | 0.0785 | 121.0 | 0.367876 |
3 | Barrington | Sowams Elementary School | Chronic | Elementary School | Chronic Absence | 2012 | 0.0560 | 144.0 | 0.243523 |
4 | Bristol Warren | Colt Andrews School | Chronic | Elementary School | Chronic Absence | 2012 | 0.1162 | 73.0 | 0.616580 |
school_region_map = {
'Academy for Career Exploration' : 'Charter',
"NE Laborers'/Cranston Public Schools Construction Career Academy" : 'Charter',
'Times2 Academy' : 'Charter'}
x = region_map
# Combine ALl
fp = './clean/'
allFiles = glob.glob(fp + "/*.csv")
frame = pd.DataFrame()
data = []
for csv in allFiles:
df = pd.read_csv(csv)
data.append(df)
frame = pd.concat(data, sort=True)
cols = ['District', 'School', 'category', 'level', 'subject', 'year', 'value', 'rank', 'percentile']
frame = frame[cols]
final = frame
final['region'] = final.District.map(region_map)
final['region2'] = final.School.map(school_region_map)
final.region2.fillna(final.region, inplace=True)
final = final.drop(['region'], axis=1)
final['subject'] = final['subject'].str.replace('Mathematics', 'Math')
final['subject'] = final['subject'].str.replace('ELA_Literacy', 'English')
final = final.rename(columns={'region2': 'region'})
final = final[final.District != 'Regional Collaborative']
cols = ['District', 'region', 'School', 'category', 'level', 'subject', 'year', 'value', 'rank', 'percentile']
final = final[cols]
final.to_csv('./clean/final_clean.csv', index=False)
final.head()
District | region | School | category | level | subject | year | value | rank | percentile | |
---|---|---|---|---|---|---|---|---|---|---|
0 | Barrington | Suburban | Hampden Meadows School | Attend | Elementary School | Attendance Rate | 2010 | 0.9659 | 21.0 | 0.887179 |
1 | Barrington | Suburban | Nayatt School | Attend | Elementary School | Attendance Rate | 2010 | 0.9585 | 50.0 | 0.712821 |
2 | Barrington | Suburban | Primrose Hill School | Attend | Elementary School | Attendance Rate | 2010 | 0.9620 | 36.0 | 0.810256 |
3 | Barrington | Suburban | Sowams Elementary School | Attend | Elementary School | Attendance Rate | 2010 | 0.9638 | 26.0 | 0.861538 |
4 | Bristol Warren | Regional | Colt Andrews School | Attend | Elementary School | Attendance Rate | 2010 | 0.9494 | 92.0 | 0.425641 |