Created by SmirkyGraphs. Code: Github. Source: RIDE.
Using Xlwings to pull info from locked excel sheets
import pandas as pd
import xlwings as xw
import numpy as np
import os
import glob
path = './data/raw/excluding-capital-and-debt-service'
year = '2016'
wb = xw.Book(path + '/' + year + '.xlsx')
sht = wb.sheets['FY ' + str(year[2:]) + ' Data']
location_summary = sht.range('B14:BN40').options(pd.DataFrame).value
function_summary = sht.range('B49:BN54').options(pd.DataFrame).value
function_intermediate = sht.range('B64:BN80').options(pd.DataFrame).value
function_detailed = sht.range('B89:BN124').options(pd.DataFrame).value
program_summary = sht.range('B132:BN142').options(pd.DataFrame).value
subject_summary = sht.range('B151:BN182').options(pd.DataFrame).value
object_summary = sht.range('B192:BN201').options(pd.DataFrame).value
object_intermediate = sht.range('B211:BN260').options(pd.DataFrame).value
object_detailed = sht.range('B270:BN566').options(pd.DataFrame).value
job_summary = sht.range('B575:BN603').options(pd.DataFrame).value
frames = [location_summary, function_summary, function_intermediate, function_detailed, program_summary,
subject_summary, object_summary, object_intermediate, object_detailed, job_summary]
str_frames = ['location_summary', 'function_summary', 'function_intermediate', 'function_detailed', 'program_summary',
'subject_summary', 'object_summary', 'object_intermediate', 'object_detailed', 'job_summary']
i = 0
for df in frames:
df['class'] = str_frames[i]
df['year'] = str(year)
i += 1
df = pd.concat(frames)
df.shape
df = df.drop(['Row Match?', 'Row Labels'], axis=1)
df = df.reset_index()
x = ['Row Labels', 'class', 'year']
df = pd.melt(df, id_vars=x, value_vars=df.columns.drop(x).tolist())
df = df.fillna(0)
df['type'] = df['Row Labels'].str.replace('\d+', '')
df['type_id'] = df['Row Labels'].str.replace('[^0-9]','')
df = df.drop('Row Labels', axis=1)
df = df.rename(columns={'variable': 'District', 'value': 'Expenditures'})
df.to_csv('./data/clean/UCOA/UCOA_' + str(year) + '_cleaned.csv', index=False)
# 2016 School Attributes
sht = wb.sheets['Master Table']
df = sht.range('C2:X64').options(pd.DataFrame).value
# All 0 values are listed as 0.00001
df[df == 0.00001] = np.nan
df = df.fillna(0)
x = ['Ref Key', 'Charter', 'State', 'Suburban', 'Urban', 'Urban Ring', 'All Table Loc']
df = df.drop(x, axis=1)
df = df.drop('Other', axis=1)
df = df.reset_index()
df['year'] = year
df.to_csv('./data/clean/Attributes/Attributes_' + str(year) + '_cleaned.csv', index=False)
year = '2015'
wb = xw.Book(path + '/' + year + '.xlsx')
sht = wb.sheets['FY ' + str(year[2:]) + ' Data']
location_summary = sht.range('B14:BM39').options(pd.DataFrame).value
function_summary = sht.range('B48:BM53').options(pd.DataFrame).value
function_intermediate = sht.range('B63:BM79').options(pd.DataFrame).value
function_detailed = sht.range('B88:BM123').options(pd.DataFrame).value
program_summary = sht.range('B131:BM141').options(pd.DataFrame).value
subject_summary = sht.range('B150:BM181').options(pd.DataFrame).value
object_summary = sht.range('B191:BM200').options(pd.DataFrame).value
object_intermediate = sht.range('B210:BM258').options(pd.DataFrame).value
object_detailed = sht.range('B268:BM561').options(pd.DataFrame).value
job_summary = sht.range('B570:BM598').options(pd.DataFrame).value
frames = [location_summary, function_summary, function_intermediate, function_detailed, program_summary,
subject_summary, object_summary, object_intermediate, object_detailed, job_summary]
str_frames = ['location_summary', 'function_summary', 'function_intermediate', 'function_detailed', 'program_summary',
'subject_summary', 'object_summary', 'object_intermediate', 'object_detailed', 'job_summary']
i = 0
for df in frames:
df['class'] = str_frames[i]
df['year'] = str(year)
i += 1
df = pd.concat(frames)
df.shape
df = df.drop(['Row Match?', 'Row Labels'], axis=1)
df = df.reset_index()
x = ['Row Labels', 'class', 'year']
df = pd.melt(df, id_vars=x, value_vars=df.columns.drop(x).tolist())
df = df.fillna(0)
df['type'] = df['Row Labels'].str.replace('\d+', '')
df['type_id'] = df['Row Labels'].str.replace('[^0-9]','')
df = df.drop('Row Labels', axis=1)
df = df.rename(columns={'variable': 'District', 'value': 'Expenditures'})
df.to_csv('./data/clean/UCOA/UCOA_' + str(year) + '_cleaned.csv', index=False)
# 2015 School Attributes
sht = wb.sheets['Master Table']
df = sht.range('C2:X63').options(pd.DataFrame).value
# All 0 values are listed as 0.00001
df[df == 0.0001] = np.nan
df = df.fillna(0)
x = ['Ref Key', 'Charter', 'State', 'Suburban', 'Urban', 'Urban Ring', 'All Table Loc']
df = df.drop(x, axis=1)
df = df.drop('Other', axis=1)
df = df.reset_index()
df['year'] = year
df.to_csv('./data/clean/Attributes/Attributes_' + str(year) + '_cleaned.csv', index=False)
year = '2014'
wb = xw.Book(path + '/' + year + '.xlsx')
sht = wb.sheets['FY ' + str(year[2:]) + ' Data']
location_summary = sht.range('B14:BK38').options(pd.DataFrame).value
function_summary = sht.range('B47:BK52').options(pd.DataFrame).value
function_intermediate = sht.range('B62:BK78').options(pd.DataFrame).value
function_detailed = sht.range('B87:BK122').options(pd.DataFrame).value
program_summary = sht.range('B130:BK140').options(pd.DataFrame).value
subject_summary = sht.range('B149:BK180').options(pd.DataFrame).value
object_summary = sht.range('B190:BK199').options(pd.DataFrame).value
object_intermediate = sht.range('B209:BK253').options(pd.DataFrame).value
object_detailed = sht.range('B263:BK555').options(pd.DataFrame).value
job_summary = sht.range('B564:BK592').options(pd.DataFrame).value
frames = [location_summary, function_summary, function_intermediate, function_detailed, program_summary,
subject_summary, object_summary, object_intermediate, object_detailed, job_summary]
str_frames = ['location_summary', 'function_summary', 'function_intermediate', 'function_detailed', 'program_summary',
'subject_summary', 'object_summary', 'object_intermediate', 'object_detailed', 'job_summary']
i = 0
for df in frames:
df['class'] = str_frames[i]
df['year'] = str(year)
i += 1
df = pd.concat(frames)
df.shape
df = df.drop(['Row Match?', 'Row Labels'], axis=1)
df = df.reset_index()
x = ['Row Labels', 'class', 'year']
df = pd.melt(df, id_vars=x, value_vars=df.columns.drop(x).tolist())
df = df.fillna(0)
df['type'] = df['Row Labels'].str.replace('\d+', '')
df['type_id'] = df['Row Labels'].str.replace('[^0-9]','')
df = df.drop('Row Labels', axis=1)
df = df.rename(columns={'variable': 'District', 'value': 'Expenditures'})
df.to_csv('./data/clean/UCOA/UCOA_' + str(year) + '_cleaned.csv', index=False)
# 2014 School Attributes
sht = wb.sheets['Master Table']
df = sht.range('C2:X61').options(pd.DataFrame).value
# All 0 values are listed as 0.00001
df[df == 0.0001] = np.nan
df = df.fillna(0)
x = ['Ref Key', 'Charter', 'State', 'Suburban', 'Urban', 'Urban Ring', 'All Table Loc']
df = df.drop(x, axis=1)
df = df.drop('Other', axis=1)
df = df.reset_index()
df['year'] = year
df.to_csv('./data/clean/Attributes/Attributes_' + str(year) + '_cleaned.csv', index=False)
year = '2013'
wb = xw.Book(path + '/' + year + '.xlsx')
sht = wb.sheets['FY ' + str(year[2:]) + ' Data']
location_summary = sht.range('B14:BG38').options(pd.DataFrame).value
function_summary = sht.range('B47:BG52').options(pd.DataFrame).value
function_intermediate = sht.range('B62:BG78').options(pd.DataFrame).value
function_detailed = sht.range('B87:BG122').options(pd.DataFrame).value
program_summary = sht.range('B130:BG140').options(pd.DataFrame).value
subject_summary = sht.range('B149:BG180').options(pd.DataFrame).value
object_summary = sht.range('B190:BG199').options(pd.DataFrame).value
object_intermediate = sht.range('B209:BG253').options(pd.DataFrame).value
object_detailed = sht.range('B263:BG555').options(pd.DataFrame).value
job_summary = sht.range('B564:BG592').options(pd.DataFrame).value
frames = [location_summary, function_summary, function_intermediate, function_detailed, program_summary,
subject_summary, object_summary, object_intermediate, object_detailed, job_summary]
str_frames = ['location_summary', 'function_summary', 'function_intermediate', 'function_detailed', 'program_summary',
'subject_summary', 'object_summary', 'object_intermediate', 'object_detailed', 'job_summary']
i = 0
for df in frames:
df['class'] = str_frames[i]
df['year'] = str(year)
i += 1
df = pd.concat(frames)
df.shape
df = df.drop(['Row Match?', 'Row Labels'], axis=1)
df = df.reset_index()
x = ['Row Labels', 'class', 'year']
df = pd.melt(df, id_vars=x, value_vars=df.columns.drop(x).tolist())
df = df.fillna(0)
df['type'] = df['Row Labels'].str.replace('\d+', '')
df['type_id'] = df['Row Labels'].str.replace('[^0-9]','')
df = df.drop('Row Labels', axis=1)
df = df.rename(columns={'variable': 'District', 'value': 'Expenditures'})
df.to_csv('./data/clean/UCOA/UCOA_' + str(year) + '_cleaned.csv', index=False)
# 2013 School Attributes
sht = wb.sheets['Master Table']
df = sht.range('C2:X57').options(pd.DataFrame).value
# All 0 values are listed as 0.00001
df[df == 0.00001] = np.nan
df = df.fillna(0)
x = ['Ref Key', 'Charter', 'State', 'Suburban', 'Urban', 'Urban Ring', 'All Table Loc']
df = df.drop(x, axis=1)
df = df.drop('Diff', axis=1)
df = df.reset_index()
df['year'] = year
df.to_csv('./data/clean/Attributes/Attributes_' + str(year) + '_cleaned.csv', index=False)
# Set File Path
path = './data/clean/UCOA/'
allFiles = glob.glob(path + "/*.csv")
# Load Data into Dataframe
frame = pd.DataFrame()
list_ = []
for file_ in allFiles:
df = pd.read_csv(file_,index_col=None, header=0)
list_.append(df)
frame = pd.concat(list_)
df = frame
df.to_csv('./data/clean/UCOA/UCOA_All.csv')
# Set File Path
path = './data/clean/Attributes/'
allFiles = glob.glob(path + "/*.csv")
# Load Data into Dataframe
frame = pd.DataFrame()
list_ = []
for file_ in allFiles:
df = pd.read_csv(file_,index_col=None, header=0)
list_.append(df)
frame = pd.concat(list_)
df = frame
df.to_csv('./data/clean/Attributes/Attributes_All.csv')