Created by: SmirkyGraphs. Code: GitHub. Source: CMS.gov.
using pandas to compile multiple csv files from different reporting periods into one
import pandas as pd
import numpy as np
import glob as glob
import csv
# get data set with only wanted columns and state of RI
files = glob.glob('./data/raw/*.csv')
nursing_staff = []
non_nursing_staff = []
for f in files:
part = f[22:-4]
df = pd.read_csv(f, low_memory=False)
df = df[df['STATE'] == 'RI']
df = df[[x for x in df if not x.strip().endswith('_emp')]]
df = df[[x for x in df if not x.strip().endswith('_ctr')]]
df['job_type'] = part
if part == 'part1':
nursing_staff.append(df)
else:
non_nursing_staff.append(df)
nursing = pd.concat(nursing_staff, sort=False)
non_nursing = pd.concat(non_nursing_staff, sort=False)
nursing.to_csv('./data/clean/nursing_clean.csv', index=False)
non_nursing.to_csv('./data/clean/non_nursing_clean.csv', index=False)
files = glob.glob('./data/clean/*clean.csv')
nursing_staff = []
non_nursing_staff = []
for f in files:
part = f[13:-10]
# cleaning data
df = pd.read_csv(f, low_memory=False)
df = df.reset_index(drop=True)
df = df.rename(columns=lambda x: x.strip())
cols = ['COUNTY_FIPS', 'CY_Qtr', 'Location 1']
df = df.drop(columns=cols)
df = df.loc[:,~df.columns.duplicated()]
x = ['PROVNUM', 'PROVNAME', 'CITY', 'STATE', 'COUNTY_NAME', 'WorkDate', 'MDScensus', 'job_type']
hours_cols = df.columns.drop(x).tolist()
df['total_hrs'] = df[hours_cols].sum(axis=1)
hours_cols.append('total_hrs')
df = pd.melt(df, id_vars=x, value_vars=hours_cols)
if part == 'nursing':
nursing_staff.append(df)
else:
non_nursing_staff.append(df)
nursing = pd.concat(nursing_staff, sort=False)
non_nursing = pd.concat(non_nursing_staff, sort=False)
nursing.to_csv('./data/clean/nursing_final.csv', index=False)
non_nursing.to_csv('./data/clean/non_nursing_final.csv', index=False)
# merging files
nursing = pd.read_csv('./data/clean/nursing_final.csv')
non_nursing = pd.read_csv('./data/clean/non_nursing_final.csv')
df = pd.concat([nursing, non_nursing])
# grabbing provider info
providers = pd.read_csv('./data/extra/provider_info.csv')
# getting lat/lng of facilities
providers['lat_lon'] = providers['Location'].apply(lambda x: x[x.find("(")+1:x.find(")")]).str.split(', ')
providers['lat'] = providers['lat_lon'].apply(lambda x: x[0])
providers['lng'] = providers['lat_lon'].apply(lambda x: x[1])
# drop list
f = './files/provider_info_drop.txt'
drop = [line.rstrip() for line in open(f)]
providers = providers.drop(columns=drop)
# getting number of beds and lat/lng
max_beds = providers[['Federal Provider Number', 'Number of Certified Beds', 'lat', 'lng']]
max_beds = max_beds.set_index('Federal Provider Number')
df = df.merge(max_beds, how='left', left_on='PROVNUM', right_on='Federal Provider Number')
# removing homes lacking max num of beds
df = df.dropna(subset=['Number of Certified Beds'])
# getting capacity
df['capacity'] = df['MDScensus']/df['Number of Certified Beds']
# replacing part1/part2
df['job_type'] = df['job_type'].replace('part1' ,'nursing_staff')
df['job_type'] = df['job_type'].replace('part2' ,'non_nursing_staff')
# clean up variable names
df['variable'] = df['variable'].str.lower()
def clean(x):
if x.lower().startswith('hrs'):
return x[4:]
else:
return x
df['variable'] = df['variable'].apply(clean)
df['variable'] = df['variable'].str.replace('_', '')
var_names = csv.reader(open('./files/variable_names.csv'))
var_names = dict(var_names)
df['variable'] = df.variable.map(var_names)
df.to_csv('./data/final/ri_pbj_staffing_final.csv')