Created by SmirkyGraphs. Code: GitHub.
from pandas.io.json import json_normalize
import pandas as pd
import numpy as np
import requests
import json
import csv
import glob
import os
import re
facility_url = 'https://ri.healthinspections.us/ri/API/index.cfm/facilities/'
inspection_url = 'https://ri.healthinspections.us/ri/API/index.cfm/inspectionsData/'
report_url = 'https://ri.healthinspections.us/'
# Download all locations json to get facility ID
for i in range(9999):
try:
file = './raw/Facility/json/json' + str(i) + '.json'
facility_url2 = facility_url + str(i) + '/0.json'
response = json.loads(requests.get(facility_url2).text)
if not response:
print('finished')
break
else:
with open(file, 'w') as outfile:
json.dump(response, outfile)
except:
pass
finished
# Transforming into csv and turning json into columns
for i in range(9999):
try:
file = './raw/Facility/json/json' + str(i) + '.json'
f = open(file)
data = json.load(f)
df = pd.DataFrame.from_dict(data)
df_json = df.to_json(orient='records')
df = json_normalize(json.loads(df_json), meta=['key'])
output = './raw/Facility/csv/csv' + str(i) + '.csv'
df.to_csv(output)
except:
pass
# Combine all csv files into 1
# Set File Path
path = './raw/Facility/csv/'
allFiles = glob.glob(path + "/*.csv")
# Load Data into Dataframe
frame = pd.DataFrame()
list_ = []
for file_ in allFiles:
df = pd.read_csv(file_,index_col=None, header=0)
list_.append(df)
frame = pd.concat(list_)
df = frame
# Data Cleaning
def trim_type(column):
x = column[14:]
return x
def trim_inspection(column):
x = column[21:]
return x
def trim_license(column):
x = column[11:]
return x
df['License Type'] = df['columns.2'].apply(trim_type)
df['Last Inspection Date'] = df['columns.1'].apply(trim_inspection)
df['License Number'] = df['columns.3'].apply(trim_license)
df = df.reset_index()
df = df.drop(['Unnamed: 0','bookmarked', 'index', 'columns.0', 'columns.1', 'columns.2', 'columns.3'], axis=1)
df.to_csv('./cleaned/Facilities.csv', index=False)
# Downloading all inspections
List = df['id'].tolist()
for ids in List:
try:
file = './raw/Inspection/json/' + ids + '.json'
inspection_url2 = inspection_url + ids + '.json'
response = json.loads(requests.get(inspection_url2).text)
with open(file, 'w') as outfile:
json.dump(response, outfile)
except:
pass
# Transforming into csv and turning json into columns
for ids in List:
file = './raw/Inspection/json/' + ids + '.json'
f = open(file)
data = json.load(f)
df = pd.DataFrame.from_dict(data)
df_json = df.to_json(orient='records')
df = json_normalize(json.loads(df_json), meta=['key'])
output = './raw/Inspection/csv/' + ids + '.csv'
df.to_csv(output)
# Combine all csv files into 1
# Set File Path
path = './raw/Inspection/csv/'
allFiles = glob.glob(path + "/*.csv")
# Load Data into Dataframe
frame = pd.DataFrame()
list_ = []
for file_ in allFiles:
df = pd.read_csv(file_,index_col=None, header=0)
list_.append(df)
frame = pd.concat(list_)
df = frame
# Unpivot the columns
x = ['Unnamed: 0', 'columns.0', 'columns.1', 'columns.2', 'facilityId', 'inspectionId', 'printablePath']
df = pd.melt(df, id_vars=x, value_vars=df.columns.drop(x).tolist())
# Remove nulls
df = df[(df['variable'] == 'violations.0') | (df['value'].notnull())]
# Data Cleaning
def trim_date(column):
x = column[16:]
x = x.replace('-','/')
return x
def trim_purpose(column):
x = column.replace('Inspection Purpose: ','')
return x
def trim_num(column):
x = column[11:]
return x
def trim_violation(column):
x = column[2:-2]
return x
def violation_id(column):
x = column[:2]
x = re.sub('[^0-9]','', x)
x = x.strip()
return x
def violation_desc(column):
y = column.find('-') + 2
x = column[y:]
return x
def clean_url(column):
x = column.replace('../', 'https://ri.healthinspections.us/')
return x
df['Inspection Date'] = df['columns.0'].apply(trim_date)
df['Inspection Purpose'] = df['columns.1'].astype(str).apply(trim_purpose)
df['Violation Num'] = df['variable'].apply(trim_num)
df['value'] = df['value'].astype(str).apply(trim_violation)
df['Report URL'] = df['printablePath'].astype(str).apply(clean_url)
# splitting value
df['Violation Id'] = df['value'].astype(str).apply(violation_id)
df['Violation Desc'] = df['value'].astype(str).apply(violation_desc)
df = df.reset_index()
df = df.drop(['index', 'Unnamed: 0', 'columns.0', 'columns.1', 'columns.2', 'printablePath',
'variable', 'value'], axis=1)
df.to_csv('./cleaned/Inspections.csv', index=False)