import pandas as pd
inspections = pd.read_csv("../data/inspections.csv", index_col=0)
inspections.head(3)
inspections['INSPECTION DATE'] = pd.to_datetime(inspections['INSPECTION DATE'])
# random_camis = inspections.sample().iloc[0]['CAMIS']
# inspections[inspections['CAMIS'] == random_camis].sort_values(by='INSPECTION DATE')['INSPECTION DATE']
Get initial inspection date.
inspections_f = inspections.groupby('CAMIS')\
.apply(lambda df: inspections[inspections['CAMIS'] == df['CAMIS'].iloc[0]]\
.sort_values(by="INSPECTION DATE")\
.iloc[0]\
.drop('CAMIS')
)\
.rename(columns={'INSPECTION DATE': 'INITIAL INSPECTION DATE'})
inspections_f.sample(1)
Not sure why, but this operation drops the DBA
field. None others are affected. Maybe something with the indexing?
Let's take the most recent inspection date.
inspections_ff = inspections_f.copy()
inspections_ff['LATEST INSPECTION DATE'] = inspections_ff\
.apply(lambda srs: inspections[inspections['CAMIS'] == srs.name]\
.sort_values(by='INSPECTION DATE')\
.iloc[-1]\
['INSPECTION DATE'],
axis='columns')
Rename the INSPECTION TYPE
to INITIAL INSPECTION TYPE
(this flags new establishments from pre-existing ones, in the cotext of our dataset).
inspections_ff = inspections_ff.rename(columns={'INSPECTION TYPE': 'INITIAL INSPECTION TYPE'})
Checking the flags:
inspections_ff['INITIAL INSPECTION TYPE'].value_counts()
A lot of them are null.
inspections_ff['INITIAL INSPECTION TYPE'].isnull().astype(int).sum()
Remember, this is new establishments that have not been inspected yet. That the numbers match up here is encouraging.
inspections['INSPECTION TYPE'].isnull().astype(int).sum()
We reattach the lost DBA column.
inspections_ff.head(1)
inspections_fff = inspections_ff.copy()
inspections_fff['DBA'] = inspections_fff.apply(lambda srs:
inspections[inspections['CAMIS'] == srs.name]\
.iloc[0]['DBA'],
axis='columns')
Prepend descriptors, to more easily distinguish things down the road.
inspections_fff.columns = ["DOHMH " + col for col in inspections_fff.columns]
inspections_fff.sample()
Now we get information from Yelp!
from yelp.client import Client
from yelp.oauth1_authenticator import Oauth1Authenticator
from yelp.errors import BusinessUnavailable
import os
import json
def import_credentials(filename='../data/yelp_credentials.json'):
try:
data = json.load(open(filename))
return data
except:
raise IOError('This API requires Yelp credentials to work. Did you forget to define them?')
credentials = import_credentials()
auth = Oauth1Authenticator(
consumer_key=credentials['consumer_key'],
consumer_secret=credentials['consumer_secret'],
token=credentials['token'],
token_secret=credentials['token_secret']
)
client = Client(auth)
from tqdm import tqdm
import httplib2.HttpLib2Error
from urllib.error import HTTPError
import yelp
def yelp_phone_fetch(num):
"""
Performs the phone search described in notebook 02 to fetch information on the entity associated with a number.
"""
if not num:
return None
else:
try:
business = client.phone_search(num).businesses[0]
if business and business.location and business.location.coordinate:
return {
'Yelp ID': business.id,
'Yelp Is Claimed': business.is_claimed,
'Yelp Is Closed': business.is_closed,
'Yelp Name': business.name,
'Yelp URL': business.url,
'Yelp Review Count': business.review_count,
'Yelp Categories': business.categories,
'Yelp Rating': business.rating,
'Yelp Address': business.location.address,
'Yelp Neighborhoods': business.location.neighborhoods,
'Yelp Latitude': business.location.coordinate.latitude,
'Yelp Longitude': business.location.coordinate.longitude,
}
else: # Partial information, skip.
return None
except IndexError: # Phone search failed!
return None
except yelp.errors.InvalidParameter: # Invalid number!
return None
After some testing there appears to be a very significant miss rate. Remember, we're fuzzy matching phone numbers from one data set with phone numbers in another dataset. It's far from 100% that we'll get something back out. I'm interested in what percentage of the time we're successful.
def random_number_dba_tuple():
random_number, random_dba = inspections_fff.sample().iloc[0][['DOHMH PHONE', 'DOHMH DBA']]
random_number = str(int(random_number))
return random_number, random_dba
random_number_dba_tuple()
yelp_phone_fetch(random_number_dba_tuple()[0])
Hey I've been there! Nice. More rigorously:
one_hundred_randoms = [random_number_dba_tuple() for i in range(100)]
testset = [yelp_phone_fetch(num) for num, placename in tqdm(one_hundred_randoms)]
np.array([entity == None for entity in testset]).astype(int).sum()
19 misses in 100 randoms. So we're able to match 80% of the time. Not bad as far as fuzzy matches go.
If our data is missing at random (MAR), then we are happy, because this makes for a statistically valid sample of all restaurants in New York City—we can simply drop the other values.
But there's a high likelihood, in my professional opinion, that the data is missing not at a random (MNAR). I expect that it's more likely for a restaurant or eatery in a poor neighborhood to be missing the phone number information in Yelp! than one in a richer, whiter community. This is a severe under-reporting issue that will invalidate any conclusions we try to make using this data with regarding to "gentrification" and whatnot.
We'll need to validate the data geospatially. We'll do that next.
Run the full Yelp! API data through, one slice at a time.
len(inspections_fff)
inspections_fff.to_csv("../data/inspections_flattened_initial.csv", encoding="utf-8")
Oy vey. This will require two days of processing, since the API limit is 25000/day.
from tqdm import tqdm_notebook
del inspections
del inspections_f
del inspections_ff
inspections_fff['DOHMH PHONE'].iloc[2702:2705]
Uh, ok.
raw_yelp_5000 = [yelp_phone_fetch(int(num)) if pd.notnull(num) and str.isdigit(num) else None for num in tqdm_notebook(inspections_fff['DOHMH PHONE'][:5000])]
raw_yelp_5000_2 = [yelp_phone_fetch(int(num)) if pd.notnull(num) and str.isdigit(num) else None for num in tqdm_notebook(inspections_fff['DOHMH PHONE'][5000:10000])]
raw_yelp_5000_3 = [yelp_phone_fetch(int(num)) if pd.notnull(num) and str.isdigit(num) else None for num in tqdm_notebook(inspections_fff['DOHMH PHONE'][10000:15000])]
import pickle
with open("../data/raw_yelp_list.pkl", "wb") as f:
pickle.dump(raw_yelp_5000+ raw_yelp_5000_2 + raw_yelp_5000_3, f)
import pickle
with open("../data/raw_yelp_list.pkl", "rb") as f:
raw_yelp_1_to_3 = pickle.load(f)
len(raw_yelp_1_to_3)
raw_yelp_5000_4 = [yelp_phone_fetch(int(num)) if pd.notnull(num) and str.isdigit(str(int(num))) else None for num in tqdm_notebook(inspections_fff['DOHMH PHONE'][15000:20000])]
inspections_fff['DOHMH PHONE'].iloc[20000 + 3270]
This appears to be an invalid phone number. Parsing it through Yelp raises a reliable error, which I've now coded into the loop.
yelp_phone_fetch(int(inspections_fff['DOHMH PHONE'].iloc[20000 + 3270]))
There seem to be multiple such numbers in there.
yelp_phone_fetch(int(inspections_fff['DOHMH PHONE'].iloc[20000 + 4065]))
raw_yelp_5000_5 = [yelp_phone_fetch(int(num)) if pd.notnull(num) and str.isdigit(str(int(num))) else None for num in tqdm_notebook(inspections_fff['DOHMH PHONE'][20000:25000])]
raw_yelp_5000_6 = [yelp_phone_fetch(int(num)) if pd.notnull(num) and str.isdigit(str(int(num))) else None for num in tqdm_notebook(inspections_fff['DOHMH PHONE'][25000:])]
all_raws = raw_yelp_1_to_3 + raw_yelp_5000_4 + raw_yelp_5000_5 + raw_yelp_5000_6
len(all_raws)
With the data thus acquired, we now assign it to our dataframe. Before we do that, first we need to clean up what we have.
inspections_fff.sample(1)
all_raws[0] = {'Yelp Address': ['1007 Morris Park Avenue'],
'Yelp Categories': ["Category(name='Bakeries', alias='bakeries')",
"Category(name='Desserts', alias='desserts')"],
'Yelp ID': 'morris-pk-bake-shop-bronx',
'Yelp Is Claimed': False,
'Yelp Is Closed': False,
'Yelp Latitude': 40.848445892334,
'Yelp Longitude': -73.8560791015625,
'Yelp Name': 'Morris Pk Bake Shop',
'Yelp Neighborhoods': ['Morris Park'],
'Yelp Rating': 4.5,
'Yelp Review Count': 27,
'Yelp URL': 'https://www.yelp.com/biz/morris-pk-bake-shop-bronx?adjust_creative=dkJPGu_jtTyHwsEgZIZN6g&utm_campaign=yelp_api&utm_medium=api_v2_phone_search&utm_source=dkJPGu_jtTyHwsEgZIZN6g'}
all_raws[0]
import re
import copy
re_cat_word = re.compile("name='[-\w\s&()/,']+(?=')")
def safe_group(match_obj):
try:
return match_obj.group().replace("name='", "")
except AttributeError:
return None
def format_data(yelp_dict):
if yelp_dict:
ret = copy.deepcopy(yelp_dict)
if ret['Yelp Address']:
ret['Yelp Address'] = ret['Yelp Address'][0]
cats = ret['Yelp Categories']
# print(cats)
if cats:
try:
parsed_cats = [re.search(re_cat_word, str(cat)).group().replace("name='", "") for cat in cats]
except:
print(cats)
parsed_cats = []
# print(parsed_cats)
ret['Yelp Categories'] = "|".join(parsed_cats)
neighborhoods = ret['Yelp Neighborhoods']
if neighborhoods:
ret['Yelp Neighborhoods'] = "|".join(ret['Yelp Neighborhoods'])
del ret['Yelp URL']
return ret
else:
return None
formatted_yelp_data = []
for raw in tqdm_notebook(all_raws):
formatted_yelp_data.append(format_data(raw))
formatted_yelp_data[0]
Now assign.
inspections_ffff = inspections_fff.copy()
for key in formatted_yelp_data[0].keys():
inspections_ffff[key] = [s[key] if s else None for s in formatted_yelp_data]
inspections_ffff.columns = [col.upper() for col in inspections_ffff.columns]
inspections_ffff
inspections_ffff.to_csv("../data/yelp_dohmh_agg_data.csv", encoding='utf-8')