import pandas as pd
import numpy as np
import json
import csv
import seaborn as sns
from glob import glob
PATH = "d:\\data\\yelpdata\\"
DATA = f'{PATH}dataset\\'
PHOT = f'{PATH}photos\\'
business_cols = ['business_id','name','neighborhood','address','city','state','postal_code','latitude','longitude',
'stars','review_count','is_open','attributes','categories','hours']
business_df = []
for i,line in enumerate(open(f'{DATA}business.json', encoding='utf-8')):
line_data = []
for k,v in json.loads(line).items():
line_data.append(v)
business_df.append(line_data)
business_df = pd.DataFrame(business_df)
business_df.columns = business_cols
business_on = business_df[business_df['state'] == 'ON'].copy()
business_on.reset_index(drop=True,inplace=True)
business_on.to_csv(f'{DATA}business_on.csv')
ontario_business_ids = set(business_on['business_id'])
del business_df
We get the date, which is great, because we can get the weather data. We can look at the effects of weather on both individual and aggregate user review scores. A simple hypothesis is that people tend to give better reviews in "good" weather, which we can test. It would be even better if we had the exact time of day (which I'm sure Yelp has), but we can't have everything.
review_cols = ['review_id','user_id','business_id','stars','date','text','useful','funny','cool']
review_df = []
for i,line in enumerate(open(f'{DATA}review.json', encoding='utf-8')):
line_data = []
for k,v in json.loads(line).items():
line_data.append(v)
review_df.append(line_data)
review_df = pd.DataFrame(review_df)
review_df.columns = review_cols
Keep only Ontario/GTA businesses.
review_on = review_df[review_df['business_id'].isin(ontario_business_ids)].copy()
review_on.reset_index(drop=True,inplace=True)
review_on['text'] = review_on['text'].apply(lambda l: l.replace('\n', ' ').replace('"', '').replace('\\', ''))
review_on.to_csv(f'{DATA}review_on.csv', quotechar='"', escapechar="\\")
ontario_user_ids = set(review_on['user_id'])
del review_df
user_cols = ['user_id','name','review_count','yelping_since','friends','useful','funny','cool','fans','elite',
'average_stars','compliment_hot','compliment_more','compliment_profile','compliment_cute','compliment_list',
'compliment_note','compliment_plain','compliment_cool','compliment_funny','compliment_writer','compliment_photos']
user_df = []
for i,line in enumerate(open(f'{DATA}user.json', encoding='utf-8')):
line_data = []
for k,v in json.loads(line).items():
line_data.append(v)
user_df.append(line_data)
user_df = pd.DataFrame(user_df)
user_df.columns = user_cols
user_on = user_df[user_df['user_id'].isin(ontario_user_ids)].copy()
user_on.reset_index(drop=True,inplace=True)
user_on.to_csv(f'{DATA}user_on.csv')
del user_df
photos_cols = ['caption','photo_id','business_id','label']
photos_df = []
for i,line in enumerate(open(f'{DATA}photos.json', encoding='utf-8')):
line_data = []
for k,v in json.loads(line).items():
line_data.append(v)
photos_df.append(line_data)
photos_df = pd.DataFrame(photos_df)
photos_df.columns = photos_cols
photos_on = photos_df[photos_df['business_id'].isin(ontario_business_ids)].copy()
photos_on.reset_index(drop=True,inplace=True)
photos_on.to_csv(f'{DATA}photos_on.csv')
del photos_df
tip_cols = ['text','date','likes','business_id','user_id']
tip_df = []
for i,line in enumerate(open(f'{DATA}tip.json', encoding='utf-8')):
line_data = []
for k,v in json.loads(line).items():
line_data.append(v)
tip_df.append(line_data)
tip_df = pd.DataFrame(tip_df)
tip_df.columns = tip_cols
tip_on = tip_df[tip_df['business_id'].isin(ontario_business_ids)].copy()
tip_on.reset_index(drop=True,inplace=True)
tip_on.to_csv(f'{DATA}tip_on.csv')
del tip_df
checkin_cols = ['time','business_id']
checkin_df = []
for i,line in enumerate(open(f'{DATA}checkin.json', encoding='utf-8')):
line_data = []
for k,v in json.loads(line).items():
line_data.append(v)
checkin_df.append(line_data)
checkin_df = pd.DataFrame(checkin_df)
checkin_df.columns = checkin_cols
checkin_on = checkin_df[checkin_df['business_id'].isin(ontario_business_ids)].copy()
checkin_on.reset_index(drop=True,inplace=True)
checkin_on.to_csv(f'{DATA}checkin_on.csv')
del checkin_df
Make a separate file only for people with friends on Yelp.
friends = pd.read_csv(f'{DATA}user_on.csv')
friends = friends[friends.friends != '[]']
friends.reset_index(inplace=True, drop=True)
friends.drop('Unnamed: 0', inplace=True, axis=1)
friends.to_csv(f'{DATA}user_on_friends.csv')
smallrev = pd.read_csv(f'{DATA}review_on.csv')
smallrev = smallrev[['review_id','user_id','business_id','stars']]
smallrev.to_csv(f'{DATA}review_on_small.csv')