#!/usr/bin/env python # coding: utf-8 # # Analysis of Gas Prices in the GTA # We obtain gas station price data for the Greater Toronto Area with data from yellowpages.ca and gasbuddy.com and cluster them based on location and price per litre of regular gas. We also test the price effects of having another gas station or Starbucks location within a 100m radius of a gas station. # In[1]: from bs4 import BeautifulSoup import urllib import requests import math import re import pandas as pd import numpy as np import datetime import sys import time import json import ast from geopy.geocoders import GoogleV3 import seaborn as sns from time import sleep geolocator = GoogleV3() get_ipython().run_line_magic('matplotlib', 'inline') today = datetime.datetime.now() # In[2]: # Get the names and addressess of gas stations listed in Yellow Pages in the Toronto region def get_addresses (page_num): with open('gas_dict.json', 'r') as infile: gas_dict = json.load(infile) url = 'https://www.yellowpages.ca/search/si/'+str(page_num)+'/Gas%20Stations/Toronto+ON' try: html = urllib.request.urlopen(url).read() except: return page_num soup = BeautifulSoup(html, 'html.parser') content = soup.findAll('div',{'class':'listing__content__wrap'}) for n in range(len(content)): try: name = content[n].findAll('a')[0].get('title').split('-')[0] except: name = '#NO NAME' try: addr_ele = content[n].findAll('span',{'class':'listing__address--full'})[0].findAll('span') addr = ','.join([addr_ele[i].string for i,e in enumerate(addr_ele)]) except: addr = '#NO ADDRESS#' gas_dict[addr] = {'name':name} with open('gas_dict.json', 'w') as outfile: json.dump(gas_dict,outfile) return page_num + 1 def address_main(): # If file doesn't exist, create it try: with open('gas_dict.json', 'r') as infile: gas_dict = json.load(infile) except: with open('gas_dict.json', 'w') as outfile: gas_dict={} json.dump(gas_dict,outfile) page_num = 1 while page_num < 22: print('Fetching page ',page_num) page_num = get_addresses(page_num) sleep(np.random.random(1)*2+2) # In[3]: # Get rid of incomplete or empty addresses def clean_addresses(): with open('gas_dict.json', 'r') as infile: gas_dict = json.load(infile) gas_dict.pop('#NO ADDRESS#',None) gas_dict.pop('ON',None) gas_dict.pop('12001 Hwy 400,Maple,ON,L7B 1A8',None) # This is actually both redundant and the wrong city ... with open('gas_dict.json', 'w') as outfile: json.dump(gas_dict,outfile) # In[4]: # for ontariogasprices.com scraping, we need a list of postal codes to search with # the site focuses on finding the lowest price in an area and does not prioritize towards specific addresses # but this works to our advantage since we can potentially find stations not included in the yellowpages data # gas types are # A: regular, B: mid, C: premium, D: diesel def get_area_prices(gas_type, postal_code): with open('gas_stations_'+datetime.datetime.now().strftime('%Y-%m-%d')+'.json', 'r') as infile: gas_stations = json.load(infile) url = 'http://www.ontariogasprices.com/GasPriceSearch.aspx?fuel='+gas_type+'&qsrch='+postal_code[0]+'%20'+postal_code[1] print ('Fetching from ', url) html = requests.get(url).text soup = BeautifulSoup(html, 'html.parser') if len(soup.findAll('tbody')) == 0: pass else: data_rows = soup.findAll('tbody')[0].findAll('tr') for i in range(len(data_rows)): try: name = data_rows[i].findAll('td')[0].find('a').string st_addr = data_rows[i].findAll('td')[0].find('dd').string.split('&')[0].split('near')[0] city = data_rows[i].findAll('td')[2].find('a').string addr = st_addr + ', '+ city + ', Ontario' price = float(data_rows[i].find('a').string) if addr not in gas_stations: gas_stations[addr] = {'name':name} else: pass gas_stations[addr]['type_'+gas_type] = price except: # The "other stations in the area" row can just be ignored pass with open('gas_stations_'+datetime.datetime.now().strftime('%Y-%m-%d')+'.json', 'w') as outfile: json.dump(gas_stations,outfile) def price_main(): # first get postal codes with open('gas_dict.json', 'r') as infile: gas_dict = json.load(infile) try: with open('gas_stations_'+datetime.datetime.now().strftime('%Y-%m-%d')+'.json', 'r') as infile: gas_stations = json.load(infile) except: with open('gas_stations_'+datetime.datetime.now().strftime('%Y-%m-%d')+'.json', 'w') as outfile: gas_stations={} json.dump(gas_stations,outfile) postal_list=[] for k,v in gas_dict.items(): postal_list.append(k.split(',')[-1].split()) # If no postal code, just ignore for now... postal_list = [postal_list[i] for i,p in enumerate(postal_list) if len(p) == 2] break_point = 0 while (break_point < len(postal_list)): postal_code = postal_list[break_point] try: for gas_type in list('ABCD'): get_area_prices(gas_type, postal_code) sleep(np.random.random(1)*2+0.5) break_point += 1 except: # Try again print('Trying again ...') sleep(np.random.random(1)*2+2) # In[5]: # Get not only the coordinates but also the Google Maps address, which will likely clean up any messes left over # from the ontariogasprices.com splits def get_coord(addr): # splitting Toronto addresses into "Toronto - [area]" messes up google maps, so get rid of that when querying if 'Toronto' in addr.split(',')[1]: addr = addr.split(',')[0] + ', Toronto, '+addr.split(',')[2] print ('Getting' , addr) geoloc = geolocator.geocode(addr) sleep(np.random.random(1)*2+2) print ('Got ', geoloc, '\n') return geoloc def coords_main(): # Start at the beginning of gas_dict break_point = 0 # open gas_stations, and get the list of all the addresses with open('gas_stations_'+datetime.datetime.now().strftime('%Y-%m-%d')+'.json', 'r') as infile: gas_dict = json.load(infile) addr_list = [k for k,v in gas_dict.items()] # do while there are still addresses to process while(break_point < len(addr_list)): # run collect_coords starting with the current breakpoint, and gets the # address that it's stuck on addr = addr_list[break_point] try: geoloc = get_coord(addr) if geoloc == None: # GoogleV3 can't get a hold of it. Need to look at it later. Most likely it's a highway. break_point += 1 sleep(np.random.random(1)*2+2) else: gas_dict[addr]['address'] = geoloc.address gas_dict[addr]['longitude'] = geoloc.longitude gas_dict[addr]['latitude'] = geoloc.latitude # If it works, get to the next one. Once break_point gets to the length of the list, the conditions of # the loop will no longer be satisfied, and the loop ends. break_point +=1 with open('gas_stations_'+datetime.datetime.now().strftime('%Y-%m-%d')+'.json','w') as outfile: json.dump(gas_dict,outfile) sleep(np.random.random(1)*2+2) except: # If it doesn't work, don't increase the break_point and try again print ('Could not get ', addr) sleep(np.random.random(1)*2+2) with open('gas_stations_'+datetime.datetime.now().strftime('%Y-%m-%d')+'.json','w') as outfile: json.dump(gas_dict,outfile) #return gas_dict # In[6]: def xy_distance(xlat,xlong,ylat,ylong): # The radius of the planet is approx. 6371.01 km xlat = math.radians(xlat) xlong = math.radians(xlong) ylat = math.radians(ylat) ylong = math.radians(ylong) dist = 6371.01 * math.acos(math.sin(xlat)*math.sin(ylat) + math.cos(xlat)*math.cos(ylat)*math.cos(xlong - ylong)) return dist def xy_closest(coords, x): return (min(coords, key=lambda y: xy_distance(x[1],x[0],y[1],y[0]))) # In[7]: def enter_address(df, addr, max_dist): standard_addr = get_coord(addr) within_range = pd.DataFrame() for i in range(len(df)): dist = xy_distance(standard_addr.latitude,standard_addr.longitude,df.iloc[i]['latitude'],df.iloc[i]['longitude']) if dist <= max_dist: within_range = pd.concat((within_range,df.iloc[i]),axis=1) within_range = within_range.transpose().sort_values('type_A') # Hide the entered address in the dataframe too so we can show it on the map print ('Gas stations within range: ') print (within_range[['name','type_A']]) folmap(within_range,15,custom=1,extralat=standard_addr.latitude,extralong=standard_addr.longitude) # In[8]: def folmap (df,zoom=10,custom=None,extralat=None,extralong=None): k=7 colorscale = branca.colormap.linear.YlGnBu.scale(0, 60) cluster_colors = ['red', 'green', 'blue', 'yellow', 'cadetblue', 'purple', 'black'] if custom == 1: m = folium.Map( location=[extralat, extralong], zoom_start=zoom ) else: # center the map on the median coordinates m = folium.Map( location=[np.median(df.latitude), np.median(df.longitude)], zoom_start=zoom ) marker_cluster = MarkerCluster().add_to(m) for j in range(k): for r in df[:-1].iterrows(): if r[1]['label'] == j: pop = ["Price: ",str(r[1]['type_A'])] folium.Marker( location = [r[1]['latitude'],r[1]['longitude']], popup=''.join(pop), icon=folium.Icon(color=cluster_colors[j], icon='ok-sign'), ).add_to(marker_cluster) if custom == 1: folium.Marker(location = [extralat,extralong], popup = 'You are here', icon=folium.Icon(color='pink', icon='cloud')).add_to(m) display(m) # Run the 4 main functions to get a json file of addresses and prices, and a list of coordinates. # In[ ]: #address_main() # In[ ]: #clean_addresses() # In[ ]: #price_main() # In[ ]: #coords_main() # In[9]: with open('gas_stations_2017-12-13.json', 'r') as infile: gas_stations = json.load(infile) len(gas_stations) # ### Clean up # # For some reason, there's a station near Vaughn that only registers in Google Maps if we input it as King City. There are also a few duplicate stations that are listed under different names or address variants, so we get rid of those. # In[10]: # Special treatment for 12001 hwy 400 nb, since gasbuddy has it as in Vaughan but Google recognizes it as being in King City gas_stations['12001 Hwy 400 NB , Vaughan, Ontario'] # In[11]: geoloc = geolocator.geocode('12001 Hwy 400 NB , King City, Ontario') # In[12]: gas_stations['12001 Hwy 400 NB , Vaughan, Ontario']['address'] = geoloc.address gas_stations['12001 Hwy 400 NB , Vaughan, Ontario']['longitude'] = geoloc.longitude gas_stations['12001 Hwy 400 NB , Vaughan, Ontario']['latitude'] =geoloc.latitude # In[13]: gas_df = pd.DataFrame.from_dict(gas_stations, orient='index') # In[14]: gas_df.head() # In[15]: gas_df[gas_df['longitude']==-79.347831] # In[16]: gas_df[gas_df['longitude']==-79.7736453] # In[17]: # Get rid of redundancies gas_df.drop_duplicates(subset=['longitude','latitude'], inplace=True) # In[18]: gas_df[gas_df['longitude']==-79.347831] # In[19]: gas_df[gas_df['longitude']==-79.7736453] # ### Mean prices by brand # The top prices may be unreliable, since they are not major chains or might be out of the way, and so user data on their prices is scarce. Not surprisingly, the cheapest brand is Costco. It's a members only service, and in some sense doesn't really count. It's prices are so low that in the K Means clustering below, the price discrepancy largely overpowers any distance effects. # In[20]: gas_df.groupby('name')['type_A'].describe().sort_values(by='mean', ascending=False)[:8] # In[21]: gas_df.groupby('name')['type_A'].describe().sort_values(by='mean', ascending=True)[:8] # In[22]: gas_df.iloc[:5] # In[23]: gas_df.to_csv('gas_stations_2017-12-13.csv') # In[24]: with open('gas_stations_2017-12-13.json','w') as outfile: json.dump(gas_stations,outfile) # In[25]: coords_df = gas_df[['longitude','latitude']].copy() # In[26]: coords_df.head() # In[27]: coords_df['long_rad'] = coords_df['longitude'].apply(lambda l:math.radians(l)) coords_df['lat_rad'] = coords_df['latitude'].apply(lambda l:math.radians(l)) # In[28]: coords_df.head() # In[29]: coords_df['combined'] = list(zip(coords_df['longitude'],coords_df['latitude'])) # In[30]: coords_df.groupby('combined').count().sort_values('longitude',ascending=False)[:5] # In[31]: gas_df[coords_df['combined']==(-79.7736453, 43.7408885)] # ### Checking for gas stations with competitors within 100 m # # While competition pushes prices down, especially when there is almost 100% price transparency (all gas stations advertise their regular gas prices prominently), multiple gas stations are usually at very busy intersections. In that case, can the extra demand counteract the competitive effect? We can check. # In[32]: within_100 = [] for i in range(len(coords_df)): closest_coord = xy_closest(coords_df['combined'][coords_df.index != coords_df.index[i]], coords_df.iloc[i]['combined']) within_100.append(xy_distance(coords_df.iloc[i]['combined'][0],coords_df.iloc[i]['combined'][1], closest_coord[0], closest_coord[1]) <= 0.1) # In[33]: sum(within_100) # In[34]: coords_df['within_100'] = np.asarray(within_100).astype(int) # In[35]: coords_df.head() # In[36]: gas_df['within_100'] = coords_df['within_100'] # In[37]: gas_df.head() # For general clustering, use longitude and latitude. For gasoline-specific clustering, also use the price of the specific type of gas. We can just use k means for general clustering, but when looking at only certain types of gas (such as regular type_A). # In[38]: cluster_df = gas_df[['name','type_A','longitude','latitude']].copy() # In[39]: cluster_df.head() # ### Exploring the Starbucks Metric # Use Starbucks' location research to our benefit. As with station proximities to other stations, we will use the "as the crow flies" distance. # # Starbucks location data from https://opendata.socrata.com/Business/All-Starbucks-Locations-in-the-World/xy4y-c4mk # In[40]: starbucks_df = pd.read_csv('All_Starbucks_Locations_in_the_World.csv', delimiter=';') starbucks_df = starbucks_df[(starbucks_df['Country']=='CA') & (starbucks_df['Country Subdivision']=='ON')] starbucks_df.reset_index(inplace=True) starbucks_df.head() # In[41]: starbucks_df.info() # In[42]: # Actually we only need the coordinates. starbucks_on_coords = starbucks_df['Coordinates'].tolist() # In[43]: # Convert from a list of strings that look like Python lists to an actual list of lists # Also need to change it from (lat,long) to (long,lat) starbucks_on_coords = [ast.literal_eval(starbucks_on_coords[i]) for i in range(len(starbucks_on_coords))] starbucks_on_coords = [(starbucks_on_coords[i][1], starbucks_on_coords[i][0]) for i in range(len(starbucks_on_coords))] # In[44]: nearest_starbucks = [] for i in range(len(coords_df)): closest_coord = xy_closest(starbucks_on_coords, coords_df.iloc[i]['combined']) nearest_starbucks.append(xy_distance(coords_df.iloc[i]['combined'][1],coords_df.iloc[i]['combined'][0], closest_coord[1], closest_coord[0])) # In[45]: starbucks_within_100m = [] for i in range(len(coords_df)): closest_coord = xy_closest(starbucks_on_coords, coords_df.iloc[i]['combined']) starbucks_within_100m.append(xy_distance(coords_df.iloc[i]['combined'][1],coords_df.iloc[i]['combined'][0], closest_coord[1], closest_coord[0])<=1) # In[46]: gas_df['nearest_starbucks'] = np.asarray(nearest_starbucks) gas_df['starbucks_within_100m'] = np.asarray(starbucks_within_100m).astype(int) # In[47]: gas_df.head() # Let's invent a Starbucks proximity metric, a sigmoid with the log of the inverse of the nearest starbucks in km:
# $\frac{1}{(1+exp(log(-\frac{1}{d})^2))}$ # (This is not actually used, but keep this here just in case) # In[48]: gas_df['starbucks_metric'] = gas_df['nearest_starbucks'].apply(lambda d:1/(1+np.exp(-np.log((2/d)**2)))) # In[49]: gas_df.head() # In[50]: gas_df.to_csv('gas_stations_2017-12-13.csv') # In[51]: regular_gas = gas_df[['type_A','within_100','starbucks_within_100m']].dropna().copy() # In[52]: regular_gas.columns=['Price','Competition_Nearby','Starbucks_Nearby'] # In[53]: regular_gas['Relative_Price'] = regular_gas['Price'].apply(lambda p: p/np.mean(regular_gas['Price'])) regular_gas['Price_Difference'] = regular_gas['Price'].apply(lambda p: p-np.mean(regular_gas['Price'])) # In[54]: regular_gas.head() # In[55]: import statsmodels.api as sm # In[56]: Y = regular_gas['Price'] X = sm.add_constant(regular_gas[['Competition_Nearby','Starbucks_Nearby']]) lm = sm.OLS(Y,X) # In[57]: results = lm.fit() # In[58]: results.params # In[59]: results.summary() # Gas stations within 100m of a Starbucks location charge 0.62 cents more for regular gas, and this is statistically significant with over 99% confidence. # # Proximity to competition also seems to increase prices, likely due to increased demand at busy intersections (as mentioned above). However, the p value is far too high so we can not be certain of its effect with any confidence. # ### Clustering and Mapping # # Let's see if we can find any meaningful clusters. # In[60]: cluster_df.dropna(inplace=True) # In[61]: cluster_df.info() # In[62]: cluster_df['type_A_norm'] = cluster_df['type_A']/np.mean(cluster_df['type_A']) cluster_df['longitude_norm'] = -cluster_df['longitude']/np.mean(cluster_df['longitude']) cluster_df['latitude_norm'] = cluster_df['latitude']/np.mean(cluster_df['latitude']) # In[63]: cluster_df.head() # In[64]: from scipy import linalg from sklearn.cluster import KMeans from sklearn import svm from scipy.spatial.distance import cdist from scipy.spatial import ConvexHull import matplotlib.pyplot as plt import seaborn as sns get_ipython().run_line_magic('matplotlib', 'inline') # In[65]: SSE = [] coords = list(cluster_df[['type_A_norm','longitude_norm','latitude_norm']].values) for k in range(1,13): km = KMeans(n_clusters=k) km.fit(coords) labels = km.labels_ centroids = km.cluster_centers_ # Get the SSE SSE.append(sum(np.min(cdist(coords, centroids, 'euclidean'),axis=1))/len(coords)) plt.plot(range(1,13),SSE, 'bo--') plt.xlabel('k clusters') plt.ylabel('SSE') plt.title('SSE vs k') # 4 seems like a decent choice, but interestingly so is 7. Let's try 7. # In[66]: k = 7 km = KMeans(n_clusters=k) km.fit(coords) labels = km.labels_ centroids = km.cluster_centers_ colors = ['r', 'g', 'b', 'y', 'c', 'm', 'k'] plt.figure(figsize=(8,8)) plt.title('Clustered Locations') for j in range(k): current_coords = np.asarray(coords)[np.where(labels==j)] j_longs = current_coords[:,1] j_lats = current_coords[:,2] plt.plot(j_longs, j_lats, '%so' % colors[j], label=j) plt.legend() # 7 is interesting indeed. Mississauga and Toronto are separated, and all but one of the Costco stations are grouped together regardless of distance. # In[67]: cluster_df['label'] = km.labels_ # In[68]: cluster_df.groupby('label')['type_A'].describe().sort_values('mean',ascending=False)['mean'] # In[74]: cluster_df[cluster_df['label']==5] # ### Folium # # Let's put it on a Toronto map with Folium. # # Then run our function for getting the nearest gas stations within a specified distance in kilometers. # In[70]: import branca import folium from folium.plugins import MarkerCluster # In[71]: cluster_df.head() # In[72]: folmap(cluster_df,zoom=10) # Enter the address and distance threshold to see nearby stations. # In[73]: enter_address(cluster_df,'80 Bloor St W, Toronto, ON M5S 2V1',2)