We obtain gas station price data for the Greater Toronto Area with data from yellowpages.ca and gasbuddy.com and cluster them based on location and price per litre of regular gas. We also test the price effects of having another gas station or Starbucks location within a 100m radius of a gas station.
from bs4 import BeautifulSoup
import urllib
import requests
import math
import re
import pandas as pd
import numpy as np
import datetime
import sys
import time
import json
import ast
from geopy.geocoders import GoogleV3
import seaborn as sns
from time import sleep
geolocator = GoogleV3()
%matplotlib inline
today = datetime.datetime.now()
# Get the names and addressess of gas stations listed in Yellow Pages in the Toronto region
def get_addresses (page_num):
with open('gas_dict.json', 'r') as infile:
gas_dict = json.load(infile)
url = 'https://www.yellowpages.ca/search/si/'+str(page_num)+'/Gas%20Stations/Toronto+ON'
try:
html = urllib.request.urlopen(url).read()
except:
return page_num
soup = BeautifulSoup(html, 'html.parser')
content = soup.findAll('div',{'class':'listing__content__wrap'})
for n in range(len(content)):
try:
name = content[n].findAll('a')[0].get('title').split('-')[0]
except:
name = '#NO NAME'
try:
addr_ele = content[n].findAll('span',{'class':'listing__address--full'})[0].findAll('span')
addr = ','.join([addr_ele[i].string for i,e in enumerate(addr_ele)])
except:
addr = '#NO ADDRESS#'
gas_dict[addr] = {'name':name}
with open('gas_dict.json', 'w') as outfile:
json.dump(gas_dict,outfile)
return page_num + 1
def address_main():
# If file doesn't exist, create it
try:
with open('gas_dict.json', 'r') as infile:
gas_dict = json.load(infile)
except:
with open('gas_dict.json', 'w') as outfile:
gas_dict={}
json.dump(gas_dict,outfile)
page_num = 1
while page_num < 22:
print('Fetching page ',page_num)
page_num = get_addresses(page_num)
sleep(np.random.random(1)*2+2)
# Get rid of incomplete or empty addresses
def clean_addresses():
with open('gas_dict.json', 'r') as infile:
gas_dict = json.load(infile)
gas_dict.pop('#NO ADDRESS#',None)
gas_dict.pop('ON',None)
gas_dict.pop('12001 Hwy 400,Maple,ON,L7B 1A8',None) # This is actually both redundant and the wrong city ...
with open('gas_dict.json', 'w') as outfile:
json.dump(gas_dict,outfile)
# for ontariogasprices.com scraping, we need a list of postal codes to search with
# the site focuses on finding the lowest price in an area and does not prioritize towards specific addresses
# but this works to our advantage since we can potentially find stations not included in the yellowpages data
# gas types are
# A: regular, B: mid, C: premium, D: diesel
def get_area_prices(gas_type, postal_code):
with open('gas_stations_'+datetime.datetime.now().strftime('%Y-%m-%d')+'.json', 'r') as infile:
gas_stations = json.load(infile)
url = 'http://www.ontariogasprices.com/GasPriceSearch.aspx?fuel='+gas_type+'&qsrch='+postal_code[0]+'%20'+postal_code[1]
print ('Fetching from ', url)
html = requests.get(url).text
soup = BeautifulSoup(html, 'html.parser')
if len(soup.findAll('tbody')) == 0:
pass
else:
data_rows = soup.findAll('tbody')[0].findAll('tr')
for i in range(len(data_rows)):
try:
name = data_rows[i].findAll('td')[0].find('a').string
st_addr = data_rows[i].findAll('td')[0].find('dd').string.split('&')[0].split('near')[0]
city = data_rows[i].findAll('td')[2].find('a').string
addr = st_addr + ', '+ city + ', Ontario'
price = float(data_rows[i].find('a').string)
if addr not in gas_stations:
gas_stations[addr] = {'name':name}
else:
pass
gas_stations[addr]['type_'+gas_type] = price
except:
# The "other stations in the area" row can just be ignored
pass
with open('gas_stations_'+datetime.datetime.now().strftime('%Y-%m-%d')+'.json', 'w') as outfile:
json.dump(gas_stations,outfile)
def price_main():
# first get postal codes
with open('gas_dict.json', 'r') as infile:
gas_dict = json.load(infile)
try:
with open('gas_stations_'+datetime.datetime.now().strftime('%Y-%m-%d')+'.json', 'r') as infile:
gas_stations = json.load(infile)
except:
with open('gas_stations_'+datetime.datetime.now().strftime('%Y-%m-%d')+'.json', 'w') as outfile:
gas_stations={}
json.dump(gas_stations,outfile)
postal_list=[]
for k,v in gas_dict.items():
postal_list.append(k.split(',')[-1].split())
# If no postal code, just ignore for now...
postal_list = [postal_list[i] for i,p in enumerate(postal_list) if len(p) == 2]
break_point = 0
while (break_point < len(postal_list)):
postal_code = postal_list[break_point]
try:
for gas_type in list('ABCD'):
get_area_prices(gas_type, postal_code)
sleep(np.random.random(1)*2+0.5)
break_point += 1
except:
# Try again
print('Trying again ...')
sleep(np.random.random(1)*2+2)
# Get not only the coordinates but also the Google Maps address, which will likely clean up any messes left over
# from the ontariogasprices.com splits
def get_coord(addr):
# splitting Toronto addresses into "Toronto - [area]" messes up google maps, so get rid of that when querying
if 'Toronto' in addr.split(',')[1]:
addr = addr.split(',')[0] + ', Toronto, '+addr.split(',')[2]
print ('Getting' , addr)
geoloc = geolocator.geocode(addr)
sleep(np.random.random(1)*2+2)
print ('Got ', geoloc, '\n')
return geoloc
def coords_main():
# Start at the beginning of gas_dict
break_point = 0
# open gas_stations, and get the list of all the addresses
with open('gas_stations_'+datetime.datetime.now().strftime('%Y-%m-%d')+'.json', 'r') as infile:
gas_dict = json.load(infile)
addr_list = [k for k,v in gas_dict.items()]
# do while there are still addresses to process
while(break_point < len(addr_list)):
# run collect_coords starting with the current breakpoint, and gets the
# address that it's stuck on
addr = addr_list[break_point]
try:
geoloc = get_coord(addr)
if geoloc == None:
# GoogleV3 can't get a hold of it. Need to look at it later. Most likely it's a highway.
break_point += 1
sleep(np.random.random(1)*2+2)
else:
gas_dict[addr]['address'] = geoloc.address
gas_dict[addr]['longitude'] = geoloc.longitude
gas_dict[addr]['latitude'] = geoloc.latitude
# If it works, get to the next one. Once break_point gets to the length of the list, the conditions of
# the loop will no longer be satisfied, and the loop ends.
break_point +=1
with open('gas_stations_'+datetime.datetime.now().strftime('%Y-%m-%d')+'.json','w') as outfile:
json.dump(gas_dict,outfile)
sleep(np.random.random(1)*2+2)
except:
# If it doesn't work, don't increase the break_point and try again
print ('Could not get ', addr)
sleep(np.random.random(1)*2+2)
with open('gas_stations_'+datetime.datetime.now().strftime('%Y-%m-%d')+'.json','w') as outfile:
json.dump(gas_dict,outfile)
#return gas_dict
def xy_distance(xlat,xlong,ylat,ylong):
# The radius of the planet is approx. 6371.01 km
xlat = math.radians(xlat)
xlong = math.radians(xlong)
ylat = math.radians(ylat)
ylong = math.radians(ylong)
dist = 6371.01 * math.acos(math.sin(xlat)*math.sin(ylat) + math.cos(xlat)*math.cos(ylat)*math.cos(xlong - ylong))
return dist
def xy_closest(coords, x):
return (min(coords, key=lambda y: xy_distance(x[1],x[0],y[1],y[0])))
def enter_address(df, addr, max_dist):
standard_addr = get_coord(addr)
within_range = pd.DataFrame()
for i in range(len(df)):
dist = xy_distance(standard_addr.latitude,standard_addr.longitude,df.iloc[i]['latitude'],df.iloc[i]['longitude'])
if dist <= max_dist:
within_range = pd.concat((within_range,df.iloc[i]),axis=1)
within_range = within_range.transpose().sort_values('type_A')
# Hide the entered address in the dataframe too so we can show it on the map
print ('Gas stations within range: ')
print (within_range[['name','type_A']])
folmap(within_range,15,custom=1,extralat=standard_addr.latitude,extralong=standard_addr.longitude)
def folmap (df,zoom=10,custom=None,extralat=None,extralong=None):
k=7
colorscale = branca.colormap.linear.YlGnBu.scale(0, 60)
cluster_colors = ['red', 'green', 'blue', 'yellow', 'cadetblue', 'purple', 'black']
if custom == 1:
m = folium.Map(
location=[extralat, extralong],
zoom_start=zoom
)
else:
# center the map on the median coordinates
m = folium.Map(
location=[np.median(df.latitude), np.median(df.longitude)],
zoom_start=zoom
)
marker_cluster = MarkerCluster().add_to(m)
for j in range(k):
for r in df[:-1].iterrows():
if r[1]['label'] == j:
pop = ["Price: ",str(r[1]['type_A'])]
folium.Marker(
location = [r[1]['latitude'],r[1]['longitude']],
popup=''.join(pop),
icon=folium.Icon(color=cluster_colors[j], icon='ok-sign'),
).add_to(marker_cluster)
if custom == 1:
folium.Marker(location = [extralat,extralong],
popup = 'You are here',
icon=folium.Icon(color='pink', icon='cloud')).add_to(m)
display(m)
Run the 4 main functions to get a json file of addresses and prices, and a list of coordinates.
#address_main()
#clean_addresses()
#price_main()
#coords_main()
with open('gas_stations_2017-12-13.json', 'r') as infile:
gas_stations = json.load(infile)
len(gas_stations)
712
For some reason, there's a station near Vaughn that only registers in Google Maps if we input it as King City. There are also a few duplicate stations that are listed under different names or address variants, so we get rid of those.
# Special treatment for 12001 hwy 400 nb, since gasbuddy has it as in Vaughan but Google recognizes it as being in King City
gas_stations['12001 Hwy 400 NB , Vaughan, Ontario']
{'address': '12001 ON-400 #1S1, King City, ON L7B 1A8, Canada', 'latitude': 43.8950319, 'longitude': -79.5574951, 'name': 'Canadian Tire', 'type_A': 125.3, 'type_D': 119.7}
geoloc = geolocator.geocode('12001 Hwy 400 NB , King City, Ontario')
gas_stations['12001 Hwy 400 NB , Vaughan, Ontario']['address'] = geoloc.address
gas_stations['12001 Hwy 400 NB , Vaughan, Ontario']['longitude'] = geoloc.longitude
gas_stations['12001 Hwy 400 NB , Vaughan, Ontario']['latitude'] =geoloc.latitude
gas_df = pd.DataFrame.from_dict(gas_stations, orient='index')
gas_df.head()
name | type_A | type_B | type_C | type_D | address | longitude | latitude | |
---|---|---|---|---|---|---|---|---|
12731 Hwy 48 , Whitchurch-Stouffville, Ontario | Ultramar | 115.6 | 129.6 | 132.6 | 115.9 | 12731 ON-48, Whitchurch-Stouffville, ON L4A 7X... | -79.282378 | 43.976346 |
1610 Keele St , Toronto - Central, Ontario | Shell | 122.9 | 136.9 | 144.9 | NaN | 1610 Keele St, York, ON M6M 3V9, Canada | -79.471988 | 43.682320 |
6897 Finch Ave W , Toronto - West, Ontario | Esso | 121.9 | 132.9 | 142.9 | 118.9 | 6897 Finch Ave W, Etobicoke, ON M9W 0A6, Canada | -79.617364 | 43.734632 |
1 Harwood Ave S , Ajax, Ontario | Pioneer | 122.6 | 123.4 | 130.9 | 118.6 | 1 Harwood Ave S, Ajax, ON L1S 2C1, Canada | -79.025100 | 43.861027 |
1 Thornhill Woods Dr , Vaughan, Ontario | Esso | 122.9 | NaN | NaN | 118.9 | 1 Thornhill Woods Dr, Thornhill, ON L4J 8Y2, C... | -79.463886 | 43.826660 |
gas_df[gas_df['longitude']==-79.347831]
name | type_A | type_B | type_C | type_D | address | longitude | latitude | |
---|---|---|---|---|---|---|---|---|
995 Pape Ave , East York, Ontario | Cocomile Service Centre | 115.9 | NaN | NaN | NaN | 995 Pape Ave, East York, ON M4K 3V8, Canada | -79.347831 | 43.687761 |
995 Pape Ave , Toronto - Central, Ontario | National Gas & Variety | 115.9 | NaN | NaN | NaN | 995 Pape Ave, East York, ON M4K 3V8, Canada | -79.347831 | 43.687761 |
gas_df[gas_df['longitude']==-79.7736453]
name | type_A | type_B | type_C | type_D | address | longitude | latitude | |
---|---|---|---|---|---|---|---|---|
490 Great Lakes Dr , Brampton, ON, Ontario | Shell | 118.9 | 132.9 | 140.9 | 118.9 | 490 Great Lakes Dr, Brampton, ON L6R, Canada | -79.773645 | 43.740888 |
490 Great Lakes Dr , Brampton, Ontario | Shell | 118.9 | 132.9 | 140.9 | 118.9 | 490 Great Lakes Dr, Brampton, ON L6R, Canada | -79.773645 | 43.740888 |
# Get rid of redundancies
gas_df.drop_duplicates(subset=['longitude','latitude'], inplace=True)
gas_df[gas_df['longitude']==-79.347831]
name | type_A | type_B | type_C | type_D | address | longitude | latitude | |
---|---|---|---|---|---|---|---|---|
995 Pape Ave , East York, Ontario | Cocomile Service Centre | 115.9 | NaN | NaN | NaN | 995 Pape Ave, East York, ON M4K 3V8, Canada | -79.347831 | 43.687761 |
gas_df[gas_df['longitude']==-79.7736453]
name | type_A | type_B | type_C | type_D | address | longitude | latitude | |
---|---|---|---|---|---|---|---|---|
490 Great Lakes Dr , Brampton, ON, Ontario | Shell | 118.9 | 132.9 | 140.9 | 118.9 | 490 Great Lakes Dr, Brampton, ON L6R, Canada | -79.773645 | 43.740888 |
The top prices may be unreliable, since they are not major chains or might be out of the way, and so user data on their prices is scarce. Not surprisingly, the cheapest brand is Costco. It's a members only service, and in some sense doesn't really count. It's prices are so low that in the K Means clustering below, the price discrepancy largely overpowers any distance effects.
gas_df.groupby('name')['type_A'].describe().sort_values(by='mean', ascending=False)[:8]
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
name | ||||||||
Speedy Gas | 1.0 | 125.900000 | NaN | 125.9 | 125.9 | 125.9 | 125.9 | 125.9 |
Good Guys Gas Bar | 1.0 | 122.900000 | NaN | 122.9 | 122.9 | 122.9 | 122.9 | 122.9 |
Falcon | 1.0 | 122.900000 | NaN | 122.9 | 122.9 | 122.9 | 122.9 | 122.9 |
Superstore | 1.0 | 122.600000 | NaN | 122.6 | 122.6 | 122.6 | 122.6 | 122.6 |
Fas Gas | 2.0 | 121.900000 | 0.000000 | 121.9 | 121.9 | 121.9 | 121.9 | 121.9 |
Petro-Canada | 188.0 | 121.862766 | 2.145533 | 113.6 | 122.7 | 122.9 | 122.9 | 123.3 |
Esso | 173.0 | 121.712717 | 2.215133 | 113.2 | 121.3 | 122.9 | 122.9 | 123.9 |
Shell | 117.0 | 121.341026 | 2.771970 | 112.4 | 119.9 | 122.9 | 122.9 | 123.9 |
gas_df.groupby('name')['type_A'].describe().sort_values(by='mean', ascending=True)[:8]
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
name | ||||||||
Costco | 7.0 | 111.471429 | 1.511858 | 110.9 | 110.90 | 110.9 | 110.9 | 114.9 |
Danforth Gas & Wash | 1.0 | 112.900000 | NaN | 112.9 | 112.90 | 112.9 | 112.9 | 112.9 |
Star Gas | 1.0 | 113.000000 | NaN | 113.0 | 113.00 | 113.0 | 113.0 | 113.0 |
Mobil | 1.0 | 113.900000 | NaN | 113.9 | 113.90 | 113.9 | 113.9 | 113.9 |
Cocomile Service Centre | 1.0 | 115.900000 | NaN | 115.9 | 115.90 | 115.9 | 115.9 | 115.9 |
OP | 1.0 | 116.500000 | NaN | 116.5 | 116.50 | 116.5 | 116.5 | 116.5 |
Santak Pump | 1.0 | 116.500000 | NaN | 116.5 | 116.50 | 116.5 | 116.5 | 116.5 |
Race Trac | 3.0 | 116.666667 | 2.926317 | 114.2 | 115.05 | 115.9 | 117.9 | 119.9 |
gas_df.iloc[:5]
name | type_A | type_B | type_C | type_D | address | longitude | latitude | |
---|---|---|---|---|---|---|---|---|
12731 Hwy 48 , Whitchurch-Stouffville, Ontario | Ultramar | 115.6 | 129.6 | 132.6 | 115.9 | 12731 ON-48, Whitchurch-Stouffville, ON L4A 7X... | -79.282378 | 43.976346 |
1610 Keele St , Toronto - Central, Ontario | Shell | 122.9 | 136.9 | 144.9 | NaN | 1610 Keele St, York, ON M6M 3V9, Canada | -79.471988 | 43.682320 |
6897 Finch Ave W , Toronto - West, Ontario | Esso | 121.9 | 132.9 | 142.9 | 118.9 | 6897 Finch Ave W, Etobicoke, ON M9W 0A6, Canada | -79.617364 | 43.734632 |
1 Harwood Ave S , Ajax, Ontario | Pioneer | 122.6 | 123.4 | 130.9 | 118.6 | 1 Harwood Ave S, Ajax, ON L1S 2C1, Canada | -79.025100 | 43.861027 |
1 Thornhill Woods Dr , Vaughan, Ontario | Esso | 122.9 | NaN | NaN | 118.9 | 1 Thornhill Woods Dr, Thornhill, ON L4J 8Y2, C... | -79.463886 | 43.826660 |
gas_df.to_csv('gas_stations_2017-12-13.csv')
with open('gas_stations_2017-12-13.json','w') as outfile:
json.dump(gas_stations,outfile)
coords_df = gas_df[['longitude','latitude']].copy()
coords_df.head()
longitude | latitude | |
---|---|---|
12731 Hwy 48 , Whitchurch-Stouffville, Ontario | -79.282378 | 43.976346 |
1610 Keele St , Toronto - Central, Ontario | -79.471988 | 43.682320 |
6897 Finch Ave W , Toronto - West, Ontario | -79.617364 | 43.734632 |
1 Harwood Ave S , Ajax, Ontario | -79.025100 | 43.861027 |
1 Thornhill Woods Dr , Vaughan, Ontario | -79.463886 | 43.826660 |
coords_df['long_rad'] = coords_df['longitude'].apply(lambda l:math.radians(l))
coords_df['lat_rad'] = coords_df['latitude'].apply(lambda l:math.radians(l))
coords_df.head()
longitude | latitude | long_rad | lat_rad | |
---|---|---|---|---|
12731 Hwy 48 , Whitchurch-Stouffville, Ontario | -79.282378 | 43.976346 | -1.383739 | 0.767532 |
1610 Keele St , Toronto - Central, Ontario | -79.471988 | 43.682320 | -1.387048 | 0.762400 |
6897 Finch Ave W , Toronto - West, Ontario | -79.617364 | 43.734632 | -1.389585 | 0.763313 |
1 Harwood Ave S , Ajax, Ontario | -79.025100 | 43.861027 | -1.379248 | 0.765519 |
1 Thornhill Woods Dr , Vaughan, Ontario | -79.463886 | 43.826660 | -1.386906 | 0.764920 |
coords_df['combined'] = list(zip(coords_df['longitude'],coords_df['latitude']))
coords_df.groupby('combined').count().sort_values('longitude',ascending=False)[:5]
longitude | latitude | long_rad | lat_rad | |
---|---|---|---|---|
combined | ||||
(-79.877743, 43.6495942) | 1 | 1 | 1 | 1 |
(-79.4275737, 43.6709778) | 1 | 1 | 1 | 1 |
(-79.41025359999999, 43.8575578) | 1 | 1 | 1 | 1 |
(-79.4099732, 43.75732610000001) | 1 | 1 | 1 | 1 |
(-79.4098422, 43.8571536) | 1 | 1 | 1 | 1 |
gas_df[coords_df['combined']==(-79.7736453, 43.7408885)]
name | type_A | type_B | type_C | type_D | address | longitude | latitude | |
---|---|---|---|---|---|---|---|---|
490 Great Lakes Dr , Brampton, ON, Ontario | Shell | 118.9 | 132.9 | 140.9 | 118.9 | 490 Great Lakes Dr, Brampton, ON L6R, Canada | -79.773645 | 43.740888 |
While competition pushes prices down, especially when there is almost 100% price transparency (all gas stations advertise their regular gas prices prominently), multiple gas stations are usually at very busy intersections. In that case, can the extra demand counteract the competitive effect? We can check.
within_100 = []
for i in range(len(coords_df)):
closest_coord = xy_closest(coords_df['combined'][coords_df.index != coords_df.index[i]], coords_df.iloc[i]['combined'])
within_100.append(xy_distance(coords_df.iloc[i]['combined'][0],coords_df.iloc[i]['combined'][1], closest_coord[0], closest_coord[1]) <= 0.1)
sum(within_100)
133
coords_df['within_100'] = np.asarray(within_100).astype(int)
coords_df.head()
longitude | latitude | long_rad | lat_rad | combined | within_100 | |
---|---|---|---|---|---|---|
12731 Hwy 48 , Whitchurch-Stouffville, Ontario | -79.282378 | 43.976346 | -1.383739 | 0.767532 | (-79.282378, 43.9763464) | 0 |
1610 Keele St , Toronto - Central, Ontario | -79.471988 | 43.682320 | -1.387048 | 0.762400 | (-79.4719879, 43.6823199) | 0 |
6897 Finch Ave W , Toronto - West, Ontario | -79.617364 | 43.734632 | -1.389585 | 0.763313 | (-79.6173637, 43.7346325) | 0 |
1 Harwood Ave S , Ajax, Ontario | -79.025100 | 43.861027 | -1.379248 | 0.765519 | (-79.0251004, 43.8610274) | 1 |
1 Thornhill Woods Dr , Vaughan, Ontario | -79.463886 | 43.826660 | -1.386906 | 0.764920 | (-79.46388639999999, 43.8266602) | 0 |
gas_df['within_100'] = coords_df['within_100']
gas_df.head()
name | type_A | type_B | type_C | type_D | address | longitude | latitude | within_100 | |
---|---|---|---|---|---|---|---|---|---|
12731 Hwy 48 , Whitchurch-Stouffville, Ontario | Ultramar | 115.6 | 129.6 | 132.6 | 115.9 | 12731 ON-48, Whitchurch-Stouffville, ON L4A 7X... | -79.282378 | 43.976346 | 0 |
1610 Keele St , Toronto - Central, Ontario | Shell | 122.9 | 136.9 | 144.9 | NaN | 1610 Keele St, York, ON M6M 3V9, Canada | -79.471988 | 43.682320 | 0 |
6897 Finch Ave W , Toronto - West, Ontario | Esso | 121.9 | 132.9 | 142.9 | 118.9 | 6897 Finch Ave W, Etobicoke, ON M9W 0A6, Canada | -79.617364 | 43.734632 | 0 |
1 Harwood Ave S , Ajax, Ontario | Pioneer | 122.6 | 123.4 | 130.9 | 118.6 | 1 Harwood Ave S, Ajax, ON L1S 2C1, Canada | -79.025100 | 43.861027 | 1 |
1 Thornhill Woods Dr , Vaughan, Ontario | Esso | 122.9 | NaN | NaN | 118.9 | 1 Thornhill Woods Dr, Thornhill, ON L4J 8Y2, C... | -79.463886 | 43.826660 | 0 |
For general clustering, use longitude and latitude. For gasoline-specific clustering, also use the price of the specific type of gas. We can just use k means for general clustering, but when looking at only certain types of gas (such as regular type_A).
cluster_df = gas_df[['name','type_A','longitude','latitude']].copy()
cluster_df.head()
name | type_A | longitude | latitude | |
---|---|---|---|---|
12731 Hwy 48 , Whitchurch-Stouffville, Ontario | Ultramar | 115.6 | -79.282378 | 43.976346 |
1610 Keele St , Toronto - Central, Ontario | Shell | 122.9 | -79.471988 | 43.682320 |
6897 Finch Ave W , Toronto - West, Ontario | Esso | 121.9 | -79.617364 | 43.734632 |
1 Harwood Ave S , Ajax, Ontario | Pioneer | 122.6 | -79.025100 | 43.861027 |
1 Thornhill Woods Dr , Vaughan, Ontario | Esso | 122.9 | -79.463886 | 43.826660 |
Use Starbucks' location research to our benefit. As with station proximities to other stations, we will use the "as the crow flies" distance.
Starbucks location data from https://opendata.socrata.com/Business/All-Starbucks-Locations-in-the-World/xy4y-c4mk
starbucks_df = pd.read_csv('All_Starbucks_Locations_in_the_World.csv', delimiter=';')
starbucks_df = starbucks_df[(starbucks_df['Country']=='CA') & (starbucks_df['Country Subdivision']=='ON')]
starbucks_df.reset_index(inplace=True)
starbucks_df.head()
index | Store ID | Name | Brand | Store Number | Phone Number | Ownership Type | Street Combined | Street 1 | Street 2 | ... | Country Subdivision | Country | Postal Code | Coordinates | Latitude | Longitude | Timezone | Current Timezone Offset | Olson Timezone | First Seen | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 108 | 323 | King & Brant | Starbucks | 4636-99652 | 416-596-0101 | CO | 527-529 King Street West | 527-529 King Street West | NaN | ... | ON | CA | M5V 1K4 | (43.6446495056152, -79.3982620239258) | 43.644650 | -79.398262 | Eastern Standard Time | -300 | GMT-05:00 America/Toronto | 12/08/2013 10:41:59 PM |
1 | 122 | 335 | 3035 Argentia Rd | Starbucks | 4665-100880 | 905-785-7667 | CO | 3035 Argentia Road, unIT 7 | 3035 Argentia Road | unIT 7 | ... | ON | CA | L5N 8P7 | (43.5956649780273, -79.7863235473633) | 43.595665 | -79.786324 | Eastern Standard Time | -300 | GMT-05:00 America/Toronto | 12/08/2013 10:41:59 PM |
2 | 124 | 337 | 631 Commissioners Rd E | Starbucks | 4674-100799 | 519-649-1444 | CO | 631 Commissioners Road East | 631 Commissioners Road East | NaN | ... | ON | CA | N6C 2V1 | (42.957763671875, -81.2328872680664) | 42.957764 | -81.232887 | Eastern Standard Time | -300 | GMT-05:00 America/Toronto | 12/08/2013 10:41:59 PM |
3 | 133 | 354 | 599 Taylor Kidd Blvd | Starbucks | 4694-102072 | 613-634-1509 | CO | 599 Taylor Kidd Blvd, Unit 6 | 599 Taylor Kidd Blvd | Unit 6 | ... | ON | CA | K7M 0A2 | (44.2504081726074, -76.5673141479492) | 44.250408 | -76.567314 | Eastern Standard Time | -300 | GMT-05:00 America/Toronto | 12/08/2013 10:41:59 PM |
4 | 141 | 364 | King & Shaw | Starbucks | 4646-98483 | 416-979-1953 | CO | 1005 King St W, Unit 7 | 1005 King St W | Unit 7 | ... | ON | CA | M6G 1B9 | (43.6412811279297, -79.4147644042969) | 43.641281 | -79.414764 | Eastern Standard Time | -300 | GMT-05:00 America/Toronto | 12/08/2013 10:41:59 PM |
5 rows × 22 columns
starbucks_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 530 entries, 0 to 529 Data columns (total 22 columns): index 530 non-null int64 Store ID 530 non-null object Name 530 non-null object Brand 530 non-null object Store Number 530 non-null object Phone Number 486 non-null object Ownership Type 530 non-null object Street Combined 530 non-null object Street 1 530 non-null object Street 2 191 non-null object Street 3 110 non-null object City 530 non-null object Country Subdivision 530 non-null object Country 530 non-null object Postal Code 530 non-null object Coordinates 530 non-null object Latitude 530 non-null float64 Longitude 530 non-null float64 Timezone 530 non-null object Current Timezone Offset 530 non-null int64 Olson Timezone 530 non-null object First Seen 530 non-null object dtypes: float64(2), int64(2), object(18) memory usage: 91.2+ KB
# Actually we only need the coordinates.
starbucks_on_coords = starbucks_df['Coordinates'].tolist()
# Convert from a list of strings that look like Python lists to an actual list of lists
# Also need to change it from (lat,long) to (long,lat)
starbucks_on_coords = [ast.literal_eval(starbucks_on_coords[i]) for i in range(len(starbucks_on_coords))]
starbucks_on_coords = [(starbucks_on_coords[i][1], starbucks_on_coords[i][0]) for i in range(len(starbucks_on_coords))]
nearest_starbucks = []
for i in range(len(coords_df)):
closest_coord = xy_closest(starbucks_on_coords, coords_df.iloc[i]['combined'])
nearest_starbucks.append(xy_distance(coords_df.iloc[i]['combined'][1],coords_df.iloc[i]['combined'][0], closest_coord[1], closest_coord[0]))
starbucks_within_100m = []
for i in range(len(coords_df)):
closest_coord = xy_closest(starbucks_on_coords, coords_df.iloc[i]['combined'])
starbucks_within_100m.append(xy_distance(coords_df.iloc[i]['combined'][1],coords_df.iloc[i]['combined'][0], closest_coord[1], closest_coord[0])<=1)
gas_df['nearest_starbucks'] = np.asarray(nearest_starbucks)
gas_df['starbucks_within_100m'] = np.asarray(starbucks_within_100m).astype(int)
gas_df.head()
name | type_A | type_B | type_C | type_D | address | longitude | latitude | within_100 | nearest_starbucks | starbucks_within_100m | |
---|---|---|---|---|---|---|---|---|---|---|---|
12731 Hwy 48 , Whitchurch-Stouffville, Ontario | Ultramar | 115.6 | 129.6 | 132.6 | 115.9 | 12731 ON-48, Whitchurch-Stouffville, ON L4A 7X... | -79.282378 | 43.976346 | 0 | 11.208629 | 0 |
1610 Keele St , Toronto - Central, Ontario | Shell | 122.9 | 136.9 | 144.9 | NaN | 1610 Keele St, York, ON M6M 3V9, Canada | -79.471988 | 43.682320 | 0 | 1.178252 | 0 |
6897 Finch Ave W , Toronto - West, Ontario | Esso | 121.9 | 132.9 | 142.9 | 118.9 | 6897 Finch Ave W, Etobicoke, ON M9W 0A6, Canada | -79.617364 | 43.734632 | 0 | 1.031603 | 0 |
1 Harwood Ave S , Ajax, Ontario | Pioneer | 122.6 | 123.4 | 130.9 | 118.6 | 1 Harwood Ave S, Ajax, ON L1S 2C1, Canada | -79.025100 | 43.861027 | 1 | 0.381652 | 1 |
1 Thornhill Woods Dr , Vaughan, Ontario | Esso | 122.9 | NaN | NaN | 118.9 | 1 Thornhill Woods Dr, Thornhill, ON L4J 8Y2, C... | -79.463886 | 43.826660 | 0 | 1.444566 | 0 |
Let's invent a Starbucks proximity metric, a sigmoid with the log of the inverse of the nearest starbucks in km:
$\frac{1}{(1+exp(log(-\frac{1}{d})^2))}$
(This is not actually used, but keep this here just in case)
gas_df['starbucks_metric'] = gas_df['nearest_starbucks'].apply(lambda d:1/(1+np.exp(-np.log((2/d)**2))))
gas_df.head()
name | type_A | type_B | type_C | type_D | address | longitude | latitude | within_100 | nearest_starbucks | starbucks_within_100m | starbucks_metric | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
12731 Hwy 48 , Whitchurch-Stouffville, Ontario | Ultramar | 115.6 | 129.6 | 132.6 | 115.9 | 12731 ON-48, Whitchurch-Stouffville, ON L4A 7X... | -79.282378 | 43.976346 | 0 | 11.208629 | 0 | 0.030856 |
1610 Keele St , Toronto - Central, Ontario | Shell | 122.9 | 136.9 | 144.9 | NaN | 1610 Keele St, York, ON M6M 3V9, Canada | -79.471988 | 43.682320 | 0 | 1.178252 | 0 | 0.742352 |
6897 Finch Ave W , Toronto - West, Ontario | Esso | 121.9 | 132.9 | 142.9 | 118.9 | 6897 Finch Ave W, Etobicoke, ON M9W 0A6, Canada | -79.617364 | 43.734632 | 0 | 1.031603 | 0 | 0.789857 |
1 Harwood Ave S , Ajax, Ontario | Pioneer | 122.6 | 123.4 | 130.9 | 118.6 | 1 Harwood Ave S, Ajax, ON L1S 2C1, Canada | -79.025100 | 43.861027 | 1 | 0.381652 | 1 | 0.964865 |
1 Thornhill Woods Dr , Vaughan, Ontario | Esso | 122.9 | NaN | NaN | 118.9 | 1 Thornhill Woods Dr, Thornhill, ON L4J 8Y2, C... | -79.463886 | 43.826660 | 0 | 1.444566 | 0 | 0.657163 |
gas_df.to_csv('gas_stations_2017-12-13.csv')
regular_gas = gas_df[['type_A','within_100','starbucks_within_100m']].dropna().copy()
regular_gas.columns=['Price','Competition_Nearby','Starbucks_Nearby']
regular_gas['Relative_Price'] = regular_gas['Price'].apply(lambda p: p/np.mean(regular_gas['Price']))
regular_gas['Price_Difference'] = regular_gas['Price'].apply(lambda p: p-np.mean(regular_gas['Price']))
regular_gas.head()
Price | Competition_Nearby | Starbucks_Nearby | Relative_Price | Price_Difference | |
---|---|---|---|---|---|
12731 Hwy 48 , Whitchurch-Stouffville, Ontario | 115.6 | 0 | 0 | 0.956242 | -5.289926 |
1610 Keele St , Toronto - Central, Ontario | 122.9 | 0 | 0 | 1.016627 | 2.010074 |
6897 Finch Ave W , Toronto - West, Ontario | 121.9 | 0 | 0 | 1.008355 | 1.010074 |
1 Harwood Ave S , Ajax, Ontario | 122.6 | 1 | 1 | 1.014146 | 1.710074 |
1 Thornhill Woods Dr , Vaughan, Ontario | 122.9 | 0 | 0 | 1.016627 | 2.010074 |
import statsmodels.api as sm
C:\ProgramData\Anaconda3\lib\site-packages\statsmodels\compat\pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead. from pandas.core import datetools
Y = regular_gas['Price']
X = sm.add_constant(regular_gas[['Competition_Nearby','Starbucks_Nearby']])
lm = sm.OLS(Y,X)
results = lm.fit()
results.params
const 120.606227 Competition_Nearby 0.182607 Starbucks_Nearby 0.621326 dtype: float64
results.summary()
Dep. Variable: | Price | R-squared: | 0.011 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.008 |
Method: | Least Squares | F-statistic: | 3.873 |
Date: | Sat, 06 Jan 2018 | Prob (F-statistic): | 0.0213 |
Time: | 21:37:31 | Log-Likelihood: | -1681.8 |
No. Observations: | 675 | AIC: | 3370. |
Df Residuals: | 672 | BIC: | 3383. |
Df Model: | 2 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 120.6062 | 0.155 | 777.316 | 0.000 | 120.302 | 120.911 |
Competition_Nearby | 0.1826 | 0.286 | 0.638 | 0.523 | -0.379 | 0.744 |
Starbucks_Nearby | 0.6213 | 0.230 | 2.699 | 0.007 | 0.169 | 1.073 |
Omnibus: | 132.421 | Durbin-Watson: | 1.985 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 211.951 |
Skew: | -1.304 | Prob(JB): | 9.45e-47 |
Kurtosis: | 3.855 | Cond. No. | 2.90 |
Gas stations within 100m of a Starbucks location charge 0.62 cents more for regular gas, and this is statistically significant with over 99% confidence.
Proximity to competition also seems to increase prices, likely due to increased demand at busy intersections (as mentioned above). However, the p value is far too high so we can not be certain of its effect with any confidence.
Let's see if we can find any meaningful clusters.
cluster_df.dropna(inplace=True)
cluster_df.info()
<class 'pandas.core.frame.DataFrame'> Index: 675 entries, 12731 Hwy 48 , Whitchurch-Stouffville, Ontario to Taymall Ave , Toronto - West, Ontario Data columns (total 4 columns): name 675 non-null object type_A 675 non-null float64 longitude 675 non-null float64 latitude 675 non-null float64 dtypes: float64(3), object(1) memory usage: 26.4+ KB
cluster_df['type_A_norm'] = cluster_df['type_A']/np.mean(cluster_df['type_A'])
cluster_df['longitude_norm'] = -cluster_df['longitude']/np.mean(cluster_df['longitude'])
cluster_df['latitude_norm'] = cluster_df['latitude']/np.mean(cluster_df['latitude'])
cluster_df.head()
name | type_A | longitude | latitude | type_A_norm | longitude_norm | latitude_norm | |
---|---|---|---|---|---|---|---|
12731 Hwy 48 , Whitchurch-Stouffville, Ontario | Ultramar | 115.6 | -79.282378 | 43.976346 | 0.956242 | -0.997435 | 1.005572 |
1610 Keele St , Toronto - Central, Ontario | Shell | 122.9 | -79.471988 | 43.682320 | 1.016627 | -0.999820 | 0.998849 |
6897 Finch Ave W , Toronto - West, Ontario | Esso | 121.9 | -79.617364 | 43.734632 | 1.008355 | -1.001649 | 1.000045 |
1 Harwood Ave S , Ajax, Ontario | Pioneer | 122.6 | -79.025100 | 43.861027 | 1.014146 | -0.994198 | 1.002935 |
1 Thornhill Woods Dr , Vaughan, Ontario | Esso | 122.9 | -79.463886 | 43.826660 | 1.016627 | -0.999718 | 1.002149 |
from scipy import linalg
from sklearn.cluster import KMeans
from sklearn import svm
from scipy.spatial.distance import cdist
from scipy.spatial import ConvexHull
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
SSE = []
coords = list(cluster_df[['type_A_norm','longitude_norm','latitude_norm']].values)
for k in range(1,13):
km = KMeans(n_clusters=k)
km.fit(coords)
labels = km.labels_
centroids = km.cluster_centers_
# Get the SSE
SSE.append(sum(np.min(cdist(coords, centroids, 'euclidean'),axis=1))/len(coords))
plt.plot(range(1,13),SSE, 'bo--')
plt.xlabel('k clusters')
plt.ylabel('SSE')
plt.title('SSE vs k')
Text(0.5,1,'SSE vs k')
4 seems like a decent choice, but interestingly so is 7. Let's try 7.
k = 7
km = KMeans(n_clusters=k)
km.fit(coords)
labels = km.labels_
centroids = km.cluster_centers_
colors = ['r', 'g', 'b', 'y', 'c', 'm', 'k']
plt.figure(figsize=(8,8))
plt.title('Clustered Locations')
for j in range(k):
current_coords = np.asarray(coords)[np.where(labels==j)]
j_longs = current_coords[:,1]
j_lats = current_coords[:,2]
plt.plot(j_longs, j_lats, '%so' % colors[j], label=j)
plt.legend()
<matplotlib.legend.Legend at 0x205df8d8c18>
7 is interesting indeed. Mississauga and Toronto are separated, and all but one of the Costco stations are grouped together regardless of distance.
cluster_df['label'] = km.labels_
cluster_df.groupby('label')['type_A'].describe().sort_values('mean',ascending=False)['mean']
label 2 122.913025 0 122.780663 6 120.150000 3 118.368056 1 116.464407 4 113.891667 5 111.114286 Name: mean, dtype: float64
cluster_df[cluster_df['label']==5]
name | type_A | longitude | latitude | type_A_norm | longitude_norm | latitude_norm | label | |
---|---|---|---|---|---|---|---|---|
1411 Warden Ave , Toronto - East, Ontario | Costco | 110.9 | -79.297736 | 43.759429 | 0.917363 | -0.997628 | 1.000612 | 5 |
150 Kingston Rd E , Ajax, Ontario | Costco | 110.9 | -79.018454 | 43.862950 | 0.917363 | -0.994114 | 1.002979 | 5 |
1570 Dundas St E , Mississauga, Ontario | Costco | 110.9 | -79.575776 | 43.610304 | 0.917363 | -1.001126 | 0.997202 | 5 |
55 New Huntington Rd , Vaughan, Ontario | Costco | 110.9 | -79.639189 | 43.769937 | 0.917363 | -1.001924 | 1.000852 | 5 |
71 Colossus Dr , Vaughan, Ontario | Costco | 110.9 | -79.541590 | 43.785631 | 0.917363 | -1.000696 | 1.001211 | 5 |
935 Liverpool Rd , Pickering, Ontario | Shell | 112.4 | -79.086912 | 43.828548 | 0.929771 | -0.994976 | 1.002192 | 5 |
Taymall Ave , Toronto - West, Ontario | Costco | 110.9 | -79.506616 | 43.623887 | 0.917363 | -1.000256 | 0.997512 | 5 |
Let's put it on a Toronto map with Folium.
Then run our function for getting the nearest gas stations within a specified distance in kilometers.
import branca
import folium
from folium.plugins import MarkerCluster
cluster_df.head()
name | type_A | longitude | latitude | type_A_norm | longitude_norm | latitude_norm | label | |
---|---|---|---|---|---|---|---|---|
12731 Hwy 48 , Whitchurch-Stouffville, Ontario | Ultramar | 115.6 | -79.282378 | 43.976346 | 0.956242 | -0.997435 | 1.005572 | 1 |
1610 Keele St , Toronto - Central, Ontario | Shell | 122.9 | -79.471988 | 43.682320 | 1.016627 | -0.999820 | 0.998849 | 0 |
6897 Finch Ave W , Toronto - West, Ontario | Esso | 121.9 | -79.617364 | 43.734632 | 1.008355 | -1.001649 | 1.000045 | 0 |
1 Harwood Ave S , Ajax, Ontario | Pioneer | 122.6 | -79.025100 | 43.861027 | 1.014146 | -0.994198 | 1.002935 | 2 |
1 Thornhill Woods Dr , Vaughan, Ontario | Esso | 122.9 | -79.463886 | 43.826660 | 1.016627 | -0.999718 | 1.002149 | 2 |
folmap(cluster_df,zoom=10)
Enter the address and distance threshold to see nearby stations.
enter_address(cluster_df,'80 Bloor St W, Toronto, ON M5S 2V1',2)
Getting 80 Bloor St W, Toronto, ON M5S 2V1 Got 80 Bloor St W, Toronto, ON M5S 2V1, Canada Gas stations within range: name type_A 835 Yonge St , Toronto - Central, Ontario Canadian Tire 119.3 1077 Yonge St , Toronto - Central, Ontario Shell 122.9 150 Dupont St , Toronto - Central, Ontario Esso 122.9 241 Church St , Toronto - South, Ontario Esso 122.9 505 Jarvis St , Toronto - South, Ontario Petro-Canada 122.9 581 Parliament St , Toronto - South, Ontario Esso 122.9