#!/usr/bin/env python
# coding: utf-8

# # Analysis of Gas Prices in the GTA
# We obtain gas station price data for the Greater Toronto Area with data from yellowpages.ca and gasbuddy.com and cluster them based on location and price per litre of regular gas. We also test the price effects of having another gas station or Starbucks location within a 100m radius of a gas station.

# In[1]:


from bs4 import BeautifulSoup
import urllib
import requests
import math
import re
import pandas as pd
import numpy as np
import datetime
import sys
import time
import json
import ast
from geopy.geocoders import GoogleV3
import seaborn as sns
from time import sleep
geolocator = GoogleV3()
get_ipython().run_line_magic('matplotlib', 'inline')
today = datetime.datetime.now()


# In[2]:


# Get the names and addressess of gas stations listed in Yellow Pages in the Toronto region

def get_addresses (page_num):
    with open('gas_dict.json', 'r') as infile:
        gas_dict = json.load(infile)
        
    url = 'https://www.yellowpages.ca/search/si/'+str(page_num)+'/Gas%20Stations/Toronto+ON'
    try:
        html = urllib.request.urlopen(url).read()
    except:
        return page_num
    soup = BeautifulSoup(html, 'html.parser')
        
    content = soup.findAll('div',{'class':'listing__content__wrap'})

    for n in range(len(content)):
        try:
            name = content[n].findAll('a')[0].get('title').split('-')[0]
        except:
            name = '#NO NAME'
        
        try:
            addr_ele = content[n].findAll('span',{'class':'listing__address--full'})[0].findAll('span')
            addr = ','.join([addr_ele[i].string for i,e in enumerate(addr_ele)])
        except:
            addr = '#NO ADDRESS#'
        
        gas_dict[addr] = {'name':name}
 
    with open('gas_dict.json', 'w') as outfile:
        json.dump(gas_dict,outfile)
    
    return page_num + 1

def address_main():
    # If file doesn't exist, create it
    try:
        with open('gas_dict.json', 'r') as infile:
            gas_dict = json.load(infile)
    except:
        with open('gas_dict.json', 'w') as outfile:
            gas_dict={}
            json.dump(gas_dict,outfile)
    
    page_num = 1
    while page_num < 22:
        print('Fetching page ',page_num)
        page_num = get_addresses(page_num)
        sleep(np.random.random(1)*2+2)


# In[3]:


# Get rid of incomplete or empty addresses
def clean_addresses():
    with open('gas_dict.json', 'r') as infile:
        gas_dict = json.load(infile)
    
    gas_dict.pop('#NO ADDRESS#',None)
    gas_dict.pop('ON',None)
    gas_dict.pop('12001 Hwy 400,Maple,ON,L7B 1A8',None)    # This is actually both redundant and the wrong city ...
    
    with open('gas_dict.json', 'w') as outfile:
        json.dump(gas_dict,outfile)


# In[4]:


# for ontariogasprices.com scraping, we need a list of postal codes to search with
# the site focuses on finding the lowest price in an area and does not prioritize towards specific addresses
# but this works to our advantage since we can potentially find stations not included in the yellowpages data
# gas types are
#  A: regular, B: mid, C: premium, D: diesel

def get_area_prices(gas_type, postal_code):
    with open('gas_stations_'+datetime.datetime.now().strftime('%Y-%m-%d')+'.json', 'r') as infile:
        gas_stations = json.load(infile)
    
    url = 'http://www.ontariogasprices.com/GasPriceSearch.aspx?fuel='+gas_type+'&qsrch='+postal_code[0]+'%20'+postal_code[1]
    print ('Fetching from ', url)    
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html.parser')    

    if len(soup.findAll('tbody')) == 0:
        pass
    else:
        data_rows = soup.findAll('tbody')[0].findAll('tr')

        for i in range(len(data_rows)):
            try:
                name = data_rows[i].findAll('td')[0].find('a').string
                st_addr = data_rows[i].findAll('td')[0].find('dd').string.split('&')[0].split('near')[0]
                city = data_rows[i].findAll('td')[2].find('a').string
                addr = st_addr + ', '+ city + ', Ontario'
                price = float(data_rows[i].find('a').string)
                if addr not in gas_stations:
                    gas_stations[addr] = {'name':name}
                else:
                    pass
                gas_stations[addr]['type_'+gas_type] = price
            except:
                # The "other stations in the area" row can just be ignored
                pass
        
        with open('gas_stations_'+datetime.datetime.now().strftime('%Y-%m-%d')+'.json', 'w') as outfile:
            json.dump(gas_stations,outfile)
    
def price_main():
    # first get postal codes
    with open('gas_dict.json', 'r') as infile:
        gas_dict = json.load(infile)
        
    try:
        with open('gas_stations_'+datetime.datetime.now().strftime('%Y-%m-%d')+'.json', 'r') as infile:
            gas_stations = json.load(infile)
    except:
        with open('gas_stations_'+datetime.datetime.now().strftime('%Y-%m-%d')+'.json', 'w') as outfile:
            gas_stations={}
            json.dump(gas_stations,outfile)
        
    postal_list=[]
    for k,v in gas_dict.items():
        postal_list.append(k.split(',')[-1].split())
    
    # If no postal code, just ignore for now...
    postal_list = [postal_list[i] for i,p in enumerate(postal_list) if len(p) == 2]

    break_point = 0
    while (break_point < len(postal_list)):
        postal_code = postal_list[break_point]
        try:
            for gas_type in list('ABCD'):
                get_area_prices(gas_type, postal_code)
                sleep(np.random.random(1)*2+0.5)                 
                break_point += 1
        except:
            # Try again
            print('Trying again ...')
            sleep(np.random.random(1)*2+2)


# In[5]:


# Get not only the coordinates but also the Google Maps address, which will likely clean up any messes left over
# from the ontariogasprices.com splits

def get_coord(addr):
    # splitting Toronto addresses into "Toronto - [area]" messes up google maps, so get rid of that when querying
    if 'Toronto' in addr.split(',')[1]:
        addr = addr.split(',')[0] + ', Toronto, '+addr.split(',')[2]
    print ('Getting' , addr)
    geoloc = geolocator.geocode(addr)
    sleep(np.random.random(1)*2+2)
    print ('Got ', geoloc, '\n')
    return geoloc
            
def coords_main():
    # Start at the beginning of gas_dict
    break_point = 0

    # open gas_stations, and get the list of all the addresses
    with open('gas_stations_'+datetime.datetime.now().strftime('%Y-%m-%d')+'.json', 'r') as infile:
        gas_dict = json.load(infile)
    addr_list = [k for k,v in gas_dict.items()]
        
    # do while there are still addresses to process
    while(break_point < len(addr_list)):
        # run collect_coords starting with the current breakpoint, and gets the
        # address that it's stuck on
        addr = addr_list[break_point]
        try: 
            geoloc = get_coord(addr)
            if geoloc == None:
                # GoogleV3 can't get a hold of it. Need to look at it later. Most likely it's a highway.            
                break_point += 1
                sleep(np.random.random(1)*2+2)                 
            else:
                gas_dict[addr]['address'] = geoloc.address
                gas_dict[addr]['longitude'] = geoloc.longitude
                gas_dict[addr]['latitude'] = geoloc.latitude
                # If it works, get to the next one. Once break_point gets to the length of the list, the conditions of
                # the loop will no longer be satisfied, and the loop ends.
                break_point +=1
                with open('gas_stations_'+datetime.datetime.now().strftime('%Y-%m-%d')+'.json','w') as outfile:
                    json.dump(gas_dict,outfile)      
                sleep(np.random.random(1)*2+2)                    
        except:
            # If it doesn't work, don't increase the break_point and try again
            print ('Could not get ', addr)
            sleep(np.random.random(1)*2+2)

    with open('gas_stations_'+datetime.datetime.now().strftime('%Y-%m-%d')+'.json','w') as outfile:
        json.dump(gas_dict,outfile)     
    #return gas_dict


# In[6]:


def xy_distance(xlat,xlong,ylat,ylong):
    # The radius of the planet is approx. 6371.01 km
    xlat = math.radians(xlat)
    xlong = math.radians(xlong)
    ylat = math.radians(ylat)
    ylong = math.radians(ylong)
    dist = 6371.01 * math.acos(math.sin(xlat)*math.sin(ylat) + math.cos(xlat)*math.cos(ylat)*math.cos(xlong - ylong))
    return dist

def xy_closest(coords, x):
    return (min(coords, key=lambda y: xy_distance(x[1],x[0],y[1],y[0])))


# In[7]:


def enter_address(df, addr, max_dist):
    standard_addr = get_coord(addr)
    within_range = pd.DataFrame()
    for i in range(len(df)):
        dist = xy_distance(standard_addr.latitude,standard_addr.longitude,df.iloc[i]['latitude'],df.iloc[i]['longitude'])
        if dist <= max_dist:
            within_range = pd.concat((within_range,df.iloc[i]),axis=1)

    within_range = within_range.transpose().sort_values('type_A')
    
    # Hide the entered address in the dataframe too so we can show it on the map
    print ('Gas stations within range: ')
    print (within_range[['name','type_A']])
    folmap(within_range,15,custom=1,extralat=standard_addr.latitude,extralong=standard_addr.longitude)


# In[8]:


def folmap (df,zoom=10,custom=None,extralat=None,extralong=None):
    k=7
    colorscale = branca.colormap.linear.YlGnBu.scale(0, 60)
    cluster_colors = ['red', 'green', 'blue', 'yellow', 'cadetblue', 'purple', 'black']

    if custom == 1:
        m = folium.Map(
            location=[extralat, extralong],
            zoom_start=zoom
        )
    else:
        # center the map on the median coordinates
        m = folium.Map(
            location=[np.median(df.latitude), np.median(df.longitude)],
            zoom_start=zoom
        )

    marker_cluster = MarkerCluster().add_to(m)

    for j in range(k):
        for r in df[:-1].iterrows():
            if r[1]['label'] == j:
                pop = ["Price: ",str(r[1]['type_A'])]
                folium.Marker(
                    location = [r[1]['latitude'],r[1]['longitude']],
                    popup=''.join(pop),
                    icon=folium.Icon(color=cluster_colors[j], icon='ok-sign'),
                ).add_to(marker_cluster)
    
    if custom == 1:
        folium.Marker(location = [extralat,extralong],
                popup = 'You are here',
                icon=folium.Icon(color='pink', icon='cloud')).add_to(m)
    
    display(m)


# Run the 4 main functions to get a json file of addresses and prices, and a list of coordinates.

# In[ ]:


#address_main()


# In[ ]:


#clean_addresses()


# In[ ]:


#price_main()


# In[ ]:


#coords_main()


# In[9]:


with open('gas_stations_2017-12-13.json', 'r') as infile:
    gas_stations = json.load(infile)
len(gas_stations)


# ### Clean up
# 
# For some reason, there's a station near Vaughn that only registers in Google Maps if we input it as King City. There are also a few duplicate stations that are listed under different names or address variants, so we get rid of those.

# In[10]:


# Special treatment for 12001 hwy 400 nb, since gasbuddy has it as in Vaughan but Google recognizes it as being in King City
gas_stations['12001 Hwy 400 NB , Vaughan, Ontario']


# In[11]:


geoloc = geolocator.geocode('12001 Hwy 400 NB , King City, Ontario')


# In[12]:


gas_stations['12001 Hwy 400 NB , Vaughan, Ontario']['address'] = geoloc.address
gas_stations['12001 Hwy 400 NB , Vaughan, Ontario']['longitude'] = geoloc.longitude
gas_stations['12001 Hwy 400 NB , Vaughan, Ontario']['latitude'] =geoloc.latitude


# In[13]:


gas_df = pd.DataFrame.from_dict(gas_stations, orient='index')


# In[14]:


gas_df.head()


# In[15]:


gas_df[gas_df['longitude']==-79.347831]


# In[16]:


gas_df[gas_df['longitude']==-79.7736453]


# In[17]:


# Get rid of redundancies
gas_df.drop_duplicates(subset=['longitude','latitude'], inplace=True)


# In[18]:


gas_df[gas_df['longitude']==-79.347831]


# In[19]:


gas_df[gas_df['longitude']==-79.7736453]


# ### Mean prices by brand
# The top prices may be unreliable, since they are not major chains or might be out of the way, and so user data on their prices is scarce. Not surprisingly, the cheapest brand is Costco. It's a members only service, and in some sense doesn't really count. It's prices are so low that in the K Means clustering below, the price discrepancy largely overpowers any distance effects.

# In[20]:


gas_df.groupby('name')['type_A'].describe().sort_values(by='mean', ascending=False)[:8]


# In[21]:


gas_df.groupby('name')['type_A'].describe().sort_values(by='mean', ascending=True)[:8]


# In[22]:


gas_df.iloc[:5]


# In[23]:


gas_df.to_csv('gas_stations_2017-12-13.csv')


# In[24]:


with open('gas_stations_2017-12-13.json','w') as outfile:
    json.dump(gas_stations,outfile) 


# In[25]:


coords_df = gas_df[['longitude','latitude']].copy()


# In[26]:


coords_df.head()


# In[27]:


coords_df['long_rad'] = coords_df['longitude'].apply(lambda l:math.radians(l))
coords_df['lat_rad'] = coords_df['latitude'].apply(lambda l:math.radians(l))


# In[28]:


coords_df.head()


# In[29]:


coords_df['combined'] = list(zip(coords_df['longitude'],coords_df['latitude']))


# In[30]:


coords_df.groupby('combined').count().sort_values('longitude',ascending=False)[:5]


# In[31]:


gas_df[coords_df['combined']==(-79.7736453, 43.7408885)]


# ### Checking for gas stations with competitors within 100 m
# 
# While competition pushes prices down, especially when there is almost 100% price transparency (all gas stations advertise their regular gas prices prominently), multiple gas stations are usually at very busy intersections. In that case, can the extra demand counteract the competitive effect? We can check.

# In[32]:


within_100 = []
for i in range(len(coords_df)):
    closest_coord = xy_closest(coords_df['combined'][coords_df.index != coords_df.index[i]], coords_df.iloc[i]['combined'])
    within_100.append(xy_distance(coords_df.iloc[i]['combined'][0],coords_df.iloc[i]['combined'][1], closest_coord[0], closest_coord[1]) <= 0.1)


# In[33]:


sum(within_100)


# In[34]:


coords_df['within_100'] = np.asarray(within_100).astype(int)


# In[35]:


coords_df.head()


# In[36]:


gas_df['within_100'] = coords_df['within_100']


# In[37]:


gas_df.head()


# For general clustering, use longitude and latitude. For gasoline-specific clustering, also use the price of the specific type of gas. We can just use k means for general clustering, but when looking at only certain types of gas (such as regular type_A).

# In[38]:


cluster_df = gas_df[['name','type_A','longitude','latitude']].copy()


# In[39]:


cluster_df.head()


# ### Exploring the Starbucks Metric
# Use Starbucks' location research to our benefit. As with station proximities to other stations, we will use the "as the crow flies" distance.
# 
# Starbucks location data from https://opendata.socrata.com/Business/All-Starbucks-Locations-in-the-World/xy4y-c4mk

# In[40]:


starbucks_df = pd.read_csv('All_Starbucks_Locations_in_the_World.csv', delimiter=';')
starbucks_df = starbucks_df[(starbucks_df['Country']=='CA') & (starbucks_df['Country Subdivision']=='ON')]
starbucks_df.reset_index(inplace=True)
starbucks_df.head()


# In[41]:


starbucks_df.info()


# In[42]:


# Actually we only need the coordinates.
starbucks_on_coords = starbucks_df['Coordinates'].tolist()


# In[43]:


# Convert from a list of strings that look like Python lists to an actual list of lists
# Also need to change it from (lat,long) to (long,lat)
starbucks_on_coords = [ast.literal_eval(starbucks_on_coords[i]) for i in range(len(starbucks_on_coords))]
starbucks_on_coords = [(starbucks_on_coords[i][1], starbucks_on_coords[i][0]) for i in range(len(starbucks_on_coords))]


# In[44]:


nearest_starbucks = []
for i in range(len(coords_df)):
    closest_coord = xy_closest(starbucks_on_coords, coords_df.iloc[i]['combined'])
    nearest_starbucks.append(xy_distance(coords_df.iloc[i]['combined'][1],coords_df.iloc[i]['combined'][0], closest_coord[1], closest_coord[0]))


# In[45]:


starbucks_within_100m = []
for i in range(len(coords_df)):
    closest_coord = xy_closest(starbucks_on_coords, coords_df.iloc[i]['combined'])
    starbucks_within_100m.append(xy_distance(coords_df.iloc[i]['combined'][1],coords_df.iloc[i]['combined'][0], closest_coord[1], closest_coord[0])<=1)


# In[46]:


gas_df['nearest_starbucks'] = np.asarray(nearest_starbucks)
gas_df['starbucks_within_100m'] = np.asarray(starbucks_within_100m).astype(int)


# In[47]:


gas_df.head()


# Let's invent a Starbucks proximity metric, a sigmoid with the log of the inverse of the nearest starbucks in km:<br>
# $\frac{1}{(1+exp(log(-\frac{1}{d})^2))}$
# (This is not actually used, but keep this here just in case)

# In[48]:


gas_df['starbucks_metric'] = gas_df['nearest_starbucks'].apply(lambda d:1/(1+np.exp(-np.log((2/d)**2))))


# In[49]:


gas_df.head()


# In[50]:


gas_df.to_csv('gas_stations_2017-12-13.csv')


# In[51]:


regular_gas = gas_df[['type_A','within_100','starbucks_within_100m']].dropna().copy()


# In[52]:


regular_gas.columns=['Price','Competition_Nearby','Starbucks_Nearby']


# In[53]:


regular_gas['Relative_Price'] = regular_gas['Price'].apply(lambda p: p/np.mean(regular_gas['Price']))
regular_gas['Price_Difference'] = regular_gas['Price'].apply(lambda p: p-np.mean(regular_gas['Price']))


# In[54]:


regular_gas.head()


# In[55]:


import statsmodels.api as sm


# In[56]:


Y = regular_gas['Price']
X = sm.add_constant(regular_gas[['Competition_Nearby','Starbucks_Nearby']])
lm = sm.OLS(Y,X)


# In[57]:


results = lm.fit()


# In[58]:


results.params


# In[59]:


results.summary()


# Gas stations within 100m of a Starbucks location charge 0.62 cents more for regular gas, and this is statistically significant with over 99% confidence.
# 
# Proximity to competition also seems to increase prices, likely due to increased demand at busy intersections (as mentioned above). However, the p value is far too high so we can not be certain of its effect with any confidence.

# ### Clustering and Mapping
# 
# Let's see if we can find any meaningful clusters.

# In[60]:


cluster_df.dropna(inplace=True)


# In[61]:


cluster_df.info()


# In[62]:


cluster_df['type_A_norm'] = cluster_df['type_A']/np.mean(cluster_df['type_A'])
cluster_df['longitude_norm'] = -cluster_df['longitude']/np.mean(cluster_df['longitude'])
cluster_df['latitude_norm'] = cluster_df['latitude']/np.mean(cluster_df['latitude'])


# In[63]:


cluster_df.head()


# In[64]:


from scipy import linalg
from sklearn.cluster import KMeans
from sklearn import svm
from scipy.spatial.distance import cdist
from scipy.spatial import ConvexHull
import matplotlib.pyplot as plt
import seaborn as sns
get_ipython().run_line_magic('matplotlib', 'inline')


# In[65]:


SSE = []
coords = list(cluster_df[['type_A_norm','longitude_norm','latitude_norm']].values)
for k in range(1,13):
    km = KMeans(n_clusters=k)
    km.fit(coords)
    labels = km.labels_
    centroids = km.cluster_centers_
    # Get the SSE
    SSE.append(sum(np.min(cdist(coords, centroids, 'euclidean'),axis=1))/len(coords))
    
plt.plot(range(1,13),SSE, 'bo--')
plt.xlabel('k clusters')
plt.ylabel('SSE')
plt.title('SSE vs k')


# 4 seems like a decent choice, but interestingly so is 7. Let's try 7.

# In[66]:


k = 7
km = KMeans(n_clusters=k)
km.fit(coords)
labels = km.labels_
centroids = km.cluster_centers_
colors = ['r', 'g', 'b', 'y', 'c', 'm', 'k']
plt.figure(figsize=(8,8))
plt.title('Clustered Locations')
for j in range(k):
    current_coords = np.asarray(coords)[np.where(labels==j)]
    j_longs = current_coords[:,1]
    j_lats =  current_coords[:,2]
    plt.plot(j_longs, j_lats, '%so' % colors[j], label=j)
plt.legend()


# 7 is interesting indeed. Mississauga and Toronto are separated, and all but one of the Costco stations are grouped together regardless of distance.

# In[67]:


cluster_df['label'] = km.labels_


# In[68]:


cluster_df.groupby('label')['type_A'].describe().sort_values('mean',ascending=False)['mean']


# In[74]:


cluster_df[cluster_df['label']==5]


# ### Folium
# 
# Let's put it on a Toronto map with Folium.
# 
# Then run our function for getting the nearest gas stations within a specified distance in kilometers.

# In[70]:


import branca
import folium
from folium.plugins import MarkerCluster


# In[71]:


cluster_df.head()


# In[72]:


folmap(cluster_df,zoom=10)


# Enter the address and distance threshold to see nearby stations.

# In[73]:


enter_address(cluster_df,'80 Bloor St W, Toronto, ON M5S 2V1',2)