In this blog, we will have :
Putting all together, we will be able to identify the best implentation to open a new shop of a given brand.
Nothing very special here.
#needed for general operations
import os, operator, time, random
#needed for mathematical operations or drawing
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as patches
#needed for http dialog with RESTful API
import json
import urllib3
http = urllib3.PoolManager()
#needed for browser emulation (dynamic page scrapping) in hidden window
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from pyvirtualdisplay import Display
#needed for web page mining
from bs4 import BeautifulSoup
#needed for address to coordinate conversions and distance computations
from geopy.geocoders import Nominatim
from geopy.distance import great_circle
geolocator = Nominatim()
#needed to display map widget in notebook
from ipyleaflet import Map,DrawControl, Circle, Rectangle, CircleMarker, LayerGroup, Marker
As the webpage are dynamically generated via javascrips, we are using Selenium to emulate a firefox browser while navigating on the pages d'or website. Also, only the 20 first results are loaded on the page by default. So we need to press a button to load following results until everything is loaded. The rest of the code is traditional beautifulSoup parsing.
def getPagesdorData(driver, topic, city):
url = "http://www.pagesdor.be/qn/business/advanced/where/"+city+"/what/"+topic+"/"
print("processing %s" % url)
driver.get(url)
while(True): #click next button until all elements are loaded on the page
try:
nextElemButton = driver.find_element_by_css_selector(".-next-result")
if(nextElemButton.is_displayed()):
nextElemButton.click()
print(" - load next results")
else: break
except: break
resultsElem = driver.find_element_by_css_selector(".cst-results")
resultsHTML = resultsElem.get_attribute("innerHTML")
resultsBS = BeautifulSoup( resultsHTML , "lxml")
results = resultsBS.body.find_all(class_="-result list-item", recursive=True) #get the block corresponding to each results
data = []
for result in results:
fields = result.find_all(class_="row-fluid", recursive=True)
Name = fields[0].find_all("a")[1].get_text("", strip=True)
Address = fields[1].get_text("", strip=True)
dic = {"name":Name, "address":Address}
data += [dic]
return data
os.environ["PATH"] += ":/home/loicus/Data/Soft/geckodriver"
display = Display(visible=0, size=(1024, 768))
display.start()
driver = webdriver.Firefox()
driver.implicitly_wait(10) # seconds
data = []
data += getPagesdorData(driver, "super marché" , "1000" )
data += getPagesdorData(driver, "super marché" , "1030" )
data += getPagesdorData(driver, "super marché" , "1040" )
data += getPagesdorData(driver, "super marché" , "1050" )
data += getPagesdorData(driver, "super marché" , "1060" )
data += getPagesdorData(driver, "super marché" , "1070" )
data += getPagesdorData(driver, "super marché" , "1080" )
data += getPagesdorData(driver, "super marché" , "1081" )
data += getPagesdorData(driver, "super marché" , "1082" )
data += getPagesdorData(driver, "super marché" , "1083" )
data += getPagesdorData(driver, "super marché" , "1090" )
data += getPagesdorData(driver, "super marché" , "1140" )
data += getPagesdorData(driver, "super marché" , "1150" )
data += getPagesdorData(driver, "super marché" , "1160" )
data += getPagesdorData(driver, "super marché" , "1170" )
data += getPagesdorData(driver, "super marché" , "1180" )
data += getPagesdorData(driver, "super marché" , "1190" )
data += getPagesdorData(driver, "super marché" , "1200" )
data += getPagesdorData(driver, "super marché" , "1210" )
data += getPagesdorData(driver, "super marché" , "1700" ) #add dilbeek
driver.quit()
display.stop()
processing http://www.pagesdor.be/qn/business/advanced/where/1000/what/super marché/ - load next results processing http://www.pagesdor.be/qn/business/advanced/where/1030/what/super marché/ - load next results processing http://www.pagesdor.be/qn/business/advanced/where/1040/what/super marché/ - load next results processing http://www.pagesdor.be/qn/business/advanced/where/1050/what/super marché/ - load next results processing http://www.pagesdor.be/qn/business/advanced/where/1060/what/super marché/ - load next results processing http://www.pagesdor.be/qn/business/advanced/where/1070/what/super marché/ - load next results processing http://www.pagesdor.be/qn/business/advanced/where/1080/what/super marché/ - load next results processing http://www.pagesdor.be/qn/business/advanced/where/1081/what/super marché/ - load next results processing http://www.pagesdor.be/qn/business/advanced/where/1082/what/super marché/ - load next results processing http://www.pagesdor.be/qn/business/advanced/where/1083/what/super marché/ - load next results processing http://www.pagesdor.be/qn/business/advanced/where/1090/what/super marché/ - load next results processing http://www.pagesdor.be/qn/business/advanced/where/1140/what/super marché/ - load next results processing http://www.pagesdor.be/qn/business/advanced/where/1150/what/super marché/ - load next results processing http://www.pagesdor.be/qn/business/advanced/where/1160/what/super marché/ - load next results processing http://www.pagesdor.be/qn/business/advanced/where/1170/what/super marché/ - load next results processing http://www.pagesdor.be/qn/business/advanced/where/1180/what/super marché/ - load next results processing http://www.pagesdor.be/qn/business/advanced/where/1190/what/super marché/ - load next results processing http://www.pagesdor.be/qn/business/advanced/where/1200/what/super marché/ - load next results processing http://www.pagesdor.be/qn/business/advanced/where/1210/what/super marché/ - load next results processing http://www.pagesdor.be/qn/business/advanced/where/1700/what/super marché/
<Display cmd_param=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '1024x768x24', ':1001'] cmd=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '1024x768x24', ':1001'] oserror=None return_code=0 stdout="" stderr="" timeout_happened=False>
colorMap = {"Delhaize":"red", "Carrefour":"blue", "Colruyt":"DarkOrange", "Lidl":"yellow"}
cleanedData = []
for d in data:
if ("delhaize" in d["name"].lower()):d["brand"] = "Delhaize"
elif("lidl" in d["name"].lower()):d["brand"] = "Lidl"
elif("colruyt" in d["name"].lower()):d["brand"] = "Colruyt"
elif("carrefour" in d["name"].lower()):d["brand"] = "Carrefour"
else: continue
loc = geolocator.geocode(d["address"])
if(not loc): continue
d["coord"] = [loc.latitude, loc.longitude]
minDist=1e99
for d2 in cleanedData:
dist = great_circle(d["coord"], d2["coord"]).meters
if(dist<minDist): minDist = dist
if(minDist>10): #only consider shops that aren't too close to avoid duplicates
cleanedData += [d]
data = cleanedData
len(data)
107
averageLat = 0.0
averageLon = 0.0
averageCnt = 0
for d in data:
averageLat+=d["coord"][0]
averageLon+=d["coord"][1]
averageCnt+=1
averageLat /= averageCnt
averageLon /= averageCnt
m0 = Map(center=[averageLat, averageLon], zoom=13)
m0.scroll_wheel_zoom =False
m1 = Map(center=[averageLat, averageLon], zoom=13)
m1.scroll_wheel_zoom =False
m2 = Map(center=[averageLat, averageLon], zoom=13)
m2.scroll_wheel_zoom =False
for d in data:
if("coord" not in d or d["coord"]==None):continue
color=colorMap.get(d["brand"], "black")
c = Circle(location=d["coord"], radius=50, weight=1, fill_opacity=1.0, fill_color=color, color="black")
#c = Marker(location=d["coord"], title=d["brand"], rise_offset=100, fill_color=color, color=color)
m0.add_layer(c)
m1.add_layer(c)
m2.add_layer(c)
m0
%matplotlib inline
labels = list(colorMap.keys())
colors = list(colorMap.values())
plt.figure(figsize=(8, 0.5))
legpatches = [ patches.Patch(color=color, label=label) for label, color in zip(labels, colors)]
plt.legend(legpatches, labels, loc='center', frameon=False, ncol=4, prop={'size':20})
plt.axis('off')
plt.show()
def getRoutesChunk(data, cells):
url = 'http://router.project-osrm.org/table/v1/driving/'
count=0
sources = []
goodData = []
for index,d in enumerate(data):
if("coord" not in d):continue
if(index!=0): url+=";"
url += "%.8f,%.8f" % (d["coord"][1],d["coord"][0])
goodData +=[d]
count +=1;
for index,cell in enumerate(cells):
center = ( (cell[1][1]+cell[0][1])*0.5 , (cell[1][0]+cell[0][0])*0.5 )
url +=";"
url += "%.8f,%.8f" % (center[0], center[1])
sources +=[count]
count +=1;
url += "?sources="
for index,source in enumerate(sources):
if(index!=0): url+=";"
url += str(source)
request = http.request('GET', url)
if(request.status!=200):
print("bad response: %s" % str(request.status) )
return []
response = json.loads( request.data.decode('utf8'))
request.release_conn()
#print( json.dumps(response, sort_keys=True, indent=3) )
toReturn = []
cellTimes = response["durations"]
for index,cellTime in enumerate(cellTimes):
timeToData = cellTime[:len(goodData)]
min_index, min_time = min(enumerate(timeToData), key=operator.itemgetter(1))
toReturn += [ (cells[index],min_time,goodData[min_index]) ]
return toReturn
def getRoutes(data, cells, chunkSize=50):
chunkCells = (cells[i:i+chunkSize] for i in range(0, len(cells), chunkSize))
toReturn = []
for chunk in chunkCells:
toReturn += getRoutesChunk(data, chunk)
return toReturn
NedgesX = 150
NedgesY = 75
boundX = 0.1700
boundY = 0.0500
edgeX = boundX/NedgesX
edgeY = boundY/NedgesY
cells = []
for i in range(0,NedgesY):
for j in range(0,NedgesX):
y = averageLat - boundY*0.5 + i*edgeY
x = averageLon - boundX*0.5 + j*edgeX
cells += [ [(y, x), (y+edgeY, x+edgeX)] ]
results = getRoutes(data, cells, 50)
for (cell,time,closest) in results:
color=colorMap.get(closest["brand"], "black")
r1 = Rectangle(bounds=cell, weight=0, fill_opacity=0.40, fill_color=color, color=color)
m1.add_layer(r1)
color = matplotlib.colors.rgb2hex( np.array([1.0, 0.0, 0.0]) + max(0,min(1,time/300.0)) * np.array([0.0, 1.0, 1.0]) )
r2 = Rectangle(bounds=cell, weight=0, fill_opacity=0.60, fill_color=color, color=color)
m2.add_layer(r2)
m1
m2
Feel free to contact me, I'd be happy to help
Many other fun data science examples are available on my blog: https://indepthdata.wordpress.com/