It is just a way of getting data from web.
There are following ways how we can collect data:
The third one above is web scraping. It simply means to writing the scripts(may be python scripts) to grab the data from web pages. It is more fun than direct download.
Many different Python libraries are available for web Scraping. I used pattern and requests here. BeautifilSoup, Scrapy are other libraries for same purpose.
# requests is for downloading data(text) from the web.
import requests
# pattern and beautifulsoup are for navigating through DOM
from pattern import web
import csv
# function 3
def get_food_truck_link(url):
'''for a given food truck link inside the main_url find the real
link of food truck and return'''
def get_city_url(url):
'''To fetch and return the dictonary of all cities and
their id present on main page.
Step 1'''
# making get request to the given url
r = requests.get(url)
html = r.text
'''Since sub-footer of the main page contain list of all countries and their link,
parsing is being done only on sub-footer of main page.'''
subfooter_index = html.find('id="subfooter"')
footer = html[subfooter_index : html[subfooter_index:].find('</ul></div>')+subfooter_index]
dom = web.Element(footer)
city_link = {}
for li in dom.by_tag('li'):
city = li.by_tag('a')[0].content
inner_link = li.by_tag('a')[0].attributes.get('href', '')
city_link[city] = inner_link
return city_link
def get_food_trucks(url):
'''For a given city url find all food trucks present on the
page of that city.
Step 2'''
r = requests.get(url)
dom = web.Element(r.text)
truck_name_link = {}
for li in dom.by_tag('li.squarelisting'):
truck_name = li.by_tag('a')[0].attributes.get('href',)[1:]
# step 3 for finding links for each truck is being called from here.
truck_link = get_food_truck_link(main_url+truck_name)
truck_name_link[truck_name] = truck_link
return truck_name_link
def get_food_truck_link(url):
'''for a given food truck link inside the main_url find the real
link of food truck and return.
Step 3'''
# print('inside get_food_truck_LINK', url)
r = requests.get(url)
dom = web.Element(r.text)
#truck_link = dom.by_tag('a.drawbutton menu_link')[0].attributes.get('href',)
truck_link = ''
for div in dom.by_tag('div.widget cf'):
truck_link = div.by_tag('a.drawbutton')[0].attributes.get('href',)
return truck_link
main_url = 'http://roaminghunger.com/'
if __name__ == "__main__":
# step 1 is to get a dict of all cities and their link address in roaminhunger.com
city_link = get_city_url(main_url)
# print city_link
# step 2 - Now for each of these cities find food-truck
city_food_trucks = {}
for city in city_link:
food_trucks = get_food_trucks(main_url+city_link[city])
city_food_trucks[city] = food_trucks
# print(city_food_trucks)
# step - 4 Write whole data in a csv file
with open('data_food_truck.csv', 'w') as csvfile:
filewriter = csv.writer(csvfile, delimiter='\t',
quotechar='|', quoting=csv.QUOTE_MINIMAL)
filewriter.writerow(['Food Truck', 'City', 'link'])
for city in city_food_trucks:
for food_truck, link in city_food_trucks[city].items():
filewriter.writerow([food_truck, city, link])
print('Done!!')