# requests is for downloading data(text) from the web. import requests # pattern and beautifulsoup are for navigating through DOM from pattern import web import csv # function 3 def get_food_truck_link(url): '''for a given food truck link inside the main_url find the real link of food truck and return''' def get_city_url(url): '''To fetch and return the dictonary of all cities and their id present on main page. Step 1''' # making get request to the given url r = requests.get(url) html = r.text '''Since sub-footer of the main page contain list of all countries and their link, parsing is being done only on sub-footer of main page.''' subfooter_index = html.find('id="subfooter"') footer = html[subfooter_index : html[subfooter_index:].find('')+subfooter_index] dom = web.Element(footer) city_link = {} for li in dom.by_tag('li'): city = li.by_tag('a')[0].content inner_link = li.by_tag('a')[0].attributes.get('href', '') city_link[city] = inner_link return city_link def get_food_trucks(url): '''For a given city url find all food trucks present on the page of that city. Step 2''' r = requests.get(url) dom = web.Element(r.text) truck_name_link = {} for li in dom.by_tag('li.squarelisting'): truck_name = li.by_tag('a')[0].attributes.get('href',)[1:] # step 3 for finding links for each truck is being called from here. truck_link = get_food_truck_link(main_url+truck_name) truck_name_link[truck_name] = truck_link return truck_name_link def get_food_truck_link(url): '''for a given food truck link inside the main_url find the real link of food truck and return. Step 3''' # print('inside get_food_truck_LINK', url) r = requests.get(url) dom = web.Element(r.text) #truck_link = dom.by_tag('a.drawbutton menu_link')[0].attributes.get('href',) truck_link = '' for div in dom.by_tag('div.widget cf'): truck_link = div.by_tag('a.drawbutton')[0].attributes.get('href',) return truck_link main_url = 'http://roaminghunger.com/' if __name__ == "__main__": # step 1 is to get a dict of all cities and their link address in roaminhunger.com city_link = get_city_url(main_url) # print city_link # step 2 - Now for each of these cities find food-truck city_food_trucks = {} for city in city_link: food_trucks = get_food_trucks(main_url+city_link[city]) city_food_trucks[city] = food_trucks # print(city_food_trucks) # step - 4 Write whole data in a csv file with open('data_food_truck.csv', 'w') as csvfile: filewriter = csv.writer(csvfile, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL) filewriter.writerow(['Food Truck', 'City', 'link']) for city in city_food_trucks: for food_truck, link in city_food_trucks[city].items(): filewriter.writerow([food_truck, city, link]) print('Done!!')