import requests url = 'http://www.upworthy.com/page/2' page = requests.get(url) print page.headers page.headers['status'] page.text page_html = page.text import re re.findall('rel="analytic_event">.*?<', page_html) re.findall('rel="analytic_event">(.*?)<', page_html) re.findall('rel="analytic_event">.*?<\/a>', page_html) re.findall('rel="analytic_event">(.*?)<', page_html) re.findall('rel="analytic_event">(.*?)<', page_html)[12:] headlines = re.findall('"analytic_event">(.*?)<', page_html)[:12] url = 'http://www.upworthy.com/page/3' page = requests.get(url) page_html = page.text headlines = re.findall('"analytic_event">(.*?)<', page_html)[12:] print headlines for page in range(2,10): url = 'http://www.upworthy.com/page/' + str(page) print url from time import sleep #somewhere to store the headlines headlines = [] #Loop through 78 pages. for page in range(2,80): #construct the URL url = 'http://www.upworthy.com/page/' + str(page) #Open the page and grab the HTML page = requests.get(url) page_html = page.text #Extract the headlines new_headlines = re.findall('"analytic_event">(.*?)<', page_html) #Add them to our headline list headlines = headlines + new_headlines[12:] #Rest sleep(1) len(headlines) headlines[:10] headlines[-10:] with open('upworthy_titles.txt', 'w') as outfile: #Loop through each of our headlines for title in headlines: #Some headlines have umlauts title = title.encode('utf-8') #HTML entities are the worst. title = title.replace(''',"'") #'\n' says End of line outfile.write(title + '\n') !cat upworthy_titles.txt