import requests

url = 'http://www.upworthy.com/page/2'

page = requests.get(url)

print page.headers

page.headers['status']

page.text

page_html = page.text

import re

re.findall('rel="analytic_event">.*?<', page_html)   

re.findall('rel="analytic_event">(.*?)<', page_html)   

re.findall('rel="analytic_event">.*?<\/a>', page_html)   

re.findall('rel="analytic_event">(.*?)<', page_html)

re.findall('rel="analytic_event">(.*?)<', page_html)[12:]

headlines = re.findall('"analytic_event">(.*?)<', page_html)[:12]

url = 'http://www.upworthy.com/page/3'

page = requests.get(url)
page_html = page.text
headlines = re.findall('"analytic_event">(.*?)<', page_html)[12:] 

print headlines

for page in range(2,10):
    url = 'http://www.upworthy.com/page/' +  str(page)
    print url


from time import sleep

#somewhere to store the headlines
headlines = []

#Loop through 78 pages.
for page in range(2,80):
    #construct the URL
    url = 'http://www.upworthy.com/page/' +  str(page)
    
    #Open the page and grab the HTML
    page = requests.get(url)
    page_html = page.text
    
    #Extract the headlines
    new_headlines = re.findall('"analytic_event">(.*?)<', page_html)
    
    #Add them to our headline list
    headlines = headlines + new_headlines[12:]
    
    #Rest
    sleep(1)

    
len(headlines)

headlines[:10]

headlines[-10:]

with open('upworthy_titles.txt', 'w') as outfile:
    
    #Loop through each of our headlines
    for title in headlines:
        
        #Some headlines have umlauts
        title = title.encode('utf-8')
        
        #HTML entities are the worst.
        title = title.replace('&#x27;',"'")
        
        #'\n' says End of line
        outfile.write(title + '\n')


!cat upworthy_titles.txt