First let's test out the scraping

In [ ]:

from bs4 import BeautifulSoup
import urllib2

In [ ]:

url = "https://www.fanfiction.net/book/Harry-Potter/?&srt=1&lan=1&r=10&p=1"

In [ ]:

# First pull the search page
page = urllib2.urlopen(url)
soup = BeautifulSoup(page.read())

In [ ]:

# Look at the links (peeked at the HTML using web inspector)
links = soup.findAll('a',{'class':'stitle'})
links[:2]

In [ ]:

# See what the links look like
links[0]['href']

In [ ]:

# Form the full urls from the links
urls = ["https://www.fanfiction.net" + link['href'] for link in links]
urls[:10]

In [ ]:

# Looks fine. Now try it by paginating across
all_urls = []
# Note: range(x,y) is inclusive of the first number, exclusive of the second number 
# So range(1,3) will include 1 and 2 but not 3
for i in range(1,26):
    print "Page " + str(i) + ", up to " + str(len(all_urls))
    url = "https://www.fanfiction.net/book/Harry-Potter/?&srt=1&lan=1&r=10&p=" + str(i)
    page = urllib2.urlopen(url)
    soup = BeautifulSoup(page.read())
    links = soup.findAll('a',{'class':'stitle'})
    urls = ["https://www.fanfiction.net" + link['href'] for link in links]
    # to add an array's elements to another array, use .extend instead of .append
    all_urls.extend(urls)
len(all_urls)

In [ ]:

# Now let's take a look at a single page
url = all_urls[0]
page = urllib2.urlopen(url)
soup = BeautifulSoup(page.read())

In [ ]:

div = soup.find('div',{'class':'storytext'})
text = div.get_text()
text = text.encode("ascii","ignore")
text[:500]

In [ ]:

# Now let's write it to a file
story_id = url.split("/")[4]
filename = "hp/" + story_id + ".txt"    
with open(filename, 'w') as f:
    f.write(text)

In [ ]:

# That was pretty easy. Let's grab all of them and save them to
# A file named after their story id from the URL
all_texts = []
for url in all_urls:
    story_id = url.split("/")[4]
    filename = "hp/" + story_id + ".txt"
    page = urllib2.urlopen(url)
    soup = BeautifulSoup(page.read())
    div = soup.find('div',{'class':'storytext'})
    text = div.get_text()
    text = text.encode("ascii","ignore")
    with open(filename, 'w') as f:
        f.write(text)

In [ ]: