First let's test out the scraping
from bs4 import BeautifulSoup
import urllib2
url = "https://www.fanfiction.net/book/Harry-Potter/?&srt=1&lan=1&r=10&p=1"
# First pull the search page
page = urllib2.urlopen(url)
soup = BeautifulSoup(page.read())
# Look at the links (peeked at the HTML using web inspector)
links = soup.findAll('a',{'class':'stitle'})
links[:2]
# See what the links look like
links[0]['href']
# Form the full urls from the links
urls = ["https://www.fanfiction.net" + link['href'] for link in links]
urls[:10]
# Looks fine. Now try it by paginating across
all_urls = []
# Note: range(x,y) is inclusive of the first number, exclusive of the second number
# So range(1,3) will include 1 and 2 but not 3
for i in range(1,26):
print "Page " + str(i) + ", up to " + str(len(all_urls))
url = "https://www.fanfiction.net/book/Harry-Potter/?&srt=1&lan=1&r=10&p=" + str(i)
page = urllib2.urlopen(url)
soup = BeautifulSoup(page.read())
links = soup.findAll('a',{'class':'stitle'})
urls = ["https://www.fanfiction.net" + link['href'] for link in links]
# to add an array's elements to another array, use .extend instead of .append
all_urls.extend(urls)
len(all_urls)
# Now let's take a look at a single page
url = all_urls[0]
page = urllib2.urlopen(url)
soup = BeautifulSoup(page.read())
div = soup.find('div',{'class':'storytext'})
text = div.get_text()
text = text.encode("ascii","ignore")
text[:500]
# Now let's write it to a file
story_id = url.split("/")[4]
filename = "hp/" + story_id + ".txt"
with open(filename, 'w') as f:
f.write(text)
# That was pretty easy. Let's grab all of them and save them to
# A file named after their story id from the URL
all_texts = []
for url in all_urls:
story_id = url.split("/")[4]
filename = "hp/" + story_id + ".txt"
page = urllib2.urlopen(url)
soup = BeautifulSoup(page.read())
div = soup.find('div',{'class':'storytext'})
text = div.get_text()
text = text.encode("ascii","ignore")
with open(filename, 'w') as f:
f.write(text)