#!/usr/bin/env python # coding: utf-8 # # Web Scraping with the Art of Literary Text Analysis # # Our objective here is to create a local copy of all the articles in journal _Digital Humanities Quarterly_. As always, there are multiple ways to accomplish this, but this will be our plan: # # * find a page that lists all the articles (since one of those conveniently exists) # * fetch its contents # * create a list of URLs for the titles # * loop through each URL and save the contents locally # # So we'll begin by fetching the URL, something we already saw in the [Getting Started](GettingStarted.ipynb). We begin by importing the built-in [urllib.request](https://docs.python.org/3/library/urllib.request.html#module-urllib.request) library, define the URL to fetch, and fetch it (as HTML source code, so plain text). # In[1]: import urllib.request urlRoot = "http://digitalhumanities.org" titlesUrl = "http://digitalhumanities.org/dhq/index/title.html" # define the URL titlesSource = urllib.request.urlopen(titlesUrl).read() # fetch the source from the URL titlesSource # preview the contents from the URL # We have our source code. The next step is to find the list of titles in the document. # In[105]: from bs4 import BeautifulSoup titlesSoup = BeautifulSoup(titlesSource) # parse the source document titleIndex = titlesSoup.find(id="titleIndex") # find the tag with ID=titleIndex # The next step will be to compile a set of URLs but before we do that we'll detour to explain the concept of lists loops in Python. # # A list in Python is an ordered set of elements, sometimes called an array in other languages. A list can be a list of just about any data type, including strings, numbers, objects, and even other lists. # # One very easy way to create a list in Python is using the square brackets. # In[106]: numbers = [0, 1, 2, 3, 4, 5] # new list of numbers numbers # preview the list # Now let's say we want to do something with each element in the list, that's called iterating or looping. In Python, as with many languages, there's the concept of a [for loop](https://docs.python.org/3/tutorial/controlflow.html?highlight=loop#for-statements). In this example we'll create tens from our single digits. # In[107]: tens = [] # create an empty list for our tens for number in numbers: # for each number in our numbers list tens.append(number*10) # add to our tens list the number times ten tens # preview the output # Another useful concept is that of the [conditional structure](http://en.wikibooks.org/wiki/Python_Programming/Conditional_Statements) in Python where we test for a boolean value (true or false). # # Python uses a colon and indentation to indicate the parts of the conditional block. If we want to execute a block when a condition evaluates to true (like ```1 < 5```, one _is_ smaller than five): # # if _condition_: # _block_ # # Or if a condition is not true (like ```1 > 5```, one _is not_ smaller than five): # # if *not* _condition_: # _block_ # # # Using this syntax we can determine if one string is in another. # In[108]: if "dhq" in titlesUrl: print("dhq is in ", titlesUrl) if not "qhd" in titlesUrl: print("qhd is not in ", titlesUrl) # Now that we've introduced lists, conditionals and the "in" operator we can understand the code below more easily. This code uses the BeautifulSoup [find_all](https://www.crummy.com/software/BeautifulSoup/bs4/doc/#find-all) function to create a list of all the <a> (anchor or link) in our document. # In[109]: urls = [] # create a list of URLs anchors = titleIndex.find_all("a") # create a list from finding all a (link) tags for a in anchors: # loop through each element of the anchors list href = a['href'] # determine what the href (url) is for the a tag using list subscripting (explained later) if "/dhq/vol/" in href: # ensure that we have a proper look URL (not an #anchor) urls.append(href) # add the href to the urls list urls # preview # As a digression, there's a more "Pythonesque" way of doing things, which is to use [list comprehension](https://docs.python.org/3/tutorial/datastructures.html?highlight=list%20comprehension#list-comprehensions). This is a syntax that's very compact and convenient (though arguable not as legible). The entire code block for looping above can be accomplished in one line. The two approaches are identical (in this case) and you should be familiar with both syntaxes. # In[110]: urls = [a['href'] for a in titleIndex.find_all("a") if "/dhq/vol/" in a['href']] urls # Before saving a local copy of our articles we'll create a data directory to which we can save our files # In[111]: import os directory = "data" # data directory if not os.path.exists(directory): # if the directory doesn't exist os.makedirs(directory) # create it # Now we can go ahead and fetch the URLs, writing out the results as we go along. Some concepts won't be explained immediately, such as the [regular expressions](https://docs.python.org/3.7/library/re.html) (`re`) and the `with open` syntax. # In[113]: import re import time for url in set(urls): # for each url in our list of URLs clean_url = re.sub(r'\s+', "", url) # clean our URLs of superfluous spaces filename = os.path.basename(clean_url) # determine the filename path = os.path.join(directory, filename) # construct a full path if os.path.exists(path): # if we alredy have the path locally print("already fetched:", clean_url) # let the user know we've else: # otherwise we need to fetch it print("fetching:", clean_url) # let the user know we've contents = urllib.request.urlopen(urlRoot+clean_url).read() # grab the contents of the URL with open(path, "wb") as f: # open the file for writing f.write(contents) # write the contents time.sleep(1) # be nice to the server by waiting # And there we are, we now have a local copy of all the articles from _Digital Humanities Quarterly_! # # --- # [CC BY-SA](https://creativecommons.org/licenses/by-sa/4.0/) From [The Art of Literary Text Analysis](ArtOfLiteraryTextAnalysis.ipynb) by [Stéfan Sinclair](http://stefansinclair.name) & [Geoffrey Rockwell](http://geoffreyrockwell.com).
Created January 24, 2019 (Jupyter 5), last updated January 31, 2018.