#!/usr/bin/env python # coding: utf-8 # ## How to download and store lots of files # # [**Neal Caren**](mailto:neal.caren@gmail.com) # University of North Carolina, Chapel Hill # Web scraping frequently involves analyzing hundreds or thousands of files. Figuring out what data you need from a page and the best way to extract is often an iterative process. After processing all the documents, your scraper might not have worked the way you thought it was working. The format of the pages might be different for a subsample of the pages and your scraper returns nothing for them. Six months later, you might want to get one more piece of information from each page. Downloading web pages or using an API takes time and costs the provider money. The content of pages and their availability also changes over time. This all means that it makes the most sense to gather all your web data and store it locally before you start analyzing it. # # This lessons walks through the basics steps for downloading and storing data. This lessons assumes that you have some basic knowledge of Python and a list of URLs you want to download. # # I begin by importing the standard library for acessessing web pages or APIs, [requests](http://docs.python-requests.org/en/master/). # In[1]: import requests # No matter how many URLs I eventually need to download, I start with a sample one to make sure the process works before creating a loop. In this case, I'm interested in Fox News opinion pieces: # In[2]: url = 'https://www.foxnews.com/opinion/gutfeld-on-hiring-combat-vets-to-defend-schools' # Using `requests`, I access the page and retrieve the HTML. # In[3]: r = requests.get(url) html = r.text # Sometimes, this works but the actual content is not what you get when you access the web page using a browser. To confirm that things went as planned, I display the contents of the file as a browser would. # In[4]: from IPython.display import HTML HTML(html) # Since that worked, I want to save the file. I create a directory to store what will eventually be numerous files. # In[5]: import os os.mkdir('fox-html') # If the directory already exists, this command will produce an error. # # Before the file it saved, it needs a name. URLs often involve characters that can cause trouble file names, such as \,: or ?. I use slugify to create an operating-system safe name for the file. This may need to be added to your python installation # In[6]: get_ipython().system('pip install python-slugify') # slugify creates a filename that is both usable by your computer and human readable. # In[7]: from slugify import slugify slugify(url) # Saving text files is one of the more non-intuitive aspects of Python. In this case we also need to create the file name and make sure the file is put in the correct directory. # In[8]: file_name = slugify(url) directory = "fox-html" location = os.path.join(directory, file_name) with open(location, "w") as outfile: outfile.write(html) # I'll use this process of downloading and saving many times, so I put it in a function that takes two parameters: the URL to create the filename and the html text to be saved. I put the construction of a the file location in a separate function because I will need to use that piece later in a different context. I also have the script pause for three seconds so the web server is not bombarded by our program. # In[26]: from time import sleep def locate(url): '''Create file name and place in directory''' file_name = slugify(url) location = os.path.join("fox-html", file_name) return location def get_html(url): """Download & save ave html text using the url to create the filename.""" sleep(3) r = requests.get(url) html = r.text location = locate(url) with open(location, "w") as outfile: outfile.write(html) # When it works, it does not return or display anything. # In[10]: get_html(url) # Note that I the project specific "fox-html" directory is hard coded into the function. If you copy and paste this function, be sure to update it to whatever name you are using. # The function could now be used to download many URLs. For example, if you had a list of six URLs, you could loop over them. # In[11]: urls = [ "https://www.foxnews.com/opinion/newt-gingrich-i-saw-the-south-korean-miracle-for-myself-it-was-incredible", "https://www.foxnews.com/opinion/gutfeld-on-hiring-combat-vets-to-defend-schools", "https://www.foxnews.com/opinion/doug-schoen-amazons-cancellation-of-a-move-to-nyc-is-bad-news-and-could-hurt-far-left-dems-at-polls", "https://www.foxnews.com/opinion/amazon-quits-new-york-victory-by-progressives-helps-city-wave-good-bye-to-25000-jobs", "https://www.foxnews.com/opinion/william-barr-is-our-new-attorney-general-here-are-four-things-he-should-focus-on-right-away", "https://www.foxnews.com/opinion/karl-rove-trumps-approval-numbers-are-going-up-three-things-could-keep-his-momentum-going", ] # In[12]: for url in urls: get_html(url) # If something when wrong, the loop would stop with an error message, but we can confirm that it worked by listing the files in the html directory. # In[13]: os.listdir('fox-html') # If you had a modest sized list of files to download, you are done. You now have a directory with all your files read for subsequent analysis. # # If you have a larger list of files, however, you would want the loop to robust to errors and to be able to restart without re-downloading files you have already collected. # The `get_html` code can be modified so that if something goes wrong with the download it prints out an error message and then pauses for 10 seconds. Since the problem may be related to the server, you want to slow down the process when things are not going well. # In[27]: def get_html(url): """Download & save ave html text using the url to create the filename.""" sleep(3) try: r = requests.get(url) html = r.text except: print("Problem with",url) sleep(10) else: location = locate(url) with open(location, "w") as outfile: outfile.write(html) # The loop can also modified to download the file from the internet only once. If you are downloading thousands of files, something will go wrong and you don't want all your work to go to waste. # # The Python way to include this check in the loop is to first confirm whether or not the file exists. If it doesn't exist, because our attempt to open it fails, then, and only then, do we download it. # # I first write a helper function that attempts to load the file based on the URL name. This takes advantage of our previously defined `locate` function. # In[28]: def confirm_file(url): """Attempt to open a file based on the url.""" location = locate(url) with open(location, "r") as infile: html = infile.read() # I now update the loop. First, trying to open the file. When that doesn't work, I attempt to download it. # In[31]: for url in urls: try: confirm_file(url) except: get_html(url) # When you are running a loop like this on a modest number of URLs, you might add a `print` statement for each URL just to make sure that things are progressing. For 100,000 URLs, however, this will clog your screen and computer memory, so make sure to remove them before letting the script wild.