#!/usr/bin/env python # coding: utf-8 # # Scraping And Parsing A Wikipedia List into Pandas # # - **Author:** [Chris Albon](http://www.chrisalbon.com/), [@ChrisAlbon](https://twitter.com/chrisalbon) # - **Date:** - # - **Repo:** [Python 3 code snippets for data science](https://github.com/chrisalbon/code_py) # - **Note:** - # ### Preliminaries # In[1]: # Import required modules import requests from bs4 import BeautifulSoup import pandas as pd # ### Create a beautiful soup object from the website # In[2]: # Create a variable with the URL to this tutorial url = 'http://en.wikipedia.org/wiki/List_of_airship_accidents' # Scrape the HTML at the url r = requests.get(url) # Turn the HTML into a Beautiful Soup object soup = BeautifulSoup(r.text) # ### Parse the html into a list # In[3]: # Create a list for the scraping results disasters = [] # The structure of the page is such that if we take all the *li* items **without** a tag, and then ignore the last three, we will have a clean list of all the disasters # In[4]: # Create a list elment for each li without a class (except for the last three) # Then, for each row, append the text to disasters. for row in soup.find_all('li', class_=False)[:-3]: disasters.append(row.text) # ### Data wrangle the list into a dataframe # In[5]: # Create a dataframe from the list df = pd.DataFrame(disasters, columns=['raw']) # In[6]: # Take everything before the ":" and call that the date variable df['date'] = df['raw'].str.extract('(^[^_]+(?=:))') # In[7]: # Take everything after the ":" and call that the description variable df['description'] = df['raw'].str.extract('\:(.*)') # In[8]: # Set the date variable to be time df['date'] = pd.to_datetime(df['date']) # In[9]: # Set the date variable to be the dataFrame's index df.index = df['date'] # In[10]: # Drop the variables we no longer need df = df.drop(['raw', 'date'], axis=1) # ### View the results # In[11]: # View the top of the dataframe df.head() # In[ ]: