# Import required modules
import requests
from bs4 import BeautifulSoup
import pandas as pd
# Create a variable with the URL to this tutorial
url = 'http://en.wikipedia.org/wiki/List_of_airship_accidents'
# Scrape the HTML at the url
r = requests.get(url)
# Turn the HTML into a Beautiful Soup object
soup = BeautifulSoup(r.text)
# Create a list for the scraping results
disasters = []
The structure of the page is such that if we take all the li items without a tag, and then ignore the last three, we will have a clean list of all the disasters
# Create a list elment for each li without a class (except for the last three)
# Then, for each row, append the text to disasters.
for row in soup.find_all('li', class_=False)[:-3]:
disasters.append(row.text)
# Create a dataframe from the list
df = pd.DataFrame(disasters, columns=['raw'])
# Take everything before the ":" and call that the date variable
df['date'] = df['raw'].str.extract('(^[^_]+(?=:))')
# Take everything after the ":" and call that the description variable
df['description'] = df['raw'].str.extract('\:(.*)')
# Set the date variable to be time
df['date'] = pd.to_datetime(df['date'])
# Set the date variable to be the dataFrame's index
df.index = df['date']
# Drop the variables we no longer need
df = df.drop(['raw', 'date'], axis=1)
# View the top of the dataframe
df.head()
description | |
---|---|
date | |
2 May 1902 | Semi-rigid airship Pax explodes over Paris, k... |
13 October 1902 | Separation of gondola from envelope over Pari... |
30 November 1907 | Loss of the French Army's Patrie - no fatalit... |
23 May 1908 | Morrell airship falls over Berkeley, Californ... |
4 August 1908 | Zeppelin LZ 4 caught fire near Echterdingen a... |