#!/usr/bin/env python
# coding: utf-8

# # Scraping And Parsing A Wikipedia List into Pandas
# 
# - **Author:** [Chris Albon](http://www.chrisalbon.com/), [@ChrisAlbon](https://twitter.com/chrisalbon)
# - **Date:** -
# - **Repo:** [Python 3 code snippets for data science](https://github.com/chrisalbon/code_py)
# - **Note:** -

# ### Preliminaries

# In[1]:


# Import required modules
import requests
from bs4 import BeautifulSoup
import pandas as pd


# ### Create a beautiful soup object from the website

# In[2]:


# Create a variable with the URL to this tutorial
url = 'http://en.wikipedia.org/wiki/List_of_airship_accidents'

# Scrape the HTML at the url
r = requests.get(url)

# Turn the HTML into a Beautiful Soup object
soup = BeautifulSoup(r.text)


# ### Parse the html into a list

# In[3]:


# Create a list for the scraping results
disasters = []


# The structure of the page is such that if we take all the *li* items **without** a tag, and then ignore the last three, we will have a clean list of all the disasters

# In[4]:


# Create a list elment for each li without a class (except for the last three)
# Then, for each row, append the text to disasters. 
for row in soup.find_all('li', class_=False)[:-3]: 
    disasters.append(row.text)


# ### Data wrangle the list into a dataframe

# In[5]:


# Create a dataframe from the list
df = pd.DataFrame(disasters, columns=['raw'])


# In[6]:


# Take everything before the ":" and call that the date variable
df['date'] = df['raw'].str.extract('(^[^_]+(?=:))')


# In[7]:


# Take everything after the ":" and call that the description variable
df['description'] = df['raw'].str.extract('\:(.*)')


# In[8]:


# Set the date variable to be time
df['date'] = pd.to_datetime(df['date'])


# In[9]:


# Set the date variable to be the dataFrame's index
df.index = df['date']


# In[10]:


# Drop the variables we no longer need
df = df.drop(['raw', 'date'], axis=1)


# ### View the results

# In[11]:


# View the top of the dataframe
df.head()


# In[ ]: