#!/usr/bin/env python
# coding: utf-8

# # Queensland State Archives, Naturalisations, 1851 to 1904
# 
# ## Add series information to index

# The [Naturalisations, 1851 to 1904](https://data.qld.gov.au/dataset/naturalisations-1851-to-1904) index is available from the Queensland Government data portal. The notes explain:
# 
# > This index was created from various records detailing the names of those who took oaths of allegiance to be naturalised as created by the Supreme Court across Queensland as well as the Colonial Secretary's Office and the Government Residents Office.
# 
# It's not clear, however, that this means that the index collates name entries from a number of different series, with a separate row for each name reference. This means that there can be multiple rows referring to the naturalisation of a single individual. This is obviously important to keep in mind if you're trying to analyse aggregate data relating to naturalisations in Queensland.
# 
# This notebook adds series information to the original index so that you can filter the data by series.

# In[1]:


import pandas as pd
import altair as alt
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook
from tqdm.auto import tqdm
from IPython.display import display, HTML, FileLink
alt.renderers.enable('notebook')


# ### Load the data

# In[2]:


# The encoding option is necessary to avoid unicode errors
df = pd.read_csv('https://data.qld.gov.au/dataset/91970fa7-d3c3-4171-a89d-410481cb90e9/resource/7b5ddae5-78ef-4d8e-b800-56e6f30d26d5/download/naturalisations-1851-1908.csv', encoding='ISO-8859-1', keep_default_na=False)


# In[3]:


df.head()


# In[4]:


# How many rows?
len(df)


# ### Add series information

# One way of removing duplicates is to filter the results by series. However, while each entry includes an `Item ID`, it doesn't include a series identifier. To get the series information we have to request the item details web page, and scrape the series information from it.
# 
# Rather than loop through the whole dataset, we'll grab all the unique `ItemID` values first, get the series information for each, then merge this data back into the original dataset.

# In[5]:


def get_series(row):
    '''
    Get the series id and title for the item identifier in the given row.
    '''
    response = requests.get('http://www.archivessearch.qld.gov.au/Search/ItemDetails.aspx', params={'ItemId': row['item_id']})
    soup = BeautifulSoup(response.text)
    series_id = soup.find(id='ctl00_cphMain_RecordDetailsView_SeriesFormView_SERIES_IDLabel').string
    series_title = soup.find('a', id='ctl00_cphMain_RecordDetailsView_SeriesFormView_TitleHyperLink').string
    return pd.Series([series_id, series_title])

tqdm.pandas(desc="Progress")
# Get the unique item ids
item_ids = pd.DataFrame(df['Item ID'].unique())
item_ids.columns = ['item_id']
# Get series data for each item id
item_ids[['series_id', 'series_title']] = item_ids.progress_apply(get_series, axis=1)


# Now we have a dataframe linking item ids to series information, we can find out something about the series represented in the original dataset.

# In[6]:


pd.set_option('display.max_colwidth', -1)
# List the indexed series
item_ids[['series_id', 'series_title']].drop_duplicates()


# Merge the series data back into the original dataset.

# In[7]:


pd.set_option('display.max_colwidth', 50)
# Merge the series data into the original dataset
qld_df = pd.merge(df, item_ids, left_on='Item ID', right_on='item_id', how='left')
qld_df.head()


# ### Save as CSV
# 
# Save the enriched dataset as a CSV file.

# In[8]:


qld_df.to_csv('qsa_naturalisations_index_with_series.csv', index=False)
display(FileLink('qsa_naturalisations_index_with_series.csv'))


# ### Explore by series
# 
# Now that we've associated each entry in the original dataset with a series, we can break the data down by series to better understand the content of the index.

# In[9]:


# Number of entries per series
qld_df['series_id'].value_counts()


# In[10]:


def filter_by_series(series_id):
    '''
    Filter datatset by series id.
    '''
    filtered_df = qld_df.loc[qld_df['series_id'] == series_id].copy()
    return filtered_df

def get_counts_by_year(df):
    '''
    Aggregate data by year and prepare for charting.
    '''
    counts = df['Year'].groupby([df['Year']]).agg('count').to_frame()
    counts.columns = ['count']
    counts = counts.reset_index()
    # Filter out date errors
    counts = counts.loc[(counts['Year'] > 0) & (counts['Year'] < 1910)].copy()
    # Create a datetime field
    counts['date'] = pd.to_datetime(counts['Year'], format='%Y')
    return counts


# In[11]:


df_5177 = filter_by_series('5177')
df_5741 = filter_by_series('5741')

c1 = alt.Chart(get_counts_by_year(df_5177)).mark_bar(size=10).encode(
    x=alt.X('year(date):T', title='Year'),
    y='count:Q'
).properties(
    width=700,
    title='Series 5177'
)

c2 = alt.Chart(get_counts_by_year(df_5741)).mark_bar(size=10).encode(
    x=alt.X('year(date):T', title='Year'),
    y='count:Q'
).properties(
    width=700,
    title='Series 5741'
)

c1 & c2