#!/usr/bin/env python
# coding: utf-8

# # Create an index to the harvested files
# 
# The XML files contain embedded metadata that includes the name of the prime minister, and the title and date of the transcript. This notebook extracts that metadata from the harvested files and creates a CSV formatted spreadsheet for easy analysis. It also demonstrates some ways of summarising and visualising the metadata.

# In[113]:


import os
from bs4 import BeautifulSoup
import arrow
import pandas as pd
import altair as alt

# Set up Altair
#alt.renderers.enable('notebook')
alt.renderers.enable('default')
alt.data_transformers.enable('json')


# ## Extract the metadata as save as a CSV

# In[114]:


def get_tag(soup, tag):
    '''
    Given some Soup, find the specified tag and return its value.
    '''
    try:
        value = soup.find(tag).string.strip()
    except AttributeError:
        value = ''
    return value

# Create a list to put the metadata in
all_details = []

# Get the file names of all the harvested files
files = [f for f in os.listdir('transcripts') if f[-4:] == '.xml']

# Loop through the harvested files
for filename in files:
    
    # Open the file
    with open(os.path.join('transcripts', filename), 'rb') as xml_file:
        
        # Create a dict to put this file's metadata in
        details = {}
        
        # Load the file contents into Soup and then get the desired tags
        soup = BeautifulSoup(xml_file.read())
        details['id'] = get_tag(soup, 'transcript-id')
        details['title'] = get_tag(soup, 'title')
        details['pm'] = get_tag(soup, 'prime-minister')
        
        # We're going to reformat the date into the ISO standard, so first get the value
        release_date = get_tag(soup, 'release-date')
        try:
            # Then try to parse the date and reformat as ISO
            iso_date = arrow.get(release_date, 'DD/MM/YYYY').format('YYYY-MM-DD')
        except:
            # If something goes wrong...
            iso_date = ''
        details['date'] = iso_date
        details['release_type'] = get_tag(soup, 'release-type')
        details['subjects'] = get_tag(soup, 'subjects')
        details['pdf'] = get_tag(soup, 'document')
        
        # Add the metadata for this file to the list
        all_details.append(details)
        

# In[115]:


# Convert the metadata to a Pandas dataframe
df = pd.DataFrame(all_details)
df.head()


# Save the index as a CSV formatted file.

# In[116]:


# Save the metadata as a CSV file
df.to_csv('index.csv', index=False)


# ## Analyse the metadata

# In[117]:


# This loads the metadata from the CSV
# This is not necessary if you've just generated the df
df = pd.read_csv('index.csv', keep_default_na=False)


# Let's have a look at the number of transcripts for each Prime Minister.

# In[118]:


df['pm'].value_counts()


# Note that there are 73 transcripts that have no Prime Minister associated with them. We'll exclude those and chart the rest.

# In[112]:


# We're exlusing those transcripts where the 'pm' value is empty - df.loc[df['pm'] != '']
alt.Chart(df.loc[df['pm'] != '']).mark_bar().encode(
    
    # Show number of transcripts on X axis
    x=alt.X('count():Q', title='Number of transcripts'),
    
    # Show the name of the PM on the Y axis
    y=alt.Y('pm:N', title='Prime Minister'),
    
    # Show details when you hover
    tooltip=[alt.Tooltip('pm:N', title='Prime Minister'), alt.Tooltip('count():Q', title='Transcripts')]
)


# Let's have a look at the number of transcripts over time.

# In[119]:


alt.Chart(df.loc[df['pm'] != '']).mark_bar(size=6).encode(
    
    # Year on the X axis
    x=alt.X('year(date):T', title='Year'),
    
    # Number of transcripts on the Y axis
    y=alt.Y('count():Q', title='Number of transcripts'),
    
    # Color denotes the PM (names will be in the legend)
    color=alt.Color('pm:N', scale=alt.Scale(scheme='tableau20'), legend=alt.Legend(title='Prime Minister')),
    
    # Show details on hover
    tooltip=[alt.Tooltip('pm:N', title='Prime Minister'), alt.Tooltip('year(date):T', title='Year'), alt.Tooltip('count():Q', title='Transcripts')]
).properties(width=700)


# ## Release types and subjects
# 
# The `release_type` field should tell us what sort of text this transcript represents – interview, media release, etc. Let's see how it's used.

# In[96]:


df['release_type'].value_counts()


# Hmmm, so it looks like there's not a lot of control over the values in this field – is a 'Press Statement' the same as a 'Media Release'? It also looks like some subjects have been incorrectly entered here.
# 
# By combining `groupby()` and `value_counts()` it's easy to look at the release types for each PM.

# In[76]:


df.groupby('pm')['release_type'].value_counts()


# Because the `release_type` field is a bit of a mess, let's get the top ten release types and visualise them for each PM.

# In[94]:


# The index returned by value_counts() are the names of the release types, so we can convert to a list and slice off the first 10.
top_ten_types = df.loc[df['release_type'] != '']['release_type'].value_counts().index.to_list()[:10]


# In[95]:


# We're exluding those transcripts where the 'pm' value is empty - df.loc[df['pm'] != '']
alt.Chart(df.loc[(df['pm'] != '') & (df['release_type'].isin(top_ten_types))]).mark_bar().encode(
    x='count():Q',
    y='pm:N',
    color='release_type:N'
)


# In[99]:


df.loc[df['subjects'] != ''].shape


# In[101]:


len(pd.unique(df['subjects']))


# In[98]:


df['subjects'].value_counts()


# In[ ]:


# In[ ]: