#!/usr/bin/env python
# coding: utf-8

# Exploring the Arts Council National Portfolio 2018-2022
# 
# On 27 June 2017, the [Arts Council England](http://www.artscouncil.org.uk/) announced the [National Portfolio Organisations](http://www.artscouncil.org.uk/NPO) for the next four years. They also provided [a factsheet](http://www.artscouncil.org.uk/sites/default/files/download-file/Investment__factsheet.pdf) and [a dataset](http://www.artscouncil.org.uk/sites/default/files/download-file/NPO_successful_20182022_July_27.xlsx)! 
# 
# Now we've explored and had a stab at [tidying the dataset](https://github.com/ekoner/Arts-Council-National-Portfolio/blob/master/notebook/Arts%20Council%20National%20Portfolio%202018-2022.ipynb), it's time to revisit it systematically.

# Author: Edafe Onerhime
# 
# Created: 2018-08-04
# 
# Last Updated: 2018-08-05
# 
# Description: Systematic exploration of the [original dataset](http://www.artscouncil.org.uk/sites/default/files/download-file/NPO_successful_20182022_July_27.xlsx) and the [tidy datasets](http://vita.had.co.nz/papers/tidy-data.pdf): [funding](https://github.com/ekoner/Arts-Council-National-Portfolio/blob/master/data/output/NPO_successful_20182022_July_6_funding.csv) & [portfolio](https://github.com/ekoner/Arts-Council-National-Portfolio/blob/master/data/output/NPO_successful_20182022_July_6_portfolio.csv).
# 
# Contents: Notebook folder with this Jupyter Notebook, data folder, research folder (not sync'd to GitHub)
# 
# Notes:
# 
# 2018-08-05 - Changed direction with this notebook. Initially it was a loose exploration tool, now I'm using it to explore processes and artefacts that speed up systematic data analysis. This notebook will chop and change frequently!

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')

import os.path
import pandas as pd
import numpy as np
import missingno as msno

from IPython.core.display import display, HTML
from urllib.request import urlretrieve
from tabulate import tabulate


# In[2]:


"""
    Making 'global' variables we'll re-use later easier to set
"""
originalFile = {'fname': 'NPO_successful_20182022_July_6.xlsx',
                'url': 'http://www.artscouncil.org.uk/sites/default/files/download-file/NPO_successful_20182022_July_6.xlsx'}

dataFolderOut = os.path.join(os.pardir,'data','output')
dataFolderIn = os.path.join(os.pardir,'data','input')


# In[3]:


"""
    All the functions we'll need
"""

def customInfo(dfIn):
    """
        Summary text and table based on Dataframe.info()
    """
    
    df = dfIn.copy(deep=True)
    df.df_name = dfIn.df_name
    
    CI = 'The dataset ' + "'" + df.df_name + "'" + \
         ' has ' + str(df.shape[1]) + ' columns and ' + str(df.shape[0]) + ' rows. Columns: '
    s = df.get_dtype_counts()
    for ind, val in s.iteritems():
        CI = CI + str(val) + ' ' + ind +', '            
    CI = CI[:-2]+'.'

    dfCI = pd.DataFrame()
    df.reset_index(inplace=True)
    dfCI['count'] = df.count()
    dfCI['unique'] = df.T.apply(lambda x: x.nunique(), axis=1)
    dfCI['min'] = df.min()
    dfCI['max'] = df.max()
    dfCI['median'] = df.median()
    dfCI['range'] = dfCI['max'].apply(pd.to_numeric, errors='coerce') - dfCI['min'].apply(pd.to_numeric, errors='coerce')
    try:
        dfCI['iqr'] = df.quantile(0.75) - df.quantile(0.25)    
    except:
        dfCI['iqr'] = np.nan
    dfCI.sort_index(inplace=True)
    return dfCI, CI

def portfolioTransform(dfIn, ColumnName, Year, Period, Type):
    """ 
        Returns a dataframe with Portfolio Funding amount, Year e.g. 2015 and Period e.g. 2015-2018
    """
    df = dfIn[[ColumnName]].copy(deep=True)
    df['Year'] = Year
    df['Period'] = Period
    df['Type'] = Type
    df.rename(columns={ColumnName:'Amount'}, inplace=True)
    return df

def describeCategory(dfIn, measure=''):
    """
        Returns an array of dataframes showing count and proportion of each category
        (1 dataframe per category column)
        When a measure is provided, use the measure instead of the count
    """
    
    df = dfIn.copy(deep=True).reset_index()
    dataframes = []
    i = 0
    
    if len(measure)==0:
        for catCol in list(df.select_dtypes(include=['category']).columns):
            dataframes.append(pd.DataFrame(df[catCol].value_counts()).reset_index().rename(columns={catCol: 'Count', 'index': catCol}))
            dataframes[i]['%'] = dataframes[i][['Count']]/df.shape[0]
            dataframes[i].df_name = catCol
            i+= 1
    else:
        for catCol in list(df.select_dtypes(include=['category']).columns):
            dataframes.append(pd.DataFrame(df.reset_index().groupby(catCol)[measure].agg('sum').reset_index()))
            dataframes[i]['%'] = dataframes[i][[measure]]/df[measure].sum() 
            dataframes[i].df_name = catCol
            i+= 1
    return dataframes


# In[4]:


"""
    Load datasets we'll use later
"""

# Fetch/Load Original
fname = os.path.join(dataFolderIn,originalFile['fname'])
if not os.path.isfile(fname):
    print('Downloading:',fname)
    url = originalFile['url']
    urlretrieve(url, fname)

df = pd.read_excel(open(fname,'rb'), sheetname=0)
df.df_name = 'Original'

"""
    Get summary information
"""
dfInfo, infoText = customInfo(df)

print(infoText)
dfInfo


# In[5]:


"""
    Tidy and enrich the original dataset
"""

# Strip out carriage returns in column names
df = df.rename(columns={col: col.replace('\n','').replace('  ',' ') for col in df.columns})
df.df_name = 'All'

# Strip out text value 'NEW' so column is all numbers
df.loc[df['% Cash change between 17/18 and 18/19'].str.contains('NEW', na=False), '% Cash change between 17/18 and 18/19'] = np.NaN

# Make these columns categories
catCols = ['Funding Band', 'Area', 'ACE Region', 'ONS Region', 'Discipline', 'Funding Source (GIA or LOT)']
df[catCols] = df[catCols].apply(pd.Categorical)

# Make this column true or false (Bool)
df.loc[df['Portfolio funded in 2015-18?'].str.contains('Yes', na=False), 'Portfolio funded in 2015-18?'] = True
df.loc[df['Portfolio funded in 2015-18?'].str.contains('No', na=False), 'Portfolio funded in 2015-18?'] = False
df['Portfolio funded in 2015-18?'] = df['Portfolio funded in 2015-18?'].astype('bool')

# Make these columns ints
intCols = ['Portfolio funding 15/16 - £', 'Portfolio funding 16/17 - £', 'Portfolio funding 17/18 - £', 
           'TOTAL Portfolio funding 15/18 - £']
df[intCols] = df[intCols].astype('int')

# Enrich: Funding Source (GIA or LOT) using descriptions here: https://www.artsprofessional.co.uk/news/npo-scheme-merge-ace-grant-aid-and-lottery-funding
df['Funding Source Description'] = df['Funding Source (GIA or LOT)'].astype(str)
df.loc[df['Funding Source Description'].str.contains('GIA', na=False), 'Funding Source Description'] = 'Grant in Aid'
df.loc[df['Funding Source Description'].str.contains('LOT', na=False), 'Funding Source Description'] = 'National Lottery'

# Enrich: Funding Bands using descriptions here: http://www.artscouncil.org.uk/sites/default/files/download-file/Briefing_current_NPOs_MPMs_FINAL.pptx
df['Funding Band Description'] = df['Funding Band'].astype(str)
df.loc[~df['Funding Band Description'].str.contains('SSO'), 'Funding Band Description'] = 'Band ' + df['Funding Band Description'].astype(str)
df.loc[df['Funding Band Description'].str.contains('SSO'), 'Funding Band Description'] = 'Sector Support Organisation'

# Pull information out of notes: Bridge Organisation, Uplift, Reduction, Museum Development
df['Bridge Organisation'] = False
df['Technical Uplift'] = False
df['Technical Reduction'] = False
df['Museum Development'] = False
df.loc[df['Notes'].str.lower().str.contains('bridge organisation', na=False), 'Bridge Organisation'] = True
df.loc[df['Notes'].str.lower().str.contains('uplift', na=False), 'Technical Uplift'] = True
df.loc[df['Notes'].str.lower().str.contains('reduction', na=False), 'Technical Reduction'] = True
df.loc[df['Notes'].str.lower().str.contains('museum development', na=False), 'Museum Development'] = True

# Add Index (Looks like Applicant Name & Funding Band are unique)
df.set_index(['Applicant Name', 'Funding Band'], inplace=True)

"""
    Split into Applicant, Classification, Portfolio and save as csv
"""

dfApplicant = df[['Alternative Name', 'Web address', 'Discipline', 'Notes', 'Area', 'ACE Region', 'ONS Region', 'Local Authority']]
dfApplicant.df_name = 'Applicant'

dfClassification = df[['Funding Source (GIA or LOT)', 'Funding Source Description', 'Funding Band Description', 'Portfolio funded in 2015-18?', 'Bridge Organisation', 'Technical Uplift', 'Technical Reduction', 'Museum Development']]
dfClassification.df_name = 'Classification'

dfPortfolio = portfolioTransform(df,'Portfolio funding 15/16 - £','2015','2015-2018','Fund')
dfPortfolio = dfPortfolio.append(portfolioTransform(df,'Portfolio funding 16/17 - £','2016','2015-2018','Fund'))
dfPortfolio = dfPortfolio.append(portfolioTransform(df,'Portfolio funding 17/18 - £','2017','2015-2018','Fund'))
dfPortfolio = dfPortfolio.append(portfolioTransform(df,'Portfolio grant 18/19 - £','2018','2018-2022','Grant'))
dfPortfolio = dfPortfolio.append(portfolioTransform(df,'Portfolio grant 19/20 - £','2019','2018-2022','Grant'))
dfPortfolio = dfPortfolio.append(portfolioTransform(df,'Portfolio grant 20/21 - £','2020','2018-2022','Grant'))
dfPortfolio = dfPortfolio.append(portfolioTransform(df,'Portfolio grant 21/22 - £','2021','2018-2022','Grant'))

# Make Period, Type categories then add Year to index
catCols = ['Year','Period','Type']
dfPortfolio[catCols] = dfPortfolio[catCols].apply(pd.Categorical)
dfPortfolio.set_index('Year', append=True, inplace=True)
dfPortfolio.df_name = 'Portfolio'

# Save csv
filePrefix = 'NPO_2018-22_'
df.to_csv(os.path.join(dataFolderOut,filePrefix+'all.csv'))
dfApplicant.to_csv(os.path.join(dataFolderOut,filePrefix+'applicant.csv'))
dfPortfolio.to_csv(os.path.join(dataFolderOut,filePrefix+'portfolio.csv'))
dfClassification.to_csv(os.path.join(dataFolderOut,filePrefix+'classification.csv'))


# In[6]:


"""
    Get summary information
"""
dfInfo, infoText = customInfo(df)
print(infoText)
dfInfo, infoText = customInfo(dfApplicant)
print(infoText)
dfInfo, infoText = customInfo(dfPortfolio)
print(infoText)
dfInfo, infoText = customInfo(dfClassification)
print(infoText)


# In[7]:


"""
    Landscape: Check for missing values using MissingNo
    https://github.com/ResidentMario/missingno
"""

msno.matrix(df)
msno.heatmap(df)
msno.bar(df)
msno.dendrogram(df)


# In[8]:


print('\n', 'Applicant')
lstDesc = describeCategory(dfApplicant)
for dfDesc in lstDesc:
    print('\n',dfDesc.df_name)
    display(HTML(tabulate(dfDesc,headers='keys', tablefmt='html', showindex=False, )))
    
print('\n', 'Portfolio')
lstDesc = describeCategory(dfPortfolio)
for dfDesc in lstDesc:
    print('\n',dfDesc.df_name)
    display(HTML(tabulate(dfDesc,headers='keys', tablefmt='html', showindex=False, )))    

print('\n', 'Classification')
lstDesc = describeCategory(dfClassification)
for dfDesc in lstDesc:
    print('\n',dfDesc.df_name)
    display(HTML(tabulate(dfDesc,headers='keys', tablefmt='html', showindex=False, )))


# In[9]:


print('\n', 'Portfolio')
lstDesc = describeCategory(dfPortfolio,'Amount')
for dfDesc in lstDesc:
    print('\n',dfDesc.df_name)
    display(HTML(tabulate(dfDesc,headers='keys', tablefmt='html', showindex=False, )))    


# In[ ]: