#!/usr/bin/env python
# coding: utf-8

# # Quick look at NY Phil concert program data
# 
# Eamonn Bell, Columbia University `<epb2125@columbia.edu>`
# 
# ---
# 
# 
# Work in progress. Some interesting questions worth asking are bolded if there's anything in the notebook that moves towards a solution.
# 
# - What composers tend to get programmed together?
# - **Where did the orchestra play?**
# - **What does 'composer discovery' look like?** Can we spot faddish composers by the shape of their performance frequency?
# - **Who played with whom over the course of the existence of the orchestra?** The social network of performers.
# - What are the significant differences between tour and subscription concert programs in general?
# - What conductors prefer which works?
# - What are the genres of the most-programmed works?
# - What time do concerts tend to start at?
# 
# ---

# ## Acknowledgements
# 
# Thanks to https://github.com/bmcfee for the parsing code. The dataset this notebook is based on was released under CC0 1.0 Universal.

# In[3]:


import lxml
import pandas as pd
import matplotlib.pyplot as plt

import numpy as np
import folium
import collections
import glob

from pprint import pprint
from IPython.display import HTML, Image
from lxml import etree, objectify


# In[4]:


get_ipython().run_line_magic('matplotlib', 'inline')


# In[3]:


get_ipython().system('git clone https://github.com/nyphilarchive/PerformanceHistory.git')


# In[13]:


# Author: https://github.com/bmcfee/nycphil (Brian McFee)

def parse_programs(programs):
    
    return [parse_program(x) for x in programs]

def parse_program(program):
    
    dispatch = dict(concertInfo=parse_concertInfo,
                    worksInfo=parse_worksInfo)
    data = dict()
    
    for child in program.getchildren():
        if child.tag in dispatch:
            data[child.tag] = dispatch[child.tag](child)
        else:
            data[child.tag] = child.text
            
    return data
            
def parse_concertInfo(concertInfo):
    data = dict()
    
    for child in concertInfo.getchildren():
        data[child.tag] = child.text
    
    return data

def parse_worksInfo(worksInfo):
    
    data = list()
    
    for child in worksInfo.getchildren():
        data.append(parse_work(child))
            
    return data

def parse_work(work):
    
    dispatch = dict(soloists=parse_soloists)
    data = dict()
    
    for child in work.getchildren():
        if child.tag in dispatch:
            data[child.tag] = dispatch[child.tag](child)
        else:
            data[child.tag] = child.text
            
    return data

def parse_soloists(soloists):
    data = list()
    for child in soloists.getchildren():
        data.append(parse_soloist(child))
    return data

def parse_soloist(soloist):
    data = dict()
    
    for child in soloist.getchildren():
        data[child.tag] = child.text
    
    return data

def flatten(d):
    
    works = d.pop('worksInfo', [])
    concertInfo = d.pop('concertInfo', [])
       
    out = []
    for w in works:
        out.append(concertInfo.copy())
        
        # Added this to get soloist's names in. Dirty.
        
        soloists = w.get('soloists', None)
        
        if soloists is not None:
            soloists_names = [s.get('soloistName') for s in soloists if s.get('soloistName') is not None]
            soloists_tsv = "\t".join(soloists_names)
            out[-1].update({'soloists_tsv' : soloists_tsv})
            
        w.pop('soloists', [])
        out[-1].update(d)
        out[-1].update(w)
        
    return out

def load_programs():
    # We need this to handle badly formatted &'s in strings
    parser = etree.XMLParser(recover=True)

    fd = []
    globbed = sorted(glob.glob('./PerformanceHistory/Programs/*.xml'))
    
    for xmlfile in globbed:
        obj = objectify.parse(xmlfile, parser=parser)
        dix = parse_programs(obj.getroot())
        for _ in dix:
            if _['programID'] == '11451':
                print _['programID']
            fd.extend(flatten(_))
    df = pd.DataFrame.from_records(fd)
    df['oldDate'] = df['Date']
    df['Date'] = pd.to_datetime(df['Date'])
    del df['worksInfo']
    del df['work']
    del df['concertInfo']

    return df


# In[14]:


df = load_programs()


# In[15]:


df.head()


# ---
# ## Number of works performed by composers over time

# In[16]:


# Get, e.g., top 5 composers by performances of all time

sample_list = list(df.composerName.value_counts()[1:5].index)


# In[17]:


sample = df[df.composerName.isin(sample_list)]


# In[18]:


all_works = df.groupby(df['Date'].map(lambda x:x.year)).count()
yearly_counts = pd.Series(all_works['id'], index=all_works.index)


# In[19]:


yearly_counts.describe()


# ### Raw counts

# In[20]:


yearly_counts.plot()
plt.title('# works performed in the NY Phil Program Archives')


# In[21]:


all_programs = df.groupby(df['Date'].map(lambda x:x.year)).programID.nunique()
all_programs.plot()


# What's the deal with 1956? There's a bunch of phoney data somewhere. That peak of 900 or so should be distributed over the decade.
# 

# In[22]:


all_programs.ix[1950:1970].plot()


# In[23]:


df.Year = df.Date.map(lambda x:x.year)


# In[24]:


df.dtypes


# In[25]:


# See if the issue is with the datetime parsing from before. Seems like it's not.

df['Sanity'] = (df.Date.map(lambda x:str(x)) == df.oldDate.map(lambda x:str(x).replace('Z', '').replace('T', ' ')))
df.Sanity.value_counts()


# In[26]:


del df['Sanity']


# In[27]:


# Now, check if the application of x.year is OK
# First do a kludgy year parse on the string rep of the datetime, i.e. oldDate

oldDate_example = '1842-12-07T05:00:00Z'

def year_from_oldDate(oldDate):
    return int(oldDate.split('-')[0])

assert year_from_oldDate('1842-12-07T05:00:00Z') == 1842


# In[28]:


df['oldYear'] = df['oldDate'].map(year_from_oldDate)


# In[29]:


# Same spike! 

all_programs = df.groupby(df['oldYear']).programID.nunique()
all_programs.plot()


# Here's one I know is messed up. See the season field.

# In[33]:


df[df.programID == '11451']


# Here's the salient part of the original .xml. I've no idea what the hell is going on.

# In[38]:


get_ipython().system("cat './PerformanceHistory/Programs/1955-56_TO_1962-63.xml' | grep -C 10 '11451'")


# In[22]:


for composer in sample_list:
    one_composer = df[df.composerName == composer]
    aggregate = one_composer.groupby(one_composer['Date'].map(lambda x:x.year)).count()
    composer_counts = pd.Series(aggregate['id'], index=aggregate.index, name=composer)
    composer_counts.plot(legend=True, label=composer, alpha=0.7)

plt.ylabel('Number of works performed that year')
plt.xlabel('Year of performance')


# ### As a proportion of all works played that year

# In[54]:


for name in sample_list:
    one_composer = df[df.composerName == name]
    aggregate = one_composer.groupby(one_composer['Date'].map(lambda x:x.year)).count()
    composer_counts = pd.Series(aggregate['id'], index=aggregate.index, name=name)
    composer_counts_prop = composer_counts.divide(yearly_counts) * 100
    composer_counts_prop.plot(legend=True, label=name)

plt.ylabel('% of works performed that year')
plt.xlabel('Year of performance')


# ### Discovery of new composers

# In[20]:


df.composerName.value_counts()[50:60]


# Who on earth is `Hadley,  Henry  Kimball`

# In[61]:


def composer_counts_by_name(name):
    composer = df[df.composerName == name]
    aggregate = composer.groupby(composer['Date'].map(lambda x:x.year)).count()
    annual_composer_counts = pd.Series(aggregate['id'], index=aggregate.index, name=name)
    return annual_composer_counts
    
def plot_composer_by_name(name):
    composer_counts_by_name(name).plot(legend=True, label=name)


# In[62]:


plot_composer_by_name('Milhaud,  Darius')
plot_composer_by_name('Gould,  Morton')


# In[63]:


plot_composer_by_name('Ravel,  Maurice')


# In[64]:


plot_composer_by_name('Hadley,  Henry  Kimball')


# In[113]:


hadley = df[df.composerName == 'Hadley,  Henry  Kimball']
hadley.groupby([df.workTitle], sort=True).count()['id'].order(ascending=False).head(10)


# ---
# ## The social network of NY Philharmonic Soloists

# In[115]:


soloists = df.soloists_tsv[df.soloists_tsv.notnull()]


# In[116]:


soloist_list = list(soloists)


# In[117]:


len([s for s in soloists if ('\t' in s) and (';' in s)])


# In[118]:


tab_separated = [t.split('\t') for t in [s for s in soloists if ('\t') in s]]
semicolon_separated = [t.split(';') for t in [s for s in soloists if (';') in s]]


# In[119]:


soloists_split = tab_separated + semicolon_separated


# In[120]:


len(soloists_split)


# In[121]:


from itertools import combinations

played_with_pairs = []

for collection in soloists_split:
    for pair in combinations(collection, 2):
        played_with_pairs.append(pair)


# In[122]:


from collections import Counter
cnt = Counter(played_with_pairs)


# In[123]:


top_ten_thou = cnt.most_common(n=10000)


# In[124]:


with open('edges.txt', 'w') as f:
    f.write('source;target;weight\n')
    for edge, weight in top_ten_thou:
        try:
            f.write("{}".format(";".join(edge)) + ";{}\n".format(weight))
        except:
            continue


# In[125]:


get_ipython().system('wc edges.txt')


# In[126]:


get_ipython().system('head edges.txt')


# Process this with something like `networkx` or Gephi to get something like this:
# 
# [pending]

# ---
# ## NY Phil on Tour

# In[26]:


df.Location.value_counts().head(10)


# In[60]:


def get_state(location_str):
    splitted = location_str.split(', ')
    if len(splitted) != 2:
        return None
    elif len(splitted[1]) == 2:
        return splitted[1].strip()
    else:
        return None
        
def test_get_state():
    assert get_state('Manhattan, NY') == 'NY'
    assert get_state('Dublin, IRELAND') is None
    assert get_state('foobar,,') is None
    return True

def run_tests():
    assert test_get_state()
    return True

assert run_tests()


# In[28]:


df['State'] = df.Location.apply(get_state)


# In[29]:


in_usa = df[df.State.notnull()]
out_of_state = df[df.State != 'NY']


# In[30]:


decade_state = out_of_state.groupby([(out_of_state.Date.apply(lambda x: x.year)//10)*10,
                                     out_of_state.State]).count()


# In[31]:


decade_state.head(10)


# In[32]:


nineties = decade_state.loc[1990]
# Move index to column for use in plotting package later
nineties.reset_index(level=0, inplace=True)


# In[33]:


twenties = decade_state.loc[1920]
twenties.reset_index(level=0, inplace=True)


# Use `folium` for chloropleth visualization

# In[ ]:


get_ipython().system('wget https://raw.githubusercontent.com/python-visualization/folium/master/examples/us-states.json')


# In[34]:


def inline_map(m, width=650, height=500):
    """Takes a folium instance and embed HTML."""
    m._build_map()
    srcdoc = m.HTML.replace('"', '&quot;')
    embed = HTML('<iframe srcdoc="{}" '
                 'style="width: {}px; height: {}px; '
                 'border: none"></iframe>'.format(srcdoc, width, height))
    return embed


# In[35]:


def state_concert_counts(state_data):
    state_geo = r'us-states.json'
    
    f = folium.Map(location=[48, -102], zoom_start=3, max_zoom=4, min_zoom=3)

    f.geo_json(geo_path=state_geo, data=state_data,
                 data_out='data.json',           
                 columns=['State', 'programID'],
                 key_on='feature.id',
                 fill_color='YlGn', fill_opacity=0.7, line_opacity=0.2,
                 legend_name='Concerts played')
    return inline_map(f)


# In[36]:


state_concert_counts(nineties)


# In[37]:


state_concert_counts(twenties)


# For those of you following along with e.g. GitHub/`ipynbviewer`, this won't render because it depends on some hosted .json files. Screenshots below:

# In[38]:


Image('1920s.png')


# In[39]:


Image('1990s.png')