#!/usr/bin/env python # coding: utf-8 # # Quick look at NY Phil concert program data # # Eamonn Bell, Columbia University `` # # --- # # # Work in progress. Some interesting questions worth asking are bolded if there's anything in the notebook that moves towards a solution. # # - What composers tend to get programmed together? # - **Where did the orchestra play?** # - **What does 'composer discovery' look like?** Can we spot faddish composers by the shape of their performance frequency? # - **Who played with whom over the course of the existence of the orchestra?** The social network of performers. # - What are the significant differences between tour and subscription concert programs in general? # - What conductors prefer which works? # - What are the genres of the most-programmed works? # - What time do concerts tend to start at? # # --- # ## Acknowledgements # # Thanks to https://github.com/bmcfee for the parsing code. The dataset this notebook is based on was released under CC0 1.0 Universal. # In[3]: import lxml import pandas as pd import matplotlib.pyplot as plt import numpy as np import folium import collections import glob from pprint import pprint from IPython.display import HTML, Image from lxml import etree, objectify # In[4]: get_ipython().run_line_magic('matplotlib', 'inline') # In[3]: get_ipython().system('git clone https://github.com/nyphilarchive/PerformanceHistory.git') # In[13]: # Author: https://github.com/bmcfee/nycphil (Brian McFee) def parse_programs(programs): return [parse_program(x) for x in programs] def parse_program(program): dispatch = dict(concertInfo=parse_concertInfo, worksInfo=parse_worksInfo) data = dict() for child in program.getchildren(): if child.tag in dispatch: data[child.tag] = dispatch[child.tag](child) else: data[child.tag] = child.text return data def parse_concertInfo(concertInfo): data = dict() for child in concertInfo.getchildren(): data[child.tag] = child.text return data def parse_worksInfo(worksInfo): data = list() for child in worksInfo.getchildren(): data.append(parse_work(child)) return data def parse_work(work): dispatch = dict(soloists=parse_soloists) data = dict() for child in work.getchildren(): if child.tag in dispatch: data[child.tag] = dispatch[child.tag](child) else: data[child.tag] = child.text return data def parse_soloists(soloists): data = list() for child in soloists.getchildren(): data.append(parse_soloist(child)) return data def parse_soloist(soloist): data = dict() for child in soloist.getchildren(): data[child.tag] = child.text return data def flatten(d): works = d.pop('worksInfo', []) concertInfo = d.pop('concertInfo', []) out = [] for w in works: out.append(concertInfo.copy()) # Added this to get soloist's names in. Dirty. soloists = w.get('soloists', None) if soloists is not None: soloists_names = [s.get('soloistName') for s in soloists if s.get('soloistName') is not None] soloists_tsv = "\t".join(soloists_names) out[-1].update({'soloists_tsv' : soloists_tsv}) w.pop('soloists', []) out[-1].update(d) out[-1].update(w) return out def load_programs(): # We need this to handle badly formatted &'s in strings parser = etree.XMLParser(recover=True) fd = [] globbed = sorted(glob.glob('./PerformanceHistory/Programs/*.xml')) for xmlfile in globbed: obj = objectify.parse(xmlfile, parser=parser) dix = parse_programs(obj.getroot()) for _ in dix: if _['programID'] == '11451': print _['programID'] fd.extend(flatten(_)) df = pd.DataFrame.from_records(fd) df['oldDate'] = df['Date'] df['Date'] = pd.to_datetime(df['Date']) del df['worksInfo'] del df['work'] del df['concertInfo'] return df # In[14]: df = load_programs() # In[15]: df.head() # --- # ## Number of works performed by composers over time # In[16]: # Get, e.g., top 5 composers by performances of all time sample_list = list(df.composerName.value_counts()[1:5].index) # In[17]: sample = df[df.composerName.isin(sample_list)] # In[18]: all_works = df.groupby(df['Date'].map(lambda x:x.year)).count() yearly_counts = pd.Series(all_works['id'], index=all_works.index) # In[19]: yearly_counts.describe() # ### Raw counts # In[20]: yearly_counts.plot() plt.title('# works performed in the NY Phil Program Archives') # In[21]: all_programs = df.groupby(df['Date'].map(lambda x:x.year)).programID.nunique() all_programs.plot() # What's the deal with 1956? There's a bunch of phoney data somewhere. That peak of 900 or so should be distributed over the decade. # # In[22]: all_programs.ix[1950:1970].plot() # In[23]: df.Year = df.Date.map(lambda x:x.year) # In[24]: df.dtypes # In[25]: # See if the issue is with the datetime parsing from before. Seems like it's not. df['Sanity'] = (df.Date.map(lambda x:str(x)) == df.oldDate.map(lambda x:str(x).replace('Z', '').replace('T', ' '))) df.Sanity.value_counts() # In[26]: del df['Sanity'] # In[27]: # Now, check if the application of x.year is OK # First do a kludgy year parse on the string rep of the datetime, i.e. oldDate oldDate_example = '1842-12-07T05:00:00Z' def year_from_oldDate(oldDate): return int(oldDate.split('-')[0]) assert year_from_oldDate('1842-12-07T05:00:00Z') == 1842 # In[28]: df['oldYear'] = df['oldDate'].map(year_from_oldDate) # In[29]: # Same spike! all_programs = df.groupby(df['oldYear']).programID.nunique() all_programs.plot() # Here's one I know is messed up. See the season field. # In[33]: df[df.programID == '11451'] # Here's the salient part of the original .xml. I've no idea what the hell is going on. # In[38]: get_ipython().system("cat './PerformanceHistory/Programs/1955-56_TO_1962-63.xml' | grep -C 10 '11451'") # In[22]: for composer in sample_list: one_composer = df[df.composerName == composer] aggregate = one_composer.groupby(one_composer['Date'].map(lambda x:x.year)).count() composer_counts = pd.Series(aggregate['id'], index=aggregate.index, name=composer) composer_counts.plot(legend=True, label=composer, alpha=0.7) plt.ylabel('Number of works performed that year') plt.xlabel('Year of performance') # ### As a proportion of all works played that year # In[54]: for name in sample_list: one_composer = df[df.composerName == name] aggregate = one_composer.groupby(one_composer['Date'].map(lambda x:x.year)).count() composer_counts = pd.Series(aggregate['id'], index=aggregate.index, name=name) composer_counts_prop = composer_counts.divide(yearly_counts) * 100 composer_counts_prop.plot(legend=True, label=name) plt.ylabel('% of works performed that year') plt.xlabel('Year of performance') # ### Discovery of new composers # In[20]: df.composerName.value_counts()[50:60] # Who on earth is `Hadley, Henry Kimball` # In[61]: def composer_counts_by_name(name): composer = df[df.composerName == name] aggregate = composer.groupby(composer['Date'].map(lambda x:x.year)).count() annual_composer_counts = pd.Series(aggregate['id'], index=aggregate.index, name=name) return annual_composer_counts def plot_composer_by_name(name): composer_counts_by_name(name).plot(legend=True, label=name) # In[62]: plot_composer_by_name('Milhaud, Darius') plot_composer_by_name('Gould, Morton') # In[63]: plot_composer_by_name('Ravel, Maurice') # In[64]: plot_composer_by_name('Hadley, Henry Kimball') # In[113]: hadley = df[df.composerName == 'Hadley, Henry Kimball'] hadley.groupby([df.workTitle], sort=True).count()['id'].order(ascending=False).head(10) # --- # ## The social network of NY Philharmonic Soloists # In[115]: soloists = df.soloists_tsv[df.soloists_tsv.notnull()] # In[116]: soloist_list = list(soloists) # In[117]: len([s for s in soloists if ('\t' in s) and (';' in s)]) # In[118]: tab_separated = [t.split('\t') for t in [s for s in soloists if ('\t') in s]] semicolon_separated = [t.split(';') for t in [s for s in soloists if (';') in s]] # In[119]: soloists_split = tab_separated + semicolon_separated # In[120]: len(soloists_split) # In[121]: from itertools import combinations played_with_pairs = [] for collection in soloists_split: for pair in combinations(collection, 2): played_with_pairs.append(pair) # In[122]: from collections import Counter cnt = Counter(played_with_pairs) # In[123]: top_ten_thou = cnt.most_common(n=10000) # In[124]: with open('edges.txt', 'w') as f: f.write('source;target;weight\n') for edge, weight in top_ten_thou: try: f.write("{}".format(";".join(edge)) + ";{}\n".format(weight)) except: continue # In[125]: get_ipython().system('wc edges.txt') # In[126]: get_ipython().system('head edges.txt') # Process this with something like `networkx` or Gephi to get something like this: # # [pending] # --- # ## NY Phil on Tour # In[26]: df.Location.value_counts().head(10) # In[60]: def get_state(location_str): splitted = location_str.split(', ') if len(splitted) != 2: return None elif len(splitted[1]) == 2: return splitted[1].strip() else: return None def test_get_state(): assert get_state('Manhattan, NY') == 'NY' assert get_state('Dublin, IRELAND') is None assert get_state('foobar,,') is None return True def run_tests(): assert test_get_state() return True assert run_tests() # In[28]: df['State'] = df.Location.apply(get_state) # In[29]: in_usa = df[df.State.notnull()] out_of_state = df[df.State != 'NY'] # In[30]: decade_state = out_of_state.groupby([(out_of_state.Date.apply(lambda x: x.year)//10)*10, out_of_state.State]).count() # In[31]: decade_state.head(10) # In[32]: nineties = decade_state.loc[1990] # Move index to column for use in plotting package later nineties.reset_index(level=0, inplace=True) # In[33]: twenties = decade_state.loc[1920] twenties.reset_index(level=0, inplace=True) # Use `folium` for chloropleth visualization # In[ ]: get_ipython().system('wget https://raw.githubusercontent.com/python-visualization/folium/master/examples/us-states.json') # In[34]: def inline_map(m, width=650, height=500): """Takes a folium instance and embed HTML.""" m._build_map() srcdoc = m.HTML.replace('"', '"') embed = HTML(''.format(srcdoc, width, height)) return embed # In[35]: def state_concert_counts(state_data): state_geo = r'us-states.json' f = folium.Map(location=[48, -102], zoom_start=3, max_zoom=4, min_zoom=3) f.geo_json(geo_path=state_geo, data=state_data, data_out='data.json', columns=['State', 'programID'], key_on='feature.id', fill_color='YlGn', fill_opacity=0.7, line_opacity=0.2, legend_name='Concerts played') return inline_map(f) # In[36]: state_concert_counts(nineties) # In[37]: state_concert_counts(twenties) # For those of you following along with e.g. GitHub/`ipynbviewer`, this won't render because it depends on some hosted .json files. Screenshots below: # In[38]: Image('1920s.png') # In[39]: Image('1990s.png')