%%time import pandas as pd import re import os import urllib import rarfile # not part of standard distro import glob import numpy as np import matplotlib.pyplot as plt from difflib import SequenceMatcher from collections import Counter %matplotlib inline %%time # download and extract charts.rar # note that this archive is periodically updated # you can skip this cell if you manually download http://bullfrogspond.com/charts.rar # and put it in the same directory as this notebook # use this command if unrar.exe is not in your PATH, changing to your path: rarfile.UNRAR_TOOL = r"C:\Program Files\WinRAR\UnRAR.exe" urllib.urlretrieve('http://bullfrogspond.com/charts.rar', 'charts.rar') with rarfile.RarFile('charts.rar') as rf: for member in rf.infolist(): rf.extract(member) %%time # create dataframe from .xls file and manipulate it # use most recent .xls file in case more than one is in directory, i.e. # you've downloaded and extracted charts.rar on different dates, after # it's been updated globlist = glob.glob('*.xls') globlist.sort() filename = globlist[-1] # read excel file into pandas dataframe. it's a huge file, but only four columns are required. df_tracks = pd.read_excel(filename, sheetname="\"Pop Annual\"", parse_cols='A,B,K,Q') print "ORIGINAL DATAFRAME:" print df_tracks.head() %%time # The Yearly Rank column has some alphabetic data, e.g. 95a, 95b # This is sometimes multiple releases from the same artist, which we # wish to keep, and sometimes Parts 1 and 2 of the same track, # which we don't. # Some Yearly Ranks are n/a, which we will change to 999 to avoid NaNs # (No year has over 998 entries) # BTW, we use 'ranked' instead of 'rank' as column name because # the latter is in the pandas namespace # Add a column ranked as float, with 0.1 added for a, 0.2 added for b, etc. # while we're at it, change all column names to lower case with underscores df_tracks.columns = [['year', 'yearly_rank', 'artist', 'track']] df_tracks['ranked'] = 0.0 def calc_rankfloat(row): rank = row.yearly_rank if type(rank) != int: try: try: suffix = re.search('([^0-9])', rank).group(1) #extract alphabetic assert len(suffix) == 1 #just in case rank = float(rank[:-1]) rank += (ord(suffix) - 96) * 0.1 except AttributeError: # sometimes Yearly Rank looks like an int, but doesn't pass the # type test. rank = float(rank.strip()) except ValueError: rank = 999 # for n/as return float(rank) df_tracks['ranked'] = df_tracks.apply(calc_rankfloat, axis=1) # calculate difference in consecutive ranks so we can evaluate cases # where difference < 1, i.e. 82a, 82b which became 82.1, 82.2, etc. df_tracks.sort(['year', 'ranked'], ascending=True, inplace=True) df_tracks.reset_index(inplace = True, drop=True) df_tracks['diff_rank'] = 0.0 for i in range(len(df_tracks)): if i == 0: df_tracks.diff_rank.iloc[i] = 1 elif df_tracks.year.iloc[i] != df_tracks.year.iloc[i-1]: df_tracks.diff_rank.iloc[i] = 1 else: df_tracks.diff_rank.iloc[i] = df_tracks.ranked.iloc[i] - df_tracks.ranked.iloc[i-1] # go through dataframe and find consecutive entries where the difference in rank # is less than one. Perform actions according to the following scenarios # 1: Artist same, track names similar tracks contain 'Part 1' and 'Part 2' # Keep first entry, without 'Part 1' # 2: Artist same, track names similar # Keep first entry # Note that 'similar' means SequenceMatcher's result is > 0.5 # Note that entries are tagged for deletion by changing the year to 0. # At the end, all rows with year == 0 are deleted for i in range(len(df_tracks)): if df_tracks.diff_rank.iloc[i] < 0.5 and df_tracks.ranked.iloc[i] != 0: diff_rank = df_tracks.diff_rank.iloc[i] year = df_tracks.year.iloc[i] artist_prev = df_tracks.artist.iloc[i-1] artist = df_tracks.artist.iloc[i] ranked_prev = df_tracks.ranked.iloc[i-1] ranked = df_tracks.ranked.iloc[i] track_prev = df_tracks.track.iloc[i-1] track = df_tracks.track.iloc[i] seq_match = SequenceMatcher(None, track_prev, track).ratio() #scenario 1 if (re.search('[Pp]art 1', track_prev) and re.search('[Pp]art 2', track) and seq_match > 0.5): df_tracks.track.iloc[i-1] = re.sub('[Pp]art 1', '', track_prev) df_tracks.year.iloc[i] = 0 elif seq_match > 0.5: df_tracks.year.iloc[i] = 0 df_tracks = df_tracks[df_tracks.year != 0] # remove those flagged for removal # remove duplicate song titles in one year -- before the 1960s, it was # very common for multiple artists to appear in the Billboard chart with # the same song at about the same time; this skews the results towards # these songs. After removal, the highest-ranking version will be kept. print "Before duplicates removed:" print df_tracks[(df_tracks.track == 'Mona Lisa') & (df_tracks.year == 1950)] print "" df_tracks.drop_duplicates(['track', 'year'], inplace=True) print "After duplicates removed:" print df_tracks[(df_tracks.track == 'Mona Lisa') & (df_tracks.year == 1950)] df_tracks.to_pickle('df_tracks_v1.pickle') df_tracks.to_pickle('df_tracks_v1.pickle') df_tracks.head() # Make some lists and dicts and functions we will use # in case you start here df_tracks = pd.read_pickle('df_tracks_v1.pickle') # lists of years and decades in df_tracks decades = list(df_tracks.decade.unique()) decades.sort() years = list(df_tracks.year.unique()) years.sort() # dict comprehension to create dicts of # lists of words with decades or years as key # lists are empty for now, when initialized decades_words = {decade: [] for decade in decades} years_words = {year: [] for year in years} # Define our log-likelihood function def loglike(n1, t1, n2, t2): """Calculates Dunning log likelihood of an observation of frequency n1 in a corpus of size t1, compared to a frequency n2 in a corpus of size t2. If result is positive, it is more likely to occur in corpus 1, otherwise in corpus 2.""" from numpy import log e1 = t1*1.0*(n1+n2)/(t1+t2) # expected values e2 = t2*1.0*(n1+n2)/(t1+t2) LL = 2 * ((n1 * log(n1/e1)) + n2 * (log(n2/e2))) if n2*1.0/t2 > n1*1.0/t1: LL = -LL return LL len_before = len(df_tracks) df_tracks = df_tracks.dropna() print "{} NaN-containing tracks dropped; {} remain".format(len_before - len(df_tracks), len(df_tracks)) %%time # make lists of words per song, per year and per decade df_tracks['wordlist'] = '' for idx, row in df_tracks.iterrows(): track = unicode(row.track) track = re.sub('[^A-Za-z0-9 \']', '', track) # remove punctuation track = re.sub('[Pp]art [0-9]', '', track) track = track.lower() words = list(set(track.split())) #removes duplicates in one song title for word in words: decades_words[row.decade].append(word) years_words[row.year].append(word) df_tracks.wordlist[idx] = ' '.join(words) # create dict of total word counts per decade and per word decades_count = {decade: len(decades_words[decade]) for decade in decades} decades_count_max = max(decades_count.values()) years_count = {year: len(years_words[year]) for year in years} %%time # create df_year and df_decade dataframes # 'counted' is raw count (called 'counted' to avoid namespace # conflict with 'count' method) dfy_words = [] dfy_years = [] dfy_counts = [] for year in years: for word in set(years_words[year]): dfy_years.append(year) dfy_words.append(word) dfy_counts.append(years_words[year].count(word)) df_year = pd.DataFrame({'word':dfy_words, 'year':dfy_years, 'counted':dfy_counts}) def calc_yr_pct(row): return row.counted * 100.0 / years_count[row.year] df_year['pct'] = df_year.apply(calc_yr_pct, axis=1) dfd_words = [] dfd_decades = [] dfd_counts = [] for decade in decades: for word in set(decades_words[decade]): dfd_decades.append(decade) dfd_words.append(word) dfd_counts.append(decades_words[decade].count(word)) df_decade = pd.DataFrame({'word':dfd_words, 'decade':dfd_decades, 'counted':dfd_counts}) def calc_dec_pct(row): return row.counted * 100.0 / decades_count[row.decade] df_decade['pct'] = df_decade.apply(calc_dec_pct, axis=1) %%time # add calculated log-likelihood column decades_pct = {decade: df_decade[df_decade.decade == decade].pct.sum() for decade in decades} # create dict of total counts and total pct per word word_counts = {} for word in df_decade.word.unique(): word_counts[word] = df_decade[df_decade.word == word].counted.sum() word_counts_total = sum(decades_count.values()) assert word_counts_total == df_decade.counted.sum() word_pcts = {} for word in df_decade.word.unique(): word_pcts[word] = df_decade[df_decade.word == word].pct.sum() word_pcts_total = df_decade.pct.sum() def calc_ll(row): return loglike(row.counted, decades_count[row.decade], word_counts[row.word], word_counts_total) df_decade['loglike'] = df_decade.apply(calc_ll, axis=1) #pickle all dataframes df_tracks.to_pickle('df_tracks_v2.pickle') df_decade.to_pickle('df_decade.pickle') df_year.to_pickle('df_year.pickle') # read from pickle in case you start here: df_tracks = pd.read_pickle('df_tracks_v2.pickle') df_tracks = df_tracks[['year', 'decade', 'artist', 'track', 'ranked', 'wordlist']] df_decade = pd.read_pickle('df_decade.pickle') df_year = pd.read_pickle('df_year.pickle') df_tracks.tail() df_decade.tail() df_year.tail() df_decade.sort('loglike', ascending=False, inplace=True) #determine how many rows are needed until each decade is represented #at least once from collections import Counter c = Counter() decades = list(df_decade.decade.unique()) remaining_decades = list(df_decade.decade.unique()) decadespop = decades num_rows = 0 while len(remaining_decades) > 0: decade = df_decade.decade.iloc[num_rows] c[decade] += 1 if decade in remaining_decades: remaining_decades.remove(decade) num_rows += 1 print '{} rows required for each decade to be represented.'.format(num_rows) print c # with this approach, there would be 32 of 64 before 1930. # instead, let's use the top five for each decade. import csv with open('billboard_output.csv', 'wb+') as csvfile: csvwriter = csv.writer(csvfile, delimiter='\t', quotechar='\"', quoting=csv.QUOTE_MINIMAL) decades = range(1890, 2020, 10) for decade in decades: dftemp = df_decade[df_decade.decade == decade].sort('loglike', ascending=False) for i in range(5): output = [] word = dftemp.word.iloc[i] keyness = int(dftemp.loglike.iloc[i]) regex = '(^{0} |^{0}$| {0}$| {0} )'.format(word) dftemp2 = df_tracks[(df_tracks.decade == decade) & (df_tracks.wordlist.str.contains(regex))] dftemp2.sort(['ranked', 'year'], ascending=True, inplace=True) artist = dftemp2.artist.iloc[0] track = dftemp2.track.iloc[0] year = dftemp2.year.iloc[0] print decade, word, keyness, artist, track, year output.append(decade) output.append(word) output.append(keyness) output.append(artist) output.append(track) output.append(year) for year in range(1890,2015): dftemp3 = df_year[(df_year.word == word) & (df_year.year == year)] if len(dftemp3) > 0: output.append(dftemp3.pct.iloc[0]) else: output.append(0) csvwriter.writerow(output)