By David Taylor, http://www.prooffreader.com
Note that all timings listed here are on my cheapo Windows laptop, so chances are you'll do at least as well.
%%time
import pandas as pd
import re
import os
import urllib
import rarfile # not part of standard distro
import glob
import numpy as np
import matplotlib.pyplot as plt
from difflib import SequenceMatcher
from collections import Counter
%matplotlib inline
Wall time: 19.7 s
%%time
# download and extract charts.rar
# note that this archive is periodically updated
# you can skip this cell if you manually download http://bullfrogspond.com/charts.rar
# and put it in the same directory as this notebook
# use this command if unrar.exe is not in your PATH, changing to your path:
rarfile.UNRAR_TOOL = r"C:\Program Files\WinRAR\UnRAR.exe"
urllib.urlretrieve('http://bullfrogspond.com/charts.rar', 'charts.rar')
with rarfile.RarFile('charts.rar') as rf:
for member in rf.infolist():
rf.extract(member)
Wall time: 5.84 s
%%time
# create dataframe from .xls file and manipulate it
# use most recent .xls file in case more than one is in directory, i.e.
# you've downloaded and extracted charts.rar on different dates, after
# it's been updated
globlist = glob.glob('*.xls')
globlist.sort()
filename = globlist[-1]
# read excel file into pandas dataframe. it's a huge file, but only four columns are required.
df_tracks = pd.read_excel(filename, sheetname="\"Pop Annual\"", parse_cols='A,B,K,Q')
print "ORIGINAL DATAFRAME:"
print df_tracks.head()
ORIGINAL DATAFRAME: Year Yearly Rank Artist Track 0 2014 162 Scotty McCreery See You Tonight 1 2014 199 B.o.B HeadBand 2 2014 226 OneRepublic Counting Stars 3 2014 285 Passenger Let Her Go 4 2014 296 Bastille Pompeii Wall time: 16.9 s
%%time
# The Yearly Rank column has some alphabetic data, e.g. 95a, 95b
# This is sometimes multiple releases from the same artist, which we
# wish to keep, and sometimes Parts 1 and 2 of the same track,
# which we don't.
# Some Yearly Ranks are n/a, which we will change to 999 to avoid NaNs
# (No year has over 998 entries)
# BTW, we use 'ranked' instead of 'rank' as column name because
# the latter is in the pandas namespace
# Add a column ranked as float, with 0.1 added for a, 0.2 added for b, etc.
# while we're at it, change all column names to lower case with underscores
df_tracks.columns = [['year', 'yearly_rank', 'artist', 'track']]
df_tracks['ranked'] = 0.0
def calc_rankfloat(row):
rank = row.yearly_rank
if type(rank) != int:
try:
try:
suffix = re.search('([^0-9])', rank).group(1) #extract alphabetic
assert len(suffix) == 1 #just in case
rank = float(rank[:-1])
rank += (ord(suffix) - 96) * 0.1
except AttributeError:
# sometimes Yearly Rank looks like an int, but doesn't pass the
# type test.
rank = float(rank.strip())
except ValueError:
rank = 999 # for n/as
return float(rank)
df_tracks['ranked'] = df_tracks.apply(calc_rankfloat, axis=1)
# calculate difference in consecutive ranks so we can evaluate cases
# where difference < 1, i.e. 82a, 82b which became 82.1, 82.2, etc.
df_tracks.sort(['year', 'ranked'], ascending=True, inplace=True)
df_tracks.reset_index(inplace = True, drop=True)
df_tracks['diff_rank'] = 0.0
for i in range(len(df_tracks)):
if i == 0:
df_tracks.diff_rank.iloc[i] = 1
elif df_tracks.year.iloc[i] != df_tracks.year.iloc[i-1]:
df_tracks.diff_rank.iloc[i] = 1
else:
df_tracks.diff_rank.iloc[i] = df_tracks.ranked.iloc[i] - df_tracks.ranked.iloc[i-1]
# go through dataframe and find consecutive entries where the difference in rank
# is less than one. Perform actions according to the following scenarios
# 1: Artist same, track names similar tracks contain 'Part 1' and 'Part 2'
# Keep first entry, without 'Part 1'
# 2: Artist same, track names similar
# Keep first entry
# Note that 'similar' means SequenceMatcher's result is > 0.5
# Note that entries are tagged for deletion by changing the year to 0.
# At the end, all rows with year == 0 are deleted
for i in range(len(df_tracks)):
if df_tracks.diff_rank.iloc[i] < 0.5 and df_tracks.ranked.iloc[i] != 0:
diff_rank = df_tracks.diff_rank.iloc[i]
year = df_tracks.year.iloc[i]
artist_prev = df_tracks.artist.iloc[i-1]
artist = df_tracks.artist.iloc[i]
ranked_prev = df_tracks.ranked.iloc[i-1]
ranked = df_tracks.ranked.iloc[i]
track_prev = df_tracks.track.iloc[i-1]
track = df_tracks.track.iloc[i]
seq_match = SequenceMatcher(None, track_prev, track).ratio()
#scenario 1
if (re.search('[Pp]art 1', track_prev) and
re.search('[Pp]art 2', track) and
seq_match > 0.5):
df_tracks.track.iloc[i-1] = re.sub('[Pp]art 1', '', track_prev)
df_tracks.year.iloc[i] = 0
elif seq_match > 0.5:
df_tracks.year.iloc[i] = 0
df_tracks = df_tracks[df_tracks.year != 0] # remove those flagged for removal
# remove duplicate song titles in one year -- before the 1960s, it was
# very common for multiple artists to appear in the Billboard chart with
# the same song at about the same time; this skews the results towards
# these songs. After removal, the highest-ranking version will be kept.
print "Before duplicates removed:"
print df_tracks[(df_tracks.track == 'Mona Lisa') & (df_tracks.year == 1950)]
print ""
df_tracks.drop_duplicates(['track', 'year'], inplace=True)
print "After duplicates removed:"
print df_tracks[(df_tracks.track == 'Mona Lisa') & (df_tracks.year == 1950)]
df_tracks.to_pickle('df_tracks_v1.pickle')
df_tracks.to_pickle('df_tracks_v1.pickle')
df_tracks.head()
Before duplicates removed: year yearly_rank artist \ 10062 1950 6 Nat "King" Cole (Les Baxter & His Orchestra) 10125 1950 69 Victor Young & His Orchestra (Vocal Don Cherry) 10202 1950 146 Harry James & His Orchestra 10203 1950 147 Art Lund 10222 1950 166 Charlie Spivak & His Orchestra 10228 1950 172 Ralph Flanagan & His Orchestra 10373 1950 318 Dennis Day track ranked diff_rank 10062 Mona Lisa 6 1 10125 Mona Lisa 69 1 10202 Mona Lisa 146 1 10203 Mona Lisa 147 1 10222 Mona Lisa 166 1 10228 Mona Lisa 172 1 10373 Mona Lisa 318 1 After duplicates removed: year yearly_rank artist \ 10062 1950 6 Nat "King" Cole (Les Baxter & His Orchestra) track ranked diff_rank 10062 Mona Lisa 6 1 Wall time: 33.3 s
Starting from df_tracks, we will create:
Note that the following changes to song titles are performed:
Note that a very, very few records with NaN values are removed (less than 1 per 10 000 song titles).
# Make some lists and dicts and functions we will use
# in case you start here
df_tracks = pd.read_pickle('df_tracks_v1.pickle')
# lists of years and decades in df_tracks
decades = list(df_tracks.decade.unique())
decades.sort()
years = list(df_tracks.year.unique())
years.sort()
# dict comprehension to create dicts of
# lists of words with decades or years as key
# lists are empty for now, when initialized
decades_words = {decade: [] for decade in decades}
years_words = {year: [] for year in years}
# Define our log-likelihood function
def loglike(n1, t1, n2, t2):
"""Calculates Dunning log likelihood of an observation of
frequency n1 in a corpus of size t1, compared to a frequency n2
in a corpus of size t2. If result is positive, it is more
likely to occur in corpus 1, otherwise in corpus 2."""
from numpy import log
e1 = t1*1.0*(n1+n2)/(t1+t2) # expected values
e2 = t2*1.0*(n1+n2)/(t1+t2)
LL = 2 * ((n1 * log(n1/e1)) + n2 * (log(n2/e2)))
if n2*1.0/t2 > n1*1.0/t1:
LL = -LL
return LL
len_before = len(df_tracks)
df_tracks = df_tracks.dropna()
print "{} NaN-containing tracks dropped; {} remain".format(len_before - len(df_tracks), len(df_tracks))
3 NaN-containing tracks dropped; 36283 remain
%%time
# make lists of words per song, per year and per decade
df_tracks['wordlist'] = ''
for idx, row in df_tracks.iterrows():
track = unicode(row.track)
track = re.sub('[^A-Za-z0-9 \']', '', track) # remove punctuation
track = re.sub('[Pp]art [0-9]', '', track)
track = track.lower()
words = list(set(track.split())) #removes duplicates in one song title
for word in words:
decades_words[row.decade].append(word)
years_words[row.year].append(word)
df_tracks.wordlist[idx] = ' '.join(words)
# create dict of total word counts per decade and per word
decades_count = {decade: len(decades_words[decade]) for decade in decades}
decades_count_max = max(decades_count.values())
years_count = {year: len(years_words[year]) for year in years}
Wall time: 14.3 s
%%time
# create df_year and df_decade dataframes
# 'counted' is raw count (called 'counted' to avoid namespace
# conflict with 'count' method)
dfy_words = []
dfy_years = []
dfy_counts = []
for year in years:
for word in set(years_words[year]):
dfy_years.append(year)
dfy_words.append(word)
dfy_counts.append(years_words[year].count(word))
df_year = pd.DataFrame({'word':dfy_words, 'year':dfy_years, 'counted':dfy_counts})
def calc_yr_pct(row):
return row.counted * 100.0 / years_count[row.year]
df_year['pct'] = df_year.apply(calc_yr_pct, axis=1)
dfd_words = []
dfd_decades = []
dfd_counts = []
for decade in decades:
for word in set(decades_words[decade]):
dfd_decades.append(decade)
dfd_words.append(word)
dfd_counts.append(decades_words[decade].count(word))
df_decade = pd.DataFrame({'word':dfd_words, 'decade':dfd_decades, 'counted':dfd_counts})
def calc_dec_pct(row):
return row.counted * 100.0 / decades_count[row.decade]
df_decade['pct'] = df_decade.apply(calc_dec_pct, axis=1)
Wall time: 27.3 s
%%time
# add calculated log-likelihood column
decades_pct = {decade: df_decade[df_decade.decade == decade].pct.sum() for decade in decades}
# create dict of total counts and total pct per word
word_counts = {}
for word in df_decade.word.unique():
word_counts[word] = df_decade[df_decade.word == word].counted.sum()
word_counts_total = sum(decades_count.values())
assert word_counts_total == df_decade.counted.sum()
word_pcts = {}
for word in df_decade.word.unique():
word_pcts[word] = df_decade[df_decade.word == word].pct.sum()
word_pcts_total = df_decade.pct.sum()
def calc_ll(row):
return loglike(row.counted,
decades_count[row.decade],
word_counts[row.word],
word_counts_total)
df_decade['loglike'] = df_decade.apply(calc_ll, axis=1)
Wall time: 5min 55s
#pickle all dataframes
df_tracks.to_pickle('df_tracks_v2.pickle')
df_decade.to_pickle('df_decade.pickle')
df_year.to_pickle('df_year.pickle')
# read from pickle in case you start here:
df_tracks = pd.read_pickle('df_tracks_v2.pickle')
df_tracks = df_tracks[['year', 'decade', 'artist', 'track', 'ranked', 'wordlist']]
df_decade = pd.read_pickle('df_decade.pickle')
df_year = pd.read_pickle('df_year.pickle')
df_tracks.tail()
year | decade | artist | track | ranked | wordlist | |
---|---|---|---|---|---|---|
39667 | 2014 | 2010 | Frankie Ballard | Helluva Life | 438 | life helluva |
39668 | 2014 | 2010 | Kelly Clarkson | Underneath The Tree | 439 | underneath the tree |
39669 | 2014 | 2010 | Lupe Fiasco | Old School Love | 440 | love school old |
39670 | 2014 | 2010 | Arctic Monkeys | Do I Wanna Know? | 442 | i do wanna know |
39671 | 2014 | 2010 | Aloe Blacc | The Man | 447 | the man |
df_decade.tail()
counted | decade | word | pct | loglike | |
---|---|---|---|---|---|
27712 | 3 | 2010 | friends | 0.052595 | 0.417387 |
27713 | 1 | 2010 | goin | 0.017532 | 3.633835 |
27714 | 1 | 2010 | bch | 0.017532 | 3.633835 |
27715 | 1 | 2010 | runaways | 0.017532 | 3.633835 |
27716 | 4 | 2010 | we're | 0.070126 | 0.850789 |
df_year.tail()
counted | word | year | pct | |
---|---|---|---|---|
59021 | 1 | u | 2014 | 0.114416 |
59022 | 1 | time | 2014 | 0.114416 |
59023 | 1 | walker | 2014 | 0.114416 |
59024 | 1 | she's | 2014 | 0.114416 |
59025 | 1 | hello | 2014 | 0.114416 |
df_decade.sort('loglike', ascending=False, inplace=True)
#determine how many rows are needed until each decade is represented
#at least once
from collections import Counter
c = Counter()
decades = list(df_decade.decade.unique())
remaining_decades = list(df_decade.decade.unique())
decadespop = decades
num_rows = 0
while len(remaining_decades) > 0:
decade = df_decade.decade.iloc[num_rows]
c[decade] += 1
if decade in remaining_decades:
remaining_decades.remove(decade)
num_rows += 1
print '{} rows required for each decade to be represented.'.format(num_rows)
print c
63 rows required for each decade to be represented. Counter({1910: 15, 1900: 12, 1930: 6, 1920: 5, 1890: 5, 2000: 4, 1970: 4, 1940: 4, 1990: 2, 1960: 2, 1980: 2, 2010: 1, 1950: 1})
# with this approach, there would be 32 of 64 before 1930.
# instead, let's use the top five for each decade.
import csv
with open('billboard_output.csv', 'wb+') as csvfile:
csvwriter = csv.writer(csvfile, delimiter='\t',
quotechar='\"', quoting=csv.QUOTE_MINIMAL)
decades = range(1890, 2020, 10)
for decade in decades:
dftemp = df_decade[df_decade.decade == decade].sort('loglike', ascending=False)
for i in range(5):
output = []
word = dftemp.word.iloc[i]
keyness = int(dftemp.loglike.iloc[i])
regex = '(^{0} |^{0}$| {0}$| {0} )'.format(word)
dftemp2 = df_tracks[(df_tracks.decade == decade) &
(df_tracks.wordlist.str.contains(regex))]
dftemp2.sort(['ranked', 'year'], ascending=True, inplace=True)
artist = dftemp2.artist.iloc[0]
track = dftemp2.track.iloc[0]
year = dftemp2.year.iloc[0]
print decade, word, keyness, artist, track, year
output.append(decade)
output.append(word)
output.append(keyness)
output.append(artist)
output.append(track)
output.append(year)
for year in range(1890,2015):
dftemp3 = df_year[(df_year.word == word) & (df_year.year == year)]
if len(dftemp3) > 0:
output.append(dftemp3.pct.iloc[0])
else:
output.append(0)
csvwriter.writerow(output)
1890 uncle 59 Cal Stewart Uncle Josh's Arrival in New York 1898 1890 casey 54 Russell Hunting Michael Casey Taking the Census 1892 1890 josh 53 Cal Stewart Uncle Josh at the Opera 1898 1890 old 26 Dan Quinn A Hot Time in the Old Town 1896 1890 michael 24 Russell Hunting Michael Casey Taking the Census 1892 1900 uncle 58 Cal Stewart Uncle Josh's Huskin' Bee Dance 1901 1900 old 58 Haydn Quartet In the Good Old Summer Time 1903 1900 josh 44 Cal Stewart Uncle Josh On an Automobile 1903 1900 reuben 38 S. H. Dudley When Reuben Comes to Town 1901 1900 when 33 George J. Gaskin When You Were Sweet Sixteen 1900 1910 gems 70 Victor Light Opera Co. Gems from "Naughty Marietta" 1912 1910 rag 52 Original Dixieland Jazz Band Tiger Rag 1918 1910 home 43 Henry Burr When You're a Long, Long Way from Home 1914 1910 land 41 Al Jolson Hello Central, Give Me No Man's Land 1918 1910 old 38 Harry Macdonough Down by the Old Mill Stream 1912 1920 blues 153 Paul Whiteman & His Orchestra Wang Wang Blues 1921 1920 pal 42 Al Jolson Little Pal 1929 1920 sweetheart 27 Isham Jones & His Orchestra Nobody's Sweetheart 1924 1920 rose 25 Ted Lewis & His Band Second Hand Rose 1921 1920 mammy 23 Paul Whiteman & His Orchestra My Mammy 1921 1930 moon 79 Glenn Miller & His Orchestra Moon Love 1939 1930 in 38 Ted Lewis & His Band In A Shanty In Old Shanty Town 1932 1930 swing 34 Ray Noble & His Orchestra Let's Swing It 1935 1930 sing 34 Benny Goodman & His Orchestra (Vocal Martha Tilton) And the Angels Sing 1939 1930 a 30 Ted Lewis & His Band In A Shanty In Old Shanty Town 1932 1940 polka 50 Kay Kyser & His Orchestra Strip Polka 1942 1940 serenade 35 Andrews Sisters Ferry Boat Serenade 1940 1940 boogie 28 Will Bradley & His Orchestra Scrub Me, Mama, With a Boogie Beat 1941 1940 blue 26 Tommy Dorsey & His Orchestra (Vocal Frank Sinatra) In The Blue Of Evening 1943 1940 christmas 22 Bing Crosby White Christmas 1942 1950 christmas 31 Art Mooney & His Orchestra (I'm Getting) Nuttin' For Christmas 1955 1950 penny 18 Dinah Shore & Tony Martin A Penny A Kiss 1951 1950 mambo 15 Perry Como Papa Loves Mambo 1954 1950 rednosed 15 Gene Autry Rudolph, the Red-Nosed Reindeer 1950 1950 three 15 Browns, The The Three Bells 1959 1960 baby 51 Supremes, The Baby Love 1964 1960 twist 24 Joey Dee & the Starliters Peppermint Twist - Part 1 1962 1960 little 16 Steve Lawrence Go Away Little Girl 1963 1960 twistin' 15 Chubby Checker Slow Twistin' 1962 1960 lonely 14 Bobby Vinton Mr. Lonely 1964 1970 woman 33 Guess Who, The American Woman 1970 1970 disco 31 Johnnie Taylor Disco Lady 1976 1970 rock 24 Elton John Crocodile Rock 1973 1970 music 24 Wild Cherry Play That Funky Music 1976 1970 dancin' 20 Leif Garrett I Was Made For Dancin' 1979 1980 love 48 Joan Jett & The Blackhearts I Love Rock 'N Roll 1982 1980 fire 24 Billy Joel We Didn't Start The Fire 1989 1980 don't 20 Human League, The Don't You Want Me 1982 1980 rock 14 Joan Jett & The Blackhearts I Love Rock 'N Roll 1982 1980 on 14 Bon Jovi Livin' On A Prayer 1987 1990 u 49 Sinead O'Connor Nothing Compares 2 U 1990 1990 you 28 Stevie B Because I Love You (The Postman Song) 1990 1990 up 21 Brandy Sittin' Up In My Room 1996 1990 get 20 En Vogue My Lovin' (You're Never Gonna Get It) 1992 1990 thang 18 Dr. Dre Nuthin' But A "G" Thang 1993 2000 u 71 Usher U Got It Bad 2001 2000 like 28 T.I. Whatever You Like 2008 2000 breathe 25 Faith Hill Breathe 2000 2000 it 24 Usher U Got It Bad 2001 2000 ya 19 OutKast Hey Ya! 2003 2010 we 22 Rihanna We Found Love 2011 2010 yeah 18 Austin Mahone Mmm Yeah 2014 2010 hell 18 Avril Lavigne What The Hell 2011 2010 fk 15 Cee Lo Green F**K You (Forget You) 2011 2010 die 14 Ke$ha Die Young 2012