%%time

import pandas as pd
import re
import os
import urllib
import rarfile # not part of standard distro
import glob
import numpy as np
import matplotlib.pyplot as plt
from difflib import SequenceMatcher
from collections import Counter
%matplotlib inline

%%time
# download and extract charts.rar
# note that this archive is periodically updated

# you can skip this cell if you manually download http://bullfrogspond.com/charts.rar
# and put it in the same directory as this notebook

# use this command if unrar.exe is not in your PATH, changing to your path:
rarfile.UNRAR_TOOL = r"C:\Program Files\WinRAR\UnRAR.exe"

urllib.urlretrieve('http://bullfrogspond.com/charts.rar', 'charts.rar')
with rarfile.RarFile('charts.rar') as rf:
    for member in rf.infolist():
        rf.extract(member)

%%time
# create dataframe from .xls file and manipulate it

# use most recent .xls file in case more than one is in directory, i.e.
# you've downloaded and extracted charts.rar on different dates, after
# it's been updated

globlist = glob.glob('*.xls')
globlist.sort()
filename = globlist[-1]

# read excel file into pandas dataframe. it's a huge file, but only four columns are required.
df_tracks = pd.read_excel(filename, sheetname="\"Pop Annual\"", parse_cols='A,B,K,Q')
print "ORIGINAL DATAFRAME:"
print df_tracks.head()

%%time
# The Yearly Rank column has some alphabetic data, e.g. 95a, 95b
# This is sometimes multiple releases from the same artist, which we
# wish to keep, and sometimes Parts 1 and 2 of the same track,
# which we don't.
# Some Yearly Ranks are n/a, which we will change to 999 to avoid NaNs
# (No year has over 998 entries)
# BTW, we use 'ranked' instead of 'rank' as column name because
# the latter is in the pandas namespace

# Add a column ranked as float, with 0.1 added for a, 0.2 added for b, etc.
# while we're at it, change all column names to lower case with underscores

df_tracks.columns = [['year', 'yearly_rank', 'artist', 'track']]
df_tracks['ranked'] = 0.0
def calc_rankfloat(row):
    rank = row.yearly_rank
    if type(rank) != int:
        try:
            try:
                suffix = re.search('([^0-9])', rank).group(1) #extract alphabetic
                assert len(suffix) == 1 #just in case
                rank = float(rank[:-1])
                rank += (ord(suffix) - 96) * 0.1
            except AttributeError:
                # sometimes Yearly Rank looks like an int, but doesn't pass the
                # type test.
                rank = float(rank.strip())
        except ValueError:
            rank = 999 # for n/as
    return float(rank)
df_tracks['ranked'] = df_tracks.apply(calc_rankfloat, axis=1)

# calculate difference in consecutive ranks so we can evaluate cases
# where difference < 1, i.e. 82a, 82b which became 82.1, 82.2, etc.
df_tracks.sort(['year', 'ranked'], ascending=True, inplace=True)
df_tracks.reset_index(inplace = True, drop=True)
df_tracks['diff_rank'] = 0.0
for i in range(len(df_tracks)):
    if i == 0:
        df_tracks.diff_rank.iloc[i] = 1
    elif df_tracks.year.iloc[i] != df_tracks.year.iloc[i-1]:
        df_tracks.diff_rank.iloc[i] = 1
    else:
        df_tracks.diff_rank.iloc[i] = df_tracks.ranked.iloc[i] - df_tracks.ranked.iloc[i-1]

# go through dataframe and find consecutive entries where the difference in rank
# is less than one. Perform actions according to the following scenarios
# 1: Artist same, track names similar tracks contain 'Part 1' and 'Part 2'
# Keep first entry, without 'Part 1'
# 2: Artist same, track names similar
# Keep first entry
# Note that 'similar' means SequenceMatcher's result is > 0.5
# Note that entries are tagged for deletion by changing the year to 0.
# At the end, all rows with year == 0 are deleted

for i in range(len(df_tracks)):
    if df_tracks.diff_rank.iloc[i] < 0.5 and df_tracks.ranked.iloc[i] != 0:
        diff_rank = df_tracks.diff_rank.iloc[i]
        year = df_tracks.year.iloc[i]
        artist_prev = df_tracks.artist.iloc[i-1]
        artist = df_tracks.artist.iloc[i]
        ranked_prev = df_tracks.ranked.iloc[i-1]
        ranked = df_tracks.ranked.iloc[i]
        track_prev = df_tracks.track.iloc[i-1]
        track = df_tracks.track.iloc[i]
        seq_match = SequenceMatcher(None, track_prev, track).ratio()
        #scenario 1
        if (re.search('[Pp]art 1', track_prev) and
            re.search('[Pp]art 2', track) and
            seq_match > 0.5):
            df_tracks.track.iloc[i-1] = re.sub('[Pp]art 1', '', track_prev)
            df_tracks.year.iloc[i] = 0
        elif seq_match > 0.5:
            df_tracks.year.iloc[i] = 0

df_tracks = df_tracks[df_tracks.year != 0] # remove those flagged for removal

# remove duplicate song titles in one year -- before the 1960s, it was 
# very common for multiple artists to appear in the Billboard chart with
# the same song at about the same time; this skews the results towards
# these songs. After removal, the highest-ranking version will be kept.
print "Before duplicates removed:"
print df_tracks[(df_tracks.track == 'Mona Lisa') & (df_tracks.year == 1950)]
print ""
df_tracks.drop_duplicates(['track', 'year'], inplace=True)
print "After duplicates removed:"
print df_tracks[(df_tracks.track == 'Mona Lisa') & (df_tracks.year == 1950)]
df_tracks.to_pickle('df_tracks_v1.pickle')

df_tracks.to_pickle('df_tracks_v1.pickle')

df_tracks.head()

# Make some lists and dicts and functions we will use

# in case you start here
df_tracks = pd.read_pickle('df_tracks_v1.pickle')

# lists of years and decades in df_tracks
decades = list(df_tracks.decade.unique())
decades.sort()
years = list(df_tracks.year.unique())
years.sort()

# dict comprehension to create dicts of 
# lists of words with decades or years as key
# lists are empty for now, when initialized
decades_words = {decade: [] for decade in decades}
years_words = {year: [] for year in years}

# Define our log-likelihood function

def loglike(n1, t1, n2, t2):
    """Calculates Dunning log likelihood of an observation of 
    frequency n1 in a corpus of size t1, compared to a frequency n2 
    in a corpus of size t2. If result is positive, it is more 
    likely to occur in corpus 1, otherwise in corpus 2."""
    from numpy import log
    e1 = t1*1.0*(n1+n2)/(t1+t2) # expected values
    e2 = t2*1.0*(n1+n2)/(t1+t2)
    LL = 2 * ((n1 * log(n1/e1)) + n2 * (log(n2/e2)))
    if n2*1.0/t2 > n1*1.0/t1:
        LL = -LL
    return LL

len_before = len(df_tracks)
df_tracks = df_tracks.dropna()
print "{} NaN-containing tracks dropped; {} remain".format(len_before - len(df_tracks), len(df_tracks))

%%time
# make lists of words per song, per year and per decade

df_tracks['wordlist'] = ''
for idx, row in df_tracks.iterrows():
    track = unicode(row.track)
    track = re.sub('[^A-Za-z0-9 \']', '', track) # remove punctuation
    track = re.sub('[Pp]art [0-9]', '', track)
    track = track.lower()
    words = list(set(track.split())) #removes duplicates in one song title
    for word in words:
        decades_words[row.decade].append(word)
        years_words[row.year].append(word)
    df_tracks.wordlist[idx] = ' '.join(words)
    
# create dict of total word counts per decade and per word
decades_count = {decade: len(decades_words[decade]) for decade in decades}
decades_count_max = max(decades_count.values())
years_count = {year: len(years_words[year]) for year in years}

%%time
# create df_year and df_decade dataframes
# 'counted' is raw count (called 'counted' to avoid namespace
# conflict with 'count' method)

dfy_words = []
dfy_years = []
dfy_counts = []

for year in years:
    for word in set(years_words[year]):
        dfy_years.append(year)
        dfy_words.append(word)
        dfy_counts.append(years_words[year].count(word))
df_year = pd.DataFrame({'word':dfy_words, 'year':dfy_years, 'counted':dfy_counts})

def calc_yr_pct(row):
    return row.counted * 100.0 / years_count[row.year]

df_year['pct'] = df_year.apply(calc_yr_pct, axis=1)

dfd_words = []
dfd_decades = []
dfd_counts = []

for decade in decades:
    for word in set(decades_words[decade]):
        dfd_decades.append(decade)
        dfd_words.append(word)
        dfd_counts.append(decades_words[decade].count(word))
df_decade = pd.DataFrame({'word':dfd_words, 'decade':dfd_decades, 'counted':dfd_counts})

def calc_dec_pct(row):
    return row.counted * 100.0 / decades_count[row.decade]

df_decade['pct'] = df_decade.apply(calc_dec_pct, axis=1)

%%time
# add calculated log-likelihood column

decades_pct = {decade: df_decade[df_decade.decade == decade].pct.sum() for decade in decades}

# create dict of total counts and total pct per word
word_counts = {}
for word in df_decade.word.unique():
    word_counts[word] = df_decade[df_decade.word == word].counted.sum()
word_counts_total = sum(decades_count.values())
assert word_counts_total == df_decade.counted.sum()

word_pcts = {}
for word in df_decade.word.unique():
    word_pcts[word] = df_decade[df_decade.word == word].pct.sum()
word_pcts_total = df_decade.pct.sum()


def calc_ll(row):
    return loglike(row.counted,
                   decades_count[row.decade],
                   word_counts[row.word],
                   word_counts_total)

df_decade['loglike'] = df_decade.apply(calc_ll, axis=1)

#pickle all dataframes
df_tracks.to_pickle('df_tracks_v2.pickle')
df_decade.to_pickle('df_decade.pickle')
df_year.to_pickle('df_year.pickle')

# read from pickle in case you start here:
df_tracks = pd.read_pickle('df_tracks_v2.pickle')
df_tracks = df_tracks[['year', 'decade', 'artist', 'track', 'ranked', 'wordlist']]
df_decade = pd.read_pickle('df_decade.pickle')
df_year = pd.read_pickle('df_year.pickle')

df_tracks.tail()

df_decade.tail()

df_year.tail()

df_decade.sort('loglike', ascending=False, inplace=True)

#determine how many rows are needed until each decade is represented
#at least once
from collections import Counter
c = Counter()
decades = list(df_decade.decade.unique())
remaining_decades = list(df_decade.decade.unique())
decadespop = decades
num_rows = 0
while len(remaining_decades) > 0:
    decade = df_decade.decade.iloc[num_rows]
    c[decade] += 1
    if decade in remaining_decades:
        remaining_decades.remove(decade)
    num_rows += 1
print '{} rows required for each decade to be represented.'.format(num_rows)
print c

# with this approach, there would be 32 of 64 before 1930.
# instead, let's use the top five for each decade.

import csv
with open('billboard_output.csv', 'wb+') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter='\t',
                           quotechar='\"', quoting=csv.QUOTE_MINIMAL)

    decades = range(1890, 2020, 10)

    for decade in decades:
        dftemp = df_decade[df_decade.decade == decade].sort('loglike', ascending=False)
        for i in range(5):
            output = []
            word = dftemp.word.iloc[i]
            keyness = int(dftemp.loglike.iloc[i])
            regex = '(^{0} |^{0}$| {0}$| {0} )'.format(word)
            dftemp2 = df_tracks[(df_tracks.decade == decade) &
                               (df_tracks.wordlist.str.contains(regex))]
            dftemp2.sort(['ranked', 'year'], ascending=True, inplace=True)
            artist = dftemp2.artist.iloc[0]
            track = dftemp2.track.iloc[0]
            year = dftemp2.year.iloc[0]
            print decade, word, keyness, artist, track, year
            output.append(decade)
            output.append(word)
            output.append(keyness)
            output.append(artist)
            output.append(track)
            output.append(year)
            for year in range(1890,2015):
                dftemp3 = df_year[(df_year.word == word) & (df_year.year == year)]
                if len(dftemp3) > 0:
                    output.append(dftemp3.pct.iloc[0])
                else:
                    output.append(0)
            csvwriter.writerow(output)