In [1]:
import cPickle as pickle
import itertools
import json
import operator
import os
import scipy.sparse

import hdf5_getters
import HartiganOnline, VectorQuantizer

from joblib import Parallel, delayed
In [2]:
MSD_DIR = u'/q/boar/boar-p9/MillionSong/'
MSD_DATA_ROOT = os.path.join(MSD_DIR, 'data')
MSD_LFM_ROOT = os.path.join(MSD_DIR, 'Lastfm')
MSD_ADD = os.path.join(MSD_DIR, 'AdditionalFiles')

Building Codebook from a combination of "hot" and not-"hot" songs

In [3]:
# get all the tracks with non-nan hotttnesss
def get_all_song_hotttnesss(msd_dir, ext='.h5') :
    track_to_hotttnesss = dict()
    msd_data_root = os.path.join(msd_dir, 'data')
    with open(os.path.join(msd_dir, 'AdditionalFiles', 'unique_tracks.txt'), 'rb') as f:
        for (count, line) in enumerate(f):
            track_ID, _, _, _ = line.strip().split('<SEP>')
            track_dir = os.path.join(msd_data_root, '/'.join(track_ID[2:5]), track_ID + ext)
            h5 = hdf5_getters.open_h5_file_read(track_dir)
            hotttnesss = hdf5_getters.get_song_hotttnesss(h5)
            if not math.isnan(hotttnesss):
                track_to_hotttnesss[track_ID] = hotttnesss
            h5.close()  
            if not count % 1000:
                print "%7d tracks processed" % count 
    return track_to_hotttnesss
In [4]:
if os.path.exists('track_to_hotttnesss.json'):
    with open('track_to_hotttnesss.json', 'rb') as f:
        track_to_hotttnesss = json.load(f)
else:
    track_to_hotttnesss = get_all_song_hotttnesss(MSD_DIR)
    with open('track_to_hotttnesss.json', 'wb') as f:
        json.dump(track_to_hotttnesss, f)
In [5]:
# see some track-hotttnesss pairs
track_to_hotttnesss_ordered = sorted(track_to_hotttnesss.iteritems(), key=operator.itemgetter(1), reverse=True)
for i in xrange(0, 50000, 1000):
    track_ID = track_to_hotttnesss_ordered[i][0]
    hotttnesss = track_to_hotttnesss_ordered[i][1]
    out = !grep "$track_ID" "$MSD_ADD"/unique_tracks.txt 
    print out[0].strip().split('<SEP>')[2:4], 'Hotttnesss:', hotttnesss
['Train', "If It's Love"] Hotttnesss: 1.0
['NEEDTOBREATHE', "Lay 'Em Down (Album Version)"] Hotttnesss: 0.910616754441
['MGMT', 'Siberian Breaks'] Hotttnesss: 0.876462939093
['The Clash', 'Rock The Casbah'] Hotttnesss: 0.8562455897
['NEEDTOBREATHE', 'Again (Album Version)'] Hotttnesss: 0.841328129539
['Air Traffic', 'Never Even Told Me Her Name'] Hotttnesss: 0.829882708577
['Foreigner', 'Waiting For A Girl Like You'] Hotttnesss: 0.819999142075
['The Hoosiers', 'Everything Goes Dark'] Hotttnesss: 0.811719327712
['Born Of Osiris', 'Empires Erased (feat. NO)'] Hotttnesss: 0.803972937132
['Dido', 'Honestly Ok'] Hotttnesss: 0.796855414613
['The Rascals', 'People Got To Be Free'] Hotttnesss: 0.790198144188
['Devotchka', 'Such A Lovely Thing'] Hotttnesss: 0.784054133144
['The Magnetic Fields', 'Painted Flower'] Hotttnesss: 0.778348356952
['Red Hot Chili Peppers', 'Parallel Universe (Album Version)'] Hotttnesss: 0.773292762211
['The Beta Band', 'Round The Bend'] Hotttnesss: 0.768472306887
['Bob Welch', 'Sentimental Lady'] Hotttnesss: 0.763751073732
['Bayside', '(Pop)Ular SciencE (Album Version)'] Hotttnesss: 0.759397620647
['The Classic Crime', 'Gravedigging'] Hotttnesss: 0.755067830168
['Texas', 'Put Your Arms Around Me'] Hotttnesss: 0.750777674798
['Bond', 'Kashmir'] Hotttnesss: 0.746971042733
['Maxwell', 'For Lovers Only'] Hotttnesss: 0.743160541216
['Hayley Westenra', 'Wuthering Heights'] Hotttnesss: 0.739326575779
['Hombres G', 'Lo noto (Directo 2003)'] Hotttnesss: 0.735836516521
['Type O Negative', 'Dead Again'] Hotttnesss: 0.732492083918
['Boxcutter', 'Kaleid'] Hotttnesss: 0.729125625342
['The Greenhornes', 'Satisfy My Mind'] Hotttnesss: 0.725975540319
['Patti Smith Group', 'Ask the Angels'] Hotttnesss: 0.722844727957
['PATY CANTU', 'D\xc3\xa9jame Ir'] Hotttnesss: 0.719756606943
['Mad Caddies', 'Leavin'] Hotttnesss: 0.716778766173
['At The Gates', 'All Life Ends - live'] Hotttnesss: 0.713848223098
['Whitechapel', 'Reprogrammed to Hate'] Hotttnesss: 0.710887119948
['Refused', 'Worthless Is The Freedom Bought...'] Hotttnesss: 0.708310522057
['Era', 'Impera'] Hotttnesss: 0.705399758021
['Bilal', "L'almagne"] Hotttnesss: 0.70277208813
['Hoboken', 'Beauty Queen'] Hotttnesss: 0.700057178504
['Jesca Hoop', 'Silverscreen'] Hotttnesss: 0.697480929478
['Michael Crawford', 'It Only Takes A Moment'] Hotttnesss: 0.694955392886
['GRAVEWORM', 'Suicide Code'] Hotttnesss: 0.692559540415
["Caribou (formerly Dan Snaith's Manitoba)", 'Tits & Ass: The Great Canadian Weekend'] Hotttnesss: 0.690288229894
['Blur', 'Sing'] Hotttnesss: 0.687759716257
['Ojos De Brujo', 'Zambra'] Hotttnesss: 0.685483518941
['John Vanderslice', 'Hard Times'] Hotttnesss: 0.683129716224
['Balkan Beat Box', 'Marcha De la Vida'] Hotttnesss: 0.681094572011
['Jeru The Damaja', 'Seinfeld'] Hotttnesss: 0.678918045265
['Maj Karma', 'Sid ja Nancy'] Hotttnesss: 0.676754002877
['Macy Gray', 'Jesus For A Day'] Hotttnesss: 0.674640755004
['Brian Bromberg', 'Choices'] Hotttnesss: 0.67255857403
['Gustavo Cerati', 'Deja Vu'] Hotttnesss: 0.670469485253
['Television', 'Torn Curtain (Remastered LP Version)'] Hotttnesss: 0.66841936937
['Wilson Phillips', 'The Dream is Still Alive'] Hotttnesss: 0.666465073085
In [6]:
# and see how the hotttnesss are distributed
hist(track_to_hotttnesss.values(), bins=20)
pass

Now let's get the training split

In [7]:
def get_tracks(filename):
    tracks = list()
    with open(filename, 'rb') as f:
        for line in f:
            tracks.append(line.split('\t')[0].strip())
    return tracks
In [8]:
# these 2 files are created in processLastfmTags.ipynb
train_tracks = get_tracks('tracks_tag_train.num')
test_tracks = get_tracks('tracks_tag_test.num')
In [9]:
train_track_to_hotttnesss = dict((track, track_to_hotttnesss[track]) 
                                 for track in filter(lambda x: x in track_to_hotttnesss, train_tracks))
In [10]:
hist(train_track_to_hotttnesss.values(), bins=20)
pass
In [11]:
# randomly select 24000 non-zero-hotttnesss tracks and 1000 zeros-hotttnesss tracks from the training split
np.random.seed(98765)
tracks_nzhotttnesss = np.random.choice(filter(lambda x: train_track_to_hotttnesss[x] != 0.0, train_track_to_hotttnesss.keys()), 
                                       size=24000, replace=False)
tracks_zhotttnesss = np.random.choice(filter(lambda x: train_track_to_hotttnesss[x] == 0.0, train_track_to_hotttnesss.keys()), 
                                      size=1000, replace=False)
tracks_VQ = np.hstack((tracks_nzhotttnesss, tracks_zhotttnesss)) 
In [12]:
def data_generator(msd_data_root, tracks, shuffle=True, ext='.h5'):
    if shuffle:
        np.random.shuffle(tracks)
    for track_ID in tracks:
        track_dir = os.path.join(msd_data_root, '/'.join(track_ID[2:5]), track_ID + ext)
        h5 = hdf5_getters.open_h5_file_read(track_dir)
        mfcc = hdf5_getters.get_segments_timbre(h5)
        h5.close()
        if shuffle:
            np.random.shuffle(mfcc)
        yield mfcc
In [13]:
def build_codewords(msd_data_root, tracks, cluster=None, n_clusters=2, max_iter=10, random_state=None):
    if type(random_state) is int:
        np.random.seed(random_state)
    elif random_state is not None:
        np.random.setstate(random_state)
        
    if cluster is None:    
        cluster = HartiganOnline.HartiganOnline(n_clusters=n_clusters)
    
    for i in xrange(max_iter):
        print 'Iteration %d: passing through the data...' % (i+1)
        for d in data_generator(msd_data_root, tracks):
            cluster.partial_fit(d)
    return cluster
In [ ]:
K = 512
cluster = build_codewords(MSD_DATA_ROOT, tracks_VQ, n_clusters=K, max_iter=3, random_state=98765)
Iteration 3: passing through the data...
In [17]:
figure(figsize=(22, 4))
imshow(cluster.cluster_centers_.T, cmap=cm.PuOr_r, aspect='auto', interpolation='nearest')
colorbar()
Out[17]:
<matplotlib.colorbar.Colorbar instance at 0x130743b0>
In [ ]:
with open('Codebook_K%d_Hartigan.cPickle' % K, 'wb') as f:
    pickle.dump(cluster, f)

Vector Quantize MSD

In [36]:
with open('Codebook_K%d_Hartigan.cPickle' % K, 'rb') as f:
    cluster = pickle.load(f)

vq = VectorQuantizer.VectorQuantizer(clusterer=cluster)
vq.center_norms_ = 0.5 * (vq.clusterer.cluster_centers_**2).sum(axis=1)
vq.components_ = vq.clusterer.cluster_centers_
In [136]:
def quantize_and_save(vq, K, msd_data_root, track_ID):
    track_dir = os.path.join(msd_data_root, '/'.join(track_ID[2:5]), track_ID + '.h5')
    h5 = hdf5_getters.open_h5_file_read(track_dir)
    mfcc = hdf5_getters.get_segments_timbre(h5)
    h5.close()

    vq_hist = vq.transform(mfcc).sum(axis=0).astype(np.int16)
    tdir = os.path.join('vq_hist', '/'.join(track_ID[2:5]))
    if not os.path.exists(tdir):
        os.makedirs(tdir)
    np.save(os.path.join(tdir, track_ID + '_K%d' % K), vq_hist)
    pass
In [ ]:
n_jobs = 5
Parallel(n_jobs=n_jobs)(delayed(quantize_and_save)(vq, K, MSD_DATA_ROOT, track_ID) 
                        for track_ID in itertools.chain(train_tracks, test_tracks))
      0 tracks processed
   1000 tracks processed
   2000 tracks processed
   3000 tracks processed
   4000 tracks processed
   5000 tracks processed
   6000 tracks processed
   7000 tracks processed
   8000 tracks processed
   9000 tracks processed
  10000 tracks processed
  11000 tracks processed
  12000 tracks processed
  13000 tracks processed
  14000 tracks processed
  15000 tracks processed
  16000 tracks processed
  17000 tracks processed
  18000 tracks processed
  19000 tracks processed
  20000 tracks processed
  21000 tracks processed
  22000 tracks processed
  23000 tracks processed
  24000 tracks processed
  25000 tracks processed
  26000 tracks processed
  27000 tracks processed
  28000 tracks processed
  29000 tracks processed
  30000 tracks processed
  31000 tracks processed
  32000 tracks processed
  33000 tracks processed
  34000 tracks processed
  35000 tracks processed
  36000 tracks processed
  37000 tracks processed
  38000 tracks processed
  39000 tracks processed
  40000 tracks processed
  41000 tracks processed
  42000 tracks processed
  43000 tracks processed
  44000 tracks processed
  45000 tracks processed
  46000 tracks processed
  47000 tracks processed
  48000 tracks processed
  49000 tracks processed
  50000 tracks processed
  51000 tracks processed
  52000 tracks processed
  53000 tracks processed
  54000 tracks processed
  55000 tracks processed
  56000 tracks processed
  57000 tracks processed
  58000 tracks processed
  59000 tracks processed
  60000 tracks processed
  61000 tracks processed
  62000 tracks processed
  63000 tracks processed
  64000 tracks processed
  65000 tracks processed
  66000 tracks processed
  67000 tracks processed
  68000 tracks processed
  69000 tracks processed
  70000 tracks processed
  71000 tracks processed
  72000 tracks processed
  73000 tracks processed
  74000 tracks processed
  75000 tracks processed
  76000 tracks processed
  77000 tracks processed
  78000 tracks processed
  79000 tracks processed
  80000 tracks processed
  81000 tracks processed
  82000 tracks processed
  83000 tracks processed
  84000 tracks processed
  85000 tracks processed
  86000 tracks processed
  87000 tracks processed
  88000 tracks processed
  89000 tracks processed
  90000 tracks processed
  91000 tracks processed
  92000 tracks processed
  93000 tracks processed
  94000 tracks processed
  95000 tracks processed
  96000 tracks processed
  97000 tracks processed
  98000 tracks processed
  99000 tracks processed
 100000 tracks processed
 101000 tracks processed
 102000 tracks processed
 103000 tracks processed
 104000 tracks processed
 105000 tracks processed
 106000 tracks processed
 107000 tracks processed
 108000 tracks processed
 109000 tracks processed
 110000 tracks processed
 111000 tracks processed
 112000 tracks processed
 113000 tracks processed
 114000 tracks processed
 115000 tracks processed
 116000 tracks processed
 117000 tracks processed
 118000 tracks processed
 119000 tracks processed
 120000 tracks processed
 121000 tracks processed
 122000 tracks processed
 123000 tracks processed
 124000 tracks processed
 125000 tracks processed
 126000 tracks processed
 127000 tracks processed
 128000 tracks processed
 129000 tracks processed
 130000 tracks processed
 131000 tracks processed
 132000 tracks processed
 133000 tracks processed
 134000 tracks processed
 135000 tracks processed
 136000 tracks processed
 137000 tracks processed
 138000 tracks processed
 139000 tracks processed
 140000 tracks processed
 141000 tracks processed
 142000 tracks processed
 143000 tracks processed
 144000 tracks processed
 145000 tracks processed
 146000 tracks processed
 147000 tracks processed
 152000 tracks processed
 153000 tracks processed
 154000 tracks processed
 155000 tracks processed
 156000 tracks processed
 157000 tracks processed
 158000 tracks processed
 159000 tracks processed
 160000 tracks processed
 161000 tracks processed
 162000 tracks processed
 163000 tracks processed
 164000 tracks processed
 165000 tracks processed
 166000 tracks processed
 167000 tracks processed
 168000 tracks processed
 169000 tracks processed
 170000 tracks processed
 171000 tracks processed
 172000 tracks processed
 173000 tracks processed
 174000 tracks processed
 175000 tracks processed
 176000 tracks processed
 177000 tracks processed
 178000 tracks processed
 179000 tracks processed
 180000 tracks processed
 181000 tracks processed
 182000 tracks processed
 183000 tracks processed
 184000 tracks processed
 185000 tracks processed
 186000 tracks processed
 187000 tracks processed
 188000 tracks processed
 189000 tracks processed
 190000 tracks processed
 191000 tracks processed
 192000 tracks processed
 193000 tracks processed
 194000 tracks processed
 195000 tracks processed
 196000 tracks processed
 197000 tracks processed
 198000 tracks processed
 199000 tracks processed
 200000 tracks processed
 201000 tracks processed