import cPickle as pickle
import itertools
import json
import operator
import os
import scipy.sparse
import hdf5_getters
import HartiganOnline, VectorQuantizer
from joblib import Parallel, delayed
MSD_DIR = u'/q/boar/boar-p9/MillionSong/'
MSD_DATA_ROOT = os.path.join(MSD_DIR, 'data')
MSD_LFM_ROOT = os.path.join(MSD_DIR, 'Lastfm')
MSD_ADD = os.path.join(MSD_DIR, 'AdditionalFiles')
# get all the tracks with non-nan hotttnesss
def get_all_song_hotttnesss(msd_dir, ext='.h5') :
track_to_hotttnesss = dict()
msd_data_root = os.path.join(msd_dir, 'data')
with open(os.path.join(msd_dir, 'AdditionalFiles', 'unique_tracks.txt'), 'rb') as f:
for (count, line) in enumerate(f):
track_ID, _, _, _ = line.strip().split('<SEP>')
track_dir = os.path.join(msd_data_root, '/'.join(track_ID[2:5]), track_ID + ext)
h5 = hdf5_getters.open_h5_file_read(track_dir)
hotttnesss = hdf5_getters.get_song_hotttnesss(h5)
if not math.isnan(hotttnesss):
track_to_hotttnesss[track_ID] = hotttnesss
h5.close()
if not count % 1000:
print "%7d tracks processed" % count
return track_to_hotttnesss
if os.path.exists('track_to_hotttnesss.json'):
with open('track_to_hotttnesss.json', 'rb') as f:
track_to_hotttnesss = json.load(f)
else:
track_to_hotttnesss = get_all_song_hotttnesss(MSD_DIR)
with open('track_to_hotttnesss.json', 'wb') as f:
json.dump(track_to_hotttnesss, f)
# see some track-hotttnesss pairs
track_to_hotttnesss_ordered = sorted(track_to_hotttnesss.iteritems(), key=operator.itemgetter(1), reverse=True)
for i in xrange(0, 50000, 1000):
track_ID = track_to_hotttnesss_ordered[i][0]
hotttnesss = track_to_hotttnesss_ordered[i][1]
out = !grep "$track_ID" "$MSD_ADD"/unique_tracks.txt
print out[0].strip().split('<SEP>')[2:4], 'Hotttnesss:', hotttnesss
['Train', "If It's Love"] Hotttnesss: 1.0 ['NEEDTOBREATHE', "Lay 'Em Down (Album Version)"] Hotttnesss: 0.910616754441 ['MGMT', 'Siberian Breaks'] Hotttnesss: 0.876462939093 ['The Clash', 'Rock The Casbah'] Hotttnesss: 0.8562455897 ['NEEDTOBREATHE', 'Again (Album Version)'] Hotttnesss: 0.841328129539 ['Air Traffic', 'Never Even Told Me Her Name'] Hotttnesss: 0.829882708577 ['Foreigner', 'Waiting For A Girl Like You'] Hotttnesss: 0.819999142075 ['The Hoosiers', 'Everything Goes Dark'] Hotttnesss: 0.811719327712 ['Born Of Osiris', 'Empires Erased (feat. NO)'] Hotttnesss: 0.803972937132 ['Dido', 'Honestly Ok'] Hotttnesss: 0.796855414613 ['The Rascals', 'People Got To Be Free'] Hotttnesss: 0.790198144188 ['Devotchka', 'Such A Lovely Thing'] Hotttnesss: 0.784054133144 ['The Magnetic Fields', 'Painted Flower'] Hotttnesss: 0.778348356952 ['Red Hot Chili Peppers', 'Parallel Universe (Album Version)'] Hotttnesss: 0.773292762211 ['The Beta Band', 'Round The Bend'] Hotttnesss: 0.768472306887 ['Bob Welch', 'Sentimental Lady'] Hotttnesss: 0.763751073732 ['Bayside', '(Pop)Ular SciencE (Album Version)'] Hotttnesss: 0.759397620647 ['The Classic Crime', 'Gravedigging'] Hotttnesss: 0.755067830168 ['Texas', 'Put Your Arms Around Me'] Hotttnesss: 0.750777674798 ['Bond', 'Kashmir'] Hotttnesss: 0.746971042733 ['Maxwell', 'For Lovers Only'] Hotttnesss: 0.743160541216 ['Hayley Westenra', 'Wuthering Heights'] Hotttnesss: 0.739326575779 ['Hombres G', 'Lo noto (Directo 2003)'] Hotttnesss: 0.735836516521 ['Type O Negative', 'Dead Again'] Hotttnesss: 0.732492083918 ['Boxcutter', 'Kaleid'] Hotttnesss: 0.729125625342 ['The Greenhornes', 'Satisfy My Mind'] Hotttnesss: 0.725975540319 ['Patti Smith Group', 'Ask the Angels'] Hotttnesss: 0.722844727957 ['PATY CANTU', 'D\xc3\xa9jame Ir'] Hotttnesss: 0.719756606943 ['Mad Caddies', 'Leavin'] Hotttnesss: 0.716778766173 ['At The Gates', 'All Life Ends - live'] Hotttnesss: 0.713848223098 ['Whitechapel', 'Reprogrammed to Hate'] Hotttnesss: 0.710887119948 ['Refused', 'Worthless Is The Freedom Bought...'] Hotttnesss: 0.708310522057 ['Era', 'Impera'] Hotttnesss: 0.705399758021 ['Bilal', "L'almagne"] Hotttnesss: 0.70277208813 ['Hoboken', 'Beauty Queen'] Hotttnesss: 0.700057178504 ['Jesca Hoop', 'Silverscreen'] Hotttnesss: 0.697480929478 ['Michael Crawford', 'It Only Takes A Moment'] Hotttnesss: 0.694955392886 ['GRAVEWORM', 'Suicide Code'] Hotttnesss: 0.692559540415 ["Caribou (formerly Dan Snaith's Manitoba)", 'Tits & Ass: The Great Canadian Weekend'] Hotttnesss: 0.690288229894 ['Blur', 'Sing'] Hotttnesss: 0.687759716257 ['Ojos De Brujo', 'Zambra'] Hotttnesss: 0.685483518941 ['John Vanderslice', 'Hard Times'] Hotttnesss: 0.683129716224 ['Balkan Beat Box', 'Marcha De la Vida'] Hotttnesss: 0.681094572011 ['Jeru The Damaja', 'Seinfeld'] Hotttnesss: 0.678918045265 ['Maj Karma', 'Sid ja Nancy'] Hotttnesss: 0.676754002877 ['Macy Gray', 'Jesus For A Day'] Hotttnesss: 0.674640755004 ['Brian Bromberg', 'Choices'] Hotttnesss: 0.67255857403 ['Gustavo Cerati', 'Deja Vu'] Hotttnesss: 0.670469485253 ['Television', 'Torn Curtain (Remastered LP Version)'] Hotttnesss: 0.66841936937 ['Wilson Phillips', 'The Dream is Still Alive'] Hotttnesss: 0.666465073085
# and see how the hotttnesss are distributed
hist(track_to_hotttnesss.values(), bins=20)
pass
def get_tracks(filename):
tracks = list()
with open(filename, 'rb') as f:
for line in f:
tracks.append(line.split('\t')[0].strip())
return tracks
# these 2 files are created in processLastfmTags.ipynb
train_tracks = get_tracks('tracks_tag_train.num')
test_tracks = get_tracks('tracks_tag_test.num')
train_track_to_hotttnesss = dict((track, track_to_hotttnesss[track])
for track in filter(lambda x: x in track_to_hotttnesss, train_tracks))
hist(train_track_to_hotttnesss.values(), bins=20)
pass
# randomly select 24000 non-zero-hotttnesss tracks and 1000 zeros-hotttnesss tracks from the training split
np.random.seed(98765)
tracks_nzhotttnesss = np.random.choice(filter(lambda x: train_track_to_hotttnesss[x] != 0.0, train_track_to_hotttnesss.keys()),
size=24000, replace=False)
tracks_zhotttnesss = np.random.choice(filter(lambda x: train_track_to_hotttnesss[x] == 0.0, train_track_to_hotttnesss.keys()),
size=1000, replace=False)
tracks_VQ = np.hstack((tracks_nzhotttnesss, tracks_zhotttnesss))
def data_generator(msd_data_root, tracks, shuffle=True, ext='.h5'):
if shuffle:
np.random.shuffle(tracks)
for track_ID in tracks:
track_dir = os.path.join(msd_data_root, '/'.join(track_ID[2:5]), track_ID + ext)
h5 = hdf5_getters.open_h5_file_read(track_dir)
mfcc = hdf5_getters.get_segments_timbre(h5)
h5.close()
if shuffle:
np.random.shuffle(mfcc)
yield mfcc
def build_codewords(msd_data_root, tracks, cluster=None, n_clusters=2, max_iter=10, random_state=None):
if type(random_state) is int:
np.random.seed(random_state)
elif random_state is not None:
np.random.setstate(random_state)
if cluster is None:
cluster = HartiganOnline.HartiganOnline(n_clusters=n_clusters)
for i in xrange(max_iter):
print 'Iteration %d: passing through the data...' % (i+1)
for d in data_generator(msd_data_root, tracks):
cluster.partial_fit(d)
return cluster
K = 512
cluster = build_codewords(MSD_DATA_ROOT, tracks_VQ, n_clusters=K, max_iter=3, random_state=98765)
Iteration 3: passing through the data...
figure(figsize=(22, 4))
imshow(cluster.cluster_centers_.T, cmap=cm.PuOr_r, aspect='auto', interpolation='nearest')
colorbar()
<matplotlib.colorbar.Colorbar instance at 0x130743b0>
with open('Codebook_K%d_Hartigan.cPickle' % K, 'wb') as f:
pickle.dump(cluster, f)
with open('Codebook_K%d_Hartigan.cPickle' % K, 'rb') as f:
cluster = pickle.load(f)
vq = VectorQuantizer.VectorQuantizer(clusterer=cluster)
vq.center_norms_ = 0.5 * (vq.clusterer.cluster_centers_**2).sum(axis=1)
vq.components_ = vq.clusterer.cluster_centers_
def quantize_and_save(vq, K, msd_data_root, track_ID):
track_dir = os.path.join(msd_data_root, '/'.join(track_ID[2:5]), track_ID + '.h5')
h5 = hdf5_getters.open_h5_file_read(track_dir)
mfcc = hdf5_getters.get_segments_timbre(h5)
h5.close()
vq_hist = vq.transform(mfcc).sum(axis=0).astype(np.int16)
tdir = os.path.join('vq_hist', '/'.join(track_ID[2:5]))
if not os.path.exists(tdir):
os.makedirs(tdir)
np.save(os.path.join(tdir, track_ID + '_K%d' % K), vq_hist)
pass
n_jobs = 5
Parallel(n_jobs=n_jobs)(delayed(quantize_and_save)(vq, K, MSD_DATA_ROOT, track_ID)
for track_ID in itertools.chain(train_tracks, test_tracks))
0 tracks processed 1000 tracks processed 2000 tracks processed 3000 tracks processed 4000 tracks processed 5000 tracks processed 6000 tracks processed 7000 tracks processed 8000 tracks processed 9000 tracks processed 10000 tracks processed 11000 tracks processed 12000 tracks processed 13000 tracks processed 14000 tracks processed 15000 tracks processed 16000 tracks processed 17000 tracks processed 18000 tracks processed 19000 tracks processed 20000 tracks processed 21000 tracks processed 22000 tracks processed 23000 tracks processed 24000 tracks processed 25000 tracks processed 26000 tracks processed 27000 tracks processed 28000 tracks processed 29000 tracks processed 30000 tracks processed 31000 tracks processed 32000 tracks processed 33000 tracks processed 34000 tracks processed 35000 tracks processed 36000 tracks processed 37000 tracks processed 38000 tracks processed 39000 tracks processed 40000 tracks processed 41000 tracks processed 42000 tracks processed 43000 tracks processed 44000 tracks processed 45000 tracks processed 46000 tracks processed 47000 tracks processed 48000 tracks processed 49000 tracks processed 50000 tracks processed 51000 tracks processed 52000 tracks processed 53000 tracks processed 54000 tracks processed 55000 tracks processed 56000 tracks processed 57000 tracks processed 58000 tracks processed 59000 tracks processed 60000 tracks processed 61000 tracks processed 62000 tracks processed 63000 tracks processed 64000 tracks processed 65000 tracks processed 66000 tracks processed 67000 tracks processed 68000 tracks processed 69000 tracks processed 70000 tracks processed 71000 tracks processed 72000 tracks processed 73000 tracks processed 74000 tracks processed 75000 tracks processed 76000 tracks processed 77000 tracks processed 78000 tracks processed 79000 tracks processed 80000 tracks processed 81000 tracks processed 82000 tracks processed 83000 tracks processed 84000 tracks processed 85000 tracks processed 86000 tracks processed 87000 tracks processed 88000 tracks processed 89000 tracks processed 90000 tracks processed 91000 tracks processed 92000 tracks processed 93000 tracks processed 94000 tracks processed 95000 tracks processed 96000 tracks processed 97000 tracks processed 98000 tracks processed 99000 tracks processed 100000 tracks processed 101000 tracks processed 102000 tracks processed 103000 tracks processed 104000 tracks processed 105000 tracks processed 106000 tracks processed 107000 tracks processed 108000 tracks processed 109000 tracks processed 110000 tracks processed 111000 tracks processed 112000 tracks processed 113000 tracks processed 114000 tracks processed 115000 tracks processed 116000 tracks processed 117000 tracks processed 118000 tracks processed 119000 tracks processed 120000 tracks processed 121000 tracks processed 122000 tracks processed 123000 tracks processed 124000 tracks processed 125000 tracks processed 126000 tracks processed 127000 tracks processed 128000 tracks processed 129000 tracks processed 130000 tracks processed 131000 tracks processed 132000 tracks processed 133000 tracks processed 134000 tracks processed 135000 tracks processed 136000 tracks processed 137000 tracks processed 138000 tracks processed 139000 tracks processed 140000 tracks processed 141000 tracks processed 142000 tracks processed 143000 tracks processed 144000 tracks processed 145000 tracks processed 146000 tracks processed 147000 tracks processed 152000 tracks processed 153000 tracks processed 154000 tracks processed 155000 tracks processed 156000 tracks processed 157000 tracks processed 158000 tracks processed 159000 tracks processed 160000 tracks processed 161000 tracks processed 162000 tracks processed 163000 tracks processed 164000 tracks processed 165000 tracks processed 166000 tracks processed 167000 tracks processed 168000 tracks processed 169000 tracks processed 170000 tracks processed 171000 tracks processed 172000 tracks processed 173000 tracks processed 174000 tracks processed 175000 tracks processed 176000 tracks processed 177000 tracks processed 178000 tracks processed 179000 tracks processed 180000 tracks processed 181000 tracks processed 182000 tracks processed 183000 tracks processed 184000 tracks processed 185000 tracks processed 186000 tracks processed 187000 tracks processed 188000 tracks processed 189000 tracks processed 190000 tracks processed 191000 tracks processed 192000 tracks processed 193000 tracks processed 194000 tracks processed 195000 tracks processed 196000 tracks processed 197000 tracks processed 198000 tracks processed 199000 tracks processed 200000 tracks processed 201000 tracks processed