pip install quilt
import quilt
pkg_name = 'iconix/deephypebot'
# start with an empty package -- revision1
#quilt.build(pkg_name)
import os
import pandas as pd
BASE_DIR = os.getcwd()
DATA_DIR = os.path.join(BASE_DIR, '..', 'datasets')
reviews_f = 'reviews_and_metadata'
reviews_5yrs_f = os.path.join(DATA_DIR, f'{reviews_f}_5yrs.json')
reviews_5yrs_df = pd.read_json(reviews_5yrs_f)
print(f'num_reviews: {len(reviews_5yrs_df)}')
reviews_5yrs_df.head()
num_reviews: 25160
artist | artist_id | audio_features | author | content | date_published | dek | desc | direction | domain | ... | pages_rendered | post_title | posturl | rendered_pages | sitename | song_title | spotify_genres | spotify_id | total_pages | word_count | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Mt. Joy | 69tiO1fG8VWduDl3ji2qhI | {'danceability': 0.486, 'energy': 0.638, 'key'... | None | New Music\nMt. Joy reached out to us with the ... | 2017-11-20T13:33:10.000Z | NaN | “Silver Lining” is the fourth released single ... | ltr | www.buffablog.com | ... | NaN | Mt. Joy - "Silver Lining" | http://www.buffablog.com/mt-joy-silver-lining/ | 1.0 | buffaBLOG | Silver Lining | [deep new americana, folk-pop, indie pop, mode... | 2fpDrL2Vpee0JnM6AoUFvl | 1 | 86 |
1 | Mt. Joy | 69tiO1fG8VWduDl3ji2qhI | {'danceability': 0.486, 'energy': 0.638, 'key'... | Martin | Folk rockers Mt. Joy have debuted their new so... | 2017-11-16T00:00:00.000Z | NaN | “Silver Lining” is the fourth released single ... | ltr | www.discobelle.net | ... | NaN | Mt. Joy – Silver Lining | Discobelle.net | http://www.discobelle.net/2017/11/16/mt-joy-si... | 1.0 | Discobelle | Silver Lining | [deep new americana, folk-pop, indie pop, mode... | 2fpDrL2Vpee0JnM6AoUFvl | 1 | 83 |
2 | Mt. Joy | 69tiO1fG8VWduDl3ji2qhI | {'danceability': 0.486, 'energy': 0.638, 'key'... | Nasko | You know we're digging Mt. Joy.\nTheir new sin... | 2017-11-14T14:30:26.000Z | NaN | “Silver Lining” is the fourth released single ... | ltr | www.stereofox.com | ... | NaN | Mt. Joy - Silver Lining | Stereofox Music Blog | https://www.stereofox.com/mt-joy-silver-lining/ | 1.0 | Stereofox | Silver Lining | [deep new americana, folk-pop, indie pop, mode... | 2fpDrL2Vpee0JnM6AoUFvl | 1 | 75 |
3 | Mt. Joy | 69tiO1fG8VWduDl3ji2qhI | {'danceability': 0.486, 'energy': 0.638, 'key'... | D & d | Nothing against the profession, but the U.S. h... | 2018-06-20T10:53:00.000Z | NaN | “Silver Lining” is the fourth released single ... | ltr | indieobsessive.blogspot.com | ... | NaN | “Silver Lining” by Mt. Joy – A Song Review | http://indieobsessive.blogspot.com/2017/11/sil... | 1.0 | Indie Obsessive | Silver Lining | [deep new americana, folk-pop, indie pop, mode... | 2fpDrL2Vpee0JnM6AoUFvl | 1 | 416 |
4 | Opia | 70zHrrrPfBkVV44AEhFyyh | {'danceability': 0.609, 'energy': 0.623, 'key'... | Nat Morawski | Connecticut duo Opia have released a guitar he... | 2018-05-24T00:00:00.000Z | NaN | \n\nI don’t know why but this feels like a bre... | ltr | acidstag.com | ... | NaN | Opia – ‘Four Winds’ | https://acidstag.com/2018/05/opia-four-winds/ | 1.0 | acid stag | Four Winds | [vapor soul] | 3NPiANHZYahLZhUT00GwTw | 1 | 148 |
5 rows × 32 columns
# https://stackoverflow.com/a/38231651
# quilt cannot handle column 'audio_features' because it is a dict - convert it into multiple columns
# convert original 'audio_features' column from dict to JSON string
audio_df = reviews_5yrs_df['audio_features'].apply(pd.Series)[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']]
str_df = reviews_5yrs_df['audio_features'].astype('|S')
reviews_5yrs_df = pd.concat([reviews_5yrs_df.drop(['audio_features'], axis=1), str_df, audio_df], axis=1)
reviews_5yrs_df.columns
Index(['artist', 'artist_id', 'author', 'content', 'date_published', 'dek', 'desc', 'direction', 'domain', 'error', 'excerpt', 'extract_url', 'failed', 'genius_id', 'genres', 'itemid', 'lang', 'lead_image_url', 'masked_content', 'messages', 'next_page_url', 'pages_rendered', 'post_title', 'posturl', 'rendered_pages', 'sitename', 'song_title', 'spotify_genres', 'spotify_id', 'total_pages', 'word_count', 'audio_features', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature'], dtype='object')
from quilt.data.iconix import deephypebot
# put data in it
deephypebot._set(['reviews_and_metadata_5yrs'], reviews_5yrs_df)
deephypebot.reviews_and_metadata_5yrs()
artist | artist_id | author | content | date_published | dek | desc | direction | domain | error | ... | loudness | mode | speechiness | acousticness | instrumentalness | liveness | valence | tempo | duration_ms | time_signature | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Mt. Joy | 69tiO1fG8VWduDl3ji2qhI | None | New Music\nMt. Joy reached out to us with the ... | 2017-11-20T13:33:10.000Z | NaN | “Silver Lining” is the fourth released single ... | ltr | www.buffablog.com | NaN | ... | -6.231 | 1.0 | 0.0323 | 0.000921 | 0.000000 | 0.0780 | 0.1780 | 144.458 | 199827.0 | 4.0 |
1 | Mt. Joy | 69tiO1fG8VWduDl3ji2qhI | Martin | Folk rockers Mt. Joy have debuted their new so... | 2017-11-16T00:00:00.000Z | NaN | “Silver Lining” is the fourth released single ... | ltr | www.discobelle.net | NaN | ... | -6.231 | 1.0 | 0.0323 | 0.000921 | 0.000000 | 0.0780 | 0.1780 | 144.458 | 199827.0 | 4.0 |
2 | Mt. Joy | 69tiO1fG8VWduDl3ji2qhI | Nasko | You know we're digging Mt. Joy.\nTheir new sin... | 2017-11-14T14:30:26.000Z | NaN | “Silver Lining” is the fourth released single ... | ltr | www.stereofox.com | NaN | ... | -6.231 | 1.0 | 0.0323 | 0.000921 | 0.000000 | 0.0780 | 0.1780 | 144.458 | 199827.0 | 4.0 |
3 | Mt. Joy | 69tiO1fG8VWduDl3ji2qhI | D & d | Nothing against the profession, but the U.S. h... | 2018-06-20T10:53:00.000Z | NaN | “Silver Lining” is the fourth released single ... | ltr | indieobsessive.blogspot.com | NaN | ... | -6.231 | 1.0 | 0.0323 | 0.000921 | 0.000000 | 0.0780 | 0.1780 | 144.458 | 199827.0 | 4.0 |
4 | Opia | 70zHrrrPfBkVV44AEhFyyh | Nat Morawski | Connecticut duo Opia have released a guitar he... | 2018-05-24T00:00:00.000Z | NaN | \n\nI don’t know why but this feels like a bre... | ltr | acidstag.com | NaN | ... | -6.575 | 1.0 | 0.1020 | 0.190000 | 0.000034 | 0.0625 | 0.5820 | 87.974 | 223814.0 | 4.0 |
5 | Opia | 70zHrrrPfBkVV44AEhFyyh | Adeel Amini | Now this is inventive. Not that we'd expect an... | 2018-05-17T11:30:22.000Z | NaN | \n\nI don’t know why but this feels like a bre... | ltr | pressplayok.com | NaN | ... | -6.575 | 1.0 | 0.1020 | 0.190000 | 0.000034 | 0.0625 | 0.5820 | 87.974 | 223814.0 | 4.0 |
6 | ELOHIM | 6wKxOKEA3K6R2UZ3COLXEY | Oblivious Pop | Since the very first release from Elohim about... | 2018-06-20T16:01:00.000Z | NaN | \n\n?\n\n | ltr | www.obliviouspop.com | NaN | ... | -6.560 | 1.0 | 0.0347 | 0.139000 | 0.000003 | 0.1110 | 0.2990 | 120.046 | 225531.0 | 4.0 |
7 | ELOHIM | 6wKxOKEA3K6R2UZ3COLXEY | None | New Music\nNow this is how you get a party sta... | 2018-05-09T22:44:56.000Z | NaN | \n\n?\n\n | ltr | www.buffablog.com | NaN | ... | -6.560 | 1.0 | 0.0347 | 0.139000 | 0.000003 | 0.1110 | 0.2990 | 120.046 | 225531.0 | 4.0 |
8 | ELOHIM | 6wKxOKEA3K6R2UZ3COLXEY | None | This Friday, April 27th, ELOHIM releases her l... | None | NaN | \n\n?\n\n | ltr | www.theautumnroses.com | NaN | ... | -6.560 | 1.0 | 0.0347 | 0.139000 | 0.000003 | 0.1110 | 0.2990 | 120.046 | 225531.0 | 4.0 |
9 | ELOHIM | 6wKxOKEA3K6R2UZ3COLXEY | None | What a long way Elohim has come since the rele... | 2018-04-24T17:12:03.000Z | NaN | \n\n?\n\n | ltr | www.highclouds.org | NaN | ... | -6.560 | 1.0 | 0.0347 | 0.139000 | 0.000003 | 0.1110 | 0.2990 | 120.046 | 225531.0 | 4.0 |
10 | ELOHIM | 6wKxOKEA3K6R2UZ3COLXEY | Acid Stag | Los Angeles artist ELOHIM will be releasing he... | 2018-04-24T00:00:00.000Z | NaN | \n\n?\n\n | ltr | acidstag.com | NaN | ... | -6.560 | 1.0 | 0.0347 | 0.139000 | 0.000003 | 0.1110 | 0.2990 | 120.046 | 225531.0 | 4.0 |
11 | ELOHIM | 6wKxOKEA3K6R2UZ3COLXEY | Adeel Amini | Elohim is no stranger to full-pelt pop, and it... | 2018-04-23T21:35:58.000Z | NaN | \n\n?\n\n | ltr | pressplayok.com | NaN | ... | -6.560 | 1.0 | 0.0347 | 0.139000 | 0.000003 | 0.1110 | 0.2990 | 120.046 | 225531.0 | 4.0 |
12 | G Flip | 4SdIXLzfabqU61iK7SnKAU | Breaking More Waves Blog | If you follow Breaking More Waves on Twitter, ... | 2018-06-20T08:00:00.000Z | NaN | \n\n?\n\n | ltr | breakingmorewaves.blogspot.com | NaN | ... | -5.865 | 1.0 | 0.0446 | 0.442000 | 0.000002 | 0.0742 | 0.6090 | 102.988 | 217148.0 | 4.0 |
13 | G Flip | 4SdIXLzfabqU61iK7SnKAU | None | Tags G Flip Georgia Flipo\nBased in London, En... | 2018-05-18T11:25:00.000Z | NaN | \n\n?\n\n | ltr | www.highclouds.org | NaN | ... | -5.865 | 1.0 | 0.0446 | 0.442000 | 0.000002 | 0.0742 | 0.6090 | 102.988 | 217148.0 | 4.0 |
14 | G Flip | 4SdIXLzfabqU61iK7SnKAU | Adeel Amini | G Flip - or Georgia to her fam - should be a n... | 2018-05-16T16:32:46.000Z | NaN | \n\n?\n\n | ltr | pressplayok.com | NaN | ... | -5.865 | 1.0 | 0.0446 | 0.442000 | 0.000002 | 0.0742 | 0.6090 | 102.988 | 217148.0 | 4.0 |
15 | Abalone | 3vxlNs9NCsdlD8xlYIPOj3 | None | Although this year's SPOT Festival - which too... | 2018-05-24T08:06:33.000Z | NaN | \n\n“Sleigh Ride” is the 3rd track on “A Lafac... | ltr | dotsanddashes.co.uk | NaN | ... | -8.162 | 0.0 | 0.0322 | 0.011800 | 0.688000 | 0.1230 | 0.1730 | 111.001 | 317250.0 | 4.0 |
16 | Abalone | 3vxlNs9NCsdlD8xlYIPOj3 | Synths of Eden | It has been a while since we last posted a dow... | 2018-05-11T13:08:38.000Z | NaN | \n\n“Sleigh Ride” is the 3rd track on “A Lafac... | ltr | synthsofeden.com | NaN | ... | -8.162 | 0.0 | 0.0322 | 0.011800 | 0.688000 | 0.1230 | 0.1730 | 111.001 | 317250.0 | 4.0 |
17 | Junip | 7HcipAIJatVGT4U6HQrnFW | None | If like me you were wondering what he was up t... | None | NaN | \n\nThe subject of this song finds themself su... | ltr | www.mp3hugger.com | NaN | ... | -8.033 | 1.0 | 0.0417 | 0.472000 | 0.523000 | 0.1110 | 0.2840 | 90.377 | 339293.0 | 4.0 |
18 | Kevin Morby | 6fxk3UXHTFYET8qCT9WlBF | None | Beautiful Strangers -Kevin Morby\nKevin Morby'... | None | NaN | \n\nAmazing\n\n | ltr | noondaytune.com | NaN | ... | -11.127 | 1.0 | 0.0445 | 0.762000 | 0.068900 | 0.1050 | 0.8910 | 100.042 | 375903.0 | 4.0 |
19 | Kevin Morby | 6fxk3UXHTFYET8qCT9WlBF | The Listening Post Blog | Kevin Morby has released new single 'Beautiful... | 2016-10-27T17:34:33.000Z | NaN | \n\nAmazing\n\n | ltr | thelisteningpostblog.wordpress.com | NaN | ... | -11.127 | 1.0 | 0.0445 | 0.762000 | 0.068900 | 0.1050 | 0.8910 | 100.042 | 375903.0 | 4.0 |
20 | Cosmo's Midnight | 4VivsO1n4n2Mi2Btyb5gfL | Adrien | by Adrien * Published 19 February, 2018 * Upd... | 2018-02-19T20:22:54.000Z | NaN | \n\n?\n\n | ltr | doyoulikethatsong.com | NaN | ... | -11.899 | 1.0 | 0.2350 | 0.119000 | 0.002270 | 0.0782 | 0.4420 | 109.079 | 241651.0 | 4.0 |
21 | Cosmo's Midnight | 4VivsO1n4n2Mi2Btyb5gfL | Zuli | It's been quite a while since these Cosmo's Mi... | 2018-02-17T11:37:47.000Z | NaN | \n\n?\n\n | ltr | www.stereofox.com | NaN | ... | -11.899 | 1.0 | 0.2350 | 0.119000 | 0.002270 | 0.0782 | 0.4420 | 109.079 | 241651.0 | 4.0 |
22 | Gilligan Moss | 2fo0F81pRzdXjmWP6MkQqB | Acid Stag | Gilligan Moss are gearing us up for the weeken... | 2018-05-18T00:00:00.000Z | NaN | \n\n“Want U So Bad” is the second single, writ... | ltr | acidstag.com | NaN | ... | -7.189 | 0.0 | 0.0380 | 0.251000 | 0.061800 | 0.2470 | 0.7120 | 117.016 | 247160.0 | 4.0 |
23 | Teleman | 3wrtQM9ICPPeHwoc1GWiyV | None | London's very-own Teleman has unleashed their ... | 2018-05-16T00:00:00.000Z | NaN | \n\n?\n\n | ltr | www.wickeddchildd.com | NaN | ... | -8.772 | 0.0 | 0.0316 | 0.014400 | 0.300000 | 0.2280 | 0.5620 | 124.003 | 293142.0 | 4.0 |
24 | RÜFÜS / RÜFÜS DU SOL | 5Pb27ujIyYb33zBqVysBkj | Acid Stag | Aussie lads R UFUS dropped a new tune over the... | 2018-05-28T00:00:00.000Z | NaN | \n\nRÜFÜS DU SOL’s first single from their upc... | ltr | acidstag.com | NaN | ... | -6.909 | 0.0 | 0.0316 | 0.015600 | 0.184000 | 0.1870 | 0.2050 | 120.001 | 238008.0 | 4.0 |
25 | RÜFÜS / RÜFÜS DU SOL | 5Pb27ujIyYb33zBqVysBkj | Erin Maher | Recently I had a conversation with a few music... | None | NaN | \n\nRÜFÜS DU SOL’s first single from their upc... | ltr | beautifulbuzzz.com | NaN | ... | -6.909 | 0.0 | 0.0316 | 0.015600 | 0.184000 | 0.1870 | 0.2050 | 120.001 | 238008.0 | 4.0 |
26 | Ben-Browning | 497VSHqaZA32b7tgCETFEH | None | Ben Browning - "Sunshine Baby"\nYou may know B... | 2018-05-29T23:12:32.000Z | NaN | \n\n?\n\n | ltr | www.weallwantsomeone.org | NaN | ... | -8.365 | 1.0 | 0.0276 | 0.072400 | 0.009450 | 0.1420 | 0.7730 | 108.980 | 214400.0 | 4.0 |
27 | Ben-Browning | 497VSHqaZA32b7tgCETFEH | Madison Blom | Kicking off his forthcoming album with the rel... | 2018-05-22T12:35:51.000Z | NaN | \n\n?\n\n | ltr | imperfectfifth.com | NaN | ... | -8.365 | 1.0 | 0.0276 | 0.072400 | 0.009450 | 0.1420 | 0.7730 | 108.980 | 214400.0 | 4.0 |
28 | Ben-Browning | 497VSHqaZA32b7tgCETFEH | None | Three years after Turns, Australian Melbourne-... | 2018-05-20T08:49:53.000Z | NaN | \n\n?\n\n | ltr | sonofmarketing.com | NaN | ... | -8.365 | 1.0 | 0.0276 | 0.072400 | 0.009450 | 0.1420 | 0.7730 | 108.980 | 214400.0 | 4.0 |
29 | Ben-Browning | 497VSHqaZA32b7tgCETFEH | Buzz | Ben Browning (Photo by Angel Ceballos)\nCut Co... | 2018-05-17T18:43:18.000Z | NaN | \n\n?\n\n | ltr | buzzbands.la | NaN | ... | -8.365 | 1.0 | 0.0276 | 0.072400 | 0.009450 | 0.1420 | 0.7730 | 108.980 | 214400.0 | 4.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
25130 | Disclosure | 6nS5roXSAGhTGr34W6n7Et | Jared Silva | New\nWe've heard a couple collaborations from ... | None | NaN | \n\n“F for You” comes courtesy of Howard Lawre... | ltr | www.allthingsgomusic.com | NaN | ... | -7.426 | 0.0 | 0.0674 | 0.006760 | 0.030300 | 0.1100 | 0.2730 | 124.023 | 269947.0 | 4.0 |
25131 | Disclosure | 6nS5roXSAGhTGr34W6n7Et | George\nO'Brien | You can count the number of days beforeDisclos... | None | NaN | \n\n“F for You” comes courtesy of Howard Lawre... | ltr | None | NaN | ... | -7.426 | 0.0 | 0.0674 | 0.006760 | 0.030300 | 0.1100 | 0.2730 | 124.023 | 269947.0 | 4.0 |
25132 | ScHoolboy Q | 5IcR3N7QB1j6KBL8eImZ8m | None | Schoolboy Q might not be getting the plaudits ... | None | NaN | \n\nThis song premiered during the 2013 NBA Pl... | ltr | www.indieshuffle.com | NaN | ... | -7.230 | 0.0 | 0.1590 | 0.370000 | 0.000000 | 0.1250 | 0.1760 | 123.966 | 272453.0 | 4.0 |
25133 | ScHoolboy Q | 5IcR3N7QB1j6KBL8eImZ8m | Human Drizzle | Schoolboy Q, the rapper behind some of the big... | 2013-05-31T08:14:33.000Z | NaN | \n\nThis song premiered during the 2013 NBA Pl... | ltr | humandrizzle.com | NaN | ... | -7.230 | 0.0 | 0.1590 | 0.370000 | 0.000000 | 0.1250 | 0.1760 | 123.966 | 272453.0 | 4.0 |
25134 | ScHoolboy Q | 5IcR3N7QB1j6KBL8eImZ8m | None | ScHoolboy Q 's latest track stands out to me a... | 2013-05-28T04:50:13.000Z | NaN | \n\nThis song premiered during the 2013 NBA Pl... | ltr | beatspill.com | NaN | ... | -7.230 | 0.0 | 0.1590 | 0.370000 | 0.000000 | 0.1250 | 0.1760 | 123.966 | 272453.0 | 4.0 |
25135 | Empire Of The Sun | 67hb7towEyKvt5Z8Bx306c | None | Off the back of an incredibly succcessful summ... | None | NaN | \n\n?\n\n | ltr | stoneyroads.com | NaN | ... | -5.551 | 1.0 | 0.0917 | 0.001680 | 0.747000 | 0.1630 | 0.0925 | 128.007 | 356719.0 | 4.0 |
25136 | //Fractures | 7sjRnhONmeFL1tmlUvdq70 | Confusion | "Twisted" is the lead single from an EP to be ... | None | NaN | \n\n?\n\n | ltr | pigeonsandplanes.com | NaN | ... | -10.524 | 0.0 | 0.0674 | 0.105000 | 0.411000 | 0.2410 | 0.3850 | 134.138 | 260402.0 | 4.0 |
25137 | Still Parade | 3CXevh2SLL5B4cuTedOkj5 | b3 | "Actors" is the debut track from newcomers Sti... | 2013-06-04T18:20:09.000Z | NaN | \n\n?\n\n | ltr | blahblahblahscience.com | NaN | ... | -12.838 | 0.0 | 0.0344 | 0.878000 | 0.789000 | 0.2130 | 0.3600 | 90.967 | 241922.0 | 4.0 |
25138 | Still Parade | 3CXevh2SLL5B4cuTedOkj5 | None | Still Parade are superstars in waiting. They s... | None | NaN | \n\n?\n\n | ltr | www.indieshuffle.com | NaN | ... | -12.838 | 0.0 | 0.0344 | 0.878000 | 0.789000 | 0.2130 | 0.3600 | 90.967 | 241922.0 | 4.0 |
25139 | Still Parade | 3CXevh2SLL5B4cuTedOkj5 | None | "Actors are so fortunate. They can choose whet... | None | NaN | \n\n?\n\n | ltr | joftheday.com | NaN | ... | -12.838 | 0.0 | 0.0344 | 0.878000 | 0.789000 | 0.2130 | 0.3600 | 90.967 | 241922.0 | 4.0 |
25140 | J. Cole | 6l3HvQ5sa6mXTsMTB19rO5 | Justin McCarthy | New\nGenre: Hip Hop Sounds Like: Drake\nThe la... | None | NaN | \n\nThis is the third release off of J Cole’s ... | ltr | www.allthingsgomusic.com | NaN | ... | -7.394 | 0.0 | 0.1740 | 0.033400 | 0.000002 | 0.1030 | 0.3270 | 138.046 | 217240.0 | 4.0 |
25141 | J. Cole | 6l3HvQ5sa6mXTsMTB19rO5 | None | While Kanye West is taking the minimalist, mys... | 2013-05-31T19:22:17.000Z | NaN | \n\nThis is the third release off of J Cole’s ... | ltr | www.idolator.com | NaN | ... | -7.394 | 0.0 | 0.1740 | 0.033400 | 0.000002 | 0.1030 | 0.3270 | 138.046 | 217240.0 | 4.0 |
25142 | J. Cole | 6l3HvQ5sa6mXTsMTB19rO5 | None | J. Cole 's Born Sinner has been highly anticip... | 2013-05-31T18:00:29.000Z | NaN | \n\nThis is the third release off of J Cole’s ... | ltr | earmilk.com | NaN | ... | -7.394 | 0.0 | 0.1740 | 0.033400 | 0.000002 | 0.1030 | 0.3270 | 138.046 | 217240.0 | 4.0 |
25143 | J. Cole | 6l3HvQ5sa6mXTsMTB19rO5 | None | For whatever reason, there is not much J. Cole... | None | NaN | \n\nThis is the third release off of J Cole’s ... | ltr | www.indieshuffle.com | NaN | ... | -7.394 | 0.0 | 0.1740 | 0.033400 | 0.000002 | 0.1030 | 0.3270 | 138.046 | 217240.0 | 4.0 |
25144 | J. Cole | 6l3HvQ5sa6mXTsMTB19rO5 | Confusion | With Born Sinner coming June 18, J. Cole drops... | None | NaN | \n\nThis is the third release off of J Cole’s ... | ltr | pigeonsandplanes.com | NaN | ... | -7.394 | 0.0 | 0.1740 | 0.033400 | 0.000002 | 0.1030 | 0.3270 | 138.046 | 217240.0 | 4.0 |
25145 | J. Cole | 6l3HvQ5sa6mXTsMTB19rO5 | None | J. Cole decides to liberate this new track off... | 2013-05-31T01:59:39.000Z | NaN | \n\nThis is the third release off of J Cole’s ... | ltr | nahright.com | NaN | ... | -7.394 | 0.0 | 0.1740 | 0.033400 | 0.000002 | 0.1030 | 0.3270 | 138.046 | 217240.0 | 4.0 |
25146 | Robert DeLong | 42crL07E4WPfVovyUtMpvC | None | Did he make you fing dance? That he did. *Robe... | 2012-11-15T21:27:23.000Z | NaN | \n\n“Global Concepts” is the 2nd track on Robe... | ltr | earmilk.com | NaN | ... | -3.758 | 0.0 | 0.2080 | 0.332000 | 0.000010 | 0.3640 | 0.6510 | 107.926 | 278147.0 | 4.0 |
25147 | Robert DeLong | 42crL07E4WPfVovyUtMpvC | b3 | As his first release under the Glassnote label... | 2012-10-31T12:38:01.000Z | NaN | \n\n“Global Concepts” is the 2nd track on Robe... | ltr | blahblahblahscience.com | NaN | ... | -3.758 | 0.0 | 0.2080 | 0.332000 | 0.000010 | 0.3640 | 0.6510 | 107.926 | 278147.0 | 4.0 |
25148 | Robert DeLong | 42crL07E4WPfVovyUtMpvC | None | Sometimes a little experimentation is good. An... | None | NaN | \n\n“Global Concepts” is the 2nd track on Robe... | ltr | www.indieshuffle.com | NaN | ... | -3.758 | 0.0 | 0.2080 | 0.332000 | 0.000010 | 0.3640 | 0.6510 | 107.926 | 278147.0 | 4.0 |
25149 | Robert DeLong | 42crL07E4WPfVovyUtMpvC | Nelson | NYC-based label Glassnote has quite an extensi... | None | NaN | \n\n“Global Concepts” is the 2nd track on Robe... | ltr | www.allthingsgomusic.com | NaN | ... | -3.758 | 0.0 | 0.2080 | 0.332000 | 0.000010 | 0.3640 | 0.6510 | 107.926 | 278147.0 | 4.0 |
25150 | f y f e | 0HdNDZaNm7xLt18v9aWDfe | Converted to dEUS-ism in 2001. Keen on screami... | The brilliant Public Service Broadcasting (sti... | 2013-05-31T17:15:28.000Z | NaN | \n\n?\n\n | ltr | www.frontstagemusic.co.uk | NaN | ... | -26.826 | 1.0 | 0.0580 | 0.996000 | 0.926000 | 0.1170 | 0.8560 | 78.980 | 123000.0 | 3.0 |
25151 | f y f e | 0HdNDZaNm7xLt18v9aWDfe | Neil Wood31 May, 2013 | Ex-David's Lyre man Paul Dixon burst onto the ... | None | NaN | \n\n?\n\n | ltr | www.thefourohfive.com | NaN | ... | -26.826 | 1.0 | 0.0580 | 0.996000 | 0.926000 | 0.1170 | 0.8560 | 78.980 | 123000.0 | 3.0 |
25152 | f y f e | 0HdNDZaNm7xLt18v9aWDfe | b3 | FYFE, a/k/a former David's Lyre frontman Paul ... | 2013-05-30T18:46:33.000Z | NaN | \n\n?\n\n | ltr | blahblahblahscience.com | NaN | ... | -26.826 | 1.0 | 0.0580 | 0.996000 | 0.926000 | 0.1170 | 0.8560 | 78.980 | 123000.0 | 3.0 |
25153 | Sylvester | 5TGTpu4g8siFOIctZuQO7y | Neil Barlow | New\nGenre: Remix Sounds Like: Gigamesh\nMy lo... | None | NaN | \n\n“You Make Me Feel (Mighty Real)” is a clas... | ltr | www.allthingsgomusic.com | NaN | ... | -12.942 | 1.0 | 0.0320 | 0.047800 | 0.328000 | 0.0480 | 0.9630 | 131.573 | 395573.0 | 4.0 |
25154 | Baby Alpaca | 3UEWPRL5vRlNOP1zhRQf71 | None | Baby Alpaca - Wild Child\nBrooklyn's, NY deare... | None | NaN | \n\n?\n\n | ltr | soundinjections.gr | NaN | ... | -8.202 | 1.0 | 0.0520 | 0.727000 | 0.000621 | 0.0929 | 0.0616 | 99.939 | 352227.0 | 4.0 |
25155 | Baby Alpaca | 3UEWPRL5vRlNOP1zhRQf71 | b3 | New York band Baby Alpaca ably blend current w... | 2013-05-23T21:14:09.000Z | NaN | \n\n?\n\n | ltr | blahblahblahscience.com | NaN | ... | -8.202 | 1.0 | 0.0520 | 0.727000 | 0.000621 | 0.0929 | 0.0616 | 99.939 | 352227.0 | 4.0 |
25156 | Two Door Cinema Club | 536BYVgOnRky0xjsPT96zl | None | After a couple false starts, RAC has officiall... | None | NaN | \n\nThis song is about homesickness and Alex T... | ltr | www.indieshuffle.com | NaN | ... | -6.498 | 1.0 | 0.0478 | 0.280000 | 0.000000 | 0.0913 | 0.3920 | 123.056 | 286093.0 | 4.0 |
25157 | Adrian Lux | 5kp9Qhzri9LrDkzrtjt5Sh | None | Back to TopMusic\nWritten by James LedgerMay 2... | None | NaN | \n\n?\n\n | ltr | stoneyroads.com | NaN | ... | -4.369 | 0.0 | 0.0363 | 0.002840 | 0.001970 | 0.1360 | 0.3910 | 128.048 | 166253.0 | 4.0 |
25158 | T.I. feat. Lil Wayne | 4OBJLual30L7gRl5UkeRcT | Confusion | T.I. and Lil Wayne are kicking off the America... | None | NaN | \n\n?\n\n | ltr | pigeonsandplanes.com | NaN | ... | -4.737 | 1.0 | 0.2290 | 0.029300 | 0.000000 | 0.0709 | 0.3880 | 142.037 | 305080.0 | 4.0 |
25159 | The Smiths | 3yY2gUcIsjMr8hjo51PoJ8 | None | My love and adoration for The Smiths dates all... | None | NaN | \n\nThe Smiths' second single of 1983, “This C... | ltr | www.indieshuffle.com | NaN | ... | -5.900 | 1.0 | 0.0436 | 0.005090 | 0.000000 | 0.0784 | 0.7510 | 103.912 | 162920.0 | 4.0 |
25160 rows × 45 columns
# revision2 - using https://github.com/iconix/openai/blob/0798aa240dd141cb3e66f54f5715dfc02caa3e22/datasets/reviews_and_metadata_5yrs.json
quilt.build(pkg_name, deephypebot)
# log in and push to the registry
quilt.login()
quilt.push(pkg_name, is_public=True)
Launching a web browser... If that didn't work, please visit the following URL: https://pkg.quiltdata.com/login Failed to launch the browser: Command '['xdg-open', 'https://pkg.quiltdata.com/login']' returned non-zero exit status 3. Enter the code from the webpage: eyJjb2RlIjogImMwZGYxNzI5LWY4YzMtNDA0MS05YmNjLWEyZjkyZjZjNmU1NyIsICJpZCI6ICJkMzI3ZTdkYS1lZDFiLTRiMWMtYTFlMC1lM2ZkNzk4NDJlMzkifQ== Fetching upload URLs from the registry...
0%| | 0.00/36.8M [00:00<?, ?B/s]
Uploading 1 fragments (36757347 bytes)...
100%|██████████| 36.8M/36.8M [00:03<00:00, 12.2MB/s]
Uploading package metadata... Updating the 'latest' tag... Push complete. iconix/deephypebot is live: https://quiltdata.com/package/iconix/deephypebot
quilt.inspect(pkg_name)
/home/ubuntu/.local/share/QuiltCli/quilt_packages/pkgs/Quilt/iconix/deephypebot └── reviews_and_metadata_5yrs: shape (25160, 45), types 'artist': object, 'artist_id': object, 'author': object, 'conte…
Spotify genres are amazingly specific, and the tail is long. Let's do some light clustering to help reduce the tail.
The working theory is that this will benefit downstream conditioning of text generation on genre.
import quilt
pkg_name = 'iconix/deephypebot'
quilt.install(pkg_name)
Downloading package metadata... iconix/deephypebot already installed. Overwrite? (y/n) y Fragments already downloaded
from quilt.data.iconix import deephypebot
reviews_5yrs_df = deephypebot.reviews_and_metadata_5yrs()
reviews_5yrs_df.sample(frac=1).head()
artist | artist_id | author | content | date_published | dek | desc | direction | domain | error | ... | loudness | mode | speechiness | acousticness | instrumentalness | liveness | valence | tempo | duration_ms | time_signature | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
18871 | WILDES | 0ypTT9UqAU5sZpPo5JZmjR | None | 15 MAR 2016\n"Bare" is a song by the London ba... | 2016-03-15T00:00:00.000Z | NaN | \n\n?\n\n | ltr | glamglare.com | NaN | ... | -10.385 | 0.0 | 0.0270 | 0.78300 | 0.005140 | 0.0898 | 0.1450 | 129.876 | 242060.0 | 4.0 |
1688 | Kllo | 0RDC2Krd2nmqseGx5C8PQz | None | While Kllo may not be the most memorable nor r... | 2017-10-03T12:33:48.000Z | NaN | \n\n?\n\n | ltr | dotsanddashes.co.uk | NaN | ... | -7.981 | 1.0 | 0.0591 | 0.24500 | 0.519000 | 0.0971 | 0.4280 | 125.040 | 224109.0 | 4.0 |
6912 | Super Duper | 5zFMLXUnqxwdgTpLCX9LDj | None | Song: Angela\nArtist: Super Duper\nNashvillian... | 2015-05-05T11:10:58.000Z | NaN | \n\n?\n\n | ltr | www.audio-aquarium.com | NaN | ... | -7.081 | 1.0 | 0.0293 | 0.00173 | 0.275000 | 0.3760 | 0.0931 | 88.984 | 175281.0 | 4.0 |
4287 | VILDE | 41EzImgqZFKQDb60paN8io | Wording by Matthew P | New music. It comes in great waves. When the t... | 2018-06-20T23:28:00.000Z | NaN | \n\nThe origin and idea for this song comes fr... | ltr | www.sos-music.co.uk | NaN | ... | -4.719 | 0.0 | 0.0433 | 0.24400 | 0.344000 | 0.1000 | 0.5450 | 169.998 | 248824.0 | 4.0 |
9069 | Owen Rabbit | 7cYskT4KsnVPdwMdjNIUnj | Wording by Matthew P | Showing an incredible diversity in his sound, ... | 2018-06-20T21:53:00.000Z | NaN | \n\n?\n\n | ltr | www.sos-music.co.uk | NaN | ... | -5.778 | 1.0 | 0.1470 | 0.03400 | 0.000017 | 0.6910 | 0.1600 | 70.018 | 201560.0 | 4.0 |
5 rows × 45 columns
from collections import Counter
c1 = Counter([g for gg in reviews_5yrs_df.spotify_genres for g in gg])
len(c1), c1.most_common()
(498, [('vapor soul', 4344), ('indie poptimism', 3886), ('pop', 3628), ('indietronica', 3462), ('electropop', 3406), ('indie r&b', 3242), ('tropical house', 2359), ('modern rock', 2171), ('indie psych-rock', 1942), ('indie pop', 1845), ('shimmer pop', 1827), ('edm', 1826), ('chillwave', 1705), ('indie electro-pop', 1451), ('rap', 1427), ('alternative dance', 1418), ('metropopolis', 1303), ('hip hop', 1212), ('pop rap', 1174), ('nu disco', 1086), ('dance pop', 1054), ('chamber pop', 996), ('aussietronica', 973), ('art pop', 973), ('vapor twitch', 915), ('new rave', 907), ('indie folk', 886), ('indie rock', 885), ('house', 867), ('vapor pop', 808), ('electro house', 752), ('indie anthem-folk', 649), ('gauze pop', 647), ('escape room', 641), ('post-teen pop', 606), ('folk-pop', 586), ('deep australian indie', 572), ('electronic trap', 570), ('southern hip hop', 544), ('trap music', 544), ('neo-psychedelic', 503), ('big room', 498), ('underground hip hop', 497), ('filter house', 474), ('dance-punk', 459), ('deep tropical house', 454), ('electronic', 448), ('brostep', 446), ('chamber psych', 409), ('stomp and holler', 408), ('deep house', 408), ('downtempo', 404), ('r&b', 401), ('australian dance', 386), ('freak folk', 384), ('tropical pop edm', 377), ('nu gaze', 372), ('bass trap', 368), ('progressive house', 359), ('neo soul', 340), ('brooklyn indie', 336), ('dream pop', 323), ('future garage', 321), ('conscious hip hop', 314), ('gangster rap', 307), ('preverb', 291), ('alternative hip hop', 284), ('swedish electropop', 273), ('microhouse', 269), ('modern alternative rock', 268), ('progressive electro house', 267), ('neo-synthpop', 264), ('deep groove house', 264), ('bass music', 256), ('urban contemporary', 253), ('trip hop', 239), ('norwegian pop', 232), ('ninja', 232), ('trap soul', 225), ('complextro', 221), ('shiver pop', 218), ('la indie', 212), ('garage rock', 207), ('alt-indie rock', 205), ('disco house', 205), ('noise pop', 203), ('australian alternative rock', 199), ('wonky', 189), ('catstep', 184), ('danish electro-pop', 184), ('indie garage rock', 181), ('funk', 176), ('garage psych', 172), ('lo-fi', 171), ('indie dream pop', 168), ('alternative rock', 165), ('dirty south rap', 162), ('chillstep', 161), ('swedish synthpop', 159), ('soul', 155), ('neo mellow', 153), ('intelligent dance music', 152), ('new americana', 142), ('nu jazz', 142), ('canadian pop', 142), ('moombahton', 141), ('float house', 139), ('etherpop', 138), ('minimal techno', 134), ('fluxwork', 128), ('swedish pop', 127), ('indie jazz', 124), ('swedish indie pop', 124), ('big beat', 117), ('french indietronica', 115), ('electroclash', 108), ('vaporwave', 107), ('australian pop', 101), ('hip pop', 98), ('future funk', 97), ('balearic', 97), ('alternative r&b', 96), ('compositional ambient', 95), ('permanent wave', 92), ('swedish soul', 91), ('shimmer psych', 90), ('hardcore hip hop', 90), ('minimal tech house', 89), ('french indie pop', 89), ('bay area indie', 88), ('rock', 86), ('turntablism', 85), ('pop rock', 79), ('east coast hip hop', 79), ('indie rockism', 79), ('noise rock', 78), ('deep big room', 77), ('danish pop', 77), ('detroit hip hop', 77), ('canadian hip hop', 74), ('indie punk', 73), ('canadian indie', 73), ('retro electro', 72), ('tech house', 72), ('outsider house', 71), ('tracestep', 71), ('lo-fi beats', 69), ('norwegian indie', 69), ('indian indie', 69), ('deep underground hip hop', 68), ('chillhop', 67), ('ambient', 67), ('deep disco house', 67), ('viral pop', 65), ('candy pop', 65), ('portland indie', 63), ('glitch hop', 63), ('indie pop rap', 63), ('post rock', 60), ('fourth world', 57), ('grave wave', 56), ('vancouver indie', 55), ('vocal house', 53), ('dark jazz', 52), ('uk funky', 52), ('slow core', 51), ('g funk', 51), ('indie quebecois', 50), ('swedish alternative rock', 50), ('filthstep', 50), ('progressive uplifting trance', 50), ('destroy techno', 48), ('shoegaze', 47), ('athens indie', 46), ('indie emo', 46), ('icelandic pop', 45), ('new weird america', 45), ('west coast rap', 45), ('indie psych-pop', 45), ('trance', 44), ('deep new americana', 43), ('singer-songwriter', 41), ('electrofox', 41), ('new wave', 40), ('deep euro house', 39), ('deep soul house', 39), ('substep', 39), ('neo-singer-songwriter', 39), ('strut', 39), ('belgian indie', 37), ('deep melodic euro house', 37), ('uk garage', 37), ('disco', 37), ('drone', 36), ('belgian pop', 35), ('grime', 35), ('bmore', 34), ('austindie', 34), ('drill', 34), ('vogue', 34), ('mandible', 33), ('deep funk', 32), ('vocal jazz', 32), ('mashup', 31), ('glitch', 30), ('edmonton indie', 30), ('afrobeat', 30), ('dutch indie', 30), ('progressive trance', 30), ('australian hip hop', 29), ('focus', 29), ('folk rock', 29), ('abstract hip hop', 28), ('acid jazz', 28), ('classic soul', 28), ('irish rock', 28), ('art rock', 27), ('icelandic indie', 27), ('danish alternative rock', 26), ('lilith', 26), ('german techno', 26), ('melancholia', 26), ('wave', 26), ('canadian folk', 24), ('trap queen', 24), ('canadian rock', 24), ('motown', 24), ('experimental rock', 23), ('chicago house', 23), ('ambeat', 23), ('dance rock', 22), ('christmas', 22), ('uplifting trance', 22), ('roots rock', 21), ('icelandic electronic', 21), ('stomp and whittle', 21), ('electro', 21), ('new wave pop', 21), ('german indie', 21), ('hip house', 21), ('drum and bass', 21), ('modern blues', 21), ('math pop', 20), ('deep trap', 20), ('israeli indie', 20), ('emo rap', 20), ('electro swing', 20), ('piano rock', 20), ('ambient idm', 19), ('norwegian jazz', 19), ('contemporary jazz', 19), ('psychedelic rock', 18), ('anti-folk', 18), ('acoustic pop', 18), ('german pop', 18), ('warm drone', 17), ('perth indie', 17), ('vapor trap', 17), ('drift', 17), ('lo-fi house', 17), ('punk blues', 17), ('soul flow', 17), ('lo star', 17), ('uk post-punk', 16), ('dubstep', 16), ('indie fuzzpop', 16), ('deep pop r&b', 16), ('swedish idol pop', 16), ('colombian indie', 15), ('twee pop', 15), ('movie tunes', 15), ('chicago indie', 15), ('madchester', 15), ('indiecoustica', 15), ('british indie rock', 14), ('footwork', 14), ('adult standards', 14), ('cabaret', 14), ('vancouver punk', 14), ('jazz blues', 14), ('french pop', 14), ('dreamo', 13), ('popgaze', 13), ('african rock', 13), ('south african hip hop', 13), ('dwn trap', 13), ('cello', 13), ('progressive bluegrass', 13), ('europop', 12), ('minimal', 12), ('latintronica', 12), ('erotica', 12), ('acid house', 12), ('pop punk', 12), ('northern soul', 12), ('quiet storm', 12), ('vegas indie', 12), ('electropowerpop', 12), ('fake', 12), ('post-punk', 11), ('bow pop', 11), ('crunk', 11), ('swing', 11), ('nordic house', 11), ('zapstep', 11), ('jam band', 11), ('new romantic', 11), ('emo', 10), ('jazz', 10), ('c86', 10), ('singaporean pop', 10), ('southern soul', 10), ('malaysian pop', 10), ('classical', 10), ('breakbeat', 10), ('reggae', 10), ('roots reggae', 10), ('hauntology', 9), ('modern hard rock', 9), ('teen pop', 8), ('pixie', 8), ('space rock', 8), ('canadian punk', 8), ('blues-rock', 8), ('colombian pop', 8), ('alternative country', 8), ('digital hardcore', 8), ('nintendocore', 8), ('witch house', 8), ('sheffield indie', 8), ('jazz funk', 7), ('lounge', 7), ('abstractro', 7), ('deep pop edm', 7), ('tribal house', 7), ('swedish indie rock', 7), ('toronto indie', 7), ('indie singer-songwriter', 7), ('drill and bass', 7), ('albuquerque indie', 7), ('concert band', 7), ('marching band', 7), ('wind ensemble', 7), ('danish indie pop', 6), ('folk punk', 6), ('stomp and flutter', 6), ('jazz rap', 6), ('bebop', 6), ('cool jazz', 6), ('ok indie', 6), ('norwegian hip hop', 6), ('boston rock', 6), ('britpop', 6), ('synthpop', 6), ('talent show', 6), ('covertrance', 6), ('classic icelandic pop', 6), ('denver indie', 6), ('liquid funk', 6), ('alternative emo', 5), ('no wave', 5), ('classic rock', 5), ('album rock', 5), ('contemporary post-bop', 5), ('jazz fusion', 5), ('jazz trumpet', 5), ('new orleans jazz', 5), ('pop quebecois', 5), ('underground rap', 5), ('minimal wave', 5), ('pop emo', 5), ('p funk', 5), ('new jack swing', 5), ('danish indie', 5), ('brazilian hip hop', 5), ('classical era', 5), ('leeds indie', 5), ('memphis soul', 5), ('seattle indie', 5), ('romantic era', 5), ('violin', 5), ('post-disco', 5), ('canadian indigenous', 4), ('belgian hip hop', 4), ('folk', 4), ('vienna indie', 4), ('punk', 4), ('swedish folk pop', 4), ('groove room', 4), ('detroit techno', 4), ('hardcore techno', 4), ('techno', 4), ('ethereal wave', 4), ('icelandic rock', 4), ('instrumental post rock', 4), ('chicano rap', 4), ('latin hip hop', 4), ('latin', 4), ('funky tech house', 4), ('bubblegum dance', 4), ('eurodance', 4), ('soul jazz', 4), ('indie deutschrap', 3), ('j-ambient', 3), ('experimental', 3), ('post-hardcore', 3), ('j-rap', 3), ('italian indie pop', 3), ('afropop', 3), ('world', 3), ('british folk', 3), ('big band', 3), ('easy listening', 3), ('boy band', 3), ('mellow gold', 3), ('soft rock', 3), ('garage pop', 3), ('australian garage punk', 3), ('skweee', 3), ('jump up', 3), ('jazz violin', 3), ('blues', 3), ('electric blues', 3), ('instrumental rock', 3), ('southern rock', 3), ('texas blues', 3), ('deep uplifting trance', 3), ('native american hip hop', 3), ('dutch house', 3), ('deep minimal techno', 3), ('french rock', 3), ('uk hip hop', 3), ('fingerstyle', 3), ('indie shoegaze', 3), ('nu age', 2), ('sound art', 2), ('chinese indie', 2), ('japanese city pop', 2), ('emo punk', 2), ('latin afrobeat', 2), ('bubble trance', 2), ('scottish rock', 2), ('dutch rock', 2), ('pop house', 2), ('traditional folk', 2), ('malaysian indie', 2), ('anthem worship', 2), ('christian uplift', 2), ('math rock', 2), ('stomp pop', 2), ('memphis hip hop', 2), ('pinoy hip hop', 2), ('fidget house', 2), ('west coast trap', 2), ('contemporary classical', 2), ('early modern classical', 2), ('classic polish pop', 2), ('melbourne bounce', 2), ('deep acoustic pop', 2), ('belly dance', 2), ('world fusion', 2), ('italian alternative', 1), ('gamecore', 1), ('slovak hip hop', 1), ('abstract beats', 1), ('trap latino', 1), ('abstract idm', 1), ('dub techno', 1), ('ecuadorian indie', 1), ('deep southern trap', 1), ('sky room', 1), ('australian indie', 1), ('classic soundtrack', 1), ('soundtrack', 1), ('deep chiptune', 1), ('danish hip hop', 1), ('chanson', 1), ('albanian hip hop', 1), ('alternative pop', 1), ('grunge', 1), ('stoner rock', 1), ('brass band', 1), ('channel pop', 1), ('power pop', 1), ('comic', 1), ('neoclassical', 1), ('progressive trance house', 1), ('irish hip hop', 1), ('gqom', 1), ('mande pop', 1), ('ska', 1), ('laboratorio', 1), ('gypsy jazz', 1), ('jazz guitar', 1), ('girl group', 1), ('belgian rock', 1), ("children's music", 1), ('kids dance party', 1)])
Let's see how many songs don't belong to a genre with at least 90 samples...
freq_threshold = 90
freq_genres = Counter({k:v for (k,v) in c1.items() if v >= freq_threshold})
len(freq_genres), freq_genres.most_common()
(127, [('vapor soul', 4344), ('indie poptimism', 3886), ('pop', 3628), ('indietronica', 3462), ('electropop', 3406), ('indie r&b', 3242), ('tropical house', 2359), ('modern rock', 2171), ('indie psych-rock', 1942), ('indie pop', 1845), ('shimmer pop', 1827), ('edm', 1826), ('chillwave', 1705), ('indie electro-pop', 1451), ('rap', 1427), ('alternative dance', 1418), ('metropopolis', 1303), ('hip hop', 1212), ('pop rap', 1174), ('nu disco', 1086), ('dance pop', 1054), ('chamber pop', 996), ('aussietronica', 973), ('art pop', 973), ('vapor twitch', 915), ('new rave', 907), ('indie folk', 886), ('indie rock', 885), ('house', 867), ('vapor pop', 808), ('electro house', 752), ('indie anthem-folk', 649), ('gauze pop', 647), ('escape room', 641), ('post-teen pop', 606), ('folk-pop', 586), ('deep australian indie', 572), ('electronic trap', 570), ('southern hip hop', 544), ('trap music', 544), ('neo-psychedelic', 503), ('big room', 498), ('underground hip hop', 497), ('filter house', 474), ('dance-punk', 459), ('deep tropical house', 454), ('electronic', 448), ('brostep', 446), ('chamber psych', 409), ('stomp and holler', 408), ('deep house', 408), ('downtempo', 404), ('r&b', 401), ('australian dance', 386), ('freak folk', 384), ('tropical pop edm', 377), ('nu gaze', 372), ('bass trap', 368), ('progressive house', 359), ('neo soul', 340), ('brooklyn indie', 336), ('dream pop', 323), ('future garage', 321), ('conscious hip hop', 314), ('gangster rap', 307), ('preverb', 291), ('alternative hip hop', 284), ('swedish electropop', 273), ('microhouse', 269), ('modern alternative rock', 268), ('progressive electro house', 267), ('neo-synthpop', 264), ('deep groove house', 264), ('bass music', 256), ('urban contemporary', 253), ('trip hop', 239), ('norwegian pop', 232), ('ninja', 232), ('trap soul', 225), ('complextro', 221), ('shiver pop', 218), ('la indie', 212), ('garage rock', 207), ('alt-indie rock', 205), ('disco house', 205), ('noise pop', 203), ('australian alternative rock', 199), ('wonky', 189), ('catstep', 184), ('danish electro-pop', 184), ('indie garage rock', 181), ('funk', 176), ('garage psych', 172), ('lo-fi', 171), ('indie dream pop', 168), ('alternative rock', 165), ('dirty south rap', 162), ('chillstep', 161), ('swedish synthpop', 159), ('soul', 155), ('neo mellow', 153), ('intelligent dance music', 152), ('new americana', 142), ('nu jazz', 142), ('canadian pop', 142), ('moombahton', 141), ('float house', 139), ('etherpop', 138), ('minimal techno', 134), ('fluxwork', 128), ('swedish pop', 127), ('indie jazz', 124), ('swedish indie pop', 124), ('big beat', 117), ('french indietronica', 115), ('electroclash', 108), ('vaporwave', 107), ('australian pop', 101), ('hip pop', 98), ('future funk', 97), ('balearic', 97), ('alternative r&b', 96), ('compositional ambient', 95), ('permanent wave', 92), ('swedish soul', 91), ('shimmer psych', 90), ('hardcore hip hop', 90)])
# scratchpad
reviews_5yrs_df.spotify_genres[0]
array(['deep new americana', 'folk-pop', 'indie pop', 'modern rock', 'new americana', 'stomp and holler'], dtype=object)
import numpy as np
import pandas as pd
# scratchpad
pd.Series(reviews_5yrs_df.spotify_genres[0]).isin(np.array(list(freq_genres.keys()))).any()
True
genre_present = reviews_5yrs_df.apply(lambda x: pd.Series(x.spotify_genres).isin(np.array(list(freq_genres.keys()))).any(), axis=1)
genre_present.value_counts()
True 19313 False 5847 dtype: int64
So 5,847 reviews don't fit into the 127 most-frequent genres.
Let's see if we can cluster these with less-specific genre tags.
If the 1gram genre is a top genre already, we should also gather samples into that existing genre (e.g. 'minimal tech house' -> 'house').
We'll accomplish this by adding the 1gram to the sample's genre list.
def add1Grams(genres):
return np.array(list(set(np.append(genres, [word for genre in genres for word in genre.split()]))))
reviews_5yrs_df.loc[genre_present == False, 'spotify_genres'] = reviews_5yrs_df.loc[genre_present == False, 'spotify_genres'].apply(add1Grams)
reviews_5yrs_df.loc[genre_present == False].spotify_genres
15 [] 16 [] 22 [] 47 [] 48 [] 49 [] 50 [] 51 [] 52 [] 53 [] 64 [] 65 [] 68 [] 69 [] 70 [] 78 [] 79 [] 80 [] 81 [] 82 [] 83 [] 84 [] 85 [] 86 [] 89 [] 90 [] 91 [] 92 [] 93 [] 94 [] ... 24973 [] 24974 [] 24975 [] 24976 [] 24977 [] 24978 [] 24979 [] 24983 [roots reggae, roots, reggae] 24987 [boston rock, rock, boston] 24988 [] 25005 [] 25006 [] 25011 [] 25012 [] 25013 [] 25024 [] 25025 [] 25043 [roots reggae, roots, reggae] 25044 [roots reggae, roots, reggae] 25045 [roots reggae, roots, reggae] 25046 [roots reggae, roots, reggae] 25047 [roots reggae, roots, reggae] 25054 [] 25060 [] 25061 [] 25071 [] 25150 [era, violin, romantic era, romantic, classical] 25151 [era, violin, romantic era, romantic, classical] 25152 [era, violin, romantic era, romantic, classical] 25153 [post-disco, disco] Name: spotify_genres, Length: 5847, dtype: object
Recalculate freq_genres
...
c1 = Counter([g for gg in reviews_5yrs_df.spotify_genres for g in gg])
freq_genres = Counter({k:v for (k,v) in c1.items() if v >= freq_threshold})
len(freq_genres), freq_genres.most_common()
(130, [('vapor soul', 4344), ('indie poptimism', 3886), ('pop', 3799), ('indietronica', 3462), ('electropop', 3406), ('indie r&b', 3242), ('tropical house', 2359), ('modern rock', 2171), ('indie psych-rock', 1942), ('indie pop', 1845), ('shimmer pop', 1827), ('edm', 1826), ('chillwave', 1705), ('rap', 1452), ('indie electro-pop', 1451), ('alternative dance', 1418), ('metropopolis', 1303), ('hip hop', 1212), ('pop rap', 1174), ('nu disco', 1086), ('dance pop', 1054), ('chamber pop', 996), ('aussietronica', 973), ('art pop', 973), ('house', 917), ('vapor twitch', 915), ('new rave', 907), ('indie folk', 886), ('indie rock', 885), ('vapor pop', 808), ('electro house', 752), ('indie anthem-folk', 649), ('gauze pop', 647), ('escape room', 641), ('post-teen pop', 606), ('folk-pop', 586), ('deep australian indie', 572), ('electronic trap', 570), ('southern hip hop', 544), ('trap music', 544), ('neo-psychedelic', 503), ('big room', 498), ('underground hip hop', 497), ('filter house', 474), ('dance-punk', 459), ('deep tropical house', 454), ('electronic', 454), ('brostep', 446), ('chamber psych', 409), ('stomp and holler', 408), ('deep house', 408), ('downtempo', 404), ('r&b', 401), ('australian dance', 386), ('freak folk', 384), ('tropical pop edm', 377), ('nu gaze', 372), ('bass trap', 368), ('progressive house', 359), ('neo soul', 340), ('brooklyn indie', 336), ('dream pop', 323), ('future garage', 321), ('conscious hip hop', 314), ('gangster rap', 307), ('preverb', 291), ('alternative hip hop', 284), ('swedish electropop', 273), ('microhouse', 269), ('modern alternative rock', 268), ('progressive electro house', 267), ('indie', 264), ('neo-synthpop', 264), ('deep groove house', 264), ('bass music', 256), ('urban contemporary', 253), ('trip hop', 239), ('norwegian pop', 232), ('ninja', 232), ('trap soul', 225), ('complextro', 221), ('shiver pop', 218), ('la indie', 212), ('garage rock', 207), ('alt-indie rock', 205), ('disco house', 205), ('noise pop', 203), ('australian alternative rock', 199), ('lo-fi', 192), ('wonky', 189), ('catstep', 184), ('danish electro-pop', 184), ('indie garage rock', 181), ('funk', 177), ('garage psych', 172), ('indie dream pop', 168), ('alternative rock', 165), ('dirty south rap', 162), ('chillstep', 161), ('swedish synthpop', 159), ('soul', 159), ('neo mellow', 153), ('intelligent dance music', 152), ('new americana', 142), ('nu jazz', 142), ('canadian pop', 142), ('moombahton', 141), ('float house', 139), ('etherpop', 138), ('minimal techno', 134), ('fluxwork', 128), ('swedish pop', 127), ('indie jazz', 124), ('swedish indie pop', 124), ('rock', 123), ('big beat', 117), ('french indietronica', 115), ('electroclash', 108), ('vaporwave', 107), ('australian pop', 101), ('hip pop', 98), ('future funk', 97), ('balearic', 97), ('trance', 96), ('alternative r&b', 96), ('compositional ambient', 95), ('permanent wave', 92), ('swedish soul', 91), ('shimmer psych', 90), ('hardcore hip hop', 90)])
Visual diff of freq_genres
, before and after adding 1grams ... indie
, rock
, and trance
are new entrants, and genres like pop
, rap
, house
, and lo-fi
receive decent gains.
genre_present = reviews_5yrs_df.apply(lambda x: pd.Series(x.spotify_genres).isin(np.array(list(freq_genres.keys()))).any(), axis=1)
genre_present.value_counts()
True 19871 False 5289 dtype: int64
Recovered 558 samples doing this.
genreless_df = reviews_5yrs_df.loc[genre_present == False]
genreless_df.spotify_genres.value_counts()
/home/ubuntu/src/anaconda3/envs/fastai/lib/python3.6/site-packages/pandas/core/algorithms.py:761: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison keys, counts = f(values, dropna)
[vogue] 15 [neo-singer-songwriter] 15 [fake] 12 [electrofox] 12 [filthstep] 11 [dreamo] 10 [mashup] 6 [covertrance] 6 [chillhop] 6 [tracestep] 4 [j-rap] 3 [disco] 3 [indiecoustica] 2 [] 1 [] 1 [] 1 [] 1 [] 1 [] 1 [] 1 [] 1 [] 1 [] 1 [] 1 [] 1 [] 1 [] 1 [] 1 [] 1 [] 1 .. [] 1 [] 1 [] 1 [] 1 [] 1 [] 1 [] 1 [] 1 [] 1 [] 1 [] 1 [] 1 [] 1 [era, classical era, classical] 1 [] 1 [] 1 [] 1 [] 1 [] 1 [] 1 [] 1 [] 1 [] 1 [] 1 [] 1 [] 1 [] 1 [] 1 [] 1 [] 1 Name: spotify_genres, Length: 5197, dtype: int64
A lot of genre lists are actually empty (duh, forgot that was possible)!
flat_genreless = [genre for ndarray in list(genreless_df.spotify_genres) for genre in list(ndarray)]
c2 = Counter(flat_genreless)
len(c2), c2.most_common()
(151, [('hop', 27), ('canadian', 18), ('hip', 18), ('jazz', 16), ('tunes', 15), ('movie', 15), ('movie tunes', 15), ('neo-singer-songwriter', 15), ('vogue', 15), ('deep', 14), ('canadian folk', 13), ('folk', 13), ('classical', 12), ('fake', 12), ('electrofox', 12), ('band', 11), ('filthstep', 11), ('dreamo', 10), ('era', 10), ('roots reggae', 10), ('roots', 10), ('reggae', 10), ('vocal jazz', 9), ('vocal', 9), ('glitch hop', 9), ('glitch', 9), ('new', 8), ('disco', 8), ('destroy techno', 7), ('techno', 7), ('destroy', 7), ('wind', 7), ('marching', 7), ('wind ensemble', 7), ('concert band', 7), ('marching band', 7), ('concert', 7), ('ensemble', 7), ('norwegian jazz', 6), ('norwegian', 6), ('chillhop', 6), ('mashup', 6), ('covertrance', 6), ('deep new americana', 5), ('americana', 5), ('big', 5), ('minimal', 5), ('minimal wave', 5), ('wave', 5), ('grave', 5), ('grave wave', 5), ('brazilian hip hop', 5), ('brazilian', 5), ('classical era', 5), ('violin', 5), ('romantic era', 5), ('romantic', 5), ('post-disco', 5), ('belgian', 4), ('belgian hip hop', 4), ('punk', 4), ('swing', 4), ('tracestep', 4), ('deep trap', 4), ('trap', 4), ('drill', 4), ('canadian hip hop', 4), ('drone', 3), ('warm drone', 3), ('warm', 3), ('j-rap', 3), ('big band', 3), ('adult', 3), ('christmas', 3), ('listening', 3), ('easy listening', 3), ('adult standards', 3), ('standards', 3), ('lounge', 3), ('easy', 3), ('cabaret', 3), ('abstractro', 3), ('weird', 3), ('america', 3), ('new weird america', 3), ('up', 3), ('jump up', 3), ('jump', 3), ('idm', 3), ('ambient', 3), ('ambient idm', 3), ('dance', 3), ('emo', 2), ('emo punk', 2), ('vancouver', 2), ('vancouver punk', 2), ('deep underground hip hop', 2), ('underground', 2), ('indiecoustica', 2), ('deep big room', 2), ('room', 2), ('contemporary classical', 2), ('modern', 2), ('early modern classical', 2), ('contemporary', 2), ('early', 2), ('belly dance', 2), ('world', 2), ('belly', 2), ('fusion', 2), ('world fusion', 2), ('canadian indigenous', 1), ('indigenous', 1), ('italian alternative', 1), ('alternative', 1), ('italian', 1), ('gamecore', 1), ('australian hip hop', 1), ('australian', 1), ('slovak hip hop', 1), ('slovak', 1), ('classic', 1), ('classic soundtrack', 1), ('soundtrack', 1), ('retro', 1), ('deep chiptune', 1), ('retro electro', 1), ('electro', 1), ('chiptune', 1), ('danish hip hop', 1), ('danish', 1), ('albanian', 1), ('albanian hip hop', 1), ('brass', 1), ('brass band', 1), ('gqom', 1), ('ska', 1), ('gypsy jazz', 1), ('gypsy', 1), ('jazz guitar', 1), ('guitar', 1), ('cool', 1), ('bebop', 1), ('cool jazz', 1), ('kids dance party', 1), ('kids', 1), ('music', 1), ('party', 1), ("children's music", 1), ("children's", 1), ('grime', 1)])
No other 1gram genres come close to 90 samples.
genreless_df.loc[genreless_df.apply(lambda x: pd.Series(x.spotify_genres).isin(['canadian']).any(), axis=1) == True, 'spotify_genres']
119 [canadian indigenous, indigenous, canadian] 7102 [canadian folk, canadian, folk] 7103 [canadian folk, canadian, folk] 7104 [canadian folk, canadian, folk] 7105 [canadian folk, canadian, folk] 9055 [canadian folk, canadian, folk] 9056 [canadian folk, canadian, folk] 9057 [canadian folk, canadian, folk] 9058 [canadian folk, canadian, folk] 16398 [canadian folk, canadian, folk] 16510 [canadian hip hop, hip, canadian, hop] 16511 [canadian hip hop, hip, canadian, hop] 16512 [canadian hip hop, hip, canadian, hop] 16513 [canadian hip hop, hip, canadian, hop] 19553 [canadian folk, canadian, folk] 19554 [canadian folk, canadian, folk] 19555 [canadian folk, canadian, folk] 19556 [canadian folk, canadian, folk] Name: spotify_genres, dtype: object
Let's special case the 2-gram 'hip hop'
freq_genres.get('hip hop')
1212
def addHipHop(genres):
if 'hip' in genres and 'hop' in genres:
return np.array(np.append(genres, ['hip hop']))
return genres
reviews_5yrs_df.loc[genre_present == False, 'spotify_genres'] = reviews_5yrs_df.loc[genre_present == False, 'spotify_genres'].apply(addHipHop)
reviews_5yrs_df.loc[genre_present == False].spotify_genres
/home/ubuntu/src/anaconda3/envs/fastai/lib/python3.6/site-packages/ipykernel_launcher.py:2: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
15 [] 16 [] 22 [] 47 [] 48 [] 49 [] 50 [] 51 [] 52 [] 53 [] 64 [] 65 [] 68 [] 69 [] 70 [] 78 [] 79 [] 80 [] 81 [] 82 [] 83 [] 84 [] 85 [] 86 [] 89 [] 90 [] 91 [] 92 [] 93 [] 94 [] ... 24972 [] 24973 [] 24974 [] 24975 [] 24976 [] 24977 [] 24978 [] 24979 [] 24983 [roots reggae, roots, reggae] 24988 [] 25005 [] 25006 [] 25011 [] 25012 [] 25013 [] 25024 [] 25025 [] 25043 [roots reggae, roots, reggae] 25044 [roots reggae, roots, reggae] 25045 [roots reggae, roots, reggae] 25046 [roots reggae, roots, reggae] 25047 [roots reggae, roots, reggae] 25054 [] 25060 [] 25061 [] 25071 [] 25150 [era, violin, romantic era, romantic, classical] 25151 [era, violin, romantic era, romantic, classical] 25152 [era, violin, romantic era, romantic, classical] 25153 [post-disco, disco] Name: spotify_genres, Length: 5289, dtype: object
c1 = Counter([g for gg in reviews_5yrs_df.spotify_genres for g in gg])
freq_genres = Counter({k:v for (k,v) in c1.items() if v >= freq_threshold})
freq_genres.most_common()
[('vapor soul', 4344), ('indie poptimism', 3886), ('pop', 3799), ('indietronica', 3462), ('electropop', 3406), ('indie r&b', 3242), ('tropical house', 2359), ('modern rock', 2171), ('indie psych-rock', 1942), ('indie pop', 1845), ('shimmer pop', 1827), ('edm', 1826), ('chillwave', 1705), ('rap', 1452), ('indie electro-pop', 1451), ('alternative dance', 1418), ('metropopolis', 1303), ('hip hop', 1230), ('pop rap', 1174), ('nu disco', 1086), ('dance pop', 1054), ('chamber pop', 996), ('aussietronica', 973), ('art pop', 973), ('house', 917), ('vapor twitch', 915), ('new rave', 907), ('indie folk', 886), ('indie rock', 885), ('vapor pop', 808), ('electro house', 752), ('indie anthem-folk', 649), ('gauze pop', 647), ('escape room', 641), ('post-teen pop', 606), ('folk-pop', 586), ('deep australian indie', 572), ('electronic trap', 570), ('southern hip hop', 544), ('trap music', 544), ('neo-psychedelic', 503), ('big room', 498), ('underground hip hop', 497), ('filter house', 474), ('dance-punk', 459), ('deep tropical house', 454), ('electronic', 454), ('brostep', 446), ('chamber psych', 409), ('stomp and holler', 408), ('deep house', 408), ('downtempo', 404), ('r&b', 401), ('australian dance', 386), ('freak folk', 384), ('tropical pop edm', 377), ('nu gaze', 372), ('bass trap', 368), ('progressive house', 359), ('neo soul', 340), ('brooklyn indie', 336), ('dream pop', 323), ('future garage', 321), ('conscious hip hop', 314), ('gangster rap', 307), ('preverb', 291), ('alternative hip hop', 284), ('swedish electropop', 273), ('microhouse', 269), ('modern alternative rock', 268), ('progressive electro house', 267), ('indie', 264), ('neo-synthpop', 264), ('deep groove house', 264), ('bass music', 256), ('urban contemporary', 253), ('trip hop', 239), ('norwegian pop', 232), ('ninja', 232), ('trap soul', 225), ('complextro', 221), ('shiver pop', 218), ('la indie', 212), ('garage rock', 207), ('alt-indie rock', 205), ('disco house', 205), ('noise pop', 203), ('australian alternative rock', 199), ('lo-fi', 192), ('wonky', 189), ('catstep', 184), ('danish electro-pop', 184), ('indie garage rock', 181), ('funk', 177), ('garage psych', 172), ('indie dream pop', 168), ('alternative rock', 165), ('dirty south rap', 162), ('chillstep', 161), ('swedish synthpop', 159), ('soul', 159), ('neo mellow', 153), ('intelligent dance music', 152), ('new americana', 142), ('nu jazz', 142), ('canadian pop', 142), ('moombahton', 141), ('float house', 139), ('etherpop', 138), ('minimal techno', 134), ('fluxwork', 128), ('swedish pop', 127), ('indie jazz', 124), ('swedish indie pop', 124), ('rock', 123), ('big beat', 117), ('french indietronica', 115), ('electroclash', 108), ('vaporwave', 107), ('australian pop', 101), ('hip pop', 98), ('future funk', 97), ('balearic', 97), ('trance', 96), ('alternative r&b', 96), ('compositional ambient', 95), ('permanent wave', 92), ('swedish soul', 91), ('shimmer psych', 90), ('hardcore hip hop', 90)]
('hip hop', 1212)
=> ('hip hop', 1230)
genre_present = reviews_5yrs_df.apply(lambda x: pd.Series(x.spotify_genres).isin(np.array(list(freq_genres.keys()))).any(), axis=1)
genre_present.value_counts()
True 19889 False 5271 dtype: int64
18 more samples recovered with generic genre hip hop
.
Remove the remaining genreless samples (including empty ones that Spotify didn't have genre info for).
genreless_df = reviews_5yrs_df.loc[genre_present == False]
final_genreless = [genre for ndarray in list(genreless_df.spotify_genres) for genre in list(ndarray)]
c3 = Counter(final_genreless)
len(c3), c3.most_common()
(135, [('jazz', 16), ('tunes', 15), ('movie', 15), ('movie tunes', 15), ('neo-singer-songwriter', 15), ('vogue', 15), ('canadian', 14), ('canadian folk', 13), ('folk', 13), ('deep', 12), ('classical', 12), ('fake', 12), ('electrofox', 12), ('band', 11), ('filthstep', 11), ('dreamo', 10), ('era', 10), ('roots reggae', 10), ('roots', 10), ('reggae', 10), ('vocal jazz', 9), ('vocal', 9), ('glitch hop', 9), ('glitch', 9), ('hop', 9), ('new', 8), ('disco', 8), ('destroy techno', 7), ('techno', 7), ('destroy', 7), ('wind', 7), ('marching', 7), ('wind ensemble', 7), ('concert band', 7), ('marching band', 7), ('concert', 7), ('ensemble', 7), ('norwegian jazz', 6), ('norwegian', 6), ('chillhop', 6), ('mashup', 6), ('covertrance', 6), ('deep new americana', 5), ('americana', 5), ('big', 5), ('minimal', 5), ('minimal wave', 5), ('wave', 5), ('grave', 5), ('grave wave', 5), ('classical era', 5), ('violin', 5), ('romantic era', 5), ('romantic', 5), ('post-disco', 5), ('punk', 4), ('swing', 4), ('tracestep', 4), ('deep trap', 4), ('trap', 4), ('drill', 4), ('drone', 3), ('warm drone', 3), ('warm', 3), ('j-rap', 3), ('big band', 3), ('adult', 3), ('christmas', 3), ('listening', 3), ('easy listening', 3), ('adult standards', 3), ('standards', 3), ('lounge', 3), ('easy', 3), ('cabaret', 3), ('abstractro', 3), ('weird', 3), ('america', 3), ('new weird america', 3), ('up', 3), ('jump up', 3), ('jump', 3), ('idm', 3), ('ambient', 3), ('ambient idm', 3), ('dance', 3), ('emo', 2), ('emo punk', 2), ('vancouver', 2), ('vancouver punk', 2), ('indiecoustica', 2), ('deep big room', 2), ('room', 2), ('contemporary classical', 2), ('modern', 2), ('early modern classical', 2), ('contemporary', 2), ('early', 2), ('belly dance', 2), ('world', 2), ('belly', 2), ('fusion', 2), ('world fusion', 2), ('canadian indigenous', 1), ('indigenous', 1), ('italian alternative', 1), ('alternative', 1), ('italian', 1), ('gamecore', 1), ('classic', 1), ('classic soundtrack', 1), ('soundtrack', 1), ('retro', 1), ('deep chiptune', 1), ('retro electro', 1), ('electro', 1), ('chiptune', 1), ('brass', 1), ('brass band', 1), ('gqom', 1), ('ska', 1), ('gypsy jazz', 1), ('gypsy', 1), ('jazz guitar', 1), ('guitar', 1), ('cool', 1), ('bebop', 1), ('cool jazz', 1), ('kids dance party', 1), ('kids', 1), ('music', 1), ('party', 1), ("children's music", 1), ("children's", 1), ('grime', 1)])
c1.get('jazz')
28
len(genreless_df.spotify_genres.loc[genreless_df.spotify_genres.apply(lambda x: len(x) == 0)])
5030
Removing 5030 empty-genre samples and 241 genreless samples.
reviews_5yrs_df = reviews_5yrs_df.drop(genreless_df.index)
reviews_5yrs_df.head()
artist | artist_id | author | content | date_published | dek | desc | direction | domain | error | ... | loudness | mode | speechiness | acousticness | instrumentalness | liveness | valence | tempo | duration_ms | time_signature | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Mt. Joy | 69tiO1fG8VWduDl3ji2qhI | None | New Music\nMt. Joy reached out to us with the ... | 2017-11-20T13:33:10.000Z | NaN | “Silver Lining” is the fourth released single ... | ltr | www.buffablog.com | NaN | ... | -6.231 | 1.0 | 0.0323 | 0.000921 | 0.000000 | 0.0780 | 0.178 | 144.458 | 199827.0 | 4.0 |
1 | Mt. Joy | 69tiO1fG8VWduDl3ji2qhI | Martin | Folk rockers Mt. Joy have debuted their new so... | 2017-11-16T00:00:00.000Z | NaN | “Silver Lining” is the fourth released single ... | ltr | www.discobelle.net | NaN | ... | -6.231 | 1.0 | 0.0323 | 0.000921 | 0.000000 | 0.0780 | 0.178 | 144.458 | 199827.0 | 4.0 |
2 | Mt. Joy | 69tiO1fG8VWduDl3ji2qhI | Nasko | You know we're digging Mt. Joy.\nTheir new sin... | 2017-11-14T14:30:26.000Z | NaN | “Silver Lining” is the fourth released single ... | ltr | www.stereofox.com | NaN | ... | -6.231 | 1.0 | 0.0323 | 0.000921 | 0.000000 | 0.0780 | 0.178 | 144.458 | 199827.0 | 4.0 |
3 | Mt. Joy | 69tiO1fG8VWduDl3ji2qhI | D & d | Nothing against the profession, but the U.S. h... | 2018-06-20T10:53:00.000Z | NaN | “Silver Lining” is the fourth released single ... | ltr | indieobsessive.blogspot.com | NaN | ... | -6.231 | 1.0 | 0.0323 | 0.000921 | 0.000000 | 0.0780 | 0.178 | 144.458 | 199827.0 | 4.0 |
4 | Opia | 70zHrrrPfBkVV44AEhFyyh | Nat Morawski | Connecticut duo Opia have released a guitar he... | 2018-05-24T00:00:00.000Z | NaN | \n\nI don’t know why but this feels like a bre... | ltr | acidstag.com | NaN | ... | -6.575 | 1.0 | 0.1020 | 0.190000 | 0.000034 | 0.0625 | 0.582 | 87.974 | 223814.0 | 4.0 |
5 rows × 45 columns
c1 = Counter([g for gg in reviews_5yrs_df.spotify_genres for g in gg])
freq_genres = Counter({k:v for (k,v) in c1.items() if v >= freq_threshold})
genre_present = reviews_5yrs_df.apply(lambda x: pd.Series(x.spotify_genres).isin(np.array(list(freq_genres.keys()))).any(), axis=1)
genre_present.value_counts()
True 19889 dtype: int64
list(zip(range(1, len(freq_genres) + 1), freq_genres.most_common()))
[(1, ('vapor soul', 4344)), (2, ('indie poptimism', 3886)), (3, ('pop', 3799)), (4, ('indietronica', 3462)), (5, ('electropop', 3406)), (6, ('indie r&b', 3242)), (7, ('tropical house', 2359)), (8, ('modern rock', 2171)), (9, ('indie psych-rock', 1942)), (10, ('indie pop', 1845)), (11, ('shimmer pop', 1827)), (12, ('edm', 1826)), (13, ('chillwave', 1705)), (14, ('rap', 1452)), (15, ('indie electro-pop', 1451)), (16, ('alternative dance', 1418)), (17, ('metropopolis', 1303)), (18, ('hip hop', 1230)), (19, ('pop rap', 1174)), (20, ('nu disco', 1086)), (21, ('dance pop', 1054)), (22, ('chamber pop', 996)), (23, ('aussietronica', 973)), (24, ('art pop', 973)), (25, ('house', 917)), (26, ('vapor twitch', 915)), (27, ('new rave', 907)), (28, ('indie folk', 886)), (29, ('indie rock', 885)), (30, ('vapor pop', 808)), (31, ('electro house', 752)), (32, ('indie anthem-folk', 649)), (33, ('gauze pop', 647)), (34, ('escape room', 641)), (35, ('post-teen pop', 606)), (36, ('folk-pop', 586)), (37, ('deep australian indie', 572)), (38, ('electronic trap', 570)), (39, ('southern hip hop', 544)), (40, ('trap music', 544)), (41, ('neo-psychedelic', 503)), (42, ('big room', 498)), (43, ('underground hip hop', 497)), (44, ('filter house', 474)), (45, ('dance-punk', 459)), (46, ('deep tropical house', 454)), (47, ('electronic', 454)), (48, ('brostep', 446)), (49, ('chamber psych', 409)), (50, ('stomp and holler', 408)), (51, ('deep house', 408)), (52, ('downtempo', 404)), (53, ('r&b', 401)), (54, ('australian dance', 386)), (55, ('freak folk', 384)), (56, ('tropical pop edm', 377)), (57, ('nu gaze', 372)), (58, ('bass trap', 368)), (59, ('progressive house', 359)), (60, ('neo soul', 340)), (61, ('brooklyn indie', 336)), (62, ('dream pop', 323)), (63, ('future garage', 321)), (64, ('conscious hip hop', 314)), (65, ('gangster rap', 307)), (66, ('preverb', 291)), (67, ('alternative hip hop', 284)), (68, ('swedish electropop', 273)), (69, ('microhouse', 269)), (70, ('modern alternative rock', 268)), (71, ('progressive electro house', 267)), (72, ('indie', 264)), (73, ('neo-synthpop', 264)), (74, ('deep groove house', 264)), (75, ('bass music', 256)), (76, ('urban contemporary', 253)), (77, ('trip hop', 239)), (78, ('norwegian pop', 232)), (79, ('ninja', 232)), (80, ('trap soul', 225)), (81, ('complextro', 221)), (82, ('shiver pop', 218)), (83, ('la indie', 212)), (84, ('garage rock', 207)), (85, ('alt-indie rock', 205)), (86, ('disco house', 205)), (87, ('noise pop', 203)), (88, ('australian alternative rock', 199)), (89, ('lo-fi', 192)), (90, ('wonky', 189)), (91, ('catstep', 184)), (92, ('danish electro-pop', 184)), (93, ('indie garage rock', 181)), (94, ('funk', 177)), (95, ('garage psych', 172)), (96, ('indie dream pop', 168)), (97, ('alternative rock', 165)), (98, ('dirty south rap', 162)), (99, ('chillstep', 161)), (100, ('swedish synthpop', 159)), (101, ('soul', 159)), (102, ('neo mellow', 153)), (103, ('intelligent dance music', 152)), (104, ('new americana', 142)), (105, ('nu jazz', 142)), (106, ('canadian pop', 142)), (107, ('moombahton', 141)), (108, ('float house', 139)), (109, ('etherpop', 138)), (110, ('minimal techno', 134)), (111, ('fluxwork', 128)), (112, ('swedish pop', 127)), (113, ('indie jazz', 124)), (114, ('swedish indie pop', 124)), (115, ('rock', 123)), (116, ('big beat', 117)), (117, ('french indietronica', 115)), (118, ('electroclash', 108)), (119, ('vaporwave', 107)), (120, ('australian pop', 101)), (121, ('hip pop', 98)), (122, ('future funk', 97)), (123, ('balearic', 97)), (124, ('trance', 96)), (125, ('alternative r&b', 96)), (126, ('compositional ambient', 95)), (127, ('permanent wave', 92)), (128, ('swedish soul', 91)), (129, ('shimmer psych', 90)), (130, ('hardcore hip hop', 90))]
Since samples can have multiple genres, let's see how often this occurs with the 'top' genres.
First, let's remove all non-top genres.
# scratchpad
np.array(pd.Series(reviews_5yrs_df.spotify_genres[0])), np.array(pd.Series(reviews_5yrs_df.spotify_genres[0]).where(pd.Series(reviews_5yrs_df.spotify_genres[0]).isin(np.array(list(freq_genres.keys())))).dropna())
(array(['deep new americana', 'folk-pop', 'indie pop', 'modern rock', 'new americana', 'stomp and holler'], dtype=object), array(['folk-pop', 'indie pop', 'modern rock', 'new americana', 'stomp and holler'], dtype=object))
reviews_5yrs_df.spotify_genres = reviews_5yrs_df.apply(lambda x: np.array(pd.Series(x.spotify_genres).where(pd.Series(x.spotify_genres).isin(np.array(list(freq_genres.keys())))).dropna()), axis=1)
Now we can simply get genre counts per row in spotify_genres
num_genres_per = reviews_5yrs_df.spotify_genres.apply(len)
list(zip(range(1, len(num_genres_per.value_counts()) + 1), num_genres_per.value_counts(), num_genres_per.value_counts() / sum(num_genres_per.value_counts())))
[(1, 4479, 0.2251998592186636), (2, 2769, 0.13922268590678263), (3, 2629, 0.1321836190859269), (4, 2617, 0.13158027050128213), (5, 2199, 0.11056362813615567), (6, 1695, 0.08522298758107497), (7, 1139, 0.05726783649253356), (8, 849, 0.04268691236361808), (9, 457, 0.022977525265221982), (10, 331, 0.016642365126451807), (11, 187, 0.009402182110714466), (12, 174, 0.008748554477349289), (13, 128, 0.006435718236210971), (14, 80, 0.004022323897631856), (15, 45, 0.0022625571924179194), (16, 31, 0.0015586505103323445), (17, 27, 0.0013575343154507518), (18, 24, 0.001206697169289557), (19, 18, 0.0009050228769671677), (20, 10, 0.000502790487203982), (21, 1, 5.027904872039821e-05)]
reviews_5yrs_df.loc[reviews_5yrs_df.spotify_genres.apply(lambda x: len(x) == 21)][['artist', 'song_title', 'spotify_genres']]
artist | song_title | spotify_genres | |
---|---|---|---|
1160 | Destroyer | Tinseltown Swimming In Blood | [alternative dance, alternative rock, art pop,... |
list(reviews_5yrs_df.loc[reviews_5yrs_df.spotify_genres.apply(lambda x: len(x) == 21), 'spotify_genres'])
[array(['alternative dance', 'alternative rock', 'art pop', 'chamber pop', 'chamber psych', 'chillwave', 'dream pop', 'folk-pop', 'freak folk', 'garage psych', 'indie folk', 'indie pop', 'indie rock', 'indietronica', 'lo-fi', 'modern rock', 'neo-psychedelic', 'noise pop', 'nu gaze', 'preverb', 'stomp and holler'], dtype=object)]
Might try capping # of frequent genres per sample at top 10 when conditioning.
cap = 10
sum(num_genres_per.value_counts()[:cap] / sum(num_genres_per.value_counts()))
0.9635476896777114
% matplotlib inline
num_genres_per.hist(bins=len(num_genres_per.value_counts()))
<matplotlib.axes._subplots.AxesSubplot at 0x7f1e546b39e8>
'unique combinations of genres:', len(Counter([tuple(set(ndarray)) for ndarray in reviews_5yrs_df.spotify_genres]))
('unique combinations of genres:', 1135)
deephypebot._set(['reviews_and_metadata_5yrs'], reviews_5yrs_df)
deephypebot.reviews_and_metadata_5yrs()
artist | artist_id | author | content | date_published | dek | desc | direction | domain | error | ... | loudness | mode | speechiness | acousticness | instrumentalness | liveness | valence | tempo | duration_ms | time_signature | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Mt. Joy | 69tiO1fG8VWduDl3ji2qhI | None | New Music\nMt. Joy reached out to us with the ... | 2017-11-20T13:33:10.000Z | NaN | “Silver Lining” is the fourth released single ... | ltr | www.buffablog.com | NaN | ... | -6.231 | 1.0 | 0.0323 | 0.000921 | 0.000000 | 0.0780 | 0.1780 | 144.458 | 199827.0 | 4.0 |
1 | Mt. Joy | 69tiO1fG8VWduDl3ji2qhI | Martin | Folk rockers Mt. Joy have debuted their new so... | 2017-11-16T00:00:00.000Z | NaN | “Silver Lining” is the fourth released single ... | ltr | www.discobelle.net | NaN | ... | -6.231 | 1.0 | 0.0323 | 0.000921 | 0.000000 | 0.0780 | 0.1780 | 144.458 | 199827.0 | 4.0 |
2 | Mt. Joy | 69tiO1fG8VWduDl3ji2qhI | Nasko | You know we're digging Mt. Joy.\nTheir new sin... | 2017-11-14T14:30:26.000Z | NaN | “Silver Lining” is the fourth released single ... | ltr | www.stereofox.com | NaN | ... | -6.231 | 1.0 | 0.0323 | 0.000921 | 0.000000 | 0.0780 | 0.1780 | 144.458 | 199827.0 | 4.0 |
3 | Mt. Joy | 69tiO1fG8VWduDl3ji2qhI | D & d | Nothing against the profession, but the U.S. h... | 2018-06-20T10:53:00.000Z | NaN | “Silver Lining” is the fourth released single ... | ltr | indieobsessive.blogspot.com | NaN | ... | -6.231 | 1.0 | 0.0323 | 0.000921 | 0.000000 | 0.0780 | 0.1780 | 144.458 | 199827.0 | 4.0 |
4 | Opia | 70zHrrrPfBkVV44AEhFyyh | Nat Morawski | Connecticut duo Opia have released a guitar he... | 2018-05-24T00:00:00.000Z | NaN | \n\nI don’t know why but this feels like a bre... | ltr | acidstag.com | NaN | ... | -6.575 | 1.0 | 0.1020 | 0.190000 | 0.000034 | 0.0625 | 0.5820 | 87.974 | 223814.0 | 4.0 |
5 | Opia | 70zHrrrPfBkVV44AEhFyyh | Adeel Amini | Now this is inventive. Not that we'd expect an... | 2018-05-17T11:30:22.000Z | NaN | \n\nI don’t know why but this feels like a bre... | ltr | pressplayok.com | NaN | ... | -6.575 | 1.0 | 0.1020 | 0.190000 | 0.000034 | 0.0625 | 0.5820 | 87.974 | 223814.0 | 4.0 |
6 | ELOHIM | 6wKxOKEA3K6R2UZ3COLXEY | Oblivious Pop | Since the very first release from Elohim about... | 2018-06-20T16:01:00.000Z | NaN | \n\n?\n\n | ltr | www.obliviouspop.com | NaN | ... | -6.560 | 1.0 | 0.0347 | 0.139000 | 0.000003 | 0.1110 | 0.2990 | 120.046 | 225531.0 | 4.0 |
7 | ELOHIM | 6wKxOKEA3K6R2UZ3COLXEY | None | New Music\nNow this is how you get a party sta... | 2018-05-09T22:44:56.000Z | NaN | \n\n?\n\n | ltr | www.buffablog.com | NaN | ... | -6.560 | 1.0 | 0.0347 | 0.139000 | 0.000003 | 0.1110 | 0.2990 | 120.046 | 225531.0 | 4.0 |
8 | ELOHIM | 6wKxOKEA3K6R2UZ3COLXEY | None | This Friday, April 27th, ELOHIM releases her l... | None | NaN | \n\n?\n\n | ltr | www.theautumnroses.com | NaN | ... | -6.560 | 1.0 | 0.0347 | 0.139000 | 0.000003 | 0.1110 | 0.2990 | 120.046 | 225531.0 | 4.0 |
9 | ELOHIM | 6wKxOKEA3K6R2UZ3COLXEY | None | What a long way Elohim has come since the rele... | 2018-04-24T17:12:03.000Z | NaN | \n\n?\n\n | ltr | www.highclouds.org | NaN | ... | -6.560 | 1.0 | 0.0347 | 0.139000 | 0.000003 | 0.1110 | 0.2990 | 120.046 | 225531.0 | 4.0 |
10 | ELOHIM | 6wKxOKEA3K6R2UZ3COLXEY | Acid Stag | Los Angeles artist ELOHIM will be releasing he... | 2018-04-24T00:00:00.000Z | NaN | \n\n?\n\n | ltr | acidstag.com | NaN | ... | -6.560 | 1.0 | 0.0347 | 0.139000 | 0.000003 | 0.1110 | 0.2990 | 120.046 | 225531.0 | 4.0 |
11 | ELOHIM | 6wKxOKEA3K6R2UZ3COLXEY | Adeel Amini | Elohim is no stranger to full-pelt pop, and it... | 2018-04-23T21:35:58.000Z | NaN | \n\n?\n\n | ltr | pressplayok.com | NaN | ... | -6.560 | 1.0 | 0.0347 | 0.139000 | 0.000003 | 0.1110 | 0.2990 | 120.046 | 225531.0 | 4.0 |
12 | G Flip | 4SdIXLzfabqU61iK7SnKAU | Breaking More Waves Blog | If you follow Breaking More Waves on Twitter, ... | 2018-06-20T08:00:00.000Z | NaN | \n\n?\n\n | ltr | breakingmorewaves.blogspot.com | NaN | ... | -5.865 | 1.0 | 0.0446 | 0.442000 | 0.000002 | 0.0742 | 0.6090 | 102.988 | 217148.0 | 4.0 |
13 | G Flip | 4SdIXLzfabqU61iK7SnKAU | None | Tags G Flip Georgia Flipo\nBased in London, En... | 2018-05-18T11:25:00.000Z | NaN | \n\n?\n\n | ltr | www.highclouds.org | NaN | ... | -5.865 | 1.0 | 0.0446 | 0.442000 | 0.000002 | 0.0742 | 0.6090 | 102.988 | 217148.0 | 4.0 |
14 | G Flip | 4SdIXLzfabqU61iK7SnKAU | Adeel Amini | G Flip - or Georgia to her fam - should be a n... | 2018-05-16T16:32:46.000Z | NaN | \n\n?\n\n | ltr | pressplayok.com | NaN | ... | -5.865 | 1.0 | 0.0446 | 0.442000 | 0.000002 | 0.0742 | 0.6090 | 102.988 | 217148.0 | 4.0 |
17 | Junip | 7HcipAIJatVGT4U6HQrnFW | None | If like me you were wondering what he was up t... | None | NaN | \n\nThe subject of this song finds themself su... | ltr | www.mp3hugger.com | NaN | ... | -8.033 | 1.0 | 0.0417 | 0.472000 | 0.523000 | 0.1110 | 0.2840 | 90.377 | 339293.0 | 4.0 |
18 | Kevin Morby | 6fxk3UXHTFYET8qCT9WlBF | None | Beautiful Strangers -Kevin Morby\nKevin Morby'... | None | NaN | \n\nAmazing\n\n | ltr | noondaytune.com | NaN | ... | -11.127 | 1.0 | 0.0445 | 0.762000 | 0.068900 | 0.1050 | 0.8910 | 100.042 | 375903.0 | 4.0 |
19 | Kevin Morby | 6fxk3UXHTFYET8qCT9WlBF | The Listening Post Blog | Kevin Morby has released new single 'Beautiful... | 2016-10-27T17:34:33.000Z | NaN | \n\nAmazing\n\n | ltr | thelisteningpostblog.wordpress.com | NaN | ... | -11.127 | 1.0 | 0.0445 | 0.762000 | 0.068900 | 0.1050 | 0.8910 | 100.042 | 375903.0 | 4.0 |
20 | Cosmo's Midnight | 4VivsO1n4n2Mi2Btyb5gfL | Adrien | by Adrien * Published 19 February, 2018 * Upd... | 2018-02-19T20:22:54.000Z | NaN | \n\n?\n\n | ltr | doyoulikethatsong.com | NaN | ... | -11.899 | 1.0 | 0.2350 | 0.119000 | 0.002270 | 0.0782 | 0.4420 | 109.079 | 241651.0 | 4.0 |
21 | Cosmo's Midnight | 4VivsO1n4n2Mi2Btyb5gfL | Zuli | It's been quite a while since these Cosmo's Mi... | 2018-02-17T11:37:47.000Z | NaN | \n\n?\n\n | ltr | www.stereofox.com | NaN | ... | -11.899 | 1.0 | 0.2350 | 0.119000 | 0.002270 | 0.0782 | 0.4420 | 109.079 | 241651.0 | 4.0 |
23 | Teleman | 3wrtQM9ICPPeHwoc1GWiyV | None | London's very-own Teleman has unleashed their ... | 2018-05-16T00:00:00.000Z | NaN | \n\n?\n\n | ltr | www.wickeddchildd.com | NaN | ... | -8.772 | 0.0 | 0.0316 | 0.014400 | 0.300000 | 0.2280 | 0.5620 | 124.003 | 293142.0 | 4.0 |
24 | RÜFÜS / RÜFÜS DU SOL | 5Pb27ujIyYb33zBqVysBkj | Acid Stag | Aussie lads R UFUS dropped a new tune over the... | 2018-05-28T00:00:00.000Z | NaN | \n\nRÜFÜS DU SOL’s first single from their upc... | ltr | acidstag.com | NaN | ... | -6.909 | 0.0 | 0.0316 | 0.015600 | 0.184000 | 0.1870 | 0.2050 | 120.001 | 238008.0 | 4.0 |
25 | RÜFÜS / RÜFÜS DU SOL | 5Pb27ujIyYb33zBqVysBkj | Erin Maher | Recently I had a conversation with a few music... | None | NaN | \n\nRÜFÜS DU SOL’s first single from their upc... | ltr | beautifulbuzzz.com | NaN | ... | -6.909 | 0.0 | 0.0316 | 0.015600 | 0.184000 | 0.1870 | 0.2050 | 120.001 | 238008.0 | 4.0 |
26 | Ben-Browning | 497VSHqaZA32b7tgCETFEH | None | Ben Browning - "Sunshine Baby"\nYou may know B... | 2018-05-29T23:12:32.000Z | NaN | \n\n?\n\n | ltr | www.weallwantsomeone.org | NaN | ... | -8.365 | 1.0 | 0.0276 | 0.072400 | 0.009450 | 0.1420 | 0.7730 | 108.980 | 214400.0 | 4.0 |
27 | Ben-Browning | 497VSHqaZA32b7tgCETFEH | Madison Blom | Kicking off his forthcoming album with the rel... | 2018-05-22T12:35:51.000Z | NaN | \n\n?\n\n | ltr | imperfectfifth.com | NaN | ... | -8.365 | 1.0 | 0.0276 | 0.072400 | 0.009450 | 0.1420 | 0.7730 | 108.980 | 214400.0 | 4.0 |
28 | Ben-Browning | 497VSHqaZA32b7tgCETFEH | None | Three years after Turns, Australian Melbourne-... | 2018-05-20T08:49:53.000Z | NaN | \n\n?\n\n | ltr | sonofmarketing.com | NaN | ... | -8.365 | 1.0 | 0.0276 | 0.072400 | 0.009450 | 0.1420 | 0.7730 | 108.980 | 214400.0 | 4.0 |
29 | Ben-Browning | 497VSHqaZA32b7tgCETFEH | Buzz | Ben Browning (Photo by Angel Ceballos)\nCut Co... | 2018-05-17T18:43:18.000Z | NaN | \n\n?\n\n | ltr | buzzbands.la | NaN | ... | -8.365 | 1.0 | 0.0276 | 0.072400 | 0.009450 | 0.1420 | 0.7730 | 108.980 | 214400.0 | 4.0 |
30 | Ben-Browning | 497VSHqaZA32b7tgCETFEH | Kai Street | Cut Copy bassist, Ben Browning, has done it ag... | 2018-05-12T16:16:45.000Z | NaN | \n\n?\n\n | ltr | frenchshuffle.com | NaN | ... | -8.365 | 1.0 | 0.0276 | 0.072400 | 0.009450 | 0.1420 | 0.7730 | 108.980 | 214400.0 | 4.0 |
31 | Ben-Browning | 497VSHqaZA32b7tgCETFEH | None | photo by Angel Ceballos\nBen Browning has play... | None | NaN | \n\n?\n\n | ltr | www.brooklynvegan.com | NaN | ... | -8.365 | 1.0 | 0.0276 | 0.072400 | 0.009450 | 0.1420 | 0.7730 | 108.980 | 214400.0 | 4.0 |
32 | SOFI TUKKER | 586uxXMyD5ObPuzjtrzO1Q | None | Sofi Tukker have written a love song to each o... | 2017-09-21T20:42:39.000Z | NaN | \n\nEvery Friday, Spotify compiles the hottest... | ltr | www.analoguetrash.com | NaN | ... | -5.704 | 1.0 | 0.0392 | 0.010900 | 0.079400 | 0.1520 | 0.3000 | 102.023 | 184880.0 | 4.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
25126 | Mt Eden | 3YAy9OUnnRsk2JLTaOLcw2 | None | It was only last month that AraabMuzik sustain... | 2013-06-04T19:30:33.000Z | NaN | None | ltr | consequenceofsound.net | NaN | ... | -1.713 | 0.0 | 0.0517 | 0.007250 | 0.000076 | 0.2090 | 0.5020 | 138.941 | 179440.0 | 4.0 |
25127 | The Colourist | 0hK26QoGcuKV3lH0x9MvUU | Converted to dEUS-ism in 2001. Keen on screami... | Remember The Colourist ace single Yes Yes from... | 2013-06-03T17:15:45.000Z | NaN | \n\n?\n\n | ltr | www.frontstagemusic.co.uk | NaN | ... | -8.003 | 1.0 | 0.0359 | 0.088000 | 0.001560 | 0.1000 | 0.3100 | 140.003 | 284013.0 | 4.0 |
25128 | The Colourist | 0hK26QoGcuKV3lH0x9MvUU | None | May 30, 2013\nChris / IMF, Remixes / magic man... | 2013-05-30T16:48:05.000Z | NaN | \n\n?\n\n | ltr | indiemusicfilter.com | NaN | ... | -8.003 | 1.0 | 0.0359 | 0.088000 | 0.001560 | 0.1000 | 0.3100 | 140.003 | 284013.0 | 4.0 |
25129 | The Colourist | 0hK26QoGcuKV3lH0x9MvUU | ATG | New\nThe Colourist have had a monster 2013 wit... | None | NaN | \n\n?\n\n | ltr | www.allthingsgomusic.com | NaN | ... | -8.003 | 1.0 | 0.0359 | 0.088000 | 0.001560 | 0.1000 | 0.3100 | 140.003 | 284013.0 | 4.0 |
25130 | Disclosure | 6nS5roXSAGhTGr34W6n7Et | Jared Silva | New\nWe've heard a couple collaborations from ... | None | NaN | \n\n“F for You” comes courtesy of Howard Lawre... | ltr | www.allthingsgomusic.com | NaN | ... | -7.426 | 0.0 | 0.0674 | 0.006760 | 0.030300 | 0.1100 | 0.2730 | 124.023 | 269947.0 | 4.0 |
25131 | Disclosure | 6nS5roXSAGhTGr34W6n7Et | George\nO'Brien | You can count the number of days beforeDisclos... | None | NaN | \n\n“F for You” comes courtesy of Howard Lawre... | ltr | None | NaN | ... | -7.426 | 0.0 | 0.0674 | 0.006760 | 0.030300 | 0.1100 | 0.2730 | 124.023 | 269947.0 | 4.0 |
25132 | ScHoolboy Q | 5IcR3N7QB1j6KBL8eImZ8m | None | Schoolboy Q might not be getting the plaudits ... | None | NaN | \n\nThis song premiered during the 2013 NBA Pl... | ltr | www.indieshuffle.com | NaN | ... | -7.230 | 0.0 | 0.1590 | 0.370000 | 0.000000 | 0.1250 | 0.1760 | 123.966 | 272453.0 | 4.0 |
25133 | ScHoolboy Q | 5IcR3N7QB1j6KBL8eImZ8m | Human Drizzle | Schoolboy Q, the rapper behind some of the big... | 2013-05-31T08:14:33.000Z | NaN | \n\nThis song premiered during the 2013 NBA Pl... | ltr | humandrizzle.com | NaN | ... | -7.230 | 0.0 | 0.1590 | 0.370000 | 0.000000 | 0.1250 | 0.1760 | 123.966 | 272453.0 | 4.0 |
25134 | ScHoolboy Q | 5IcR3N7QB1j6KBL8eImZ8m | None | ScHoolboy Q 's latest track stands out to me a... | 2013-05-28T04:50:13.000Z | NaN | \n\nThis song premiered during the 2013 NBA Pl... | ltr | beatspill.com | NaN | ... | -7.230 | 0.0 | 0.1590 | 0.370000 | 0.000000 | 0.1250 | 0.1760 | 123.966 | 272453.0 | 4.0 |
25135 | Empire Of The Sun | 67hb7towEyKvt5Z8Bx306c | None | Off the back of an incredibly succcessful summ... | None | NaN | \n\n?\n\n | ltr | stoneyroads.com | NaN | ... | -5.551 | 1.0 | 0.0917 | 0.001680 | 0.747000 | 0.1630 | 0.0925 | 128.007 | 356719.0 | 4.0 |
25136 | //Fractures | 7sjRnhONmeFL1tmlUvdq70 | Confusion | "Twisted" is the lead single from an EP to be ... | None | NaN | \n\n?\n\n | ltr | pigeonsandplanes.com | NaN | ... | -10.524 | 0.0 | 0.0674 | 0.105000 | 0.411000 | 0.2410 | 0.3850 | 134.138 | 260402.0 | 4.0 |
25137 | Still Parade | 3CXevh2SLL5B4cuTedOkj5 | b3 | "Actors" is the debut track from newcomers Sti... | 2013-06-04T18:20:09.000Z | NaN | \n\n?\n\n | ltr | blahblahblahscience.com | NaN | ... | -12.838 | 0.0 | 0.0344 | 0.878000 | 0.789000 | 0.2130 | 0.3600 | 90.967 | 241922.0 | 4.0 |
25138 | Still Parade | 3CXevh2SLL5B4cuTedOkj5 | None | Still Parade are superstars in waiting. They s... | None | NaN | \n\n?\n\n | ltr | www.indieshuffle.com | NaN | ... | -12.838 | 0.0 | 0.0344 | 0.878000 | 0.789000 | 0.2130 | 0.3600 | 90.967 | 241922.0 | 4.0 |
25139 | Still Parade | 3CXevh2SLL5B4cuTedOkj5 | None | "Actors are so fortunate. They can choose whet... | None | NaN | \n\n?\n\n | ltr | joftheday.com | NaN | ... | -12.838 | 0.0 | 0.0344 | 0.878000 | 0.789000 | 0.2130 | 0.3600 | 90.967 | 241922.0 | 4.0 |
25140 | J. Cole | 6l3HvQ5sa6mXTsMTB19rO5 | Justin McCarthy | New\nGenre: Hip Hop Sounds Like: Drake\nThe la... | None | NaN | \n\nThis is the third release off of J Cole’s ... | ltr | www.allthingsgomusic.com | NaN | ... | -7.394 | 0.0 | 0.1740 | 0.033400 | 0.000002 | 0.1030 | 0.3270 | 138.046 | 217240.0 | 4.0 |
25141 | J. Cole | 6l3HvQ5sa6mXTsMTB19rO5 | None | While Kanye West is taking the minimalist, mys... | 2013-05-31T19:22:17.000Z | NaN | \n\nThis is the third release off of J Cole’s ... | ltr | www.idolator.com | NaN | ... | -7.394 | 0.0 | 0.1740 | 0.033400 | 0.000002 | 0.1030 | 0.3270 | 138.046 | 217240.0 | 4.0 |
25142 | J. Cole | 6l3HvQ5sa6mXTsMTB19rO5 | None | J. Cole 's Born Sinner has been highly anticip... | 2013-05-31T18:00:29.000Z | NaN | \n\nThis is the third release off of J Cole’s ... | ltr | earmilk.com | NaN | ... | -7.394 | 0.0 | 0.1740 | 0.033400 | 0.000002 | 0.1030 | 0.3270 | 138.046 | 217240.0 | 4.0 |
25143 | J. Cole | 6l3HvQ5sa6mXTsMTB19rO5 | None | For whatever reason, there is not much J. Cole... | None | NaN | \n\nThis is the third release off of J Cole’s ... | ltr | www.indieshuffle.com | NaN | ... | -7.394 | 0.0 | 0.1740 | 0.033400 | 0.000002 | 0.1030 | 0.3270 | 138.046 | 217240.0 | 4.0 |
25144 | J. Cole | 6l3HvQ5sa6mXTsMTB19rO5 | Confusion | With Born Sinner coming June 18, J. Cole drops... | None | NaN | \n\nThis is the third release off of J Cole’s ... | ltr | pigeonsandplanes.com | NaN | ... | -7.394 | 0.0 | 0.1740 | 0.033400 | 0.000002 | 0.1030 | 0.3270 | 138.046 | 217240.0 | 4.0 |
25145 | J. Cole | 6l3HvQ5sa6mXTsMTB19rO5 | None | J. Cole decides to liberate this new track off... | 2013-05-31T01:59:39.000Z | NaN | \n\nThis is the third release off of J Cole’s ... | ltr | nahright.com | NaN | ... | -7.394 | 0.0 | 0.1740 | 0.033400 | 0.000002 | 0.1030 | 0.3270 | 138.046 | 217240.0 | 4.0 |
25146 | Robert DeLong | 42crL07E4WPfVovyUtMpvC | None | Did he make you fing dance? That he did. *Robe... | 2012-11-15T21:27:23.000Z | NaN | \n\n“Global Concepts” is the 2nd track on Robe... | ltr | earmilk.com | NaN | ... | -3.758 | 0.0 | 0.2080 | 0.332000 | 0.000010 | 0.3640 | 0.6510 | 107.926 | 278147.0 | 4.0 |
25147 | Robert DeLong | 42crL07E4WPfVovyUtMpvC | b3 | As his first release under the Glassnote label... | 2012-10-31T12:38:01.000Z | NaN | \n\n“Global Concepts” is the 2nd track on Robe... | ltr | blahblahblahscience.com | NaN | ... | -3.758 | 0.0 | 0.2080 | 0.332000 | 0.000010 | 0.3640 | 0.6510 | 107.926 | 278147.0 | 4.0 |
25148 | Robert DeLong | 42crL07E4WPfVovyUtMpvC | None | Sometimes a little experimentation is good. An... | None | NaN | \n\n“Global Concepts” is the 2nd track on Robe... | ltr | www.indieshuffle.com | NaN | ... | -3.758 | 0.0 | 0.2080 | 0.332000 | 0.000010 | 0.3640 | 0.6510 | 107.926 | 278147.0 | 4.0 |
25149 | Robert DeLong | 42crL07E4WPfVovyUtMpvC | Nelson | NYC-based label Glassnote has quite an extensi... | None | NaN | \n\n“Global Concepts” is the 2nd track on Robe... | ltr | www.allthingsgomusic.com | NaN | ... | -3.758 | 0.0 | 0.2080 | 0.332000 | 0.000010 | 0.3640 | 0.6510 | 107.926 | 278147.0 | 4.0 |
25154 | Baby Alpaca | 3UEWPRL5vRlNOP1zhRQf71 | None | Baby Alpaca - Wild Child\nBrooklyn's, NY deare... | None | NaN | \n\n?\n\n | ltr | soundinjections.gr | NaN | ... | -8.202 | 1.0 | 0.0520 | 0.727000 | 0.000621 | 0.0929 | 0.0616 | 99.939 | 352227.0 | 4.0 |
25155 | Baby Alpaca | 3UEWPRL5vRlNOP1zhRQf71 | b3 | New York band Baby Alpaca ably blend current w... | 2013-05-23T21:14:09.000Z | NaN | \n\n?\n\n | ltr | blahblahblahscience.com | NaN | ... | -8.202 | 1.0 | 0.0520 | 0.727000 | 0.000621 | 0.0929 | 0.0616 | 99.939 | 352227.0 | 4.0 |
25156 | Two Door Cinema Club | 536BYVgOnRky0xjsPT96zl | None | After a couple false starts, RAC has officiall... | None | NaN | \n\nThis song is about homesickness and Alex T... | ltr | www.indieshuffle.com | NaN | ... | -6.498 | 1.0 | 0.0478 | 0.280000 | 0.000000 | 0.0913 | 0.3920 | 123.056 | 286093.0 | 4.0 |
25157 | Adrian Lux | 5kp9Qhzri9LrDkzrtjt5Sh | None | Back to TopMusic\nWritten by James LedgerMay 2... | None | NaN | \n\n?\n\n | ltr | stoneyroads.com | NaN | ... | -4.369 | 0.0 | 0.0363 | 0.002840 | 0.001970 | 0.1360 | 0.3910 | 128.048 | 166253.0 | 4.0 |
25158 | T.I. feat. Lil Wayne | 4OBJLual30L7gRl5UkeRcT | Confusion | T.I. and Lil Wayne are kicking off the America... | None | NaN | \n\n?\n\n | ltr | pigeonsandplanes.com | NaN | ... | -4.737 | 1.0 | 0.2290 | 0.029300 | 0.000000 | 0.0709 | 0.3880 | 142.037 | 305080.0 | 4.0 |
25159 | The Smiths | 3yY2gUcIsjMr8hjo51PoJ8 | None | My love and adoration for The Smiths dates all... | None | NaN | \n\nThe Smiths' second single of 1983, “This C... | ltr | www.indieshuffle.com | NaN | ... | -5.900 | 1.0 | 0.0436 | 0.005090 | 0.000000 | 0.0784 | 0.7510 | 103.912 | 162920.0 | 4.0 |
19889 rows × 45 columns
deephypebot._meta['description'] = 'All reviews contain 1+ "top" genres'
quilt.build(pkg_name, deephypebot) # revision3
quilt.inspect(pkg_name)
/home/ubuntu/.local/share/QuiltCli/quilt_packages/pkgs/Quilt/iconix/deephypebot └── reviews_and_metadata_5yrs: shape (19889, 45), types 'artist': object, 'artist_id': object, 'author': object, 'conte…
quilt.login()
quilt.push(pkg_name, is_public=True)
Launching a web browser... If that didn't work, please visit the following URL: https://pkg.quiltdata.com/login Failed to launch the browser: Command '['xdg-open', 'https://pkg.quiltdata.com/login']' returned non-zero exit status 3. Enter the code from the webpage: eyJjb2RlIjogIjg3MTM3MzZkLTgwNDAtNGEyNi05OTUyLTc5MTRmYjg2NTNmOCIsICJpZCI6ICJkMzI3ZTdkYS1lZDFiLTRiMWMtYTFlMC1lM2ZkNzk4NDJlMzkifQ== Fetching upload URLs from the registry...
0%| | 0.00/29.1M [00:00<?, ?B/s]
Uploading 2 fragments (29078344 bytes)...
100%|██████████| 29.1M/29.1M [00:03<00:00, 9.31MB/s]
Uploading package metadata... Updating the 'latest' tag... Push complete. iconix/deephypebot is live: https://quiltdata.com/package/iconix/deephypebot
import quilt
pkg_name = 'iconix/deephypebot'
quilt.install(pkg_name)
Downloading package metadata... iconix/deephypebot already installed. Overwrite? (y/n) y Fragments already downloaded
from quilt.data.iconix import deephypebot
reviews_5yrs_df = deephypebot.reviews_and_metadata_5yrs()
reviews_5yrs_df.sample(frac=1).head()
artist | artist_id | author | content | date_published | dek | desc | direction | domain | error | ... | loudness | mode | speechiness | acousticness | instrumentalness | liveness | valence | tempo | duration_ms | time_signature | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
10016 | Atlas Genius | 42vg2T0Xg9yPaAgogJzoQH | ITJYM | It's been an absolute age since we've heard an... | 2015-06-30T00:00:00.000Z | NaN | \n\n“Molecules” is the lead single from Atlas ... | ltr | inthejunkyardmusic.co.uk | NaN | ... | -4.822 | 0.0 | 0.0440 | 0.000442 | 0.000010 | 0.1950 | 0.513 | 117.028 | 249267.0 | 4.0 |
14296 | MEG MAC | 4faUajx9k93O56nlmpkOuz | None | I posted Meg Mac a while back, and it was quit... | None | NaN | \n\n?\n\n | ltr | musicthatwelike.com | NaN | ... | -8.176 | 0.0 | 0.0483 | 0.079600 | 0.000023 | 0.0414 | 0.371 | 80.118 | 193405.0 | 4.0 |
16588 | DYLYN | 3hOdLrtKdSs3AEuwcR7ses | Laurence\nDay | DYLYN deals in big statements: cascading beats... | None | NaN | \n\n?\n\n | ltr | None | NaN | ... | -3.677 | 0.0 | 0.0452 | 0.054500 | 0.000007 | 0.0766 | 0.905 | 154.964 | 187320.0 | 4.0 |
9729 | Major Lazer | 738wLrAtLtCtFOLvQBXOXp | theneedledrop | July 22, 2015New MusictheneedledropComment\nMa... | None | NaN | \n\nMajor Lazer strikes back with “Lost,” a Re... | ltr | www.theneedledrop.com | NaN | ... | -5.061 | 1.0 | 0.0286 | 0.005600 | 0.002120 | 0.0875 | 0.847 | 139.928 | 195649.0 | 4.0 |
4419 | FUTURO PELO | 5EqHv5lvGgrnkuGEFn67yN | Ivo | Electronic\nIt's so difficult to describe Futu... | 2017-02-23T14:09:18.000Z | NaN | \n\n | ltr | www.stereofox.com | NaN | ... | -7.852 | 0.0 | 0.0560 | 0.020700 | 0.137000 | 0.0984 | 0.860 | 142.057 | 170704.0 | 4.0 |
5 rows × 45 columns
from collections import Counter
freq_threshold = 90
c1 = Counter([g for gg in reviews_5yrs_df.spotify_genres for g in gg])
freq_genres = Counter({k:v for (k,v) in c1.items() if v >= freq_threshold})
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline
wordcloud = WordCloud(width=975).generate_from_frequencies(dict(freq_genres))
plt.figure(figsize=(20,10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.tight_layout(pad=0)
plt.savefig('genrecloud.png', bbox_inches='tight', facecolor='k')
For VAE training
from quilt.data.iconix import deephypebot
import spacy
import time
start = time.time()
nlp = spacy.load('en')
df = deephypebot.reviews_and_metadata_5yrs()
df = df.assign(content_sentences=lambda x: [[' '.join(s.text.split()) for s in ss] for ss in [list(nlp(c).sents) for c in x.masked_content]])
print(f'Runtime: {time.time() - start:.2f}s')
df.content_sentences.head()
Runtime: 571.64s
0 [New Music <artist> reached out to us with the... 1 [Folk rockers <artist> have debuted their new ... 2 [You know we're digging <, artist>., Their new... 3 [Nothing against the profession, but the U.S. ... 4 [Connecticut duo <artist> have released a guit... Name: content_sentences, dtype: object
import quilt
pkg_name = 'iconix/deephypebot'
deephypebot._set(['reviews_and_metadata_5yrs'], df)
deephypebot.reviews_and_metadata_5yrs().content_sentences.head()
0 [New Music <artist> reached out to us with the... 1 [Folk rockers <artist> have debuted their new ... 2 [You know we're digging <, artist>., Their new... 3 [Nothing against the profession, but the U.S. ... 4 [Connecticut duo <artist> have released a guit... Name: content_sentences, dtype: object
deephypebot._meta['description'] = 'Split content into sentences'
quilt.build(pkg_name, deephypebot) # revision4
quilt.inspect(pkg_name)
/home/ubuntu/.local/share/QuiltCli/quilt_packages/pkgs/Quilt/iconix/deephypebot └── reviews_and_metadata_5yrs: shape (19889, 46), types 'artist': object, 'artist_id': object, 'author': object, 'conte…
quilt.login()
quilt.push(pkg_name, is_public=True)
Launching a web browser... If that didn't work, please visit the following URL: https://pkg.quiltdata.com/login Failed to launch the browser: Command '['xdg-open', 'https://pkg.quiltdata.com/login']' returned non-zero exit status 3. Enter the code from the webpage: eyJjb2RlIjogIjBhMmM2N2RhLTgwN2UtNDE3MC04YTUyLTFmYWZiZjljNzBiYyIsICJpZCI6ICJkMzI3ZTdkYS1lZDFiLTRiMWMtYTFlMC1lM2ZkNzk4NDJlMzkifQ== Fetching upload URLs from the registry...
0%| | 0.00/40.1M [00:00<?, ?B/s]
Uploading 2 fragments (40111029 bytes)...
100%|██████████| 40.1M/40.1M [00:03<00:00, 12.6MB/s]
Uploading package metadata... Updating the 'latest' tag... Push complete. iconix/deephypebot is live: https://quiltdata.com/package/iconix/deephypebot
#df.to_json('../datasets/reviews_and_metadata_5yrs.json', orient='records')
Based on topic model explorations below, tour dates frequently muddy the dataset and don't represent legit sentences - so let's remove the worst offenders.
Basically, when n_topics=4
, one topic cluster neatly gathers a bunch of crud, above a certain probability level.
Quickly duplicating topic modeling code from below:
from pytorchtextvae import datasets
import torch
# pip install git+https://github.com/iconix/pytorch-text-vae.git
from pytorchtextvae import generate
DEVICE = torch.device('cpu') # CPU inference
vae, input_side, output_side, pairs, dataset, EMBED_SIZE, random_state = generate.load_model('../../pytorch-text-vae/model/best/reviews_and_metadata_5yrs_state.pt', 'reviews_and_metadata_5yrs_stored_info.pkl', DEVICE, cache_path='../../pytorch-text-vae/model/best/tmp')
def tokenize(line):
l = line.strip().lstrip().rstrip()
l = datasets.normalize_string(l)
return l.split(' ')
sents = [pair[0] for pair in pairs]
texts = [tokenize(sentence) for sentence in sents]
from nltk.corpus import stopwords
# remove stop words and words that appear only once
stoplist = [datasets.normalize_string(word) for word in stopwords.words('english')]
fillerlist = ['author', 'song_title', 'artist', 'sitename']
texts = [[word for word in text if word not in stoplist and word not in fillerlist] for text in texts]
from gensim.corpora.dictionary import Dictionary
dictionary = Dictionary(texts)
from gensim.models.ldamodel import LdaModel
n_topics = 4
passes = 20 # number of passes through documents
iterations = 400
eval_every = 1 # Don't evaluate model perplexity, takes too much time.
corpus = [dictionary.doc2bow(text) for text in texts]
# Train the model on the corpus.
lda = LdaModel(corpus, id2word=dictionary, num_topics=n_topics, iterations=iterations, passes=passes, eval_every=eval_every)
#lda = LdaModel(corpus, id2word=dictionary, num_topics=n_topics)
lda.print_topics(n_topics)
Fetching cached info at ../../pytorch-text-vae/model/best/tmp/reviews_and_metadata_5yrs_stored_info.pkl Cache ../../pytorch-text-vae/model/best/tmp/reviews_and_metadata_5yrs_stored_info.pkl loaded (load time: 0.57s) Found saved model ../../pytorch-text-vae/model/best/reviews_and_metadata_5yrs_state.pt MAX_SAMPLE: False; TRUNCATED_SAMPLE: True Trained for 360000 steps (load time: 18.91s) Setting new random seed
[(0, '0.015*"track" + 0.015*"pop" + 0.013*"vocals" + 0.007*"sound" + 0.006*"production" + 0.006*"song" + 0.005*"electronic" + 0.005*"vocal" + 0.005*"house" + 0.005*"like"'), (1, '0.011*"tour" + 0.008*"music" + 0.006*"10" + 0.006*"festival" + 0.005*"live" + 0.005*"london" + 0.005*"los" + 0.004*"angeles" + 0.004*"dates" + 0.004*"uk"'), (2, '0.031*"new" + 0.019*"single" + 0.019*"album" + 0.015*"track" + 0.012*"release" + 0.011*"ep" + 0.011*"year" + 0.011*"first" + 0.010*"released" + 0.010*"debut"'), (3, '0.015*"like" + 0.013*"one" + 0.012*"song" + 0.011*"music" + 0.008*"get" + 0.008*"time" + 0.006*"something" + 0.006*"really" + 0.006*"love" + 0.006*"us"')]
from operator import itemgetter
out = sorted([f'{max(lda[corpus[i]], key=itemgetter(1))} {datasets.normalize_string(sents[i])}\n' for i in range(len(texts))], reverse=True)
with open(f'sentence_{n_topics}topics.txt', 'w') as f:
f.writelines(out)
import os
import pandas as pd
BASE_DIR = os.getcwd()
DATA_DIR = os.path.join(BASE_DIR, '..', 'datasets')
reviews_f = 'reviews_and_metadata'
reviews_5yrs_f = os.path.join(DATA_DIR, f'{reviews_f}_5yrs.json')
reviews_5yrs_df = pd.read_json(reviews_5yrs_f)
print(f'num_reviews: {len(reviews_5yrs_df)}')
reviews_5yrs_df.content_sentences.head()
num_reviews: 19889
0 [New Music <artist> reached out to us with the... 1 [Folk rockers <artist> have debuted their new ... 2 [You know we're digging <, artist>., Their new... 3 [Nothing against the profession, but the U.S. ... 4 [Connecticut duo <artist> have released a guit... Name: content_sentences, dtype: object
def sentence_to_topic(sent):
return lda[dictionary.doc2bow([word for word in tokenize(sent) if word not in stoplist and word not in fillerlist])]
sentence_to_topic(reviews_5yrs_df.content_sentences[1][0])
[(0, 0.22650239), (2, 0.42532876), (3, 0.33853623)]
[(s, max(sentence_to_topic(s), key=itemgetter(1)), f'ssi: {ssi}', f'si: {si}') for ssi, ss in enumerate(reviews_5yrs_df.content_sentences[:1]) for si, s in enumerate(ss)]
[('New Music <artist> reached out to us with the powerful sentiment of their alt-folk anthem, "<song_title>.', (0, 0.43400997), 'ssi: 0', 'si: 0'), ('" The song is about coping with the feelings of loss after losing friends to addiction.', (3, 0.9059083), 'ssi: 0', 'si: 1'), ('The heavy subject matter is touching and eloquently-delivered...', (3, 0.50518245), 'ssi: 0', 'si: 2'), ('And its hard to deny the powerful chorus of "', (0, 0.51616824), 'ssi: 0', 'si: 3'), ('But if it\'s the drugs, the women, the wine, the weed."', (1, 0.4500785), 'ssi: 0', 'si: 4'), ('Recommended for fans of Mumford & Sons, Edward Sharpe, or Noah and the Whale.', (2, 0.41642538), 'ssi: 0', 'si: 5'), ("Drop <artist> a like on Instagram while you're at it.", (3, 0.56144327), 'ssi: 0', 'si: 6')]
[(s, f'ssi: {ssi}', f'si: {si}') for ssi, ss in enumerate(reviews_5yrs_df.content_sentences) for si, s in enumerate(ss) if "the boulder theatre sep 20th emos austin tx sep 22nd granada theater dallas tx sep 24th orange peel asheville nc sep 25th cannery ballroom nashville tn sep 26th midpoint music festival cincinnati oh sep 27th pygmalion music festival champaign urbana il sep 29th royale boston ma oct 1st terminal 5 new york ny oct 3rd the forum london united kingdom oct 4th the art school glasgow united kingdom oct 5th stylus leeds united kingdom oct 6th trinity centre bristol united kingdom oct 8th la gaite lyrique paris france oct 9th vk club brussels belgium oct 10th melkweg old room amsterdam netherlands oct 11th uebel gefahrlich hamburg germany oct 12th gebaude 9 cologne germany oct 14th c club berlin germany oct 15th basen warsaw" in datasets.normalize_string(s)]
remove_topic = 1
def remove_topic(column):
return [sent for sent in column if (max(sentence_to_topic(sent), key=itemgetter(1))[0] != remove_topic and max(sentence_to_topic(sent), key=itemgetter(1))[1] < 0.75)]
import time
start = time.time()
reviews_5yrs_df.content_sentences = reviews_5yrs_df.content_sentences.apply(remove_topic)
print(f'runtime: {time.time() - start:.2f}s')
reviews_5yrs_df.content_sentences.head()
runtime: 158.94s
0 [New Music <artist> reached out to us with the... 1 [Folk rockers <artist> have debuted their new ... 2 [You know we're digging <, artist>., Their new... 3 [Nothing against the profession, but the U.S. ... 4 [Connecticut duo <artist> have released a guit... Name: content_sentences, dtype: object
[(s, f'ssi: {ssi}', f'si: {si}') for ssi, ss in enumerate(reviews_5yrs_df.content_sentences) for si, s in enumerate(ss) if "the boulder theatre sep 20th emos austin tx sep 22nd granada theater dallas tx sep 24th orange peel asheville nc sep 25th cannery ballroom nashville tn sep 26th midpoint music festival cincinnati oh sep 27th pygmalion music festival champaign urbana il sep 29th royale boston ma oct 1st terminal 5 new york ny oct 3rd the forum london united kingdom oct 4th the art school glasgow united kingdom oct 5th stylus leeds united kingdom oct 6th trinity centre bristol united kingdom oct 8th la gaite lyrique paris france oct 9th vk club brussels belgium oct 10th melkweg old room amsterdam netherlands oct 11th uebel gefahrlich hamburg germany oct 12th gebaude 9 cologne germany oct 14th c club berlin germany oct 15th basen warsaw" in datasets.normalize_string(s)]
[]
import quilt
from quilt.data.iconix import deephypebot
pkg_name = 'iconix/deephypebot'
deephypebot._set(['reviews_and_metadata_5yrs'], reviews_5yrs_df)
deephypebot.reviews_5yrs_df().content_sentences.head()
deephypebot._meta['description'] = 'Remove tour dates and some other junk'
quilt.build(pkg_name, deephypebot) # revision5
quilt.inspect(pkg_name)
quilt.login()
quilt.push(pkg_name, is_public=True)
from quilt.data.iconix import deephypebot
import quilt
pkg_name = 'iconix/deephypebot'
quilt.inspect(pkg_name)
/home/ubuntu/.local/share/QuiltCli/quilt_packages/pkgs/Quilt/iconix/deephypebot └── reviews_and_metadata_5yrs: shape (19889, 46), types 'artist': object, 'artist_id': object, 'author': object, 'conte…
import dill as pickle
cache_file = '../../pytorch-text-vae/tmp/reviews_and_metadata_5yrs_stored_info.pkl'
with open(cache_file, "rb", buffering=0) as f:
input_side, output_side, pairs, dataset, Z_SIZE, CONDITION_SIZE, DECODER_HIDDEN_SIZE, ENCODER_HIDDEN_SIZE, N_ENCODER_LAYERS = pickle.load(f)
pairs[0]
('ultimatum is out now and it features malian musician and singer songwriter fatoumata diawara', 'arawaid atamuotaf retirwgnos regnis dna naicisum nailam serutaef ti dna won tuo si mutamitlu', array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]))
with open(cache_file, "rb", buffering=0) as f:
bytearray(f)
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-9-f2bf9d3754cc> in <module>() 1 with open(cache_file, "rb", buffering=0) as f: ----> 2 bytearray(f) TypeError: an integer is required
with open(cache_file, "rb", buffering=0) as f:
deephypebot._set(['reviews_and_metadata_5yrs_stored_info'], bytes(f))
deephypebot.reviews_and_metadata_5yrs_stored_info()
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-4-1db04e4ea5f2> in <module>() 1 with open(cache_file, "rb", buffering=0) as f: ----> 2 deephypebot._set(['reviews_and_metadata_5yrs_stored_info'], bytes(f)) 3 deephypebot.reviews_and_metadata_5yrs_stored_info() TypeError: 'bytes' object cannot be interpreted as an integer
import dill as pickle
input_side, output_side, pairs, dataset, Z_SIZE, CONDITION_SIZE, DECODER_HIDDEN_SIZE, ENCODER_HIDDEN_SIZE, N_ENCODER_LAYERS = pickle.load(deephypebot.reviews_and_metadata_5yrs_stored_info())
pairs[0]
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-6-8b038f66b88b> in <module>() 1 import dill as pickle 2 ----> 3 input_side, output_side, pairs, dataset, Z_SIZE, CONDITION_SIZE, DECODER_HIDDEN_SIZE, ENCODER_HIDDEN_SIZE, N_ENCODER_LAYERS = pickle.load(deephypebot.reviews_and_metadata_5yrs_stored_info()) 4 5 pairs[0] ~/src/anaconda3/envs/fastai/lib/python3.6/site-packages/dill/dill.py in load(file) 284 def load(file): 285 """unpickle an object from a file""" --> 286 pik = Unpickler(file) 287 pik._main = _main_module 288 obj = pik.load() ~/src/anaconda3/envs/fastai/lib/python3.6/site-packages/dill/dill.py in __init__(self, *args, **kwds) 446 447 def __init__(self, *args, **kwds): --> 448 StockUnpickler.__init__(self, *args, **kwds) 449 self._main = _main_module 450 pass TypeError: file must have 'read' and 'readline' attributes
import os
import pandas as pd
BASE_DIR = os.getcwd()
DATA_DIR = os.path.join(BASE_DIR, '..', 'datasets')
reviews_f = 'reviews_and_metadata'
reviews_5yrs_f = os.path.join(DATA_DIR, f'{reviews_f}_5yrs.json')
reviews_5yrs_df = pd.read_json(reviews_5yrs_f)
print(f'num_reviews: {len(reviews_5yrs_df)}')
reviews_5yrs_df.content_sentences.head()
num_reviews: 19889
0 [New Music <artist> reached out to us with the... 1 [Folk rockers <artist> have debuted their new ... 2 [You know we're digging <, artist>., Their new... 3 [Nothing against the profession, but the U.S. ... 4 [Connecticut duo <artist> have released a guit... Name: content_sentences, dtype: object
import torch
# pip install git+https://github.com/iconix/pytorch-text-vae.git
from pytorchtextvae import generate
DEVICE = torch.device('cpu') # CPU inference
vae, input_side, output_side, pairs, dataset, EMBED_SIZE, random_state = generate.load_model('../../pytorch-text-vae/model/best/reviews_and_metadata_5yrs_state.pt', 'reviews_and_metadata_5yrs_stored_info.pkl', DEVICE, cache_path='../../pytorch-text-vae/model/best/tmp')
Fetching cached info at ../../pytorch-text-vae/model/best/tmp/reviews_and_metadata_5yrs_stored_info.pkl Cache ../../pytorch-text-vae/model/best/tmp/reviews_and_metadata_5yrs_stored_info.pkl loaded (load time: 0.57s) Found saved model ../../pytorch-text-vae/model/best/reviews_and_metadata_5yrs_state.pt MAX_SAMPLE: False; TRUNCATED_SAMPLE: True Trained for 360000 steps (load time: 18.86s) Setting new random seed
from pytorchtextvae import datasets
def tokenize(line):
l = line.strip().lstrip().rstrip()
l = datasets.normalize_string(l)
return l.split(' ')
#sents = [sentence for review in reviews_5yrs_df.content_sentences for sentence in review]
sents = [pair[0] for pair in pairs]
texts = [tokenize(sentence) for sentence in sents]
texts[:3]
[['tribal', 'chants', 'are', 'pitched', 'warped', 'and', 'layered', 'all', 'while', 'being', 'wrapped', 'in', 'waves', 'of', 'warm', 'synths', 'and', 'building', 'percussion'], ['artist', 'new', 'live', 'lp', 'heaven', 'on', 'earth', 'is', 'in', 'stores', 'now'], ['in', 'true', 'artist', 'fashion', 'he', 'has', 'tantalised', 'everyone', 'with', 'a', 'series', 'of', 'sporadic', 'uploading', 'of', 'tracks', 'to', 'his', 'soundcloud', 'and', 'the', 'latest', 'is', 'a', 'beautiful', 'cut', 'titled', 'the', 'light', 'featuring', 'the', 'ever', 'soulful', 'vocals', 'courtesy', 'of', 'denai', 'moore', 'who', 'has', 'also', 'worked', 'with', 'fantastic', 'mr', 'fox', '3']]
from nltk.corpus import stopwords
# remove stop words and words that appear only once
stoplist = [datasets.normalize_string(word) for word in stopwords.words('english')]
fillerlist = ['author', 'song_title', 'artist', 'sitename']
texts = [[word for word in text if word not in stoplist and word not in fillerlist] for text in texts]
texts[:3]
[['tribal', 'chants', 'pitched', 'warped', 'layered', 'wrapped', 'waves', 'warm', 'synths', 'building', 'percussion'], ['new', 'live', 'lp', 'heaven', 'earth', 'stores'], ['true', 'fashion', 'tantalised', 'everyone', 'series', 'sporadic', 'uploading', 'tracks', 'soundcloud', 'latest', 'beautiful', 'cut', 'titled', 'light', 'featuring', 'ever', 'soulful', 'vocals', 'courtesy', 'denai', 'moore', 'also', 'worked', 'fantastic', 'mr', 'fox', '3']]
from gensim.corpora.dictionary import Dictionary
'''# remove stop words and words that appear only once
stoplist = list(stopwords.words('english'))
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist
if stopword in dictionary.token2id]
filler_ids = [dictionary.token2id[fillword] for fillword in ['author', 'song_title', 'artist', 'sitename']
if fillword in dictionary.token2id]
dictionary.filter_tokens(stop_ids + filler_ids) # remove stop words
dictionary.compactify() # remove gaps in id sequence after words that were removed'''
dictionary = Dictionary(texts)
from gensim.models.ldamodel import LdaModel
n_topics = 2
passes = 20 # number of passes through documents
iterations = 400
eval_every = 1 # Don't evaluate model perplexity, takes too much time.
corpus = [dictionary.doc2bow(text) for text in texts]
# Train the model on the corpus.
lda = LdaModel(corpus, id2word=dictionary, num_topics=n_topics, iterations=iterations, passes=passes, eval_every=eval_every)
#lda = LdaModel(corpus, id2word=dictionary, num_topics=n_topics)
lda.print_topics(n_topics)
[(0, '0.011*"track" + 0.010*"like" + 0.008*"song" + 0.007*"one" + 0.006*"music" + 0.006*"pop" + 0.006*"vocals" + 0.005*"time" + 0.004*"sound" + 0.004*"love"'), (1, '0.021*"new" + 0.013*"single" + 0.013*"album" + 0.009*"track" + 0.009*"release" + 0.009*"ep" + 0.008*"year" + 0.007*"first" + 0.007*"debut" + 0.006*"remix"')]
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
p = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
pyLDAvis.save_html(p, f'lda_{n_topics}topics.html')
Let's look at a few sentences and their topics...
from operator import itemgetter
for i in range(10):
print(max(lda[corpus[i]],key=itemgetter(1)), datasets.normalize_string(sents[i]))
(0, 0.9573547) tribal chants are pitched warped and layered all while being wrapped in waves of warm synths and building percussion (1, 0.8950334) artist new live lp heaven on earth is in stores now (1, 0.5538248) in true artist fashion he has tantalised everyone with a series of sporadic uploading of tracks to his soundcloud and the latest is a beautiful cut titled the light featuring the ever soulful vocals courtesy of denai moore who has also worked with fantastic mr fox 3 (1, 0.9520523) now hes back with another single song_title offering another taste of his forthcoming solo project artist s demo (0, 0.9283465) if youre a bit familiar with artist youll probably recognise his signature sound (0, 0.9440481) its first half is a lush idyllic soundscape before it unfurls into a hand clapping hip shaking number (1, 0.9285623) the ep is due to be released on august 12th 2013 (1, 0.9440406) aka ben moorhouse and leo duncan will release their debut album (0, 0.7605752) look numbers dont mean everything but it means that i probably listened to their previous single about 900 thousand times which means over 100k people listened to that song (0, 0.7546815) we ve seen him dive into all sorts of dance pop trance electro and this time hes returned to his version of progressive club house
from collections import Counter
Counter([max(lda[corpus[i]], key=itemgetter(1))[0] + 1 for i in range(len(texts))])
Counter({1: 63454, 2: 41013})
out = sorted([f'{max(lda[corpus[i]], key=itemgetter(1))} {datasets.normalize_string(sents[i])}\n' for i in range(len(texts))], reverse=True)
with open(f'sentence_{n_topics}topics.txt', 'w') as f:
f.writelines(out)
Most "representative":
(*) desirable topic
n_topics = 3
corpus = [dictionary.doc2bow(text) for text in texts]
# Train the model on the corpus.
lda = LdaModel(corpus, id2word=dictionary, num_topics=n_topics, iterations=iterations, passes=passes, eval_every=eval_every)
#lda = LdaModel(corpus, id2word=dictionary, num_topics=n_topics)
lda.print_topics(n_topics)
[(0, '0.014*"pop" + 0.013*"vocals" + 0.012*"track" + 0.007*"production" + 0.006*"house" + 0.006*"sound" + 0.005*"vocal" + 0.005*"electronic" + 0.005*"synth" + 0.005*"guitar"'), (1, '0.012*"like" + 0.011*"song" + 0.011*"one" + 0.009*"music" + 0.007*"time" + 0.006*"track" + 0.006*"love" + 0.006*"way" + 0.005*"get" + 0.005*"much"'), (2, '0.026*"new" + 0.015*"single" + 0.015*"album" + 0.012*"track" + 0.010*"release" + 0.010*"ep" + 0.009*"year" + 0.009*"debut" + 0.008*"first" + 0.008*"remix"')]
p = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
pyLDAvis.save_html(p, f'lda_{n_topics}topics.html')
for i in range(10):
print(max(lda[corpus[i]],key=itemgetter(1)), datasets.normalize_string(sents[i]))
(0, 0.94405675) tribal chants are pitched warped and layered all while being wrapped in waves of warm synths and building percussion (2, 0.696002) artist new live lp heaven on earth is in stores now (2, 0.5910036) in true artist fashion he has tantalised everyone with a series of sporadic uploading of tracks to his soundcloud and the latest is a beautiful cut titled the light featuring the ever soulful vocals courtesy of denai moore who has also worked with fantastic mr fox 3 (2, 0.9425159) now hes back with another single song_title offering another taste of his forthcoming solo project artist s demo (1, 0.6511736) if youre a bit familiar with artist youll probably recognise his signature sound (0, 0.84212404) its first half is a lush idyllic soundscape before it unfurls into a hand clapping hip shaking number (2, 0.9047589) the ep is due to be released on august 12th 2013 (2, 0.9256436) aka ben moorhouse and leo duncan will release their debut album (1, 0.7314071) look numbers dont mean everything but it means that i probably listened to their previous single about 900 thousand times which means over 100k people listened to that song (0, 0.5583165) we ve seen him dive into all sorts of dance pop trance electro and this time hes returned to his version of progressive club house
Counter([max(lda[corpus[i]], key=itemgetter(1))[0] + 1 for i in range(len(texts))])
Counter({1: 22837, 3: 39023, 2: 42607})
out = sorted([f'{max(lda[corpus[i]], key=itemgetter(1))} {datasets.normalize_string(sents[i])}\n' for i in range(len(texts))], reverse=True)
with open(f'sentence_{n_topics}topics.txt', 'w') as f:
f.writelines(out)
Most "representative":
(*) desirable topic (although, note that I don't want the tour dates at top of the topic...)
n_topics = 4
corpus = [dictionary.doc2bow(text) for text in texts]
# Train the model on the corpus.
lda = LdaModel(corpus, id2word=dictionary, num_topics=n_topics, iterations=iterations, passes=passes, eval_every=eval_every)
#lda = LdaModel(corpus, id2word=dictionary, num_topics=n_topics)
lda.print_topics(n_topics)
[(0, '0.015*"track" + 0.011*"vocals" + 0.011*"pop" + 0.008*"sound" + 0.006*"song" + 0.006*"like" + 0.006*"production" + 0.005*"vocal" + 0.004*"electronic" + 0.004*"house"'), (1, '0.016*"one" + 0.014*"like" + 0.011*"song" + 0.011*"time" + 0.011*"music" + 0.008*"get" + 0.007*"us" + 0.006*"im" + 0.006*"know" + 0.006*"love"'), (2, '0.009*"music" + 0.009*"tour" + 0.008*"live" + 0.005*"10" + 0.005*"festival" + 0.004*"show" + 0.004*"dates" + 0.003*"2" + 0.003*"club" + 0.003*"london"'), (3, '0.029*"new" + 0.020*"single" + 0.020*"album" + 0.016*"track" + 0.014*"release" + 0.013*"ep" + 0.012*"debut" + 0.011*"first" + 0.010*"year" + 0.010*"released"')]
p = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
pyLDAvis.save_html(p, f'lda_{n_topics}topics.html')
for i in range(10):
print(max(lda[corpus[i]],key=itemgetter(1)), datasets.normalize_string(sents[i]))
(0, 0.9373901) tribal chants are pitched warped and layered all while being wrapped in waves of warm synths and building percussion (3, 0.7175353) artist new live lp heaven on earth is in stores now (3, 0.403456) in true artist fashion he has tantalised everyone with a series of sporadic uploading of tracks to his soundcloud and the latest is a beautiful cut titled the light featuring the ever soulful vocals courtesy of denai moore who has also worked with fantastic mr fox 3 (3, 0.93666035) now hes back with another single song_title offering another taste of his forthcoming solo project artist s demo (1, 0.3998728) if youre a bit familiar with artist youll probably recognise his signature sound (0, 0.8428147) its first half is a lush idyllic soundscape before it unfurls into a hand clapping hip shaking number (3, 0.89285654) the ep is due to be released on august 12th 2013 (3, 0.91666096) aka ben moorhouse and leo duncan will release their debut album (1, 0.58346903) look numbers dont mean everything but it means that i probably listened to their previous single about 900 thousand times which means over 100k people listened to that song (0, 0.34985387) we ve seen him dive into all sorts of dance pop trance electro and this time hes returned to his version of progressive club house
Counter([max(lda[corpus[i]], key=itemgetter(1))[0] + 1 for i in range(len(texts))])
Counter({1: 32601, 4: 30634, 2: 31296, 3: 9936})
out = sorted([f'{max(lda[corpus[i]], key=itemgetter(1))} {datasets.normalize_string(sents[i])}\n' for i in range(len(texts))], reverse=True)
with open(f'sentence_{n_topics}topics.txt', 'w') as f:
f.writelines(out)
Most "representative":
(*) desirable topic
n_topics = 5
corpus = [dictionary.doc2bow(text) for text in texts]
# Train the model on the corpus.
lda = LdaModel(corpus, id2word=dictionary, num_topics=n_topics, iterations=iterations, passes=passes, eval_every=eval_every)
#lda = LdaModel(corpus, id2word=dictionary, num_topics=n_topics)
lda.print_topics(n_topics)
[(0, '0.014*"remix" + 0.010*"soundcloud" + 0.009*"check" + 0.008*"music" + 0.007*"video" + 0.007*"stream" + 0.006*"facebook" + 0.006*"via" + 0.005*"free" + 0.005*"available"'), (1, '0.015*"like" + 0.014*"one" + 0.011*"song" + 0.011*"music" + 0.010*"time" + 0.010*"track" + 0.007*"love" + 0.006*"us" + 0.006*"get" + 0.006*"something"'), (2, '0.007*"song" + 0.006*"years" + 0.005*"two" + 0.005*"music" + 0.005*"band" + 0.005*"work" + 0.004*"three" + 0.004*"working" + 0.004*"later" + 0.004*"one"'), (3, '0.023*"pop" + 0.019*"vocals" + 0.019*"track" + 0.009*"production" + 0.008*"electronic" + 0.008*"house" + 0.008*"sound" + 0.008*"vocal" + 0.008*"indie" + 0.007*"synth"'), (4, '0.031*"new" + 0.021*"single" + 0.021*"album" + 0.015*"release" + 0.014*"ep" + 0.013*"track" + 0.012*"debut" + 0.011*"year" + 0.011*"first" + 0.011*"released"')]
p = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
pyLDAvis.save_html(p, f'lda_{n_topics}topics.html')
for i in range(10):
print(max(lda[corpus[i]],key=itemgetter(1)), datasets.normalize_string(sents[i]))
(2, 0.56225455) tribal chants are pitched warped and layered all while being wrapped in waves of warm synths and building percussion (4, 0.8843004) artist new live lp heaven on earth is in stores now (4, 0.458874) in true artist fashion he has tantalised everyone with a series of sporadic uploading of tracks to his soundcloud and the latest is a beautiful cut titled the light featuring the ever soulful vocals courtesy of denai moore who has also worked with fantastic mr fox 3 (4, 0.93293434) now hes back with another single song_title offering another taste of his forthcoming solo project artist s demo (1, 0.41141158) if youre a bit familiar with artist youll probably recognise his signature sound (3, 0.5411828) its first half is a lush idyllic soundscape before it unfurls into a hand clapping hip shaking number (4, 0.8856973) the ep is due to be released on august 12th 2013 (4, 0.46677127) aka ben moorhouse and leo duncan will release their debut album (1, 0.5935679) look numbers dont mean everything but it means that i probably listened to their previous single about 900 thousand times which means over 100k people listened to that song (3, 0.61135703) we ve seen him dive into all sorts of dance pop trance electro and this time hes returned to his version of progressive club house
Counter([max(lda[corpus[i]], key=itemgetter(1))[0] + 1 for i in range(len(texts))])
Counter({3: 9414, 5: 29043, 2: 41333, 4: 16679, 1: 7998})
out = sorted([f'{max(lda[corpus[i]], key=itemgetter(1))} {datasets.normalize_string(sents[i])}\n' for i in range(len(texts))], reverse=True)
with open(f'sentence_{n_topics}topics.txt', 'w') as f:
f.writelines(out)
Most "representative":
(*) desirable topic (note that the most desirable topic here is still highly fragmented)
import spacy
nlp = spacy.load('en')
[[token.tag_ for token in nlp(' '.join(text))] for text in texts[:5]]
[['JJ', 'NN', 'VBD', 'PRP', 'JJ', 'NN', 'NN', 'NN', 'NN'], ['NN', 'VBG', 'NNS', 'NN', 'VBG', 'NNS', 'NN'], ['JJ', 'JJ', 'NN', 'VBG', 'RB', 'VBN'], ['RB', 'NN', 'JJ', 'NN'], ['NNS', 'NNS', 'NN', 'NN']]
pos_texts = [[token.tag_ for token in nlp(' '.join(text))] for text in texts]
pos_texts[:3]
[['JJ', 'NN', 'VBD', 'PRP', 'JJ', 'NN', 'NN', 'NN', 'NN'], ['NN', 'VBG', 'NNS', 'NN', 'VBG', 'NNS', 'NN'], ['JJ', 'JJ', 'NN', 'VBG', 'RB', 'VBN']]
pos_dictionary = Dictionary(pos_texts)
pos_corpus = [pos_dictionary.doc2bow(text) for text in pos_texts]
# Train the model on the corpus.
pos_lda = LdaModel(pos_corpus, id2word=pos_dictionary, num_topics=n_topics)
pos_lda.print_topics(n_topics)
[(0, '0.402*"LS" + 0.247*"XX" + 0.145*"UH" + 0.089*"CC" + 0.058*"FW" + 0.019*"NN" + 0.017*"ADD" + 0.008*"DT" + 0.006*"AFX" + 0.006*"NFP"'), (1, '0.291*"NN" + 0.236*"CD" + 0.210*"VB" + 0.058*"PRP" + 0.046*"IN" + 0.042*"MD" + 0.037*"RB" + 0.022*"VBN" + 0.014*"NNS" + 0.011*"NNP"'), (2, '0.384*"NN" + 0.203*"JJ" + 0.113*"NNS" + 0.069*"RB" + 0.046*"VBG" + 0.045*"VBP" + 0.035*"VBZ" + 0.035*"VBN" + 0.033*"VBD" + 0.021*"IN"')]
pyLDAvis.gensim.prepare(pos_lda, pos_corpus, pos_dictionary)
/home/ubuntu/src/anaconda3/envs/fastai/lib/python3.6/site-packages/pyLDAvis/_prepare.py:257: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version of pandas will change to not sort by default. To accept the future behavior, pass 'sort=True'. To retain the current behavior and silence the warning, pass sort=False return pd.concat([default_term_info] + list(topic_dfs))
Counter([max(pos_lda[pos_corpus[i]], key=itemgetter(1))[0] + 1 for i in range(len(pos_texts))])
Counter({3: 158910, 2: 31753, 1: 52064})
for i in range(10):
print(max(pos_lda[pos_corpus[i]],key=itemgetter(1)), normalize_string(sents[i]))
(2, 0.75887287) new music artist reached out to us with the powerful sentiment of their alt folk anthem song_title (2, 0.9140886) the song is about coping with the feelings of loss after losing friends to addiction (2, 0.9022116) the heavy subject matter is touching and eloquently delivered (2, 0.8601445) and its hard to deny the powerful chorus of (2, 0.8613848) but if its the drugs the women the wine the weed (2, 0.92435616) recommended for fans of mumford sons edward sharpe or noah and the whale (1, 0.82347447) drop artist a like on instagram while youre at it (2, 0.97328794) folk rockers artist have debuted their new song aeuroe song_title aeur an attempt by guitarist vocalist matt quinn to find some solace after tragedy aeur in this case the friends in his community lost too young to drugs and the devastation they left behind (2, 0.9543618) itaeur tm s a compellingly open hearted piece of songwriting that exemplifies why artist have found such a passionate following in little more than a year as a band (2, 0.7302367) see artist live in la on december 8th at the lodge room
out = sorted([f'{max(pos_lda[pos_corpus[i]], key=itemgetter(1))[0] + 1} {normalize_string(sents[i])}\n' for i in range(len(pos_texts))])
with open('pos_topics.txt', 'w') as f:
f.writelines(out)
import pandas as pd
plots = pd.read_csv('../../pytorch-text-vae/pytorchtextvae/plots.txt', sep='\t', header=None)
from matplotlib import pyplot as plt
import numpy as np
from scipy.interpolate import spline
%matplotlib inline
x = plots[0]
y = plots[1]
x_new = np.linspace(x.min(), x.max(), 300) #300 represents number of points to make between T.min and T.max
y_smooth = spline(x, y, x_new)
plt.plot(x_new, y_smooth)
/home/ubuntu/src/anaconda3/envs/fastai/lib/python3.6/site-packages/ipykernel_launcher.py:12: DeprecationWarning: `spline` is deprecated! spline is deprecated in scipy 0.19.0, use Bspline class instead. if sys.path[0] == '':
[<matplotlib.lines.Line2D at 0x7fe142298b38>]
plt.plot(plots[0], plots[3])
[<matplotlib.lines.Line2D at 0x7f31ecc45d68>]