In [2]:
# import the graphlab
import graphlab as gl
In [3]:
# reading the csv file
data = gl.SFrame.read_csv("C:\learn\ML\ML00caseStudy\week01Intro\song_data.csv")
[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: C:\Users\Atul\AppData\Local\Temp\graphlab_server_1469128219.log.0
INFO:graphlab.cython.cy_server:GraphLab Create v2.1 started. Logging: C:\Users\Atul\AppData\Local\Temp\graphlab_server_1469128219.log.0
Finished parsing file C:\learn\ML\ML00caseStudy\week01Intro\song_data.csv
Parsing completed. Parsed 100 lines in 4.06285 secs.
This non-commercial license of GraphLab Create for academic use is assigned to [email protected] and will expire on December 09, 2016.
------------------------------------------------------
Read 637410 lines. Lines per second: 192485
Finished parsing file C:\learn\ML\ML00caseStudy\week01Intro\song_data.csv
Parsing completed. Parsed 1000000 lines in 3.68689 secs.
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,str,long]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------
In [4]:
# getting no of rows and columns
data.num_rows(), data.num_columns() , len(data)
Out[4]:
(1000000, 5, 1000000)
In [5]:
# saving the dataset
data.save('songs')
In [6]:
# loading the dataset
sdata = gl.load_sframe('songs')
In [7]:
# head 5 rows
sdata.head(5)
Out[7]:
song_id title release artist_name year
SOQMMHC12AB0180CB8 Silent Night Monster Ballads X-Mas Faster Pussy cat 2003
SOVFVAK12A8C1350D9 Tanssi vaan Karkuteillä Karkkiautomaatti 1995
SOGTUKN12AB017F4F1 No One Could Ever Butter Hudson Mohawke 2006
SOBNYVR12A8C13558C Si Vos Querés De Culo Yerba Brava 2003
SOHSBXH12A8C13B0DF Tangle Of Aspens Rene Ablaze Presents
Winter Sessions ...
Der Mystic 0
[5 rows x 5 columns]
In [8]:
# likewise tail 4 rows
sdata.tail(4)
Out[8]:
song_id title release artist_name year
SOXQYIQ12A8C137FBB Jago Chhadeo Naale Baba Lassi Pee Gya Kuldeep Manak 0
SOHODZI12A8C137BB3 Novemba Dub_Connected: electronic
music ...
Gabriel Le Mar 0
SOLXGOR12A81C21EB7 Faraday The Trance Collection
Vol. 2 ...
Elude 0
SOWXJXQ12AB0189F43 Fernweh feat. Sektion
Kuchikäschtli ...
So Oder So Texta 2004
[4 rows x 5 columns]
In [9]:
# checking the column names and data type
sdata.column_types(), sdata.column_names()
Out[9]:
([str, str, str, str, int],
 ['song_id', 'title', 'release', 'artist_name', 'year'])

Modify the data

In [10]:
# creating a new columns
sdata['my_rating'] = 0
sdata
Out[10]:
song_id title release artist_name year my_rating
SOQMMHC12AB0180CB8 Silent Night Monster Ballads X-Mas Faster Pussy cat 2003 0
SOVFVAK12A8C1350D9 Tanssi vaan Karkuteillä Karkkiautomaatti 1995 0
SOGTUKN12AB017F4F1 No One Could Ever Butter Hudson Mohawke 2006 0
SOBNYVR12A8C13558C Si Vos Querés De Culo Yerba Brava 2003 0
SOHSBXH12A8C13B0DF Tangle Of Aspens Rene Ablaze Presents
Winter Sessions ...
Der Mystic 0 0
SOZVAPQ12A8C13B63C Symphony No. 1 G minor
"Sinfonie ...
Berwald: Symphonies Nos.
1/2/3/4 ...
David Montgomery 0 0
SOQVRHI12A6D4FB2D7 We Have Got Love Strictly The Best Vol. 34 Sasha / Turbulence 0 0
SOEYRFT12AB018936C 2 Da Beat Ch'yall Da Bomb Kris Kross 1993 0
SOPMIYT12A6D4F851E Goodbye Danny Boy Joseph Locke 0 0
SOJCFMH12A8C13B0C2 Mama_ mama can't you see
? ...
March to cadence with the
US marines ...
The Sun Harbor's Chorus-
Documentary Recordings ...
0 0
[1000000 rows x 6 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
In [11]:
# generating a new colmns called song_age respective to my age

my_age = 1989

sdata['song_age']=sdata['year']-my_age
sdata
Out[11]:
song_id title release artist_name year my_rating
SOQMMHC12AB0180CB8 Silent Night Monster Ballads X-Mas Faster Pussy cat 2003 0
SOVFVAK12A8C1350D9 Tanssi vaan Karkuteillä Karkkiautomaatti 1995 0
SOGTUKN12AB017F4F1 No One Could Ever Butter Hudson Mohawke 2006 0
SOBNYVR12A8C13558C Si Vos Querés De Culo Yerba Brava 2003 0
SOHSBXH12A8C13B0DF Tangle Of Aspens Rene Ablaze Presents
Winter Sessions ...
Der Mystic 0 0
SOZVAPQ12A8C13B63C Symphony No. 1 G minor
"Sinfonie ...
Berwald: Symphonies Nos.
1/2/3/4 ...
David Montgomery 0 0
SOQVRHI12A6D4FB2D7 We Have Got Love Strictly The Best Vol. 34 Sasha / Turbulence 0 0
SOEYRFT12AB018936C 2 Da Beat Ch'yall Da Bomb Kris Kross 1993 0
SOPMIYT12A6D4F851E Goodbye Danny Boy Joseph Locke 0 0
SOJCFMH12A8C13B0C2 Mama_ mama can't you see
? ...
March to cadence with the
US marines ...
The Sun Harbor's Chorus-
Documentary Recordings ...
0 0
song_age
14
6
17
14
-1989
-1989
-1989
4
-1989
-1989
[1000000 rows x 7 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
In [12]:
# generating another column title length
sdata['title length'] = sdata['title'].apply(lambda x : len(x.split()))
sdata
Out[12]:
song_id title release artist_name year my_rating
SOQMMHC12AB0180CB8 Silent Night Monster Ballads X-Mas Faster Pussy cat 2003 0
SOVFVAK12A8C1350D9 Tanssi vaan Karkuteillä Karkkiautomaatti 1995 0
SOGTUKN12AB017F4F1 No One Could Ever Butter Hudson Mohawke 2006 0
SOBNYVR12A8C13558C Si Vos Querés De Culo Yerba Brava 2003 0
SOHSBXH12A8C13B0DF Tangle Of Aspens Rene Ablaze Presents
Winter Sessions ...
Der Mystic 0 0
SOZVAPQ12A8C13B63C Symphony No. 1 G minor
"Sinfonie ...
Berwald: Symphonies Nos.
1/2/3/4 ...
David Montgomery 0 0
SOQVRHI12A6D4FB2D7 We Have Got Love Strictly The Best Vol. 34 Sasha / Turbulence 0 0
SOEYRFT12AB018936C 2 Da Beat Ch'yall Da Bomb Kris Kross 1993 0
SOPMIYT12A6D4F851E Goodbye Danny Boy Joseph Locke 0 0
SOJCFMH12A8C13B0C2 Mama_ mama can't you see
? ...
March to cadence with the
US marines ...
The Sun Harbor's Chorus-
Documentary Recordings ...
0 0
song_age title length
14 2
6 2
17 4
14 3
-1989 3
-1989 9
-1989 4
4 4
-1989 1
-1989 6
[1000000 rows x 8 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
In [13]:
# adding one more column
sdata.add_column(sdata.select_column('year').apply(lambda x: x - my_age),name='how_old_was_i')
sdata
Out[13]:
song_id title release artist_name year my_rating
SOQMMHC12AB0180CB8 Silent Night Monster Ballads X-Mas Faster Pussy cat 2003 0
SOVFVAK12A8C1350D9 Tanssi vaan Karkuteillä Karkkiautomaatti 1995 0
SOGTUKN12AB017F4F1 No One Could Ever Butter Hudson Mohawke 2006 0
SOBNYVR12A8C13558C Si Vos Querés De Culo Yerba Brava 2003 0
SOHSBXH12A8C13B0DF Tangle Of Aspens Rene Ablaze Presents
Winter Sessions ...
Der Mystic 0 0
SOZVAPQ12A8C13B63C Symphony No. 1 G minor
"Sinfonie ...
Berwald: Symphonies Nos.
1/2/3/4 ...
David Montgomery 0 0
SOQVRHI12A6D4FB2D7 We Have Got Love Strictly The Best Vol. 34 Sasha / Turbulence 0 0
SOEYRFT12AB018936C 2 Da Beat Ch'yall Da Bomb Kris Kross 1993 0
SOPMIYT12A6D4F851E Goodbye Danny Boy Joseph Locke 0 0
SOJCFMH12A8C13B0C2 Mama_ mama can't you see
? ...
March to cadence with the
US marines ...
The Sun Harbor's Chorus-
Documentary Recordings ...
0 0
song_age title length how_old_was_i
14 2 14
6 2 6
17 4 17
14 3 14
-1989 3 -1989
-1989 9 -1989
-1989 4 -1989
4 4 4
-1989 1 -1989
-1989 6 -1989
[1000000 rows x 9 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
In [14]:
# can add multiple columns at a time
sdata[['col1','col2']] = [sdata['title length'],sdata['my_rating']]
sdata
Out[14]:
song_id title release artist_name year my_rating
SOQMMHC12AB0180CB8 Silent Night Monster Ballads X-Mas Faster Pussy cat 2003 0
SOVFVAK12A8C1350D9 Tanssi vaan Karkuteillä Karkkiautomaatti 1995 0
SOGTUKN12AB017F4F1 No One Could Ever Butter Hudson Mohawke 2006 0
SOBNYVR12A8C13558C Si Vos Querés De Culo Yerba Brava 2003 0
SOHSBXH12A8C13B0DF Tangle Of Aspens Rene Ablaze Presents
Winter Sessions ...
Der Mystic 0 0
SOZVAPQ12A8C13B63C Symphony No. 1 G minor
"Sinfonie ...
Berwald: Symphonies Nos.
1/2/3/4 ...
David Montgomery 0 0
SOQVRHI12A6D4FB2D7 We Have Got Love Strictly The Best Vol. 34 Sasha / Turbulence 0 0
SOEYRFT12AB018936C 2 Da Beat Ch'yall Da Bomb Kris Kross 1993 0
SOPMIYT12A6D4F851E Goodbye Danny Boy Joseph Locke 0 0
SOJCFMH12A8C13B0C2 Mama_ mama can't you see
? ...
March to cadence with the
US marines ...
The Sun Harbor's Chorus-
Documentary Recordings ...
0 0
song_age title length how_old_was_i col1 col2
14 2 14 2 0
6 2 6 2 0
17 4 17 4 0
14 3 14 3 0
-1989 3 -1989 3 0
-1989 9 -1989 9 0
-1989 4 -1989 4 0
4 4 4 4 0
-1989 1 -1989 1 0
-1989 6 -1989 6 0
[1000000 rows x 11 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
In [15]:
#adding some more
#sdata[['col3','col4']] = [[3,4]]  this is not allowed, assigned values should be SArrray
#sdata
In [16]:
# deleting column
del sdata['song_age']
In [17]:
# rename columns
sdata.rename({'title length':'title_length'})
sdata
Out[17]:
song_id title release artist_name year my_rating
SOQMMHC12AB0180CB8 Silent Night Monster Ballads X-Mas Faster Pussy cat 2003 0
SOVFVAK12A8C1350D9 Tanssi vaan Karkuteillä Karkkiautomaatti 1995 0
SOGTUKN12AB017F4F1 No One Could Ever Butter Hudson Mohawke 2006 0
SOBNYVR12A8C13558C Si Vos Querés De Culo Yerba Brava 2003 0
SOHSBXH12A8C13B0DF Tangle Of Aspens Rene Ablaze Presents
Winter Sessions ...
Der Mystic 0 0
SOZVAPQ12A8C13B63C Symphony No. 1 G minor
"Sinfonie ...
Berwald: Symphonies Nos.
1/2/3/4 ...
David Montgomery 0 0
SOQVRHI12A6D4FB2D7 We Have Got Love Strictly The Best Vol. 34 Sasha / Turbulence 0 0
SOEYRFT12AB018936C 2 Da Beat Ch'yall Da Bomb Kris Kross 1993 0
SOPMIYT12A6D4F851E Goodbye Danny Boy Joseph Locke 0 0
SOJCFMH12A8C13B0C2 Mama_ mama can't you see
? ...
March to cadence with the
US marines ...
The Sun Harbor's Chorus-
Documentary Recordings ...
0 0
title_length how_old_was_i col1 col2
2 14 2 0
2 6 2 0
4 17 4 0
3 14 3 0
3 -1989 3 0
9 -1989 9 0
4 -1989 4 0
4 4 4 0
1 -1989 1 0
6 -1989 6 0
[1000000 rows x 10 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
In [18]:
# swap columns location
sdata.swap_columns('year','title_length')
sdata
Out[18]:
song_id title release artist_name title_length
SOQMMHC12AB0180CB8 Silent Night Monster Ballads X-Mas Faster Pussy cat 2
SOVFVAK12A8C1350D9 Tanssi vaan Karkuteillä Karkkiautomaatti 2
SOGTUKN12AB017F4F1 No One Could Ever Butter Hudson Mohawke 4
SOBNYVR12A8C13558C Si Vos Querés De Culo Yerba Brava 3
SOHSBXH12A8C13B0DF Tangle Of Aspens Rene Ablaze Presents
Winter Sessions ...
Der Mystic 3
SOZVAPQ12A8C13B63C Symphony No. 1 G minor
"Sinfonie ...
Berwald: Symphonies Nos.
1/2/3/4 ...
David Montgomery 9
SOQVRHI12A6D4FB2D7 We Have Got Love Strictly The Best Vol. 34 Sasha / Turbulence 4
SOEYRFT12AB018936C 2 Da Beat Ch'yall Da Bomb Kris Kross 4
SOPMIYT12A6D4F851E Goodbye Danny Boy Joseph Locke 1
SOJCFMH12A8C13B0C2 Mama_ mama can't you see
? ...
March to cadence with the
US marines ...
The Sun Harbor's Chorus-
Documentary Recordings ...
6
my_rating year how_old_was_i col1 col2
0 2003 14 2 0
0 1995 6 2 0
0 2006 17 4 0
0 2003 14 3 0
0 0 -1989 3 0
0 0 -1989 9 0
0 0 -1989 4 0
0 1993 4 4 0
0 0 -1989 1 0
0 0 -1989 6 0
[1000000 rows x 10 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
In [19]:
# change the column data type
sdata.column_names(),sdata.column_types()
Out[19]:
(['song_id',
  'title',
  'release',
  'artist_name',
  'title_length',
  'my_rating',
  'year',
  'how_old_was_i',
  'col1',
  'col2'],
 [str, str, str, str, int, int, int, int, int, int])
In [20]:
sdata['my_rating'] = sdata['my_rating'].astype(float)
sdata.column_types()
Out[20]:
[str, str, str, str, int, float, int, int, int, int]
In [21]:
# swap column location
sdata.swap_columns('title_length','year')
Out[21]:
song_id title release artist_name year my_rating
SOQMMHC12AB0180CB8 Silent Night Monster Ballads X-Mas Faster Pussy cat 2003 0.0
SOVFVAK12A8C1350D9 Tanssi vaan Karkuteillä Karkkiautomaatti 1995 0.0
SOGTUKN12AB017F4F1 No One Could Ever Butter Hudson Mohawke 2006 0.0
SOBNYVR12A8C13558C Si Vos Querés De Culo Yerba Brava 2003 0.0
SOHSBXH12A8C13B0DF Tangle Of Aspens Rene Ablaze Presents
Winter Sessions ...
Der Mystic 0 0.0
SOZVAPQ12A8C13B63C Symphony No. 1 G minor
"Sinfonie ...
Berwald: Symphonies Nos.
1/2/3/4 ...
David Montgomery 0 0.0
SOQVRHI12A6D4FB2D7 We Have Got Love Strictly The Best Vol. 34 Sasha / Turbulence 0 0.0
SOEYRFT12AB018936C 2 Da Beat Ch'yall Da Bomb Kris Kross 1993 0.0
SOPMIYT12A6D4F851E Goodbye Danny Boy Joseph Locke 0 0.0
SOJCFMH12A8C13B0C2 Mama_ mama can't you see
? ...
March to cadence with the
US marines ...
The Sun Harbor's Chorus-
Documentary Recordings ...
0 0.0
title_length how_old_was_i col1 col2
2 14 2 0
2 6 2 0
4 17 4 0
3 14 3 0
3 -1989 3 0
9 -1989 9 0
4 -1989 4 0
4 4 4 0
1 -1989 1 0
6 -1989 6 0
[1000000 rows x 10 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
In [22]:
# let's try to calculate how many 'a' are there in 'title', 'release' & 'artist_name' column
sdata.add_column(sdata['title', 'release' , 'artist_name'].apply(lambda row:sum(word.count('a') for word in row.values())),'no_of_a')
sdata.head(3)
Out[22]:
song_id title release artist_name year my_rating title_length
SOQMMHC12AB0180CB8 Silent Night Monster Ballads X-Mas Faster Pussy cat 2003 0.0 2
SOVFVAK12A8C1350D9 Tanssi vaan Karkuteillä Karkkiautomaatti 1995 0.0 2
SOGTUKN12AB017F4F1 No One Could Ever Butter Hudson Mohawke 2006 0.0 4
how_old_was_i col1 col2 no_of_a
14 2 0 5
6 2 0 8
17 4 0 1
[3 rows x 11 columns]

Checking Missing values

In [23]:
# checking the missing year
year_count = sdata.groupby('year', gl.aggregate.COUNT)
year_count.head(6)
Out[23]:
year Count
1987 5125
2011 1
1998 15858
1990 7258
1991 8650
1950 84
[6 rows x 2 columns]
In [24]:
# no of unique years
print "No of unique years :", str(len(year_count))
No of unique years : 90
In [25]:
print "no of invalid year count "
year_count.topk('year', reverse=True, k=1)
no of invalid year count 
Out[25]:
year Count
0 484424
[1 rows x 2 columns]
In [26]:
# year 0 is invalid value to better convert it to None
sdata['year'] = sdata['year'].apply(lambda x : None if x==0 else x)
sdata.head(5)
Out[26]:
song_id title release artist_name year my_rating title_length
SOQMMHC12AB0180CB8 Silent Night Monster Ballads X-Mas Faster Pussy cat 2003 0.0 2
SOVFVAK12A8C1350D9 Tanssi vaan Karkuteillä Karkkiautomaatti 1995 0.0 2
SOGTUKN12AB017F4F1 No One Could Ever Butter Hudson Mohawke 2006 0.0 4
SOBNYVR12A8C13558C Si Vos Querés De Culo Yerba Brava 2003 0.0 3
SOHSBXH12A8C13B0DF Tangle Of Aspens Rene Ablaze Presents
Winter Sessions ...
Der Mystic None 0.0 3
how_old_was_i col1 col2 no_of_a
14 2 0 5
6 2 0 8
17 4 0 1
14 3 0 3
-1989 3 0 2
[5 rows x 11 columns]
In [27]:
# now I have to fix the value of 'How_old_Was_i' column as well
# I can do this two ways, one - substract - year - born_year 
# or I can use the same apply function with NONE if x < 0
sdata['how_old_was_i'] = sdata['year'].apply(lambda x : None if x is None else x-my_age)
sdata.head(5)
Out[27]:
song_id title release artist_name year my_rating title_length
SOQMMHC12AB0180CB8 Silent Night Monster Ballads X-Mas Faster Pussy cat 2003 0.0 2
SOVFVAK12A8C1350D9 Tanssi vaan Karkuteillä Karkkiautomaatti 1995 0.0 2
SOGTUKN12AB017F4F1 No One Could Ever Butter Hudson Mohawke 2006 0.0 4
SOBNYVR12A8C13558C Si Vos Querés De Culo Yerba Brava 2003 0.0 3
SOHSBXH12A8C13B0DF Tangle Of Aspens Rene Ablaze Presents
Winter Sessions ...
Der Mystic None 0.0 3
how_old_was_i col1 col2 no_of_a
14 2 0 5
6 2 0 8
17 4 0 1
14 3 0 3
None 3 0 2
[5 rows x 11 columns]
In [28]:
# check the no of songs which have valid years
print len(sdata[sdata['year']>0])
515576
In [29]:
tmp = sdata['year']>0
print tmp
[1L, 1L, 1L, 1L, None, None, None, 1L, None, None, 1L, 1L, None, 1L, None, 1L, 1L, 1L, 1L, None, None, None, 1L, 1L, None, None, 1L, 1L, None, None, None, None, None, 1L, 1L, 1L, None, 1L, None, 1L, None, 1L, None, 1L, 1L, 1L, None, 1L, None, None, None, None, 1L, 1L, 1L, None, None, 1L, None, None, 1L, 1L, None, 1L, 1L, 1L, None, None, None, 1L, None, None, 1L, None, 1L, 1L, None, None, None, None, 1L, None, None, 1L, 1L, None, 1L, None, 1L, None, None, None, 1L, 1L, 1L, 1L, None, 1L, 1L, 1L, ... ]
In [30]:
# Look at lots of descriptive statistics of title_length
print "mean: " + str(sdata['title_length'].mean())
print "std: " + str(sdata['title_length'].std())
print "var: " + str(sdata['title_length'].var())
print "min: " + str(sdata['title_length'].min())
print "max: " + str(sdata['title_length'].max())
print "sum: " + str(sdata['title_length'].sum())
print "number of non-zero entries: " + str(sdata['title_length'].nnz())
mean: 3.369894
std: 2.17226527587
var: 4.71873642876
min: 0
max: 47
sum: 3369894
number of non-zero entries: 999985
In [31]:
approx_sketch = sdata['title_length'].sketch_summary()
print approx_sketch
+--------------------+---------------+----------+
|        item        |     value     | is exact |
+--------------------+---------------+----------+
|       Length       |    1000000    |   Yes    |
|        Min         |      0.0      |   Yes    |
|        Max         |      47.0     |   Yes    |
|        Mean        |    3.369894   |   Yes    |
|        Sum         |   3369894.0   |   Yes    |
|      Variance      | 4.71873642876 |   Yes    |
| Standard Deviation | 2.17226527587 |   Yes    |
|  # Missing Values  |       0       |   Yes    |
|  # unique values   |       44      |    No    |
+--------------------+---------------+----------+

Most frequent items:
+-------+--------+--------+--------+--------+-------+-------+-------+-------+
| value |   2    |   3    |   1    |   4    |   5   |   6   |   7   |   8   |
+-------+--------+--------+--------+--------+-------+-------+-------+-------+
| count | 241587 | 217674 | 164124 | 152569 | 92148 | 54674 | 30777 | 17826 |
+-------+--------+--------+--------+--------+-------+-------+-------+-------+
+-------+------+
|   9   |  10  |
+-------+------+
| 10465 | 6633 |
+-------+------+

Quantiles: 
+-----+-----+-----+-----+-----+-----+-----+------+------+
|  0% |  1% |  5% | 25% | 50% | 75% | 95% | 99%  | 100% |
+-----+-----+-----+-----+-----+-----+-----+------+------+
| 0.0 | 1.0 | 1.0 | 2.0 | 3.0 | 4.0 | 7.0 | 11.0 | 47.0 |
+-----+-----+-----+-----+-----+-----+-----+------+------+

In [32]:
# lets check which songs are having largest and smallest length
top_title_length = sdata.topk('title_length')
print top_title_length
+--------------------+-------------------------------+
|      song_id       |             title             |
+--------------------+-------------------------------+
| SOAALOO12AC468C4ED | Resolution Island Suite I)... |
| SOSEWOR12AB018BDF3 | Guayacan Mix: Amor Traicio... |
| SOEBPPZ12AB0183730 | Son Of Scheherazade: Pt.1-... |
| SODRVPW12A8C13DDCB | And by our own hand did ev... |
| SODEYRT12A8AE47972 | Any Place I Hang My Hat is... |
| SOEICJI12AC3DFAAD4 | Throw Away Comedy Medley: ... |
| SORHPYP12AB017C661 | Happenings' Medley: Oh! Th... |
| SOKNNWV12AB017B473 | If I Had My Way/Irish Rose... |
| SOWKFQF12AAA8C85E4 | Manitoba Ne Répond Plus (C... |
| SOGFUVA12A58A773AB | They Call Me Rock 'N' Roll... |
+--------------------+-------------------------------+
+-------------------------------+--------------------------+------+-----------+
|            release            |       artist_name        | year | my_rating |
+-------------------------------+--------------------------+------+-----------+
|      Allegory Of Hearing      |      Roy Montgomery      | None |    0.0    |
|        Como en un baile       |         Guayacan         | None |    0.0    |
|        British Tour '76       |       Renaissance        | None |    0.0    |
| Every Red Heart Shines Tow... |       Red Sparowes       | 2006 |    0.0    |
| Too Marvelous For Words - ... |       Lee Lessack        | None |    0.0    |
|       Live At The Sands       |       Dean Martin        | None |    0.0    |
| The Cat & The Fiddle - 66 ... | The Mother Goose Singers | None |    0.0    |
| 1999 International Barbers... |        Swing City        | None |    0.0    |
|    Manitoba Ne Répond Plus    |      Gérard Manset       | 2008 |    0.0    |
|     Motor City Connection     |   Brownsville Station    | 1975 |    0.0    |
+-------------------------------+--------------------------+------+-----------+
+--------------+---------------+------+------+---------+
| title_length | how_old_was_i | col1 | col2 | no_of_a |
+--------------+---------------+------+------+---------+
|      47      |      None     |  47  |  0   |    10   |
|      46      |      None     |  46  |  0   |    22   |
|      46      |      None     |  46  |  0   |    17   |
|      45      |       17      |  45  |  0   |    16   |
|      44      |      None     |  44  |  0   |    14   |
|      44      |      None     |  44  |  0   |    14   |
|      40      |      None     |  40  |  0   |    6    |
|      40      |      None     |  40  |  0   |    21   |
|      39      |       19      |  39  |  0   |    21   |
|      39      |      -14      |  39  |  0   |    9    |
+--------------+---------------+------+------+---------+
[10 rows x 11 columns]

In [33]:
# what about lowest
lowest_title_lenght = sdata.topk('title_length', reverse=True)
print lowest_title_lenght
+--------------------+-------+-------------------------------+----------------+
|      song_id       | title |            release            |  artist_name   |
+--------------------+-------+-------------------------------+----------------+
| SOVICLT12A58A7C4D0 |       |        Of Flies And Men       |     Milton     |
| SOAGRAA12AB018D567 |       |          Puce de luxe         | Sébastien Roch |
| SOAUTVB12AB018AFF0 |       |          Puce de luxe         | Sébastien Roch |
| SOMPGYE12AB018AFE0 |       |          Puce de luxe         | Sébastien Roch |
| SOVHAZH12AB018D5B3 |       |          Puce de luxe         | Sébastien Roch |
| SOECGBQ12AB018D582 |       |          Puce de luxe         | Sébastien Roch |
| SOFDILP12AC960A4EF |       | Recupera tus Clásicos - Fobia |     Fobia      |
| SOSFNKO12AB018D5BB |       |          Puce de luxe         | Sébastien Roch |
| SOZDBDL12AB018AFFF |       |          Puce de luxe         | Sébastien Roch |
| SOQUGMS12AB018B01D |       |          Puce de luxe         | Sébastien Roch |
+--------------------+-------+-------------------------------+----------------+
+------+-----------+--------------+---------------+------+------+---------+
| year | my_rating | title_length | how_old_was_i | col1 | col2 | no_of_a |
+------+-----------+--------------+---------------+------+------+---------+
| None |    0.0    |      0       |      None     |  0   |  0   |    0    |
| None |    0.0    |      0       |      None     |  0   |  0   |    1    |
| None |    0.0    |      0       |      None     |  0   |  0   |    1    |
| None |    0.0    |      0       |      None     |  0   |  0   |    1    |
| None |    0.0    |      0       |      None     |  0   |  0   |    1    |
| None |    0.0    |      0       |      None     |  0   |  0   |    1    |
| None |    0.0    |      0       |      None     |  0   |  0   |    3    |
| None |    0.0    |      0       |      None     |  0   |  0   |    1    |
| None |    0.0    |      0       |      None     |  0   |  0   |    1    |
| None |    0.0    |      0       |      None     |  0   |  0   |    1    |
+------+-----------+--------------+---------------+------+------+---------+
[10 rows x 11 columns]

In [34]:
# lowest 15
# what about lowest
lowest_title_lenght = sdata.topk('title_length', reverse=True, k =15)
print lowest_title_lenght
+--------------------+-------+---------------------+----------------+------+-----------+
|      song_id       | title |       release       |  artist_name   | year | my_rating |
+--------------------+-------+---------------------+----------------+------+-----------+
| SOVICLT12A58A7C4D0 |       |   Of Flies And Men  |     Milton     | None |    0.0    |
| SOAGRAA12AB018D567 |       |     Puce de luxe    | Sébastien Roch | None |    0.0    |
| SOOAFJX12AB018A028 |       |     Puce de luxe    | Sébastien Roch | None |    0.0    |
| SOLDTFD12AB018AFE6 |       |     Puce de luxe    | Sébastien Roch | None |    0.0    |
| SOAUTVB12AB018AFF0 |       |     Puce de luxe    | Sébastien Roch | None |    0.0    |
| SOMPGYE12AB018AFE0 |       |     Puce de luxe    | Sébastien Roch | None |    0.0    |
| SOVHAZH12AB018D5B3 |       |     Puce de luxe    | Sébastien Roch | None |    0.0    |
| SOECGBQ12AB018D582 |       |     Puce de luxe    | Sébastien Roch | None |    0.0    |
| SOTJWHR12AB018D5A0 |       |     Puce de luxe    | Sébastien Roch | None |    0.0    |
| SOBDDHR12AB01888A6 |       | Grand Glam Outtakes |     Remute     | None |    0.0    |
+--------------------+-------+---------------------+----------------+------+-----------+
+--------------+---------------+------+------+---------+
| title_length | how_old_was_i | col1 | col2 | no_of_a |
+--------------+---------------+------+------+---------+
|      0       |      None     |  0   |  0   |    0    |
|      0       |      None     |  0   |  0   |    1    |
|      0       |      None     |  0   |  0   |    1    |
|      0       |      None     |  0   |  0   |    1    |
|      0       |      None     |  0   |  0   |    1    |
|      0       |      None     |  0   |  0   |    1    |
|      0       |      None     |  0   |  0   |    1    |
|      0       |      None     |  0   |  0   |    1    |
|      0       |      None     |  0   |  0   |    1    |
|      0       |      None     |  0   |  0   |    3    |
+--------------+---------------+------+------+---------+
[15 rows x 11 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
In [35]:
before_i_was_born = sdata['how_old_was_i'] < 0
before_i_was_born.all(), before_i_was_born.any()
Out[35]:
(False, True)
In [36]:
# get total songs in an album and display the top album by no
sdata.groupby(['artist_name','release'], {'no_of_songs_in_album':gl.aggregate.COUNT} ).topk('no_of_songs_in_album')
Out[36]:
artist_name release no_of_songs_in_album
Fanny First Time In A Long
Time: The Reprise ...
85
Bernard Herrmann The Twilight Zone 81
Spanky & Our Gang The Complete Mercury
Recordings ...
75
The Smashing Pumpkins Rarities & B-Sides 72
Big Star Keep An Eye On The Sky 71
Jacques Dutronc Intégrale Les Cactus 69
The Stooges 1970: The Complete Fun
House Sessions ...
67
Lull Moments 64
Willie Clancy Willie Clancy The Gold
Ring ...
61
Jack Dangers Forbidden Planet Explored
/ Sci-Fi Sound Effects ...
60
[10 rows x 3 columns]
In [37]:
# this will download the 118 MB file
#usage_data = gl.SFrame.read_csv("https://static.turi.com/datasets/millionsong/10000.txt", header=False, delimiter='\t', column_type_hints={'X3':int})
#usage_data.rename({'X1':'user_id', 'X2':'song_id', 'X3':'listen_count'})
In [38]:
# Read the data
usage_data = gl.SFrame.read_csv("C:/learn/ML/ML00caseStudy/week01Intro/10000.txt", header=False, delimiter="\t", column_type_hints={'X3':int})
usage_data.rename({'X1':'user_id', 'X2':'song_id', 'X3':'listen_count'})
Finished parsing file C:\learn\ML\ML00caseStudy\week01Intro\10000.txt
Parsing completed. Parsed 100 lines in 3.04661 secs.
Read 844838 lines. Lines per second: 265452
Finished parsing file C:\learn\ML\ML00caseStudy\week01Intro\10000.txt
Parsing completed. Parsed 2000000 lines in 4.35327 secs.
Out[38]:
user_id song_id listen_count
b80344d063b5ccb3212f76538
f3d9e43d87dca9e ...
SOAKIMP12A8C130995 1
b80344d063b5ccb3212f76538
f3d9e43d87dca9e ...
SOBBMDR12A8C13253B 2
b80344d063b5ccb3212f76538
f3d9e43d87dca9e ...
SOBXHDL12A81C204C0 1
b80344d063b5ccb3212f76538
f3d9e43d87dca9e ...
SOBYHAJ12A6701BF1D 1
b80344d063b5ccb3212f76538
f3d9e43d87dca9e ...
SODACBL12A8C13C273 1
b80344d063b5ccb3212f76538
f3d9e43d87dca9e ...
SODDNQT12A6D4F5F7E 5
b80344d063b5ccb3212f76538
f3d9e43d87dca9e ...
SODXRTY12AB0180F3B 1
b80344d063b5ccb3212f76538
f3d9e43d87dca9e ...
SOFGUAY12AB017B0A8 1
b80344d063b5ccb3212f76538
f3d9e43d87dca9e ...
SOFRQTD12A81C233C0 1
b80344d063b5ccb3212f76538
f3d9e43d87dca9e ...
SOHQWYZ12A6D4FA701 1
[2000000 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
In [39]:
# saving this data frame
usage_data.save('usage_data')
# loading  the data
usage_data = gl.load_sframe('usage_data')
In [40]:
# find out the unique users
print len(usage_data['user_id'].unique())
76353
In [41]:
# let's create two datasets which we can join
ds1 = sdata[((sdata['artist_name'] == 'Relient K')
                           | (sdata['artist_name'] == 'Streetlight Manifesto'))
                          & (sdata['how_old_was_i'] >= 14) & (sdata['how_old_was_i'] <= 18)]
ds1
Out[41]:
song_id title release artist_name year my_rating
SOQNUHJ12A6D4F9E19 Let It All Out (Album
Version) ...
MMHMM Relient K 2004 0.0
SOADZOM12A8C13E446 The Blonde Lead The Blind
(Album Version) ...
Somewhere In The Between Streetlight Manifesto 2007 0.0
SOKUTMT12A58A77CFB Give Until There's
Nothing Left ...
Five Score And Seven
Years Ago ...
Relient K 2007 0.0
SOIONAH12A58A76FD1 We Are The Few (Album
Version) ...
Everything Goes Numb Streetlight Manifesto 2003 0.0
SOUUYHK12A6D4F9E16 I So Hate Consequences
(Album Version) ...
MMHMM Relient K 2004 0.0
SOCBUJT12A6D4F9E1A Who I Am Hates Who I've
Been (mmhmm Album ...
MMHMM Relient K 2004 0.0
SOBUHJR12A6D4FDC7C Here's To Life (Album
Version) ...
Everything Goes Numb Streetlight Manifesto 2003 0.0
SONKBWG12A6D4FB91D Giving Up_ Giving In (LP
Version) ...
Keasbey Nights Streetlight Manifesto 2006 0.0
SOSFAVU12A6D4FDC6A Everything Went Numb
(Album Version) ...
Everything Goes Numb Streetlight Manifesto 2003 0.0
SORYDMW12A6D4FB923 This One Goes Out To....
(LP Version) ...
Keasbey Nights Streetlight Manifesto 2006 0.0
title_length how_old_was_i col1 col2 no_of_a
6 15 6 0 0
7 18 7 0 2
5 18 5 0 1
6 14 6 0 1
6 15 6 0 1
10 15 10 0 1
5 14 5 0 1
6 17 6 0 2
5 14 5 0 1
7 17 7 0 2
[? rows x 11 columns]
Note: Only the head of the SFrame is printed. This SFrame is lazily evaluated.
You can use sf.materialize() to force materialization.
In [42]:
# Let's join ds1 with the 10000.txt dataset usage_data
dsjoin = ds1.join(usage_data, 'song_id')
dsjoin
Out[42]:
song_id title release artist_name year my_rating
SOMMLRO12A6D4F9FD1 Must Have Done Something
Right ...
Must Have Done Something
Right ...
Relient K 2007 0.0
SOCLCYG12A6D4FDC71 Point/Counterpoint (Album
Version) ...
Everything Goes Numb Streetlight Manifesto 2003 0.0
SOINPKF12A6D4FDC75 A Better Place_ A Better
Time (Album Version) ...
Everything Goes Numb Streetlight Manifesto 2003 0.0
SOMMLRO12A6D4F9FD1 Must Have Done Something
Right ...
Must Have Done Something
Right ...
Relient K 2007 0.0
SOMMLRO12A6D4F9FD1 Must Have Done Something
Right ...
Must Have Done Something
Right ...
Relient K 2007 0.0
SOMMLRO12A6D4F9FD1 Must Have Done Something
Right ...
Must Have Done Something
Right ...
Relient K 2007 0.0
SOPSQOS12A6D4F9E15 High Of 75 (Album
Version) ...
MMHMM Relient K 2004 0.0
SOSTGAF12A58A7B18B The Best Thing Five Score And Seven
Years Ago ...
Relient K 2007 0.0
SOUUYHK12A6D4F9E16 I So Hate Consequences
(Album Version) ...
MMHMM Relient K 2004 0.0
SOMMLRO12A6D4F9FD1 Must Have Done Something
Right ...
Must Have Done Something
Right ...
Relient K 2007 0.0
title_length how_old_was_i col1 col2 no_of_a user_id listen_count
5 18 5 0 2 3f9ed694a79835c921ef6d94a
cd28f876c1d901e ...
3
3 14 3 0 1 956044d724390e40c8511b49e
5bf6bc28071de3a ...
1
8 14 8 0 2 956044d724390e40c8511b49e
5bf6bc28071de3a ...
1
5 18 5 0 2 354cfdb566f543bb5b810a4d8
959d974a30797fd ...
1
5 18 5 0 2 e4d4e50c99ed5b3dfe740fa1c
cbe6be41eeb4f35 ...
14
5 18 5 0 2 ed0f4979ffddc0010244638a3
b956510624dc6a9 ...
3
5 15 5 0 0 ed0f4979ffddc0010244638a3
b956510624dc6a9 ...
2
3 18 3 0 1 ed0f4979ffddc0010244638a3
b956510624dc6a9 ...
1
6 15 6 0 1 ed0f4979ffddc0010244638a3
b956510624dc6a9 ...
1
5 18 5 0 2 0e741cfe121bb619177be8d8a
135a2d3692d9c90 ...
4
[571 rows x 13 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
In [43]:
# total row in ds1 and dsjoin datasets
len(ds1), len(dsjoin)
Out[43]:
(64, 571)
In [44]:
len(ds1['song_id'].unique()), len(dsjoin['song_id'].unique()), len(ds1['song_id']), len(dsjoin['song_id'])
Out[44]:
(64, 7, 64, 571)
In [45]:
# find out most popular songs when I was between 14 n 18
most_popular = dsjoin.groupby(['song_id'], {'total_listen_count':gl.aggregate.SUM('listen_count'), 
                                             'num_unique_users':gl.aggregate.COUNT('user_id')})
most_popular
Out[45]:
song_id num_unique_users total_listen_count
SOCVOVH12A6D4FB912 107 555
SOPSQOS12A6D4F9E15 79 207
SOMMLRO12A6D4F9FD1 104 331
SOUUYHK12A6D4F9E16 79 171
SOINPKF12A6D4FDC75 63 196
SOSTGAF12A58A7B18B 72 168
SOCLCYG12A6D4FDC71 67 188
[7 rows x 3 columns]
In [46]:
# to get artist name we have to join this data
most_popular.join(sdata, 'song_id').topk('total_listen_count',k=20)
Out[46]:
song_id num_unique_users total_listen_count title release
SOCVOVH12A6D4FB912 107 555 Keasbey Nights (LP
Version) ...
Keasbey Nights
SOMMLRO12A6D4F9FD1 104 331 Must Have Done Something
Right ...
Must Have Done Something
Right ...
SOPSQOS12A6D4F9E15 79 207 High Of 75 (Album
Version) ...
MMHMM
SOINPKF12A6D4FDC75 63 196 A Better Place_ A Better
Time (Album Version) ...
Everything Goes Numb
SOCLCYG12A6D4FDC71 67 188 Point/Counterpoint (Album
Version) ...
Everything Goes Numb
SOUUYHK12A6D4F9E16 79 171 I So Hate Consequences
(Album Version) ...
MMHMM
SOSTGAF12A58A7B18B 72 168 The Best Thing Five Score And Seven
Years Ago ...
artist_name year my_rating title_length how_old_was_i col1 col2 no_of_a
Streetlight Manifesto 2006 0.0 4 17 4 0 3
Relient K 2007 0.0 5 18 5 0 2
Relient K 2004 0.0 5 15 5 0 0
Streetlight Manifesto 2003 0.0 8 14 8 0 2
Streetlight Manifesto 2003 0.0 3 14 3 0 1
Relient K 2004 0.0 6 15 6 0 1
Relient K 2007 0.0 3 18 3 0 1
[7 rows x 13 columns]
In [47]:
# let's append a row with max liste count and check whether it comes in above result or not
me = gl.SFrame({'user_id':['evan'],'song_id':['SOSFAVU12A6D4FDC6A'],'listen_count':[4000]})
# adding this data to usage data
usage_data = usage_data.append(me)
In [48]:
# repeating the above join n group by statement
dsjoin = ds1.join(usage_data, 'song_id')
most_popular = dsjoin.groupby(['song_id'], {'total_listen_count':gl.aggregate.SUM('listen_count'), 
                                             'num_unique_users':gl.aggregate.COUNT('user_id')})
most_popular.join(sdata, 'song_id').topk('total_listen_count',k=20)
Out[48]:
song_id num_unique_users total_listen_count title release
SOSFAVU12A6D4FDC6A 1 4000 Everything Went Numb
(Album Version) ...
Everything Goes Numb
SOCVOVH12A6D4FB912 107 555 Keasbey Nights (LP
Version) ...
Keasbey Nights
SOMMLRO12A6D4F9FD1 104 331 Must Have Done Something
Right ...
Must Have Done Something
Right ...
SOPSQOS12A6D4F9E15 79 207 High Of 75 (Album
Version) ...
MMHMM
SOINPKF12A6D4FDC75 63 196 A Better Place_ A Better
Time (Album Version) ...
Everything Goes Numb
SOCLCYG12A6D4FDC71 67 188 Point/Counterpoint (Album
Version) ...
Everything Goes Numb
SOUUYHK12A6D4F9E16 79 171 I So Hate Consequences
(Album Version) ...
MMHMM
SOSTGAF12A58A7B18B 72 168 The Best Thing Five Score And Seven
Years Ago ...
artist_name year my_rating title_length how_old_was_i col1 col2 no_of_a
Streetlight Manifesto 2003 0.0 5 14 5 0 1
Streetlight Manifesto 2006 0.0 4 17 4 0 3
Relient K 2007 0.0 5 18 5 0 2
Relient K 2004 0.0 5 15 5 0 0
Streetlight Manifesto 2003 0.0 8 14 8 0 2
Streetlight Manifesto 2003 0.0 3 14 3 0 1
Relient K 2004 0.0 6 15 6 0 1
Relient K 2007 0.0 3 18 3 0 1
[8 rows x 13 columns]

Splitting and Sampling

Lets check, how we can randomly split the data for test

In [49]:
# Randomly split data rows into two subsets
first_set, second_set = sdata.random_split(0.8, seed = 1)
first_set.num_rows(), second_set.num_rows()
Out[49]:
(800135, 199865)

If you want to split on a predicate though, you'll have to do that manually.

In [50]:
songs_before = sdata[sdata['how_old_was_i'] < 0]
songs_after = sdata[sdata['how_old_was_i'] >= 0]
songs_before.num_rows(), songs_after.num_rows()
Out[50]:
(75581, 439995)
In [51]:
# generating sample data
sample = sdata.sample(0.4)
sample.num_rows()
Out[51]:
399454

SArray

In [52]:
arr = gl.SArray([1,2,3])
arr
Out[52]:
dtype: int
Rows: 3
[1L, 2L, 3L]
In [53]:
arr2 = 2*arr
arr2
Out[53]:
dtype: int
Rows: 3
[2L, 4L, 6L]
In [54]:
# add
arr + arr2
Out[54]:
dtype: int
Rows: 3
[3L, 6L, 9L]
In [55]:
# multiply
arr * arr2
Out[55]:
dtype: int
Rows: 3
[2L, 8L, 18L]
In [56]:
# divide
arr2 / arr
Out[56]:
dtype: float
Rows: 3
[2.0, 2.0, 2.0]
In [57]:
# iterating with SFrame
for i in sdata:
    if i['title_length'] >= 45:
        print "Whoa that's long!"
Whoa that's long!
Whoa that's long!
Whoa that's long!
Whoa that's long!

Using apply function on SFrame

In [58]:
sdata['title_artist_length'] = sdata['title','artist_name'].apply(lambda row: sum([len(col) for col in row.values()]))
sdata
Out[58]:
song_id title release artist_name year my_rating
SOQMMHC12AB0180CB8 Silent Night Monster Ballads X-Mas Faster Pussy cat 2003 0.0
SOVFVAK12A8C1350D9 Tanssi vaan Karkuteillä Karkkiautomaatti 1995 0.0
SOGTUKN12AB017F4F1 No One Could Ever Butter Hudson Mohawke 2006 0.0
SOBNYVR12A8C13558C Si Vos Querés De Culo Yerba Brava 2003 0.0
SOHSBXH12A8C13B0DF Tangle Of Aspens Rene Ablaze Presents
Winter Sessions ...
Der Mystic None 0.0
SOZVAPQ12A8C13B63C Symphony No. 1 G minor
"Sinfonie ...
Berwald: Symphonies Nos.
1/2/3/4 ...
David Montgomery None 0.0
SOQVRHI12A6D4FB2D7 We Have Got Love Strictly The Best Vol. 34 Sasha / Turbulence None 0.0
SOEYRFT12AB018936C 2 Da Beat Ch'yall Da Bomb Kris Kross 1993 0.0
SOPMIYT12A6D4F851E Goodbye Danny Boy Joseph Locke None 0.0
SOJCFMH12A8C13B0C2 Mama_ mama can't you see
? ...
March to cadence with the
US marines ...
The Sun Harbor's Chorus-
Documentary Recordings ...
None 0.0
title_length how_old_was_i col1 col2 no_of_a title_artist_length
2 14 2 0 5 28
2 6 2 0 8 27
4 17 4 0 1 31
3 14 3 0 3 25
3 None 3 0 2 26
9 None 9 0 3 78
4 None 4 0 3 34
4 4 4 0 4 27
1 None 1 0 1 19
6 None 6 0 10 72
[1000000 rows x 12 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.

Saving Our Work

In [59]:
# save as csv
sdata.save('sdata_new.csv', format='csv')
In [60]:
sdata.save('sdata_new')
In [ ]: