import graphlab as gl
# set canvas to show sframes and sgraphs in ipython notebook
gl.canvas.set_target('ipynb')
[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1525500461.log
This non-commercial license of GraphLab Create for academic use is assigned to wangchengjun@nju.edu.cn and will expire on March 14, 2019.
#train_file = 'http://s3.amazonaws.com/dato-datasets/millionsong/10000.txt'
train_file = '/Users/datalab/bigdata/cjc/millionsong/song_usage_10000.txt'
sf = gl.SFrame.read_csv(train_file, header=False, delimiter='\t', verbose=False)
sf.rename({'X1':'user_id', 'X2':'music_id', 'X3':'rating'})
user_id | music_id | rating |
---|---|---|
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ... |
SOAKIMP12A8C130995 | 1 |
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ... |
SOBBMDR12A8C13253B | 2 |
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ... |
SOBXHDL12A81C204C0 | 1 |
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ... |
SOBYHAJ12A6701BF1D | 1 |
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ... |
SODACBL12A8C13C273 | 1 |
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ... |
SODDNQT12A6D4F5F7E | 5 |
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ... |
SODXRTY12AB0180F3B | 1 |
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ... |
SOFGUAY12AB017B0A8 | 1 |
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ... |
SOFRQTD12A81C233C0 | 1 |
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ... |
SOHQWYZ12A6D4FA701 | 1 |
train_set, test_set = sf.random_split(0.8, seed=1)
popularity_model = gl.popularity_recommender.create(train_set,
'user_id', 'music_id',
target = 'rating')
Recsys training: model = popularity
Preparing data set.
Data has 1599753 observations with 76085 users and 10000 items.
Data prepared in: 1.2441s
1599753 observations to process; with 10000 unique items.
item_sim_model = gl.item_similarity_recommender.create(train_set,
'user_id', 'music_id',
target = 'rating',
similarity_type='cosine')
Recsys training: model = item_similarity
Preparing data set.
Data has 1599753 observations with 76085 users and 10000 items.
Data prepared in: 1.18984s
Training model from provided data.
Gathering per-item and per-user statistics.
+--------------------------------+------------+
| Elapsed Time (Item Statistics) | % Complete |
+--------------------------------+------------+
| 1.549ms | 1.25 |
| 46.313ms | 100 |
+--------------------------------+------------+
Setting up lookup tables.
Processing data in one pass using dense lookup tables.
+-------------------------------------+------------------+-----------------+
| Elapsed Time (Constructing Lookups) | Total % Complete | Items Processed |
+-------------------------------------+------------------+-----------------+
| 264.127ms | 0 | 0 |
| 994.743ms | 100 | 10000 |
+-------------------------------------+------------------+-----------------+
Finalizing lookup tables.
Generating candidate set for working with new users.
Finished training in 1.06117s
factorization_machine_model = gl.recommender.factorization_recommender.create(train_set,
'user_id', 'music_id',
target='rating')
Recsys training: model = factorization_recommender
Preparing data set.
Data has 1599753 observations with 76085 users and 10000 items.
Data prepared in: 1.21379s
Training factorization_recommender for recommendations.
+--------------------------------+--------------------------------------------------+----------+
| Parameter | Description | Value |
+--------------------------------+--------------------------------------------------+----------+
| num_factors | Factor Dimension | 8 |
| regularization | L2 Regularization on Factors | 1e-08 |
| solver | Solver used for training | sgd |
| linear_regularization | L2 Regularization on Linear Coefficients | 1e-10 |
| max_iterations | Maximum Number of Iterations | 50 |
+--------------------------------+--------------------------------------------------+----------+
Optimizing model using SGD; tuning step size.
Using 199969 / 1599753 points for tuning the step size.
+---------+-------------------+------------------------------------------+
| Attempt | Initial Step Size | Estimated Objective Value |
+---------+-------------------+------------------------------------------+
| 0 | 25 | No Decrease (227.564 >= 37.2517) |
| 1 | 6.25 | No Decrease (218.774 >= 37.2517) |
| 2 | 1.5625 | No Decrease (189.223 >= 37.2517) |
| 3 | 0.390625 | No Decrease (84.7797 >= 37.2517) |
| 4 | 0.0976562 | 12.0849 |
| 5 | 0.0488281 | 8.26182 |
| 6 | 0.0244141 | 21.5168 |
+---------+-------------------+------------------------------------------+
| Final | 0.0488281 | 8.26182 |
+---------+-------------------+------------------------------------------+
Starting Optimization.
+---------+--------------+-------------------+-----------------------+-------------+
| Iter. | Elapsed Time | Approx. Objective | Approx. Training RMSE | Step Size |
+---------+--------------+-------------------+-----------------------+-------------+
| Initial | 72us | 43.795 | 6.61778 | |
+---------+--------------+-------------------+-----------------------+-------------+
| 1 | 164.211ms | 43.5009 | 6.59512 | 0.0488281 |
| 2 | 310.478ms | 40.8579 | 6.39166 | 0.0290334 |
| 3 | 428.164ms | 37.9236 | 6.15785 | 0.0214205 |
| 4 | 543.402ms | 35.1185 | 5.92569 | 0.0172633 |
| 5 | 661.456ms | 32.6788 | 5.7161 | 0.014603 |
| 6 | 784.067ms | 30.599 | 5.53115 | 0.0127367 |
| 10 | 1.25s | 24.7663 | 4.97592 | 0.008683 |
| 11 | 1.38s | 23.5067 | 4.84768 | 0.00808399 |
| 20 | 2.49s | 17.6493 | 4.20017 | 0.00516295 |
| 30 | 3.65s | 14.3453 | 3.78639 | 0.00380916 |
| 40 | 4.88s | 12.6728 | 3.55862 | 0.00306991 |
| 50 | 6.13s | 11.2253 | 3.34901 | 0.00218366 |
+---------+--------------+-------------------+-----------------------+-------------+
Optimization Complete: Maximum number of passes through the data reached.
Computing final objective value and training RMSE.
Final objective value: 9.81513
Final training RMSE: 3.1314
len(train_set)
1599753
result = gl.recommender.util.compare_models(test_set,
[popularity_model, item_sim_model, factorization_machine_model],
user_sample=.5, skip_set=train_set)
compare_models: using 34355 users to estimate model performance PROGRESS: Evaluate model M0
recommendations finished on 1000/34355 queries. users per second: 18058.4
recommendations finished on 2000/34355 queries. users per second: 21323.8
recommendations finished on 3000/34355 queries. users per second: 22068.6
recommendations finished on 4000/34355 queries. users per second: 22236.7
recommendations finished on 5000/34355 queries. users per second: 22389.4
recommendations finished on 6000/34355 queries. users per second: 22147
recommendations finished on 7000/34355 queries. users per second: 22349.2
recommendations finished on 8000/34355 queries. users per second: 22358.7
recommendations finished on 9000/34355 queries. users per second: 22426.9
recommendations finished on 10000/34355 queries. users per second: 22486.8
recommendations finished on 11000/34355 queries. users per second: 22207.4
recommendations finished on 12000/34355 queries. users per second: 22163.2
recommendations finished on 13000/34355 queries. users per second: 22060.5
recommendations finished on 14000/34355 queries. users per second: 22038
recommendations finished on 15000/34355 queries. users per second: 22088.7
recommendations finished on 16000/34355 queries. users per second: 22082.8
recommendations finished on 17000/34355 queries. users per second: 22162.7
recommendations finished on 18000/34355 queries. users per second: 22229.9
recommendations finished on 19000/34355 queries. users per second: 22293.3
recommendations finished on 20000/34355 queries. users per second: 22182.1
recommendations finished on 21000/34355 queries. users per second: 22132.2
recommendations finished on 22000/34355 queries. users per second: 22136.5
recommendations finished on 23000/34355 queries. users per second: 21993.7
recommendations finished on 24000/34355 queries. users per second: 21916.9
recommendations finished on 25000/34355 queries. users per second: 21920.7
recommendations finished on 26000/34355 queries. users per second: 21987.5
recommendations finished on 27000/34355 queries. users per second: 21954.6
recommendations finished on 28000/34355 queries. users per second: 22011.1
recommendations finished on 29000/34355 queries. users per second: 22063.8
recommendations finished on 30000/34355 queries. users per second: 22121.1
recommendations finished on 31000/34355 queries. users per second: 22215.6
recommendations finished on 32000/34355 queries. users per second: 22137.3
recommendations finished on 33000/34355 queries. users per second: 22185.3
recommendations finished on 34000/34355 queries. users per second: 22219.4
Precision and recall summary statistics by cutoff +--------+-------------------+-------------------+ | cutoff | mean_precision | mean_recall | +--------+-------------------+-------------------+ | 1 | 0.000320186290205 | 2.62069953527e-05 | | 2 | 0.000363848057051 | 0.000157978795732 | | 3 | 0.000368699364479 | 0.000246448831037 | | 4 | 0.000451171590744 | 0.00045461967667 | | 5 | 0.000518119633241 | 0.000646663196468 | | 6 | 0.000499684665017 | 0.000719981467646 | | 7 | 0.000478200303553 | 0.000800163591435 | | 8 | 0.000451171590744 | 0.000835628106468 | | 9 | 0.000501301767493 | 0.000996939435667 | | 10 | 0.000474457866395 | 0.00104312051081 | +--------+-------------------+-------------------+ [10 rows x 3 columns] ('\nOverall RMSE: ', 6.470445808584627) Per User RMSE (best) +-------------------------------+-------+------+ | user_id | count | rmse | +-------------------------------+-------+------+ | 6d61c9b3678aa6c015ea9fd502... | 1 | 0.0 | +-------------------------------+-------+------+ [1 rows x 3 columns] Per User RMSE (worst) +-------------------------------+-------+---------------+ | user_id | count | rmse | +-------------------------------+-------+---------------+ | 50996bbabb6f7857bf0c801943... | 2 | 647.013311924 | +-------------------------------+-------+---------------+ [1 rows x 3 columns] Per Item RMSE (best) +--------------------+-------+-----------------+ | music_id | count | rmse | +--------------------+-------+-----------------+ | SOXDPFW12A81C2319B | 8 | 0.0735294117647 | +--------------------+-------+-----------------+ [1 rows x 3 columns] Per Item RMSE (worst) +--------------------+-------+---------------+ | music_id | count | rmse | +--------------------+-------+---------------+ | SOUAGPQ12A8AE47B3A | 8 | 323.517367637 | +--------------------+-------+---------------+ [1 rows x 3 columns] PROGRESS: Evaluate model M1
recommendations finished on 1000/34355 queries. users per second: 17896.8
recommendations finished on 2000/34355 queries. users per second: 20141.8
recommendations finished on 3000/34355 queries. users per second: 21115
recommendations finished on 4000/34355 queries. users per second: 21548.9
recommendations finished on 5000/34355 queries. users per second: 22269.8
recommendations finished on 6000/34355 queries. users per second: 22353.6
recommendations finished on 7000/34355 queries. users per second: 22388.9
recommendations finished on 8000/34355 queries. users per second: 22353.4
recommendations finished on 9000/34355 queries. users per second: 22462.1
recommendations finished on 10000/34355 queries. users per second: 22408.8
recommendations finished on 11000/34355 queries. users per second: 21999.6
recommendations finished on 12000/34355 queries. users per second: 22154.4
recommendations finished on 13000/34355 queries. users per second: 22417.7
recommendations finished on 14000/34355 queries. users per second: 22434.6
recommendations finished on 15000/34355 queries. users per second: 22602.1
recommendations finished on 16000/34355 queries. users per second: 22675.6
recommendations finished on 17000/34355 queries. users per second: 22789.5
recommendations finished on 18000/34355 queries. users per second: 22858.7
recommendations finished on 19000/34355 queries. users per second: 22980.9
recommendations finished on 20000/34355 queries. users per second: 23016.8
recommendations finished on 21000/34355 queries. users per second: 23015.3
recommendations finished on 22000/34355 queries. users per second: 23089
recommendations finished on 23000/34355 queries. users per second: 23022.7
recommendations finished on 24000/34355 queries. users per second: 23052.6
recommendations finished on 25000/34355 queries. users per second: 22970.8
recommendations finished on 26000/34355 queries. users per second: 22781.5
recommendations finished on 27000/34355 queries. users per second: 22733.5
recommendations finished on 28000/34355 queries. users per second: 22789.1
recommendations finished on 29000/34355 queries. users per second: 22741.9
recommendations finished on 30000/34355 queries. users per second: 22669.3
recommendations finished on 31000/34355 queries. users per second: 22684
recommendations finished on 32000/34355 queries. users per second: 22677.7
recommendations finished on 33000/34355 queries. users per second: 22675.2
recommendations finished on 34000/34355 queries. users per second: 22616.2
Precision and recall summary statistics by cutoff +--------+-----------------+-----------------+ | cutoff | mean_precision | mean_recall | +--------+-----------------+-----------------+ | 1 | 0.0505894338524 | 0.0151970338993 | | 2 | 0.0616795226313 | 0.0336650217808 | | 3 | 0.0729927715519 | 0.0543725425713 | | 4 | 0.0751055159365 | 0.0704135431553 | | 5 | 0.0741726095183 | 0.0842718589105 | | 6 | 0.0725076408092 | 0.0963338861128 | | 7 | 0.0700792149198 | 0.106287905834 | | 8 | 0.0680978023577 | 0.115967909942 | | 9 | 0.0657740261 | 0.124363659149 | | 10 | 0.0636239266482 | 0.13230804769 | +--------+-----------------+-----------------+ [10 rows x 3 columns] ('\nOverall RMSE: ', 7.150718576843326) Per User RMSE (best) +-------------------------------+-------+-------------------+ | user_id | count | rmse | +-------------------------------+-------+-------------------+ | dad5cd4678a6f6df34932432bc... | 1 | 0.000917145184108 | +-------------------------------+-------+-------------------+ [1 rows x 3 columns] Per User RMSE (worst) +-------------------------------+-------+---------------+ | user_id | count | rmse | +-------------------------------+-------+---------------+ | 50996bbabb6f7857bf0c801943... | 2 | 650.121367005 | +-------------------------------+-------+---------------+ [1 rows x 3 columns] Per Item RMSE (best) +--------------------+-------+---------------+ | music_id | count | rmse | +--------------------+-------+---------------+ | SOJUKCL12A6D4F7DF7 | 3 | 0.75550628309 | +--------------------+-------+---------------+ [1 rows x 3 columns] Per Item RMSE (worst) +--------------------+-------+---------------+ | music_id | count | rmse | +--------------------+-------+---------------+ | SOUAGPQ12A8AE47B3A | 8 | 325.077941681 | +--------------------+-------+---------------+ [1 rows x 3 columns] PROGRESS: Evaluate model M2
recommendations finished on 1000/34355 queries. users per second: 16771.2
recommendations finished on 2000/34355 queries. users per second: 18825.8
recommendations finished on 3000/34355 queries. users per second: 19420.1
recommendations finished on 4000/34355 queries. users per second: 20088.8
recommendations finished on 5000/34355 queries. users per second: 20395.8
recommendations finished on 6000/34355 queries. users per second: 20666.2
recommendations finished on 7000/34355 queries. users per second: 20774.3
recommendations finished on 8000/34355 queries. users per second: 20775.1
recommendations finished on 9000/34355 queries. users per second: 20780.4
recommendations finished on 10000/34355 queries. users per second: 20950.3
recommendations finished on 11000/34355 queries. users per second: 20620.6
recommendations finished on 12000/34355 queries. users per second: 20654.8
recommendations finished on 13000/34355 queries. users per second: 20770.8
recommendations finished on 14000/34355 queries. users per second: 20732.1
recommendations finished on 15000/34355 queries. users per second: 20729.6
recommendations finished on 16000/34355 queries. users per second: 20752.5
recommendations finished on 17000/34355 queries. users per second: 20774.8
recommendations finished on 18000/34355 queries. users per second: 20801.7
recommendations finished on 19000/34355 queries. users per second: 20786.5
recommendations finished on 20000/34355 queries. users per second: 20808.1
recommendations finished on 21000/34355 queries. users per second: 20682
recommendations finished on 22000/34355 queries. users per second: 20684.4
recommendations finished on 23000/34355 queries. users per second: 20683.2
recommendations finished on 24000/34355 queries. users per second: 20687.9
recommendations finished on 25000/34355 queries. users per second: 20722.5
recommendations finished on 26000/34355 queries. users per second: 20771.3
recommendations finished on 27000/34355 queries. users per second: 20842.5
recommendations finished on 28000/34355 queries. users per second: 20878
recommendations finished on 29000/34355 queries. users per second: 20863.7
recommendations finished on 30000/34355 queries. users per second: 20824.5
recommendations finished on 31000/34355 queries. users per second: 20777.4
recommendations finished on 32000/34355 queries. users per second: 20721.2
recommendations finished on 33000/34355 queries. users per second: 20763
recommendations finished on 34000/34355 queries. users per second: 20792.3
Precision and recall summary statistics by cutoff +--------+-------------------+-------------------+ | cutoff | mean_precision | mean_recall | +--------+-------------------+-------------------+ | 1 | 0.000436617668462 | 8.84193897883e-05 | | 2 | 0.000392955901615 | 0.000146983613211 | | 3 | 0.000397807209043 | 0.000273528343889 | | 4 | 0.000400232862757 | 0.000328267207427 | | 5 | 0.000424974530636 | 0.00040585165237 | | 6 | 0.000460874205598 | 0.000574532683748 | | 7 | 0.00049483335759 | 0.000721448772082 | | 8 | 0.000531218163295 | 0.000887467388127 | | 9 | 0.00055628325167 | 0.00109888572439 | | 10 | 0.000593800029108 | 0.00130662212516 | +--------+-------------------+-------------------+ [10 rows x 3 columns] ('\nOverall RMSE: ', 8.111262615677196) Per User RMSE (best) +-------------------------------+-------+-------------------+ | user_id | count | rmse | +-------------------------------+-------+-------------------+ | 06aad545e9390a6332a7fee7f3... | 1 | 0.000536064571246 | +-------------------------------+-------+-------------------+ [1 rows x 3 columns] Per User RMSE (worst) +-------------------------------+-------+---------------+ | user_id | count | rmse | +-------------------------------+-------+---------------+ | 50996bbabb6f7857bf0c801943... | 2 | 689.967975368 | +-------------------------------+-------+---------------+ [1 rows x 3 columns] Per Item RMSE (best) +--------------------+-------+-----------------+ | music_id | count | rmse | +--------------------+-------+-----------------+ | SOJWSGL12A81C217B0 | 2 | 0.0503145302263 | +--------------------+-------+-----------------+ [1 rows x 3 columns] Per Item RMSE (worst) +--------------------+-------+---------------+ | music_id | count | rmse | +--------------------+-------+---------------+ | SOUAGPQ12A8AE47B3A | 8 | 345.207580713 | +--------------------+-------+---------------+ [1 rows x 3 columns]
K = 10
users = gl.SArray(sf['user_id'].unique().head(100))
recs = item_sim_model.recommend(users=users, k=K)
recs.head()
user_id | music_id | score | rank |
---|---|---|---|
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ... |
SOXUQNR12AF72A69D6 | 3.02242265145 | 1 |
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ... |
SOUFAZA12AC3DFAB20 | 1.33684277534 | 2 |
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ... |
SOSFSTC12A8C141219 | 1.09198212624 | 3 |
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ... |
SOVIWFP12A58A7D1BD | 1.04516386986 | 4 |
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ... |
SOBMTQD12AB01833D0 | 1.02945168813 | 5 |
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ... |
SOCMNRG12AB0189D3F | 0.975643793742 | 6 |
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ... |
SOXOHUM12A67ADC826 | 0.950687328974 | 7 |
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ... |
SOWBFVW12A6D4F612B | 0.909237066905 | 8 |
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ... |
SOXFYTY127E9433E7D | 0.897727807363 | 9 |
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ... |
SOYBLYP12A58A79D32 | 0.897092819214 | 10 |