In [1]:

from pyspark.sql import SparkSession

In [2]:

spark = SparkSession.builder.appName('CF').getOrCreate()

In [3]:

from pyspark.ml.recommendation import ALS

In [4]:

from pyspark.ml.evaluation import RegressionEvaluator

In [5]:

# data source:
# https://github.com/yennanliu/movie_recommendation
# https://grouplens.org/datasets/movielens/
# https://github.com/khanhnamle1994/movielens

In [6]:

data = spark.read.csv("movie_ratings.csv", inferSchema=True, header=True)

In [7]:

data.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)

In [8]:

data.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|     31|   2.5|1260759144|
|     1|   1029|   3.0|1260759179|
|     1|   1061|   3.0|1260759182|
|     1|   1129|   2.0|1260759185|
|     1|   1172|   4.0|1260759205|
|     1|   1263|   2.0|1260759151|
|     1|   1287|   2.0|1260759187|
|     1|   1293|   2.0|1260759148|
|     1|   1339|   3.5|1260759125|
|     1|   1343|   2.0|1260759131|
|     1|   1371|   2.5|1260759135|
|     1|   1405|   1.0|1260759203|
|     1|   1953|   4.0|1260759191|
|     1|   2105|   4.0|1260759139|
|     1|   2150|   3.0|1260759194|
|     1|   2193|   2.0|1260759198|
|     1|   2294|   2.0|1260759108|
|     1|   2455|   2.5|1260759113|
|     1|   2968|   1.0|1260759200|
|     1|   3671|   3.0|1260759117|
+------+-------+------+----------+
only showing top 20 rows

In [9]:

data.describe().show()

+-------+------------------+------------------+------------------+--------------------+
|summary|            userId|           movieId|            rating|           timestamp|
+-------+------------------+------------------+------------------+--------------------+
|  count|            100004|            100004|            100004|              100004|
|   mean| 347.0113095476181|12548.664363425463| 3.543608255669773|1.1296390869392424E9|
| stddev|195.16383797819535|26369.198968815268|1.0580641091070326|1.9168582602710962E8|
|    min|                 1|                 1|               0.5|           789652009|
|    max|               671|            163949|               5.0|          1476640644|
+-------+------------------+------------------+------------------+--------------------+

In [10]:

# train, test split 
training, test = data.randomSplit([0.8, 0.2])

In [11]:

# set up the model and super-parameters 
als = ALS(maxIter=5, 
          regParam=0.01,
          userCol='userId',
          itemCol='movieId',
          ratingCol= 'rating')

In [12]:

# train the model 
model  = als.fit(training)

In [13]:

# do the prediction 
predicitons = model.transform(test)

In [14]:

# show the prediciton
predicitons.show()

+------+-------+------+----------+----------+
|userId|movieId|rating| timestamp|prediction|
+------+-------+------+----------+----------+
|   452|    463|   2.0| 976424451| 2.4552588|
|    85|    471|   3.0| 837512312| 3.9172719|
|   588|    471|   3.0| 842298526| 4.7625732|
|   460|    471|   5.0|1072836030| 3.8125675|
|   274|    471|   5.0|1074104142| 3.6691563|
|   292|    471|   3.5|1140049920| 4.0752306|
|    15|    471|   3.0|1166586067|  2.311449|
|    73|    471|   4.0|1296460183| 3.3499885|
|   354|    471|   5.0| 846062674|  4.579715|
|   529|    471|   4.0| 965497394| 3.1544423|
|   184|    471|   5.0| 833525100| 4.5493975|
|   311|    471|   0.5|1062015819| 2.6232295|
|   521|    471|   3.5|1370072127|  4.019308|
|   547|    496|   3.0| 974778561| 2.5938766|
|   463|   1088|   3.0|1050499697| 3.0081568|
|    52|   1088|   4.0|1231766626|  4.288722|
|   500|   1088|   4.0|1229098924| 2.4964237|
|   387|   1088|   4.0| 974790964| 2.1745355|
|   514|   1088|   3.0| 853896732| 3.0606182|
|   160|   1088|   4.0| 974258881| 4.6870093|
+------+-------+------+----------+----------+
only showing top 20 rows

In [15]:

# evaluate the model  (RegressionEvaluator : rmse)
evaluator = RegressionEvaluator(
            metricName = 'rmse',
            labelCol= 'rating',
            predictionCol = 'prediction')

In [16]:

rmse = evaluator.evaluate(predicitons)

In [17]:

print ('RMSE')
print (rmse)

RMSE
nan

In [18]:

test.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|   1129|   2.0|1260759185|
|     1|   1287|   2.0|1260759187|
|     1|   1339|   3.5|1260759125|
|     1|   1343|   2.0|1260759131|
|     1|   2294|   2.0|1260759108|
|     2|     10|   4.0| 835355493|
|     2|    161|   3.0| 835355493|
|     2|    186|   3.0| 835355664|
|     2|    208|   3.0| 835355511|
|     2|    292|   3.0| 835355492|
|     2|    300|   3.0| 835355532|
|     2|    339|   3.0| 835355492|
|     2|    367|   3.0| 835355619|
|     2|    457|   3.0| 835355511|
|     2|    468|   4.0| 835355790|
|     2|    474|   2.0| 835355828|
|     2|    515|   4.0| 835355817|
|     2|    550|   3.0| 835356109|
|     2|    587|   3.0| 835355779|
|     3|    736|   3.5|1298932787|
+------+-------+------+----------+
only showing top 20 rows

In [19]:

# predict single user's taste
single_user = test.filter(test['userId']== 11).select(['userId', 'movieId'])

In [20]:

# movies a single users has watched
single_user.show()

+------+-------+
|userId|movieId|
+------+-------+
|    11|     70|
|    11|   1027|
|    11|   1201|
|    11|   1408|
|    11|   2042|
|    11|   3424|
|    11|  71211|
|    11|  77455|
|    11|  81158|
|    11|  81562|
|    11|  96079|
|    11|  96861|
+------+-------+

In [21]:

recommendations = model.transform(single_user)

In [22]:

# recommendations output 
recommendations.show()

+------+-------+----------+
|userId|movieId|prediction|
+------+-------+----------+
|    11|   1201|  5.064004|
|    11|  71211|  2.056238|
|    11|   2042| 2.8747272|
|    11|  96079|  5.072752|
|    11|  81562| 5.3010736|
|    11|  81158| 1.9231318|
|    11|  96861| 1.9171791|
|    11|     70|  3.744793|
|    11|   1027| 4.1614656|
|    11|   1408| 4.0286865|
|    11|   3424| 3.1237607|
|    11|  77455| 4.8629804|
+------+-------+----------+

In [23]:

recommendations.orderBy('prediction', ascending=False).show()

+------+-------+----------+
|userId|movieId|prediction|
+------+-------+----------+
|    11|  81562| 5.3010736|
|    11|  96079|  5.072752|
|    11|   1201|  5.064004|
|    11|  77455| 4.8629804|
|    11|   1027| 4.1614656|
|    11|   1408| 4.0286865|
|    11|     70|  3.744793|
|    11|   3424| 3.1237607|
|    11|   2042| 2.8747272|
|    11|  71211|  2.056238|
|    11|  81158| 1.9231318|
|    11|  96861| 1.9171791|
+------+-------+----------+

In [25]:

# end of course (CF) :  15.56 
# next : 16 : NLP