This notebook contains code used to load and pre-process the MovieLens 1M dataset, consisting of a collection of 1 million movie ratings by different users along with side information about them, and taking extra information about the movies from the tag genome dataset which is taken from the larger MovieLens 25M dataset.
This data is then used in a usage guide for building recommender systems with the the cmfrec package.
The user side information is enhanced with an external dataset about US zip codes, US states, and US geographical regions, while the item information (tag genome) - a very high-dimensional dataset - is simplified by taking the first 50 principal components.
import numpy as np, pandas as pd, re
ratings = pd.read_table(
'ml-1m/ratings.dat',
sep='::', engine='python',
names=['UserId','ItemId','Rating','Timestamp']
)
ratings = ratings.drop("Timestamp", axis=1)
ratings.head()
UserId | ItemId | Rating | |
---|---|---|---|
0 | 1 | 1193 | 5 |
1 | 1 | 661 | 3 |
2 | 1 | 914 | 3 |
3 | 1 | 3408 | 4 |
4 | 1 | 2355 | 5 |
print("Number of users: %d" % ratings["UserId"].nunique())
print("Number of items: %d" % ratings["ItemId"].nunique())
print("Number of ratings: %d" % ratings["Rating"].count())
Number of users: 6040 Number of items: 3706 Number of ratings: 1000209
movie_titles = pd.read_table(
'ml-1m/movies.dat',
sep='::', engine='python', header=None, encoding='latin_1',
names=['ItemId', 'title', 'genres']
)
movie_titles = movie_titles[['ItemId', 'title']]
movie_titles.head()
ItemId | title | |
---|---|---|
0 | 1 | Toy Story (1995) |
1 | 2 | Jumanji (1995) |
2 | 3 | Grumpier Old Men (1995) |
3 | 4 | Waiting to Exhale (1995) |
4 | 5 | Father of the Bride Part II (1995) |
movie_id_to_title = {i.ItemId: i.title for i in movie_titles.itertuples()}
movies = pd.read_csv('ml-25m/movies.csv')
movies = movies[['movieId', 'title']]
movies = pd.merge(movies, movie_titles)
movies = movies[['movieId', 'ItemId']]
tags = pd.read_csv('ml-25m/genome-scores.csv')
tags_wide = tags.pivot(index='movieId', columns='tagId', values='relevance')
tags_wide.columns=["tag"+str(i) for i in tags_wide.columns]
item_side_info = pd.merge(movies, tags_wide, how='inner', left_on='movieId', right_index=True)
item_side_info = item_side_info.drop('movieId', axis=1)
item_side_info.head()
ItemId | tag1 | tag2 | tag3 | tag4 | tag5 | tag6 | tag7 | tag8 | tag9 | ... | tag1119 | tag1120 | tag1121 | tag1122 | tag1123 | tag1124 | tag1125 | tag1126 | tag1127 | tag1128 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0.02875 | 0.02375 | 0.06250 | 0.07575 | 0.14075 | 0.14675 | 0.06350 | 0.20375 | 0.2020 | ... | 0.04050 | 0.01425 | 0.03050 | 0.03500 | 0.14125 | 0.05775 | 0.03900 | 0.02975 | 0.08475 | 0.02200 |
1 | 2 | 0.04125 | 0.04050 | 0.06275 | 0.08275 | 0.09100 | 0.06125 | 0.06925 | 0.09600 | 0.0765 | ... | 0.05250 | 0.01575 | 0.01250 | 0.02000 | 0.12225 | 0.03275 | 0.02100 | 0.01100 | 0.10525 | 0.01975 |
2 | 3 | 0.04675 | 0.05550 | 0.02925 | 0.08700 | 0.04750 | 0.04775 | 0.04600 | 0.14275 | 0.0285 | ... | 0.06275 | 0.01950 | 0.02225 | 0.02300 | 0.12200 | 0.03475 | 0.01700 | 0.01800 | 0.09100 | 0.01775 |
3 | 4 | 0.03425 | 0.03800 | 0.04050 | 0.03100 | 0.06500 | 0.03575 | 0.02900 | 0.08650 | 0.0320 | ... | 0.05325 | 0.02800 | 0.01675 | 0.03875 | 0.18200 | 0.07050 | 0.01625 | 0.01425 | 0.08850 | 0.01500 |
4 | 5 | 0.04300 | 0.05325 | 0.03800 | 0.04100 | 0.05400 | 0.06725 | 0.02775 | 0.07650 | 0.0215 | ... | 0.05350 | 0.02050 | 0.01425 | 0.02550 | 0.19225 | 0.02675 | 0.01625 | 0.01300 | 0.08700 | 0.01600 |
5 rows × 1129 columns
from sklearn.decomposition import PCA
pca_obj = PCA(n_components = 50)
item_sideinfo_reduced = item_side_info.drop("ItemId", axis=1)
item_sideinfo_pca = pca_obj.fit_transform(item_sideinfo_reduced)
item_sideinfo_pca = pd.DataFrame(
item_sideinfo_pca,
columns=["pc"+str(i+1) for i in range(item_sideinfo_pca.shape[1])]
)
item_sideinfo_pca['ItemId'] = item_side_info["ItemId"].to_numpy()
item_sideinfo_pca = item_sideinfo_pca[["ItemId"] + item_sideinfo_pca.columns[:50].tolist()]
item_sideinfo_pca.head()
ItemId | pc1 | pc2 | pc3 | pc4 | pc5 | pc6 | pc7 | pc8 | pc9 | ... | pc41 | pc42 | pc43 | pc44 | pc45 | pc46 | pc47 | pc48 | pc49 | pc50 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1.193171 | 2.085621 | 2.634135 | 1.156088 | 0.721649 | 0.995436 | 1.250474 | -0.779532 | 1.616702 | ... | -0.317134 | -0.070338 | -0.019553 | 0.169051 | 0.201415 | -0.094831 | -0.250461 | -0.149919 | -0.031735 | -0.177708 |
1 | 2 | -1.333533 | 1.743796 | 1.352161 | 0.795724 | -0.484175 | 0.380645 | 0.804462 | -0.598527 | 0.917250 | ... | 0.300060 | -0.261956 | 0.054457 | 0.003863 | 0.304605 | -0.315796 | 0.360203 | 0.152770 | 0.144790 | -0.096549 |
2 | 3 | -1.363395 | -0.017107 | 0.530395 | -0.316202 | 0.469430 | 0.164630 | 0.019083 | 0.159188 | -0.232969 | ... | 0.215020 | -0.060682 | -0.280852 | 0.001087 | 0.084960 | -0.257190 | -0.136963 | -0.113914 | 0.128352 | -0.203658 |
3 | 4 | -1.237840 | -0.993731 | 0.809815 | -0.303009 | -0.088991 | -0.049621 | -0.179544 | -0.771278 | -0.400499 | ... | 0.066207 | 0.056054 | -0.223027 | 0.400157 | 0.292300 | 0.260936 | -0.307608 | -0.224141 | 0.488955 | 0.439189 |
4 | 5 | -1.611499 | -0.251899 | 1.126443 | -0.135702 | 0.403340 | 0.187289 | 0.108451 | -0.275341 | -0.261142 | ... | 0.109560 | -0.086042 | -0.236327 | 0.461589 | 0.013350 | -0.192557 | -0.234025 | -0.369643 | -0.041060 | -0.074656 |
5 rows × 51 columns
print("Number of items from MovieLens 1M with side info: %d" %
ratings["ItemId"][np.in1d(ratings["ItemId"], item_sideinfo_pca["ItemId"])].nunique())
Number of items from MovieLens 1M with side info: 3080
zipcode_abbs = pd.read_csv("states.csv", low_memory=False)
zipcode_abbs_dct = {z.State: z.Abbreviation for z in zipcode_abbs.itertuples()}
us_regs_table = [
('New England', 'Connecticut, Maine, Massachusetts, New Hampshire, Rhode Island, Vermont'),
('Middle Atlantic', 'Delaware, Maryland, New Jersey, New York, Pennsylvania'),
('South', 'Alabama, Arkansas, Florida, Georgia, Kentucky, Louisiana, Mississippi, Missouri, North Carolina, South Carolina, Tennessee, Virginia, West Virginia'),
('Midwest', 'Illinois, Indiana, Iowa, Kansas, Michigan, Minnesota, Nebraska, North Dakota, Ohio, South Dakota, Wisconsin'),
('Southwest', 'Arizona, New Mexico, Oklahoma, Texas'),
('West', 'Alaska, California, Colorado, Hawaii, Idaho, Montana, Nevada, Oregon, Utah, Washington, Wyoming')
]
us_regs_table = [(x[0], [i.strip() for i in x[1].split(",")]) for x in us_regs_table]
us_regs_dct = dict()
for r in us_regs_table:
for s in r[1]:
us_regs_dct[zipcode_abbs_dct[s]] = r[0]
zipcode_info = pd.read_csv("free-zipcode-database.csv", low_memory=False)
zipcode_info = zipcode_info.groupby('Zipcode').first().reset_index()
zipcode_info.loc[lambda x: x["Country"] != "US", 'State'] = 'UnknownOrNonUS'
zipcode_info['Region'] = zipcode_info['State'].copy()
zipcode_info.loc[lambda x: x["Country"] == "US", "Region"] = (
zipcode_info
.loc[lambda x: x["Country"] == "US"]
["Region"]
.map(lambda x: us_regs_dct[x] if x in us_regs_dct else 'UsOther')
)
zipcode_info = zipcode_info[['Zipcode', 'Region']]
zipcode_info.head()
Zipcode | Region | |
---|---|---|
0 | 501 | Middle Atlantic |
1 | 544 | Middle Atlantic |
2 | 601 | UsOther |
3 | 602 | UsOther |
4 | 603 | UsOther |
users = pd.read_table(
'ml-1m/users.dat',
sep='::', engine='python', encoding='cp1252',
names=["UserId", "Gender", "Age", "Occupation", "Zipcode"]
)
users["Zipcode"] = users["Zipcode"].map(lambda x: int(re.sub("-.*", "", x)))
users = pd.merge(users, zipcode_info, on='Zipcode', how='left')
users['Region'] = users["Region"].fillna('UnknownOrNonUS')
occupations = {
0: "\"other\" or not specified",
1: "academic/educator",
2: "artist",
3: "clerical/admin",
4: "college/grad student",
5: "customer service",
6: "doctor/health care",
7: "executive/managerial",
8: "farmer",
9: "homemaker",
10: "K-12 student",
11: "lawyer",
12: "programmer",
13: "retired",
14: "sales/marketing",
15: "scientist",
16: "self-employed",
17: "technician/engineer",
18: "tradesman/craftsman",
19: "unemployed",
20: "writer"
}
users['Occupation'] = users["Occupation"].map(occupations)
users['Age'] = users["Age"].map(lambda x: str(x))
users.head()
UserId | Gender | Age | Occupation | Zipcode | Region | |
---|---|---|---|---|---|---|
0 | 1 | F | 1 | K-12 student | 48067 | Midwest |
1 | 2 | M | 56 | self-employed | 70072 | South |
2 | 3 | M | 25 | scientist | 55117 | Midwest |
3 | 4 | M | 45 | executive/managerial | 2460 | New England |
4 | 5 | M | 25 | writer | 55455 | Midwest |
user_side_info = pd.get_dummies(users[['UserId', 'Gender', 'Age', 'Occupation', 'Region']])
user_side_info.head()
UserId | Gender_F | Gender_M | Age_1 | Age_18 | Age_25 | Age_35 | Age_45 | Age_50 | Age_56 | ... | Occupation_unemployed | Occupation_writer | Region_Middle Atlantic | Region_Midwest | Region_New England | Region_South | Region_Southwest | Region_UnknownOrNonUS | Region_UsOther | Region_West | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | True | False | True | False | False | False | False | False | False | ... | False | False | False | True | False | False | False | False | False | False |
1 | 2 | False | True | False | False | False | False | False | False | True | ... | False | False | False | False | False | True | False | False | False | False |
2 | 3 | False | True | False | False | True | False | False | False | False | ... | False | False | False | True | False | False | False | False | False | False |
3 | 4 | False | True | False | False | False | False | True | False | False | ... | False | False | False | False | True | False | False | False | False | False |
4 | 5 | False | True | False | False | True | False | False | False | False | ... | False | True | False | True | False | False | False | False | False | False |
5 rows × 39 columns
print("Number of users with demographic information: %d" %
user_side_info["UserId"].nunique())
Number of users with demographic information: 6040
import pickle
pickle.dump(ratings, open("ratings.p", "wb"))
pickle.dump(item_sideinfo_pca, open("item_sideinfo_pca.p", "wb"))
pickle.dump(user_side_info, open("user_side_info.p", "wb"))
pickle.dump(movie_id_to_title, open("movie_id_to_title.p", "wb"))