%reload_ext autoreload
%autoreload 2
#export
from nb_007a import *
from pandas import Series,DataFrame
Data available from http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
PATH = Path('data/ml-latest-small/')
Table user/movie -> rating
ratings = pd.read_csv(PATH/'ratings.csv')
ratings.head()
Table to get the titles of the movies.
movies = pd.read_csv(PATH/'movies.csv')
movies.head()
ratings.columns
#export
def series2cat(df:DataFrame, *col_names):
"Categorifies the columns in df."
for c in listify(col_names): df[c] = df[c].astype('category').cat.as_ordered()
series2cat(ratings, 'userId','movieId')
ratings.userId.dtype
#export
@dataclass
class ColabFilteringDataset(DatasetBase):
"Base dataset for collaborative filtering"
user:Series
item:Series
ratings:DataFrame
def __post_init__(self):
self.user_ids = np.array(self.user.cat.codes, dtype=np.int64)
self.item_ids = np.array(self.item.cat.codes, dtype=np.int64)
def __len__(self)->int: return len(self.ratings)
def __getitem__(self, idx:int)->Tuple[Tuple[int,int],float]:
return (self.user_ids[idx],self.item_ids[idx]), self.ratings[idx]
@property
def c(self) -> int: return 1
@property
def n_user(self)->int: return len(self.user.cat.categories)
@property
def n_item(self)->int: return len(self.item.cat.categories)
@classmethod
def from_df(cls, rating_df:DataFrame, pct_val:float=0.2, user_name:Optional[str]=None, item_name:Optional[str]=None,
rating_name:Optional[str]=None) -> Tuple['ColabFilteringDataset','ColabFilteringDataset']:
"Splits a given dataframe in a training and validation set"
if user_name is None: user_name = rating_df.columns[0]
if item_name is None: item_name = rating_df.columns[1]
if rating_name is None: rating_name = rating_df.columns[2]
user = rating_df[user_name]
item = rating_df[item_name]
ratings = np.array(rating_df[rating_name], dtype=np.float32)
idx = np.random.permutation(len(ratings))
cut = int(pct_val * len(ratings))
return (cls(user[idx[cut:]], item[idx[cut:]], ratings[idx[cut:]]),
cls(user[idx[:cut]], item[idx[:cut]], ratings[idx[:cut]]))
@classmethod
def from_csv(cls, csv_name:str, **kwargs) -> Tuple['ColabFilteringDataset','ColabFilteringDataset']:
"Splits a given table in a csv in a training and validation set"
df = pd.read_csv(csv_name)
return cls.from_df(df, **kwargs)
train_ds, valid_ds = ColabFilteringDataset.from_df(ratings)
len(ratings), len(train_ds), len(valid_ds)
bs = 64
data = DataBunch.create(train_ds, valid_ds, bs=bs, num_workers=0)
#export
def trunc_normal_(x:Tensor, mean:float=0., std:float=1.) -> Tensor:
"Truncated normal initialization"
# From https://discuss.pytorch.org/t/implementing-truncated-normal-initializer/4778/12
return x.normal_().fmod_(2).mul_(std).add_(mean)
def get_embedding(ni:int,nf:int) -> Model:
"Creates an embedding layer"
emb = nn.Embedding(ni, nf)
# See https://arxiv.org/abs/1711.09160
with torch.no_grad(): trunc_normal_(emb.weight, std=0.01)
return emb
class EmbeddingDotBias(nn.Module):
"Base model for callaborative filtering"
def __init__(self, n_factors:int, n_users:int, n_items:int, min_score:float=None, max_score:float=None):
super().__init__()
self.min_score,self.max_score = min_score,max_score
(self.u_weight, self.i_weight, self.u_bias, self.i_bias) = [get_embedding(*o) for o in [
(n_users, n_factors), (n_items, n_factors), (n_users,1), (n_items,1)
]]
def forward(self, users:LongTensor, items:LongTensor) -> Tensor:
dot = self.u_weight(users)* self.i_weight(items)
res = dot.sum(1) + self.u_bias(users).squeeze() + self.i_bias(items).squeeze()
if self.min_score is None: return res
return torch.sigmoid(res) * (self.max_score-self.min_score) + self.min_score
#export
def get_collab_learner(n_factors:int, data:DataBunch, min_score:float=None, max_score:float=None,
loss_fn:LossFunction=F.mse_loss, **kwargs) -> Learner:
"Creates a Learner for collaborative filtering"
ds = data.train_ds
model = EmbeddingDotBias(n_factors, ds.n_user, ds.n_item, min_score, max_score)
return Learner(data, model, loss_fn=loss_fn, **kwargs)
n_factors = 50
learn = get_collab_learner(n_factors, data, 0, 5, wd=1e-1)
learn.lr_find()
learn.recorder.plot()
learn.fit_one_cycle(5, 5e-3)
math.sqrt(0.77)