wukong-recommendation
wukong-recommendation copied to clipboard
Experiments on MovieLensLatest wukong seems cannot beat MLPs without any interaction modules
Dear clabrugere,
Thanks for your great job on repeating wukong‘s experiments.
According to the claims of this paper, wukong has great performance on serveral datasets, like Table 2.
But when I re-run the experiments using codes from this repo, found that wukong cannot beat MLPs(without any interaction module).
I have been confused with this for weeks. After checking the code and paper times by times, I cannot figure out what's wrong.
If you are willing, I am very respectful to know do you repeat the wukong's experiments, does it work well?
dataset
criteo(Kaggle Display Advertising dataset): https://go.criteo.net/criteo-research-kaggle-display-advertising-challenge-dataset.tar.gz, using the train.txt
movielens_latest: http://grouplens.org/datasets/movielens/, using the rating.csv
from latest
version
preprocess
follows the pytorchfm's processing: https://github.com/rixwew/pytorch-fm/blob/master/torchfm/dataset/movielens.py
results
- movielens-latest
method | epochs | valid auc (best) | test auc |
---|---|---|---|
mlps(400, 400) | 19 | 0.915191645690709 | 0.9153241141382682 |
dcnv2(1 layer) | 28 | 0.9151184084632096 | 0.9152726643561548 |
wukonfg(1 layer) | 9 | 0.912915048710929 | 0.9132910202259985 |
- criteo
method | epochs | valid auc (best) | test auc |
---|---|---|---|
mlps(400, 400) | 24 | 0.812205612493922 | 0.812158279764408 |
num_layer: 1mlps: (400, 400) | 27 | 0.8131887068022828 | 0.813106074441504 |
num_layer: 1num_lcb: 16num_fmb: 16fmb_mlps(400, 400) | 28 | 0.8122118809319111 | 0.8122262149453786 |
code
I move most of the codes to torchfm, and expects to get a fair comparison. here is the code:
import torch
from torch import Tensor, nn
from torchfm.layer import FeaturesEmbedding, MultiLayerPerceptron
import numpy as np
class MLP(torch.nn.Sequential):
def __init__(
self,
dim_in: int,
num_hidden: int,
dim_hidden: int,
dim_out: int | None = None,
batch_norm: bool = True,
dropout: float = 0.0,
) -> None:
layers = []
for _ in range(num_hidden - 1):
layers.append(torch.nn.Linear(dim_in, dim_hidden))
if batch_norm:
layers.append(torch.nn.BatchNorm1d(dim_hidden))
layers.append(torch.nn.ReLU())
layers.append(torch.nn.Dropout(dropout))
dim_in = dim_hidden
if dim_out:
layers.append(torch.nn.Linear(dim_in, dim_out))
else:
layers.append(torch.nn.Linear(dim_in, dim_hidden))
super().__init__(*layers)
class MLP_DLRM(torch.nn.Module):
def __init__(self, input_dim, embed_dims, dropout, output_layer=True, output_dim=1):
super().__init__()
self.layers = list()
for embed_dim in embed_dims:
self.layers.append(torch.nn.Linear(input_dim, embed_dim))
self.layers.append(torch.nn.BatchNorm1d(embed_dim))
self.layers.append(torch.nn.ReLU())
self.layers.append(torch.nn.Dropout(p=dropout))
input_dim = embed_dim
if output_layer:
self.layers.append(torch.nn.Linear(input_dim, output_dim))
self.mlp = torch.nn.Sequential(*self.layers)
# self._reset_parameters()
def _reset_parameters(self) -> None:
for layer in self.layers:
if isinstance(layer, torch.nn.Linear):
mean = 0.0
std_dev = np.sqrt(2 / (layer.weight.size(0) + layer.weight.size(1)))
W = np.random.normal(mean, std_dev, size=layer.weight.shape).astype(np.float32)
std_dev = np.sqrt(1 / layer.weight.size(0))
bt = np.random.normal(mean, std_dev, size=layer.bias.shape).astype(np.float32)
with torch.no_grad():
layer.weight.copy_(torch.tensor(W))
layer.bias.copy_(torch.tensor(bt))
def forward(self, x):
"""
:param x: Float tensor of size ``(batch_size, embed_dim)``
"""
return self.mlp(x)
class MLP_PURE(torch.nn.Module):
def __init__(self, input_dim, embed_dims, dropout, output_layer=True, output_dim=1):
super().__init__()
self.layers = list()
for embed_dim in embed_dims:
self.layers.append(torch.nn.Linear(input_dim, embed_dim))
self.layers.append(torch.nn.BatchNorm1d(embed_dim))
self.layers.append(torch.nn.ReLU())
self.layers.append(torch.nn.Dropout(p=dropout))
input_dim = embed_dim
if output_layer:
self.layers.append(torch.nn.Linear(input_dim, output_dim))
self.mlp = torch.nn.Sequential(*self.layers)
# self._reset_parameters()
def _reset_parameters(self,):
for layer in self.layers:
if isinstance(layer, torch.nn.Linear):
torch.nn.init.xavier_uniform_(layer.weight)
def forward(self, x):
"""
:param x: Float tensor of size ``(batch_size, embed_dim)``
"""
return self.mlp(x)
class LinearCompressBlock(nn.Module):
def __init__(self, num_emb_in: int, num_emb_out: int) -> None:
super().__init__()
self.weight = nn.Parameter(torch.empty((num_emb_in, num_emb_out)))
self._reset_parameters()
def _reset_parameters(self) -> None:
nn.init.kaiming_uniform_(self.weight)
def forward(self, inputs: Tensor) -> Tensor:
# (bs, num_emb_in, dim_emb) -> (bs, dim_emb, num_emb_in)
outputs = inputs.permute(0, 2, 1)
# (bs, dim_emb, num_emb_in) @ (num_emb_in, num_emb_out) -> (bs, dim_emb, num_emb_out)
outputs = outputs @ self.weight
# (bs, dim_emb, num_emb_out) -> (bs, num_emb_out, dim_emb)
outputs = outputs.permute(0, 2, 1)
return outputs
class FactorizationMachineBlock(nn.Module):
def __init__(
self,
num_emb_in: int,
num_emb_out: int,
dim_emb: int,
rank: int,
mlp_dims: tuple,
dropout: float,
) -> None:
super().__init__()
self.num_emb_in = num_emb_in
self.num_emb_out = num_emb_out
self.dim_emb = dim_emb
self.rank = rank
self.weight = nn.Parameter(torch.empty((num_emb_in, rank)))
self.norm = nn.LayerNorm(num_emb_in * rank)
# self.mlp = MLP(
# dim_in=num_emb_in * rank,
# num_hidden=num_hidden,
# dim_hidden=dim_hidden,
# dim_out=num_emb_out * dim_emb,
# dropout=dropout,
# )
re_mlp_dims = list(mlp_dims)
re_mlp_dims[-1] = num_emb_out * dim_emb
# self.mlp = MultiLayerPerceptron(num_emb_in * rank, re_mlp_dims, dropout, output_layer=False)
self.mlp = MLP_PURE(num_emb_in * rank, mlp_dims[0:-1], dropout, output_layer=True, output_dim=num_emb_out * dim_emb)
# self.mlp = MLP(num_emb_in * rank, 3, 400, dim_out=num_emb_out * dim_emb, batch_norm=True, dropout=dropout)
self._reset_parameters()
def _reset_parameters(self):
nn.init.kaiming_uniform_(self.weight)
def forward(self, inputs: Tensor) -> Tensor:
# (bs, num_emb_in, dim_emb) -> (bs, dim_emb, num_emb_in)
outputs = inputs.permute(0, 2, 1)
# (bs, dim_emb, num_emb_in) @ (num_emb_in, rank) -> (bs, dim_emb, rank)
outputs = outputs @ self.weight
# (bs, num_emb_in, dim_emb) @ (bs, dim_emb, rank) -> (bs, num_emb_in, rank)
outputs = torch.bmm(inputs, outputs)
# (bs, num_emb_in, rank) -> (bs, num_emb_in * rank)
outputs = outputs.view(-1, self.num_emb_in * self.rank)
# (bs, num_emb_in * rank) -> (bs, num_emb_out * dim_emb)
outputs = self.mlp(self.norm(outputs))
# (bs, num_emb_out * dim_emb) -> (bs, num_emb_out, dim_emb)
outputs = outputs.view(-1, self.num_emb_out, self.dim_emb)
return outputs
class ResidualProjection(nn.Module):
def __init__(self, num_emb_in: int, num_emb_out: int) -> None:
super().__init__()
self.weight = nn.Parameter(torch.empty((num_emb_in, num_emb_out)))
self._reset_parameters()
def _reset_parameters(self):
nn.init.kaiming_uniform_(self.weight)
def forward(self, inputs: Tensor) -> Tensor:
# (bs, num_emb_in, dim_emb) -> (bs, dim_emb, num_emb_in)
outputs = inputs.permute(0, 2, 1)
# (bs, dim_emb, num_emb_in) @ (num_emb_in, num_emb_out) -> (bs, dim_emb, num_emb_out)
outputs = outputs @ self.weight
# # (bs, dim_emb, num_emb_out) -> (bs, num_emb_out, dim_emb)
outputs = outputs.permute(0, 2, 1)
return outputs
class WukongLayer(nn.Module):
def __init__(
self,
num_emb_in: int,
dim_emb: int,
num_emb_lcb: int,
num_emb_fmb: int,
rank_fmb: int,
mlp_dims: tuple,
dropout: float,
) -> None:
super().__init__()
self.lcb = LinearCompressBlock(num_emb_in, num_emb_lcb)
self.fmb = FactorizationMachineBlock(
num_emb_in,
num_emb_fmb,
dim_emb,
rank_fmb,
mlp_dims,
dropout,
)
self.norm = nn.LayerNorm(dim_emb)
if num_emb_in != num_emb_lcb + num_emb_fmb:
self.residual_projection = ResidualProjection(num_emb_in, num_emb_lcb + num_emb_fmb)
else:
self.residual_projection = nn.Identity()
def forward(self, inputs: Tensor) -> Tensor:
# (bs, num_emb_in, dim_emb) -> (bs, num_emb_lcb, dim_emb)
lcb = self.lcb(inputs)
# (bs, num_emb_in, dim_emb) -> (bs, num_emb_fmb, dim_emb)
fmb = self.fmb(inputs)
# (bs, num_emb_lcb, dim_emb), (bs, num_emb_fmb, dim_emb) -> (bs, num_emb_lcb + num_emb_fmb, dim_emb)
outputs = torch.concat((fmb, lcb), dim=1)
# (bs, num_emb_lcb + num_emb_fmb, dim_emb) -> (bs, num_emb_lcb + num_emb_fmb, dim_emb)
outputs = self.norm(outputs + self.residual_projection(inputs))
return outputs
# class Wukong(torch.nn.Module): # layer
# def __init__(self, input_dim, num_layers, num_fields, embed_dim):
# """
# input_dim: num_fields*embed_dim
# """
# super().__init__()
# self.num_layers = num_layers
# self.num_fields = num_fields
# self.embed_dim = embed_dim
# self.input_dim = input_dim
# self.W = torch.nn.ParameterList([
# torch.nn.Parameter(torch.Tensor(input_dim, input_dim)) for _ in range(num_layers)
# ])
# # self.W = torch.nn.ModuleList([
# # torch.nn.Linear(input_dim, input_dim, bias=False) for _ in range(num_layers)
# # ])
# self.b = torch.nn.ParameterList([
# torch.nn.Parameter(torch.zeros(input_dim, )) for _ in range(num_layers)
# ])
# for i in range(num_layers):
# torch.nn.init.xavier_uniform_(self.W[i])
# def forward(self, x):
# """
# x: Tensor of size ``(batch_size, num_fields*embed_dim)``
# """
# x0 = x
# for i in range(self.num_layers):
# # x = x.unsqueeze(2)
# # xw = self.W[i](x)
# xw = torch.matmul(x, self.W[i])
# # xw = xw.squeeze(2)
# # x = x.squeeze(2)
# x = x0 * (xw + self.b[i]) + x
# return x
class WukongModel(nn.Module):
def __init__(
self,
field_dims: list,
embed_dim: int,
num_layers: int,
mlp_dims: tuple = (400, 400, 400),
dropout: float = 0.0,
num_emb_lcb: int = 16,
num_emb_fmb: int = 16,
rank_fmb: int = 24,
) -> None:
super().__init__()
self.embed_dim = embed_dim
self.num_emb_lcb = num_emb_lcb
self.num_emb_fmb = num_emb_fmb
# self.embedding = Embedding(num_sparse_emb, dim_emb, dim_input_dense)
self.num_fields = len(field_dims)
self.embedding = FeaturesEmbedding(field_dims, embed_dim)
self.embed_output_dim = self.num_fields * embed_dim
hidden_fields = self.num_fields
# num_emb_in = dim_input_sparse + dim_input_dense
num_emb_in = len(field_dims)
self.interaction_layers = nn.Sequential()
for _ in range(num_layers):
# print("self.num_fields:", self.num_fields)
self.interaction_layers.append(
WukongLayer(
hidden_fields,
embed_dim,
num_emb_lcb,
num_emb_fmb,
rank_fmb,
(400, 400, 400),
dropout,
),
)
hidden_fields = num_emb_lcb + num_emb_fmb
# self.mlp = MultiLayerPerceptron((num_emb_lcb + num_emb_fmb) * embed_dim, mlp_dims, dropout, output_layer=True)
self.mlp = MLP_DLRM((num_emb_lcb + num_emb_fmb) * embed_dim, mlp_dims[0:-1], dropout, output_layer=True, output_dim=1)
# self.mlp = MLP((num_emb_lcb + num_emb_fmb) * embed_dim, 3, 400, dim_out=1, batch_norm=True, dropout=dropout)
def forward(self, x) -> Tensor:
# outputs = self.embedding(x).view(-1, self.embed_output_dim)
embed_x = self.embedding(x).view(-1, self.num_fields, self.embed_dim)
# # print("x old: ", embed_x.size())
# embed_x = self.embedding(x).view(self.num_fields, -1, self.embed_dim)
# # print("x new: ", embed_x.size())
# # return
outputs = self.interaction_layers(embed_x)
outputs = outputs.view(-1, (self.num_emb_lcb + self.num_emb_fmb) * self.embed_dim)
outputs = self.mlp(outputs)
return torch.sigmoid(outputs.squeeze(1))
# class DNNModel(torch.nn.Module):
# def __init__(self, field_dims, embed_dim, mlp_dims, dropout):
# super().__init__()
# self.embedding = FeaturesEmbedding(field_dims, embed_dim)
# self.embed_output_dim = len(field_dims) * embed_dim
# self.mlp = MultiLayerPerceptron(self.embed_output_dim, mlp_dims, dropout)
# def forward(self, x):
# """
# :param x: Long tensor of size ``(batch_size, num_fields)``
# """
# embed_x = self.embedding(x)
# x = self.mlp(embed_x.view(-1, self.embed_output_dim))
# return torch.sigmoid(x.squeeze(1))