wukong-recommendation icon indicating copy to clipboard operation
wukong-recommendation copied to clipboard

Experiments on MovieLensLatest wukong seems cannot beat MLPs without any interaction modules

Open YH-learning opened this issue 5 months ago • 0 comments

Dear clabrugere, Thanks for your great job on repeating wukong‘s experiments. According to the claims of this paper, wukong has great performance on serveral datasets, like Table 2. image But when I re-run the experiments using codes from this repo, found that wukong cannot beat MLPs(without any interaction module). I have been confused with this for weeks. After checking the code and paper times by times, I cannot figure out what's wrong. If you are willing, I am very respectful to know do you repeat the wukong's experiments, does it work well?

dataset

criteo(Kaggle Display Advertising dataset): https://go.criteo.net/criteo-research-kaggle-display-advertising-challenge-dataset.tar.gz, using the train.txt movielens_latest: http://grouplens.org/datasets/movielens/, using the rating.csv from latest version

preprocess

follows the pytorchfm's processing: https://github.com/rixwew/pytorch-fm/blob/master/torchfm/dataset/movielens.py

results

  • movielens-latest
method epochs valid auc (best) test auc
mlps(400, 400) 19 0.915191645690709 0.9153241141382682
dcnv2(1 layer) 28 0.9151184084632096 0.9152726643561548
wukonfg(1 layer) 9 0.912915048710929 0.9132910202259985
  • criteo
method epochs valid auc (best) test auc
mlps(400, 400) 24 0.812205612493922 0.812158279764408
num_layer: 1mlps: (400, 400) 27 0.8131887068022828 0.813106074441504
num_layer: 1num_lcb: 16num_fmb: 16fmb_mlps(400, 400) 28 0.8122118809319111 0.8122262149453786

code

I move most of the codes to torchfm, and expects to get a fair comparison. here is the code:

import torch
from torch import Tensor, nn
from torchfm.layer import FeaturesEmbedding, MultiLayerPerceptron
import numpy as np


class MLP(torch.nn.Sequential):
    def __init__(
        self,
        dim_in: int,
        num_hidden: int,
        dim_hidden: int,
        dim_out: int | None = None,
        batch_norm: bool = True,
        dropout: float = 0.0,
    ) -> None:
        layers = []
        for _ in range(num_hidden - 1):
            layers.append(torch.nn.Linear(dim_in, dim_hidden))

            if batch_norm:
                layers.append(torch.nn.BatchNorm1d(dim_hidden))

            layers.append(torch.nn.ReLU())
            layers.append(torch.nn.Dropout(dropout))
            dim_in = dim_hidden

        if dim_out:
            layers.append(torch.nn.Linear(dim_in, dim_out))
        else:
            layers.append(torch.nn.Linear(dim_in, dim_hidden))

        super().__init__(*layers)


class MLP_DLRM(torch.nn.Module):

    def __init__(self, input_dim, embed_dims, dropout, output_layer=True, output_dim=1):
        super().__init__()
        self.layers = list()
        for embed_dim in embed_dims:
            self.layers.append(torch.nn.Linear(input_dim, embed_dim))
            self.layers.append(torch.nn.BatchNorm1d(embed_dim))
            self.layers.append(torch.nn.ReLU())
            self.layers.append(torch.nn.Dropout(p=dropout))
            input_dim = embed_dim
        if output_layer:
            self.layers.append(torch.nn.Linear(input_dim, output_dim))

        self.mlp = torch.nn.Sequential(*self.layers)
        # self._reset_parameters()
    
    def _reset_parameters(self) -> None:
        for layer in self.layers:
            if isinstance(layer, torch.nn.Linear):
                mean = 0.0
                std_dev = np.sqrt(2 / (layer.weight.size(0) + layer.weight.size(1)))
                W = np.random.normal(mean, std_dev, size=layer.weight.shape).astype(np.float32)
                std_dev = np.sqrt(1 / layer.weight.size(0))
                bt = np.random.normal(mean, std_dev, size=layer.bias.shape).astype(np.float32)
                with torch.no_grad():
                    layer.weight.copy_(torch.tensor(W))
                    layer.bias.copy_(torch.tensor(bt))

    def forward(self, x):
        """
        :param x: Float tensor of size ``(batch_size, embed_dim)``
        """
        return self.mlp(x)


class MLP_PURE(torch.nn.Module):

    def __init__(self, input_dim, embed_dims, dropout, output_layer=True, output_dim=1):
        super().__init__()
        self.layers = list()
        for embed_dim in embed_dims:
            self.layers.append(torch.nn.Linear(input_dim, embed_dim))
            self.layers.append(torch.nn.BatchNorm1d(embed_dim))
            self.layers.append(torch.nn.ReLU())
            self.layers.append(torch.nn.Dropout(p=dropout))
            input_dim = embed_dim
        if output_layer:
            self.layers.append(torch.nn.Linear(input_dim, output_dim))

        self.mlp = torch.nn.Sequential(*self.layers)
        # self._reset_parameters()
    
    def _reset_parameters(self,):
        for layer in self.layers:
            if isinstance(layer, torch.nn.Linear):
                torch.nn.init.xavier_uniform_(layer.weight)


    def forward(self, x):
        """
        :param x: Float tensor of size ``(batch_size, embed_dim)``
        """
        return self.mlp(x)



class LinearCompressBlock(nn.Module):
    def __init__(self, num_emb_in: int, num_emb_out: int) -> None:
        super().__init__()

        self.weight = nn.Parameter(torch.empty((num_emb_in, num_emb_out)))
        self._reset_parameters()

    def _reset_parameters(self) -> None:
        nn.init.kaiming_uniform_(self.weight)

    def forward(self, inputs: Tensor) -> Tensor:
        # (bs, num_emb_in, dim_emb) -> (bs, dim_emb, num_emb_in)
        outputs = inputs.permute(0, 2, 1)

        # (bs, dim_emb, num_emb_in) @ (num_emb_in, num_emb_out) -> (bs, dim_emb, num_emb_out)
        outputs = outputs @ self.weight

        # (bs, dim_emb, num_emb_out) -> (bs, num_emb_out, dim_emb)
        outputs = outputs.permute(0, 2, 1)

        return outputs


class FactorizationMachineBlock(nn.Module):
    def __init__(
        self,
        num_emb_in: int,
        num_emb_out: int,
        dim_emb: int,
        rank: int,
        mlp_dims: tuple,
        dropout: float,
    ) -> None:
        super().__init__()

        self.num_emb_in = num_emb_in
        self.num_emb_out = num_emb_out
        self.dim_emb = dim_emb
        self.rank = rank

        self.weight = nn.Parameter(torch.empty((num_emb_in, rank)))
        self.norm = nn.LayerNorm(num_emb_in * rank)
        # self.mlp = MLP(
        #     dim_in=num_emb_in * rank,
        #     num_hidden=num_hidden,
        #     dim_hidden=dim_hidden,
        #     dim_out=num_emb_out * dim_emb,
        #     dropout=dropout,
        # )

        re_mlp_dims = list(mlp_dims)
        re_mlp_dims[-1] = num_emb_out * dim_emb


        # self.mlp = MultiLayerPerceptron(num_emb_in * rank, re_mlp_dims, dropout, output_layer=False)
        self.mlp = MLP_PURE(num_emb_in * rank, mlp_dims[0:-1], dropout, output_layer=True, output_dim=num_emb_out * dim_emb)
        # self.mlp = MLP(num_emb_in * rank, 3, 400, dim_out=num_emb_out * dim_emb, batch_norm=True, dropout=dropout)


        self._reset_parameters()

    def _reset_parameters(self):
        nn.init.kaiming_uniform_(self.weight)

    def forward(self, inputs: Tensor) -> Tensor:
        # (bs, num_emb_in, dim_emb) -> (bs, dim_emb, num_emb_in)
        outputs = inputs.permute(0, 2, 1)

        # (bs, dim_emb, num_emb_in) @ (num_emb_in, rank) -> (bs, dim_emb, rank)
        outputs = outputs @ self.weight

        # (bs, num_emb_in, dim_emb) @ (bs, dim_emb, rank) -> (bs, num_emb_in, rank)
        outputs = torch.bmm(inputs, outputs)

        # (bs, num_emb_in, rank) -> (bs, num_emb_in * rank)
        outputs = outputs.view(-1, self.num_emb_in * self.rank)

        # (bs, num_emb_in * rank) -> (bs, num_emb_out * dim_emb)
        outputs = self.mlp(self.norm(outputs))

        # (bs, num_emb_out * dim_emb) -> (bs, num_emb_out, dim_emb)
        outputs = outputs.view(-1, self.num_emb_out, self.dim_emb)

        return outputs


class ResidualProjection(nn.Module):
    def __init__(self, num_emb_in: int, num_emb_out: int) -> None:
        super().__init__()

        self.weight = nn.Parameter(torch.empty((num_emb_in, num_emb_out)))
        self._reset_parameters()

    def _reset_parameters(self):
        nn.init.kaiming_uniform_(self.weight)

    def forward(self, inputs: Tensor) -> Tensor:
        # (bs, num_emb_in, dim_emb) -> (bs, dim_emb, num_emb_in)
        outputs = inputs.permute(0, 2, 1)

        # (bs, dim_emb, num_emb_in) @ (num_emb_in, num_emb_out) -> (bs, dim_emb, num_emb_out)
        outputs = outputs @ self.weight

        # # (bs, dim_emb, num_emb_out) -> (bs, num_emb_out, dim_emb)
        outputs = outputs.permute(0, 2, 1)

        return outputs


class WukongLayer(nn.Module):
    def __init__(
        self,
        num_emb_in: int,
        dim_emb: int,
        num_emb_lcb: int,
        num_emb_fmb: int,
        rank_fmb: int,
        mlp_dims: tuple,
        dropout: float,
    ) -> None:
        super().__init__()

        self.lcb = LinearCompressBlock(num_emb_in, num_emb_lcb)
        self.fmb = FactorizationMachineBlock(
            num_emb_in,
            num_emb_fmb,
            dim_emb,
            rank_fmb,
            mlp_dims,
            dropout,
        )
        self.norm = nn.LayerNorm(dim_emb)

        if num_emb_in != num_emb_lcb + num_emb_fmb:
            self.residual_projection = ResidualProjection(num_emb_in, num_emb_lcb + num_emb_fmb)
        else:
            self.residual_projection = nn.Identity()

    def forward(self, inputs: Tensor) -> Tensor:
        # (bs, num_emb_in, dim_emb) -> (bs, num_emb_lcb, dim_emb)
        lcb = self.lcb(inputs)

        # (bs, num_emb_in, dim_emb) -> (bs, num_emb_fmb, dim_emb)
        fmb = self.fmb(inputs)

        # (bs, num_emb_lcb, dim_emb), (bs, num_emb_fmb, dim_emb) -> (bs, num_emb_lcb + num_emb_fmb, dim_emb)
        outputs = torch.concat((fmb, lcb), dim=1)

        # (bs, num_emb_lcb + num_emb_fmb, dim_emb) -> (bs, num_emb_lcb + num_emb_fmb, dim_emb)
        outputs = self.norm(outputs + self.residual_projection(inputs))

        return outputs


# class Wukong(torch.nn.Module):  # layer

#     def __init__(self, input_dim, num_layers, num_fields, embed_dim):
#         """
#         input_dim: num_fields*embed_dim
#         """
#         super().__init__()
#         self.num_layers = num_layers
#         self.num_fields = num_fields
#         self.embed_dim = embed_dim
#         self.input_dim = input_dim
#         self.W = torch.nn.ParameterList([
#             torch.nn.Parameter(torch.Tensor(input_dim, input_dim)) for _ in range(num_layers)
#         ])
#         # self.W = torch.nn.ModuleList([
#         #     torch.nn.Linear(input_dim, input_dim, bias=False) for _ in range(num_layers)
#         # ])
#         self.b = torch.nn.ParameterList([
#             torch.nn.Parameter(torch.zeros(input_dim, )) for _ in range(num_layers)
#         ])
#         for i in range(num_layers):
#             torch.nn.init.xavier_uniform_(self.W[i])

#     def forward(self, x):
#         """
#         x: Tensor of size ``(batch_size, num_fields*embed_dim)``
#         """
#         x0 = x
#         for i in range(self.num_layers):
#             # x = x.unsqueeze(2)
#             # xw = self.W[i](x)
#             xw = torch.matmul(x, self.W[i])
#             # xw = xw.squeeze(2)
#             # x = x.squeeze(2)
#             x = x0 * (xw + self.b[i]) + x
#         return x


class WukongModel(nn.Module):
    def __init__(
        self,
        field_dims: list,
        embed_dim: int,
        num_layers: int,
        mlp_dims: tuple = (400, 400, 400),
        dropout: float = 0.0,
        num_emb_lcb: int = 16,
        num_emb_fmb: int = 16,
        rank_fmb: int = 24,
    ) -> None:
        super().__init__()

        self.embed_dim = embed_dim
        self.num_emb_lcb = num_emb_lcb
        self.num_emb_fmb = num_emb_fmb

        # self.embedding = Embedding(num_sparse_emb, dim_emb, dim_input_dense)

        self.num_fields = len(field_dims)
        self.embedding = FeaturesEmbedding(field_dims, embed_dim)
        self.embed_output_dim = self.num_fields * embed_dim
        hidden_fields = self.num_fields

        # num_emb_in = dim_input_sparse + dim_input_dense
        num_emb_in = len(field_dims)

        self.interaction_layers = nn.Sequential()
        for _ in range(num_layers):
            # print("self.num_fields:", self.num_fields)
            self.interaction_layers.append(
                WukongLayer(
                    hidden_fields,
                    embed_dim,
                    num_emb_lcb,
                    num_emb_fmb,
                    rank_fmb,
                    (400, 400, 400),
                    dropout,
                ),
            )
            hidden_fields = num_emb_lcb + num_emb_fmb

        # self.mlp = MultiLayerPerceptron((num_emb_lcb + num_emb_fmb) * embed_dim, mlp_dims, dropout, output_layer=True)
        self.mlp = MLP_DLRM((num_emb_lcb + num_emb_fmb) * embed_dim, mlp_dims[0:-1], dropout, output_layer=True, output_dim=1)
        # self.mlp = MLP((num_emb_lcb + num_emb_fmb) * embed_dim, 3, 400, dim_out=1, batch_norm=True, dropout=dropout)

    def forward(self, x) -> Tensor:

        # outputs = self.embedding(x).view(-1, self.embed_output_dim)

        embed_x = self.embedding(x).view(-1, self.num_fields, self.embed_dim)
        # # print("x old: ", embed_x.size())
        # embed_x = self.embedding(x).view(self.num_fields, -1, self.embed_dim)
        # # print("x new: ", embed_x.size())
        # # return 

        outputs = self.interaction_layers(embed_x)
        outputs = outputs.view(-1, (self.num_emb_lcb + self.num_emb_fmb) * self.embed_dim)
        outputs = self.mlp(outputs)

        return torch.sigmoid(outputs.squeeze(1))


# class DNNModel(torch.nn.Module):
#     def __init__(self, field_dims, embed_dim, mlp_dims, dropout):
#         super().__init__()
#         self.embedding = FeaturesEmbedding(field_dims, embed_dim)
#         self.embed_output_dim = len(field_dims) * embed_dim
#         self.mlp = MultiLayerPerceptron(self.embed_output_dim, mlp_dims, dropout)

#     def forward(self, x):
#         """
#         :param x: Long tensor of size ``(batch_size, num_fields)``
#         """
#         embed_x = self.embedding(x)
#         x = self.mlp(embed_x.view(-1, self.embed_output_dim))
#         return torch.sigmoid(x.squeeze(1))

YH-learning avatar Sep 24 '24 09:09 YH-learning