recommenders icon indicating copy to clipboard operation
recommenders copied to clipboard

combine listwise ranking and feature preprocessing

Open hyyan112 opened this issue 2 years ago • 0 comments

https://www.tensorflow.org/recommenders/examples/listwise_ranking by follow this doucument I wrote some code

import pprint

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

import tensorflow_ranking as tfr
import tensorflow_recommenders as tfrs

ratings = tfds.load("movielens/100k-ratings", split="train")
movies = tfds.load("movielens/100k-movies", split="train")

ratings = ratings.map(
    lambda x: {
        "movie_title": x["movie_title"],
        "user_id": x["user_id"],
        "user_rating": x["user_rating"],
    }
)
movies = movies.map(lambda x: x["movie_title"])

unique_movie_titles = np.unique(np.concatenate(list(movies.batch(1000))))
unique_user_ids = np.unique(
    np.concatenate(list(ratings.batch(1_000).map(lambda x: x["user_id"])))
)

tf.random.set_seed(42)

# Split between train and tests sets, as before.
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

# We sample 50 lists for each user for the training data. For each list we
# sample 5 movies from the movies the user rated.
train = tfrs.examples.movielens.sample_listwise(
    train, num_list_per_user=50, num_examples_per_list=5, seed=42
)
test = tfrs.examples.movielens.sample_listwise(
    test, num_list_per_user=1, num_examples_per_list=5, seed=42
)

for example in train.take(1):
    pprint.pprint(example)


class RankingModel(tfrs.Model):
    def __init__(self, loss):
        super().__init__()
        embedding_dimension = 32

        self.user_embeddings = tf.keras.Sequential(
            [
                tf.keras.layers.StringLookup(vocabulary=unique_user_ids),
                tf.keras.layers.Embedding(
                    len(unique_user_ids) + 2, embedding_dimension
                ),
            ]
        )

        self.movie_embeddings = tf.keras.Sequential(
            [
                tf.keras.layers.StringLookup(vocabulary=unique_movie_titles),
                tf.keras.layers.Embedding(
                    len(unique_movie_titles) + 2, embedding_dimension
                ),
            ]
        )

        max_tokens = 1000
        self.title_text_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.TextVectorization(max_tokens=max_tokens),
                tf.keras.layers.Embedding(max_tokens, embedding_dimension, mask_zero=True),
                # We average the embedding of individual words to get one embedding vector
                # per title.
                tf.keras.layers.GlobalAveragePooling1D(),
            ]
        )

        self.score_model = tf.keras.Sequential(
            [
                tf.keras.layers.Dense(256, activation="relu"),
                tf.keras.layers.Dense(64, activation="relu"),
                tf.keras.layers.Dense(1),
            ]
        )

        self.task = tfrs.tasks.Ranking(
            loss=loss,
            metrics=[
                tfr.keras.metrics.NDCGMetric(name="ndcg_metric"),
                tf.keras.metrics.RootMeanSquaredError(),
            ],
        )

    def call(self, features):
        tf.print(features)
        user_embeddings = self.user_embeddings(features["user_id"])

        movie_embeddings = self.movie_embeddings(features["movie_title"])

        text_embeddings = self.title_text_embedding(features["movie_title"])

        movie_model = tf.concat([movie_embeddings, text_embeddings], axis=1)

        list_length = features["movie_title"].shape[1]
        user_embedding_repeated = tf.repeat(
            tf.expand_dims(user_embeddings, 1), [list_length], axis=1
        )

        concatenated_embeddings = tf.concat(
            [user_embedding_repeated, movie_model], 2
        )

        return self.score_model(concatenated_embeddings)

    def compute_loss(self, features, training=False):
        labels = features.pop("user_rating")

        scores = self(features)

        return self.task(
            labels=labels,
            predictions=tf.squeeze(scores, axis=-1),
        )


epochs = 30

cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(8192).cache()

listwise_model = RankingModel(tfr.keras.losses.ListMLELoss())
listwise_model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

listwise_model.fit(cached_train, epochs=epochs, verbose=False)

listwise_model_result = listwise_model.evaluate(cached_test, return_dict=True)
print("NDCG of the ListMLE model: {:.4f}".format(listwise_model_result["ndcg_metric"]))

notice that I add a text feature

text_embeddings = self.title_text_embedding(features["movie_title"])

but got error

in user code:
    
        File "xxxxx/rank_demo.py", line 104, in call  *
            text_embeddings = self.title_text_embedding(features["movie_title"])
        File "xxxxx/traceback_utils.py", line 70, in error_handler  **
            raise e.with_traceback(filtered_tb) from None
        File "xxxxx/text_vectorization.py", line 573, in _preprocess
            raise ValueError(
    
        ValueError: Exception encountered when calling layer 'text_vectorization' (type TextVectorization).
        
        When using `TextVectorization` to tokenize strings, the input rank must be 1 or the last shape dimension must be 1. Received: inputs.shape=(None, 5) with rank=2
        
        Call arguments received by layer 'text_vectorization' (type TextVectorization):
          • inputs=tf.Tensor(shape=(None, 5), dtype=string)
    
    
    Call arguments received by layer 'ranking_model' (type RankingModel):
      • features={'user_id': 'tf.Tensor(shape=(None,), dtype=string)', 'movie_title': 'tf.Tensor(shape=(None, 5), dtype=string)'}

I think it's because the tfrs.examples.movielens.sample_listwise reshaped the datasets to shape=(None, 5), but how should I fix it? Really need some help here

hyyan112 avatar Nov 07 '23 08:11 hyyan112