recommenders
recommenders copied to clipboard
combine listwise ranking and feature preprocessing
https://www.tensorflow.org/recommenders/examples/listwise_ranking by follow this doucument I wrote some code
import pprint
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_ranking as tfr
import tensorflow_recommenders as tfrs
ratings = tfds.load("movielens/100k-ratings", split="train")
movies = tfds.load("movielens/100k-movies", split="train")
ratings = ratings.map(
lambda x: {
"movie_title": x["movie_title"],
"user_id": x["user_id"],
"user_rating": x["user_rating"],
}
)
movies = movies.map(lambda x: x["movie_title"])
unique_movie_titles = np.unique(np.concatenate(list(movies.batch(1000))))
unique_user_ids = np.unique(
np.concatenate(list(ratings.batch(1_000).map(lambda x: x["user_id"])))
)
tf.random.set_seed(42)
# Split between train and tests sets, as before.
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)
train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)
# We sample 50 lists for each user for the training data. For each list we
# sample 5 movies from the movies the user rated.
train = tfrs.examples.movielens.sample_listwise(
train, num_list_per_user=50, num_examples_per_list=5, seed=42
)
test = tfrs.examples.movielens.sample_listwise(
test, num_list_per_user=1, num_examples_per_list=5, seed=42
)
for example in train.take(1):
pprint.pprint(example)
class RankingModel(tfrs.Model):
def __init__(self, loss):
super().__init__()
embedding_dimension = 32
self.user_embeddings = tf.keras.Sequential(
[
tf.keras.layers.StringLookup(vocabulary=unique_user_ids),
tf.keras.layers.Embedding(
len(unique_user_ids) + 2, embedding_dimension
),
]
)
self.movie_embeddings = tf.keras.Sequential(
[
tf.keras.layers.StringLookup(vocabulary=unique_movie_titles),
tf.keras.layers.Embedding(
len(unique_movie_titles) + 2, embedding_dimension
),
]
)
max_tokens = 1000
self.title_text_embedding = tf.keras.Sequential(
[
tf.keras.layers.TextVectorization(max_tokens=max_tokens),
tf.keras.layers.Embedding(max_tokens, embedding_dimension, mask_zero=True),
# We average the embedding of individual words to get one embedding vector
# per title.
tf.keras.layers.GlobalAveragePooling1D(),
]
)
self.score_model = tf.keras.Sequential(
[
tf.keras.layers.Dense(256, activation="relu"),
tf.keras.layers.Dense(64, activation="relu"),
tf.keras.layers.Dense(1),
]
)
self.task = tfrs.tasks.Ranking(
loss=loss,
metrics=[
tfr.keras.metrics.NDCGMetric(name="ndcg_metric"),
tf.keras.metrics.RootMeanSquaredError(),
],
)
def call(self, features):
tf.print(features)
user_embeddings = self.user_embeddings(features["user_id"])
movie_embeddings = self.movie_embeddings(features["movie_title"])
text_embeddings = self.title_text_embedding(features["movie_title"])
movie_model = tf.concat([movie_embeddings, text_embeddings], axis=1)
list_length = features["movie_title"].shape[1]
user_embedding_repeated = tf.repeat(
tf.expand_dims(user_embeddings, 1), [list_length], axis=1
)
concatenated_embeddings = tf.concat(
[user_embedding_repeated, movie_model], 2
)
return self.score_model(concatenated_embeddings)
def compute_loss(self, features, training=False):
labels = features.pop("user_rating")
scores = self(features)
return self.task(
labels=labels,
predictions=tf.squeeze(scores, axis=-1),
)
epochs = 30
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(8192).cache()
listwise_model = RankingModel(tfr.keras.losses.ListMLELoss())
listwise_model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))
listwise_model.fit(cached_train, epochs=epochs, verbose=False)
listwise_model_result = listwise_model.evaluate(cached_test, return_dict=True)
print("NDCG of the ListMLE model: {:.4f}".format(listwise_model_result["ndcg_metric"]))
notice that I add a text feature
text_embeddings = self.title_text_embedding(features["movie_title"])
but got error
in user code:
File "xxxxx/rank_demo.py", line 104, in call *
text_embeddings = self.title_text_embedding(features["movie_title"])
File "xxxxx/traceback_utils.py", line 70, in error_handler **
raise e.with_traceback(filtered_tb) from None
File "xxxxx/text_vectorization.py", line 573, in _preprocess
raise ValueError(
ValueError: Exception encountered when calling layer 'text_vectorization' (type TextVectorization).
When using `TextVectorization` to tokenize strings, the input rank must be 1 or the last shape dimension must be 1. Received: inputs.shape=(None, 5) with rank=2
Call arguments received by layer 'text_vectorization' (type TextVectorization):
• inputs=tf.Tensor(shape=(None, 5), dtype=string)
Call arguments received by layer 'ranking_model' (type RankingModel):
• features={'user_id': 'tf.Tensor(shape=(None,), dtype=string)', 'movie_title': 'tf.Tensor(shape=(None, 5), dtype=string)'}
I think it's because the tfrs.examples.movielens.sample_listwise reshaped the datasets to shape=(None, 5), but how should I fix it? Really need some help here