recommenders
recommenders copied to clipboard
Optimization for big values of num_list_per_user
I noticed that the current algorithm works slowly for big values of num_list_per_user, as a proposal, substitute a regular python list with a NumPy array for faster indexing. You can check the difference in https://colab.research.google.com/drive/1PMUomKqlEe48kzCeIZqQWNB0Rm_bLnJd?usp=sharing
I'd like to suggest this for the sample listwise technique in general. It removes the reliance on building a large dictionary for the tensor_slices dict.
`
def sample_listwise():
random_state = np.random.RandomState(seed)
example_lists_by_user = defaultdict(_create_feature_dict)
movie_title_vocab = set()
for example in rating_dataset:
user_id = example["user_id"].numpy()
example_lists_by_user[user_id]["movie_title"].append(example["movie_title"])
example_lists_by_user[user_id]["user_rating"].append(example["user_rating"])
item_id_vocab.add(example["movie_title"].numpy())
def sampled_customer_ratings_generator():
for user_id, feature_lists in example_lists_by_user.items():
for _ in range(num_list_per_user):
# Drop the user if they don't have enough ratings.
if len(feature_lists["movie_title"]) < num_examples_per_list:
continue
sampled_movie_titles, sampled_ratings = _sample_list(
feature_lists,
num_examples_per_list,
random_state=random_state,
)
yield {'user_id': user_id, 'movie_title': sampled_movie_titles, 'user_rating': sampled_ratings}
# create a dataset from the generator function above
return tf.data.Dataset.from_generator(
sampled_customer_ratings_generator,
output_signature = {
'user_id': tf.TensorSpec([], dtype=tf.string),
'movie_title': tf.TensorSpec([num_examples_per_list], dtype=tf.string),
'user_rating': tf.TensorSpec([num_examples_per_list], dtype=tf.float32)
}
)
`