recommenders
recommenders copied to clipboard
Very low topk %, real-world accuracy around 40% at k=5000
Sorry if this may be the wrong place to ask, I couldn't find a lead anywhere after weeks of googling.
I've implemented a retrieval model to create embeddings for cosine similarity search for a job board application. The retrieval pairs are users (query) and jobs (candidate), the application is to surface likely positive jobs for a user, then passing these jobs into a ranking model for better recommendation.
In our real-world testing, we're seeing around 35%-40% accuracy in alignment with our user behaviour when retrieving the top 5000 candidates using cosine similarity (as the model increases, we plan to lower this, the goal being 500). Also, the TopK results are quite poor (<4%) during training:
factorized_top_k/top_1_categorical_accuracy: 9.3203e-04 - factorized_top_k/top_5_categorical_accuracy: 0.0032 - factorized_top_k/top_10_categorical_accuracy: 0.0048 - factorized_top_k/top_50_categorical_accuracy: 0.0202 - factorized_top_k/top_100_categorical_accuracy: 0.0369 - loss: 1307.5373 - regularization_loss: 0.0000e+00 - total_loss: 1307.5373
In our dataset, we have about:
- 105k user-job interactions (54k positive)
- 7k users
- 137k jobs
We're at a loss on how to improve the model using the recommenders library and retrieval algo.
Here's the model code:
from typing import Dict, Text, List
import tensorflow as tf
import tensorflow_recommenders as tfrs
from ..utils import get_buckets, _make_text_vect_layer, CONFIG
# left side is model layer name, right side is key in dataset
# {layer_name: data_key}
user_model_feature_to_data_map = {
"user_skills_layer": "user_skills",
"user_specialization_layer": "user_current_specialization",
"user_job_title_layer": "user_job_title",
"user_min_salary_layer": "user_min_salary",
"user_seniority_layer": "user_seniority",
"user_remote_pref_layer": "user_remote",
"user_job_perks_layer": "user_job_perks",
"user_countries_layer": "user_countries",
"user_country_states_layer": "user_country_states",
}
job_model_feature_to_data_map = {
"job_job_title_layer": "job_job_title",
"job_seniority_layer": "job_required_seniority",
"job_remote_pref_layer": "job_remote",
"job_required_skills_layer": "job_required_skills",
"job_normalized_title_layer": "job_normalized_title",
"job_culture_values": "job_culture_values",
"job_perks_layer": "job_perks",
"job_country_layer": "job_country",
"job_country_state_layer": "job_country_state",
"job_must_reside_in_layer": "job_must_reside_in",
"job_min_salary_layer": "job_min_salary",
"job_max_salary_layer": "job_max_salary",
}
class UserModel(tf.keras.Model):
"""
build sequential model for each feature
pass outputs to dense/cross layers
concatentate the outputs
the produced embedding represents the features
of a Playlist known at query time
"""
def __init__(self, vocab_dict, embedding_dim, seed, layer_sizes):
super().__init__()
self.user_skills_layer = _make_text_vect_layer(
vocab_dict,
"user_skills",
embedding_dim,
layer_name="user_skills",
no_embedding=True,
)
self.user_specialization_layer = _make_text_vect_layer(
vocab_dict, "user_current_specializations", no_embedding=True
)
self.user_job_title_layer = _make_text_vect_layer(
vocab_dict, "user_job_titles", no_embedding=True
)
self.user_seniority_layer = _make_text_vect_layer(
vocab_dict, "user_seniorities", no_embedding=True
)
self.user_remote_pref_layer = _make_text_vect_layer(
vocab_dict, "user_remotes", no_embedding=True
)
user_min_salary_max_val = CONFIG["salary_bucket_max"]
user_min_salary_bucket_step = CONFIG["salary_bucket_step"]
user_min_salary_bucket_num = (
user_min_salary_max_val // user_min_salary_bucket_step
)
self.user_min_salary_layer = tf.keras.Sequential(
[
tf.keras.layers.Discretization(
get_buckets(
min_val=1,
max_val=user_min_salary_max_val,
buckets_num=user_min_salary_bucket_num,
),
output_mode="one_hot",
),
# XXX(Phong): only use the embedding layer if output_mode is "int"
# tf.keras.layers.Embedding(
# input_dim=user_min_salary_bucket_num + 1,
# output_dim=embedding_dim,
# name="user_min_salary_emb_layer",
# mask_zero=False,
# ),
],
name="user_min_salary_layer",
)
self.user_job_perks_layer = _make_text_vect_layer(
vocab_dict, "user_job_perks", no_embedding=True
)
self.user_countries_layer = _make_text_vect_layer(
vocab_dict, "user_countries", no_embedding=True
)
self.user_country_states_layer = _make_text_vect_layer(
vocab_dict, "user_country_states", embedding_dim=embedding_dim
)
self.dense_layers = tf.keras.Sequential(name="user_dense_layers")
for layer_size in layer_sizes:
self.dense_layers.add(
tf.keras.layers.Dense(
units=layer_size,
activation="relu",
kernel_initializer=tf.keras.initializers.GlorotUniform(seed=seed),
)
)
# if use_dropout:
# self.dense_layers.add(tf.keras.layers.Dropout(rate=0.2))
# ADDING L2 NORM AT THE END to fix vector mangtitude (fixed length vector)
# good for cosine similarity comparisons
self.dense_layers.add(
tf.keras.layers.Lambda(
lambda x: tf.nn.l2_normalize(x, axis=-1), name="l2_normalization"
)
)
# self.dense_layers.add(
# tf.keras.layers.LayerNormalization(name="normalize_dense")
# )
def call(self, data):
all_embs = tf.concat(
[
getattr(self, layer_name)(data[data_key])
for layer_name, data_key in user_model_feature_to_data_map.items()
],
axis=1,
)
return self.dense_layers(all_embs)
class JobModel(tf.keras.Model):
"""
build sequential model for each feature
pass outputs to dense/cross layers
concatentate the outputs
the produced embedding represents the features
of a Playlist known at query time
"""
def __init__(self, vocab_dict, embedding_dim, seed, layer_sizes):
super().__init__()
### Output mode == int is very important, will run VERY slow if not set
self.job_job_title_layer = _make_text_vect_layer(
vocab_dict,
"job_job_titles",
embedding_dim,
layer_name="job_job_title",
output_mode="int",
)
self.job_seniority_layer = _make_text_vect_layer(
vocab_dict, "job_seniorities", no_embedding=True
)
self.job_remote_pref_layer = _make_text_vect_layer(
vocab_dict, "job_remotes", no_embedding=True
)
self.job_required_skills_layer = _make_text_vect_layer(
vocab_dict, "job_required_skills", no_embedding=True
)
self.job_normalized_title_layer = _make_text_vect_layer(
vocab_dict, "job_normalized_titles", no_embedding=True
)
self.job_culture_values = _make_text_vect_layer(
vocab_dict, "job_culture_values", no_embedding=True
)
self.job_perks_layer = _make_text_vect_layer(
vocab_dict, "job_perks", no_embedding=True
)
self.job_country_layer = _make_text_vect_layer(
vocab_dict, "job_countries", no_embedding=True
)
self.job_country_state_layer = _make_text_vect_layer(
vocab_dict, "job_country_states", embedding_dim=embedding_dim
)
self.job_must_reside_in_layer = _make_text_vect_layer(
vocab_dict, "job_must_reside_in", no_embedding=True
)
salary_bucket_max = CONFIG["salary_bucket_max"]
salary_bucket_step = CONFIG["salary_bucket_step"]
salary_bucket_num = salary_bucket_max // salary_bucket_step
self.job_min_salary_layer = tf.keras.Sequential(
[
tf.keras.layers.Discretization(
get_buckets(
min_val=1,
max_val=salary_bucket_max,
buckets_num=salary_bucket_num,
),
output_mode="one_hot",
),
],
name="job_min_salary_layer",
)
self.job_max_salary_layer = tf.keras.Sequential(
[
tf.keras.layers.Discretization(
get_buckets(
min_val=1,
max_val=salary_bucket_max,
buckets_num=salary_bucket_num,
),
output_mode="one_hot",
),
],
name="job_max_salary_layer",
)
self.dense_layers = tf.keras.Sequential(name="job_dense_layers")
for layer_size in layer_sizes:
self.dense_layers.add(
tf.keras.layers.Dense(
units=layer_size,
activation="relu",
kernel_initializer=tf.keras.initializers.GlorotUniform(seed=seed),
)
)
# if use_dropout:
# self.dense_layers.add(tf.keras.layers.Dropout(rate=0.2))
# ADDING L2 NORM AT THE END to fix vector mangtitude (fixed length vector)
# good for cosine similarity comparisons
self.dense_layers.add(
tf.keras.layers.Lambda(
lambda x: tf.nn.l2_normalize(x, axis=-1), name="l2_normalization"
)
)
# self.dense_layers.add(
# tf.keras.layers.LayerNormalization(name="normalize_dense")
# )
def call(self, data):
all_embs = tf.concat(
[
getattr(self, layer_name)(data[data_key])
for layer_name, data_key in job_model_feature_to_data_map.items()
],
axis=1,
)
return self.dense_layers(all_embs)
class RetrievalModel(tfrs.Model):
def __init__(self, vocab_dict, dataset):
super().__init__()
self.embedding_dim = 128
# XXX(Phong): this is the output layer, if you change this, you need to
# run a migration to update the db embeddings dims on helix-service
self.layer_sizes = [256]
self.seed = 42
self.query_model: tf.keras.Model = UserModel(
vocab_dict=vocab_dict,
embedding_dim=self.embedding_dim,
seed=self.seed,
layer_sizes=self.layer_sizes,
)
self.candidate_model: tf.keras.Model = JobModel(
vocab_dict=vocab_dict,
embedding_dim=self.embedding_dim,
seed=self.seed,
layer_sizes=self.layer_sizes,
)
self.task: tf.keras.layers.Layer = tfrs.tasks.Retrieval(
metrics=tfrs.metrics.FactorizedTopK(
# XXX(Phong): all occurences of the candidate in the dataset,
# not affected by train/test split
candidates=dataset.batch(512).map(
lambda data: (
data["job_id"],
# XXX(Phong): need to pull out the relevant keys
# or the model will infer the inputs from the wrong ones
self.candidate_model(
{
key: data[key]
for key in job_model_feature_to_data_map.values()
}
),
)
)
)
)
"""
XXX(Phong): `data` is the training data being passed into the
RetrievalModel when you run model.fit()
"""
def compute_loss(self, data: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
CANDIDATE_ID_KEY = "job_id"
# XXX(Phong): need to pull out the relevant training data for each sub-
# model, otherwise it will infer the inputs from the keys of the data
user_model_train_data = {
key: data[key] for key in user_model_feature_to_data_map.values()
}
query_embeddings = self.query_model(user_model_train_data)
job_model_train_data = {
key: data[key] for key in job_model_feature_to_data_map.values()
}
candidate_embeddings = self.candidate_model(job_model_train_data)
return self.task(
query_embeddings,
candidate_embeddings,
candidate_ids=data[CANDIDATE_ID_KEY],
)
Are we using the library wrong? There are some differences to the movielens example as we're following this repo