When loading model saved locally, I get an error message huggingface_hub.errors.HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': 'model_name'. Use `repo_type` argument if needed.
Have you searched existing issues? 🔎
- [x] I have searched and found no existing issues
Desribe the bug
Python: 3.9.12 Bertopic: 0.16.4
I get this error when trying to read a saved BERTopic model. I am loading using the same package and environment:
Traceback (most recent call last):
File "/scratch/ta480/hormonal/hormanalBertopicModular.py", line 91, in repo_type argument if needed.
This is the complete code in case youd'd like to review: import json import os import gc from collections import defaultdict
import sys import threading import time from datetime import datetime import logging
from hdbscan import HDBSCAN from bertopic import BERTopic import gensim.corpora as corpora from sklearn.feature_extraction.text import CountVectorizer from gensim.models.coherencemodel import CoherenceModel from umap import UMAP import pandas as pd import json import random
Set up logging to both console and a log file
logger = logging.getLogger() logger.setLevel(logging.DEBUG) formatter = logging.Formatter('%(asctime)s : %(levelname)s : %(message)s')
ch = logging.StreamHandler(sys.stdout) ch.setLevel(logging.DEBUG) ch.setFormatter(formatter) logger.addHandler(ch)
fh = logging.FileHandler('process.log') fh.setLevel(logging.DEBUG) fh.setFormatter(formatter) logger.addHandler(fh)
def log_info(message): logger.info(message)
Read and preprocess documents
def create_threads_list_from_directory(input_dir): threads = defaultdict(list) for filename in os.listdir(input_dir): if filename.endswith(".jsonl"): file_path = os.path.join(input_dir, filename) with open(file_path, "r", encoding="utf-8") as f: for line in f: try: data = json.loads(line) link_id = data.get("link_id") body = data.get("body", "").strip() if link_id and body and body.lower() not in ["[deleted]", "[removed]"]: threads[link_id].append(body) except json.JSONDecodeError: continue return [" ".join(comments) for comments in threads.values()]
Function to compute coherence score
def compute_coherence_score(topic_model, docs_text): log_info("Computing coherence score...")
vectorizer = topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()
tokens = [analyzer(doc) for doc in docs_text if isinstance(doc, str)]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words = []
for topic in range(len(set(topic_model.get_topics())) - 1):
topic_data = topic_model.get_topic(topic)
log_info(f"Topic {topic} raw data: {topic_data}")
if topic_data is not None and len(topic_data) > 0:
topic_terms = [word for word, _ in topic_data]
if len(topic_terms) > 0:
topic_words.append(topic_terms)
if not topic_words:
log_info("No valid topics extracted. Skipping coherence calculation.")
return None
coherence_model = CoherenceModel(topics=topic_words, texts=tokens,
corpus=corpus, dictionary=dictionary,
coherence='u_mass')
coherence = coherence_model.get_coherence()
log_info(f"Coherence score: {coherence}")
return coherence
Load and compute coherence score for existing BERTopic model
log_info("Loading pre-trained BERTopic model.") model_filename = "/scratch/ta480/hormonal/bertopic_n10_m10_g(1,2).model" topic_model = BERTopic.load(model_filename,embedding_model="all-MiniLM-L6-v2") input_directory = "/scratch/ta480/hormonal/" docs_text = create_threads_list_from_directory(input_directory) coherence_score = compute_coherence_score(topic_model, docs_text) log_info(f"Computed coherence score for {model_filename}: {coherence_score}")
del topic_model
gc.collect()
Restart training and coherence calculation with new parameters
umap_neighbors = [20, 30, 40, 50] # UMAP neighborhood sizes min_topic_sizes = [20, 30, 40, 50] # For the clustering model (MiniBatchKMeans doesn't use min_topic_size but you can adjust n_clusters) ngram_ranges = [(1, 2), (1, 3)] # Different n-gram tokenization strategies
def train_and_evaluate_models(): log_info("Starting training of new BERTopic models.") model_list = [] coherence_values = []
for n_neighbors in dumap_neighbors:
for min_size in min_topic_sizes:
for ngram in ngram_ranges:
log_info(f"Training Model with UMAP({n_neighbors}), HDBSCAN({min_size}), N-gram{ngram}")
umap_model = UMAP(n_neighbors=n_neighbors, n_components=3, min_dist=0.1, metric='cosine')
hdbscan_model = HDBSCAN(min_cluster_size=min_size, metric='euclidean', cluster_selection_method='eom')
vectorizer_model = CountVectorizer(ngram_range=ngram, stop_words="english")
topic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model,
vectorizer_model=vectorizer_model, embedding_model="all-MiniLM-L6-v2", low_memory=True)
topics, probs = topic_model.fit_transform(docs_text)
log_info("BERTopic model trained successfully.")
model_list.append(topic_model)
coherence = compute_coherence_score(topic_model, docs_text)
coherence_values.append(coherence)
log_info("Model training and coherence computation completed.")
return model_list, coherence_values
log_info("Commencing new BERTopic model training and coherence calculation.") model_list, coherence_values = train_and_evaluate_models() log_info("Process finished successfully.")
Reproduction
from bertopic import BERTopic
BERTopic Version
0.16.4
I am getting same error. In case it is useful:
from bertopic import BERTopic
import os
import glob
def main():
directory = "/scratch/ysc4337/constellate/data/"
models: list = []
# List all models in a directory
part_directories = glob.glob(os.path.join(directory, "part-4*"))
# List files in each found directory
for part_dir in part_directories:
if os.path.isdir(part_dir):
for file in os.listdir(part_dir):
if file.startswith("bertopic_model") and not file.endswith(".html"):
models.append(os.path.abspath(file))
print(models)
models = models[:3]
# Iteratively merge models
# Initialize the first model
# Load each model
loaded_models = [
BERTopic.load(model_path, embedding_model="all-MiniLM-L6-v2")
for model_path in models
]
# Merge models
merged_model = BERTopic.merge_models(loaded_models)
# Save the merged model
merged_model.save("/scratch/ysc4337/constellate/model_output/merged_model_part_4")
# Visualize the models
merged_model.visualize_topics().write_html(
"/scratch/ysc4337/constellate/visualizations/merged_topics_part_4.html"
)
merged_model.visualize_heatmap().write_html(
"/scratch/ysc4337/constellate/visualizations/merged_heatmp_part_4.html"
)
merged_model.visualize_barchart().write_html(
"/scratch/ysc4337/constellate/visualizations/merged_barchart_terms_part4.html"
)
if __name__ == "__main__":
main()
Versions:
bertopic 0.16.4
Python 3.9.21
Traceback:
Traceback (most recent call last):
File "/projects/p32234/projects/aerith/anthropocene-reconcile/anthropocene-ML/anthropoceneCodebase/constellate/batch1/merge-models.py", line 52, in <module>
main()
File "/projects/p32234/projects/aerith/anthropocene-reconcile/anthropocene-ML/anthropoceneCodebase/constellate/batch1/merge-models.py", line 26, in main
loaded_models = [
File "/projects/p32234/projects/aerith/anthropocene-reconcile/anthropocene-ML/anthropoceneCodebase/constellate/batch1/merge-models.py", line 27, in <listcomp>
BERTopic.load(model_path, embedding_model="all-MiniLM-L6-v2")
File "/projects/p32234/projects/aerith/anthropocene-reconcile/anthropocene-ML/anthropoceneCodebase/constellate/.venv/lib/python3.9/site-packages/bertopic/_bertopic.py", line 3394, in load
topics, params, tensors, ctfidf_tensors, ctfidf_config, images = save_utils.load_files_from_hf(path)
File "/projects/p32234/projects/aerith/anthropocene-reconcile/anthropocene-ML/anthropoceneCodebase/constellate/.venv/lib/python3.9/site-packages/bertopic/_save_utils.py", line 227, in load_files_from_hf
topics = load_cfg_from_json(hf_hub_download(path, TOPICS_NAME, revision=None))
File "/projects/p32234/projects/aerith/anthropocene-reconcile/anthropocene-ML/anthropoceneCodebase/constellate/.venv/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 106, in _inner_fn
validate_repo_id(arg_value)
File "/projects/p32234/projects/aerith/anthropocene-reconcile/anthropocene-ML/anthropoceneCodebase/constellate/.venv/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 154, in validate_repo_id
raise HFValidationError(
huggingface_hub.errors.HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/projects/p32234/projects/aerith/anthropocene-reconcile/anthropocene-ML/anthropoceneCodebase/constellate/batch1/bertopic_model_part-4_50000_60000'. Use `repo_type` argument if needed.
It seems that it thinks the path you shared must be a HF-hub link, which isn't the case. What if you were to remove the "/" at the beginning of the path?
It seems that it thinks the path you shared must be a HF-hub link, which isn't the case. What if you were to remove the "/" at the beginning of the path?
Just tried it. Still same error.
Could one of you provide a minimal reproducible example? I cannot seem to reproduce the issue.
I just tried the exact same code as in my first comment, and it seemed to work fine this time. I will continue to try to reproduce the error for knowledge's sake. I will note that I am working in a scratch directory on remote clusters, which is the one similarity we both have. I am wondering if that may be contributing to the issue?
Not sure, but I wouldn't be surprised if it is. huggingface-hub needs to have specific paths, so perhaps it is related to that (and to an extension a remote env).