distilabel
distilabel copied to clipboard
[BUG] `pipeline.log` cache location not consistent within the same `Pipeline`
Describe the bug
Apparently, the cache location is different in the Pipeline.run
method before and after calling the super().run
, since the signature is updated, and it modifies the path, so that the pipeline.log
is dumped in one directory (signature calculated before super().run
) and then the rest of the files into another one (signature calculated after super().run
).
To Reproduce
import time
from distilabel.llms import LlamaCppLLM, OpenAILLM
from distilabel.pipeline import Pipeline
from distilabel.steps import KeepColumns, LoadHubDataset, PushToHub
from distilabel.steps.combine import CombineColumns
from distilabel.steps.generators.data import LoadDataFromDicts
from distilabel.steps.tasks import TextGeneration, UltraFeedback
if __name__ == "__main__":
start_time = time.time()
with Pipeline(name="ultrafeedback-dpo") as pipeline:
load_dataset = LoadHubDataset(
name="load_dataset",
output_mappings={"prompt": "instruction"},
)
text_generation_zephyr = TextGeneration(
name="text_generation_zephyr",
llm=LlamaCppLLM(
model_path="./models/zephyr-7b-beta.Q4_K_M.gguf", # type: ignore
n_gpu_layers=-1,
n_ctx=1024,
),
input_batch_size=10,
output_mappings={"model_name": "generation_model"},
)
load_dataset.connect(text_generation_zephyr)
text_generation_gemma = TextGeneration(
name="text_generation_gemma",
llm=LlamaCppLLM(
model_path="./models/gemma-7b-it-Q4_K_M.gguf", # type: ignore
n_gpu_layers=-1,
n_ctx=1024,
),
input_batch_size=10,
output_mappings={"model_name": "generation_model"},
)
load_dataset.connect(text_generation_gemma)
combine_columns = CombineColumns(
name="combine_columns",
columns=["generation", "generation_model"],
output_columns=["generations", "generation_models"],
)
text_generation_zephyr.connect(combine_columns)
text_generation_gemma.connect(combine_columns)
ultrafeedback = UltraFeedback(
name="ultrafeedback",
llm=OpenAILLM(
model="gpt-4",
api_key="sk-...", # type: ignore
),
aspect="overall-rating",
input_batch_size=10,
output_mappings={"model_name": "ultrafeedback_model"},
)
combine_columns.connect(ultrafeedback)
distiset = pipeline.run(
parameters={
"load_dataset": {
"repo_id": "distilabel-internal-testing/instruction-dataset-mini",
"split": "test",
},
"text_generation_zephyr": {
"llm": {
"generation_kwargs": {
"max_new_tokens": 1024,
"temperature": 0.7,
},
},
},
"text_generation_gemma": {
"llm": {
"generation_kwargs": {
"max_new_tokens": 1024,
"temperature": 0.7,
},
},
},
"ultrafeedback": {
"llm": {
"generation_kwargs": {
"max_new_tokens": 1024,
"temperature": 0.7,
},
},
},
}
)
print("--- %s seconds ---" % (time.time() - start_time))
distiset.push_to_hub(
"alvarobartt/example-distilabel", token="..."
)
Expected behaviour
The cache location to be the same within the Pipeline.run
method.
Desktop:
- Package version:
develop
- Python version: 3.11