distilabel
distilabel copied to clipboard
[BUG] Ultrafeedback pipeline don't work
Describe the bug
I want to use Ultrafeedback task in a pipeline, but i have already the dataset, so the pipeline include only loading the dataset and after pass it to ultrafeedback module. Doing this, give this error:
WARNING ['distilabel.step.ultrafeedback'] ⚠️ Processing batch 12 with step step_wrapper.py:240
'ultrafeedback' failed. Sending empty batch filled with None
s...
WARNING ['distilabel.step.ultrafeedback'] Subprocess traceback: step_wrapper.py:244
Traceback (most recent call last):
File
"/home/jovyan/synthetic_data_workspace/synthetic_env/lib/python3.1
1/site-packages/distilabel/pipeline/step_wrapper.py", line 228, in
_non_generator_process_loop
result = next(step.process_applying_mappings(*batch.data))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File
"/home/jovyan/synthetic_data_workspace/synthetic_env/lib/python3.1
1/site-packages/distilabel/steps/base.py", line 545, in
process_applying_mappings
for output_rows in generator:
File
"/home/jovyan/synthetic_data_workspace/synthetic_env/lib/python3.1
1/site-packages/distilabel/steps/tasks/base.py", line 198, in
process
outputs = self.llm.generate(
^^^^^^^^^^^^^^^^^^
File
"/home/jovyan/synthetic_data_workspace/synthetic_env/lib/python3.1
1/site-packages/distilabel/llms/base.py", line 357, in generate
return self.event_loop.run_until_complete(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File
"/home/jovyan/synthetic_data_workspace/synthetic_env/lib/python3.1
1/site-packages/nest_asyncio.py", line 98, in run_until_complete
return f.result()
^^^^^^^^^^
File
"/home/jovyan/synthetic_data_workspace/synthetic_env/lib/python3.1
1/asyncio/futures.py", line 203, in result
raise self._exception.with_traceback(self._exception_tb)
File
"/home/jovyan/synthetic_data_workspace/synthetic_env/lib/python3.1
1/asyncio/tasks.py", line 279, in __step
result = coro.throw(exc)
^^^^^^^^^^^^^^^
File
"/home/jovyan/synthetic_data_workspace/synthetic_env/lib/python3.1
1/site-packages/distilabel/llms/base.py", line 327, in _agenerate
return await asyncio.gather(*tasks)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File
"/home/jovyan/synthetic_data_workspace/synthetic_env/lib/python3.1
1/asyncio/tasks.py", line 349, in __wakeup
future.result()
File
"/home/jovyan/synthetic_data_workspace/synthetic_env/lib/python3.1
1/asyncio/tasks.py", line 277, in __step
result = coro.send(None)
^^^^^^^^^^^^^^^
File
"/home/jovyan/synthetic_data_workspace/synthetic_env/lib/python3.1
1/site-packages/distilabel/llms/openai.py", line 268, in agenerate
completion = await
self._aclient.chat.completions.create(**kwargs) # type: ignore
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^^
File
"/home/jovyan/synthetic_data_workspace/synthetic_env/lib/python3.1
1/site-packages/openai/resources/chat/completions.py", line 1295,
in create
return await self._post(
^^^^^^^^^^^^^^^^^
File
"/home/jovyan/synthetic_data_workspace/synthetic_env/lib/python3.1
1/site-packages/openai/_base_client.py", line 1826, in post
return await self.request(cast_to, opts, stream=stream,
stream_cls=stream_cls)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^
File
"/home/jovyan/synthetic_data_workspace/synthetic_env/lib/python3.1
1/site-packages/openai/_base_client.py", line 1519, in request
return await self._request(
^^^^^^^^^^^^^^^^^^^^
File
"/home/jovyan/synthetic_data_workspace/synthetic_env/lib/python3.1
1/site-packages/openai/_base_client.py", line 1620, in _request
raise self._make_status_error_from_response(err.response) from
None
openai.BadRequestError: Error code: 400 - {'object': 'error',
'message': "This model's maximum context length is 24288 tokens.
However, you requested 74604 tokens (73580 in the messages, 1024
in the completion). Please reduce the length of the messages or
completion.", 'type': 'BadRequestError', 'param': None, 'code':
400}
To Reproduce Code to reproduce
from distilabel.llms import TransformersLLM, OpenAILLM
from distilabel.pipeline import Pipeline
from distilabel.steps import ConversationTemplate, DeitaFiltering, ExpandColumns, LoadDataFromHub
from distilabel.steps.tasks import ComplexityScorer, EvolInstruct, EvolQuality, GenerateEmbeddings, QualityScorer, UltraFeedback
import pandas as pd
from huggingface_hub import notebook_login
import os
os.environ["OPENAI_API_KEY"] = "EMPTY"
from distilabel.steps import LoadDataFromFileSystem
pipeline = Pipeline(name="DEITA")
loader = LoadDataFromFileSystem(data_files="./dataset/distilabel_dataset.jsonl", pipeline=pipeline, batch_size=16)
loader.load()
from distilabel.llms import OpenAILLM
llm = OpenAILLM(
model="meta-llama/Meta-Llama-3.1-70B-Instruct",
base_url="http://172.18.21.136:8000/v1",
timeout=15000,
generation_kwargs={
"max_new_tokens": 1024,
"temperature": 0.8,
"top_p": 0.8
}
)
# evol_instruction_complexity = EvolInstruct(
# name="evol_instruction_complexity",
# llm=llm,
# num_evolutions=4,
# store_evolutions=True,
# generate_answers=True,
# include_original_instruction=False,
# pipeline=pipeline,
# input_batch_size=8
# )
ultrafeedback = UltraFeedback(
name="ultrafeedback",
llm=llm,
aspect="truthfulness",
input_mappings={"instruction": "evolved_instructions", "generations": "answers"},
output_mappings={"model_name": "ultrafeedback_model"},
pipeline=pipeline,
input_batch_size=8
)
expand_evolved_instructions = ExpandColumns(
name="expand_evolved_instructions",
columns=['evolved_instructions', 'answers', 'types', 'ratings', 'rationales-for-ratings'],
pipeline=pipeline,
)
# loader.connect(evol_instruction_complexity)
# evol_instruction_complexity.connect(ultrafeedback)
loader.connect(ultrafeedback)
ultrafeedback.connect(expand_evolved_instructions)
distiset = pipeline.run(
parameters={
"load_data_from_file_system_0": {
"repo_id": "./dataset/distilabel_dataset.jsonl",
"batch_size":16
},
# "evol_instruction_complexity": {
# "llm": {"generation_kwargs": {"max_new_tokens": 1024, "temperature": 0.8, "top_p":0.8}},
# "input_batch_size":8
# },
"ultrafeedback": {
"llm": {
"generation_kwargs": {
"max_new_tokens": 1024,
"temperature": 0.8,
}
},
"input_batch_size":8
},
},
use_cache=False,
)
Expected behaviour If you notice, in the code there are some commented parts, if i use ultrafeedback with that commented parts, for example after EvolInstruct module, it work very well, all i did is to comment the EvolInstruct Module and connect loader with ultrafeedback (because i have already the generated dataset) and it don't work anymore, giving problem with prompt lenght. I don't understand why.
Screenshots If applicable, add screenshots to help explain your problem.
Desktop (please complete the following information):
- Package version: Distilabel 1.3.1
- Python version: 3.11.9
Additional context Add any other context about the problem here.