opencompass icon indicating copy to clipboard operation
opencompass copied to clipboard

[Bug] Can NOT find valid BAAI/TACO dataset

Open ShaneTian opened this issue 3 months ago • 1 comments

Prerequisite

Type

I'm evaluating with the officially supported tasks/models/datasets.

Environment

{'CUDA available': True,
 'CUDA_HOME': '/usr/local/cuda',
 'GCC': 'gcc (GCC) 11.4.0',
 'GPU 0,1,2,3,4,5,6,7': 'NVIDIA H20',
 'MMEngine': '0.10.7',
 'MUSA available': False,
 'NVCC': 'Cuda compilation tools, release 12.8, V12.8.93',
 'PyTorch': '2.7.1+cu126',
 'PyTorch compiling details': 'PyTorch built with:\n'
                              '  - GCC 11.2\n'
                              '  - C++ Version: 201703\n'
                              '  - Intel(R) oneAPI Math Kernel Library Version '
                              '2024.2-Product Build 20240605 for Intel(R) 64 '
                              'architecture applications\n'
                              '  - Intel(R) MKL-DNN v3.7.1 (Git Hash '
                              '8d263e693366ef8db40acc569cc7d8edf644556d)\n'
                              '  - OpenMP 201511 (a.k.a. OpenMP 4.5)\n'
                              '  - LAPACK is enabled (usually provided by '
                              'MKL)\n'
                              '  - NNPACK is enabled\n'
                              '  - CPU capability usage: AVX512\n'
                              '  - CUDA Runtime 12.6\n'
                              '  - NVCC architecture flags: '
                              '-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n'       
                              '  - CuDNN 90.5.1\n'
                              '  - Magma 2.6.1\n'
                              '  - Build settings: BLAS_INFO=mkl, '
                              'BUILD_TYPE=Release, '
                              'COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, '
                              'CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, '
                              'CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, '
                              'CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 '
                              '-fvisibility-inlines-hidden -DUSE_PTHREADPOOL '
                              '-DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER '
                              '-DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM '
                              '-DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK '
                              '-DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC '
                              '-Wall -Wextra -Werror=return-type '
                              '-Werror=non-virtual-dtor '
                              '-Werror=range-loop-construct '
                              '-Werror=bool-operation -Wnarrowing '
                              '-Wno-missing-field-initializers '
                              '-Wno-unknown-pragmas -Wno-unused-parameter '
                              '-Wno-strict-overflow -Wno-strict-aliasing '
                              '-Wno-stringop-overflow -Wsuggest-override '
                              '-Wno-psabi -Wno-error=old-style-cast '
                              '-fdiagnostics-color=always -faligned-new '
                              '-Wno-maybe-uninitialized -fno-math-errno '
                              '-fno-trapping-math -Werror=format '
                              '-Wno-stringop-overflow, LAPACK_INFO=mkl, '
                              'PERF_WITH_AVX=1, PERF_WITH_AVX2=1, '
                              'TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, '
                              'USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, '
                              'USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, '
                              'USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, '
                              'USE_OPENMP=ON, USE_ROCM=OFF, '
                              'USE_ROCM_KERNEL_ASSERT=OFF, \n',
 'Python': '3.10.18 (main, Jun  5 2025, 13:14:17) [GCC 11.2.0]',
 'TorchVision': '0.22.1+cu126',
 'lmdeploy': "not installed:No module named 'lmdeploy'",
 'numpy_random_seed': 2147483648,
 'opencompass': '0.4.2+6eee45e',
 'sys.platform': 'linux',
 'transformers': '4.56.0'}

Reproduces the problem - code/configuration sample

from mmengine.config import read_base

from opencompass.models import VLLMwithChatTemplate, OpenAISDK
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
from opencompass.utils.text_postprocessors import extract_non_reasoning_content


#######################################################################
#                          PART 1  Datasets List                      #
#######################################################################

with read_base():
    from opencompass.configs.datasets.taco.taco_gen_c7893a import TACO_datasets


datasets = sum(
    (v for k, v in locals().items() if k.endswith("_datasets")),
    [],
)

#######################################################################
#                          PART 2  Model List                         #
#######################################################################

models = [
    dict(
        type=VLLMwithChatTemplate,
        abbr="deepseek-r1-distill-qwen-1.5b-vllm",
        path="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
        max_out_len=32768,
        batch_size=16,
        generation_kwargs=dict(temperature=0.6, top_p=0.95, do_sample=True),
        model_kwargs=dict(tensor_parallel_size=1),
        run_cfg=dict(num_gpus=1),
        pred_postprocessor=dict(type=extract_non_reasoning_content)
    )
]


#######################################################################
#                          PART 3  Inference/Evaluation               #
#######################################################################

# Inference configuration
infer = dict(
    partitioner=dict(
        type=NumWorkerPartitioner,
        num_worker=8
    ),
    runner=dict(
        type=LocalRunner,
        task=dict(type=OpenICLInferTask)
    ),
)

# Evaluation configuration
eval = dict(
    partitioner=dict(
        type=NaivePartitioner, n=8
    ),
    runner=dict(
        type=LocalRunner,
        task=dict(
            type=OpenICLEvalTask)
    ),
)


#######################################################################
#                          PART 4  Utils                              #
#######################################################################

work_dir = "outputs/deepseek_r1_distill_qwen_1p5b_taco"

Reproduces the problem - command or script

python run.py eval_deepseek_r1_distill_qwen_1p5b_taco.py --debug

Reproduces the problem - error message

INFO 08-31 23:20:47 [__init__.py:241] Automatically detected platform cuda.
08/31 23:20:48 - OpenCompass - INFO - Current exp folder: outputs/deepseek_r1_distill_qwen_1p5b_taco/20250831_232048
08/31 23:20:48 - OpenCompass - WARNING - SlurmRunner is not used, so the partition argument is ignored.
08/31 23:20:48 - OpenCompass - INFO - ./data/BAAI-TACO does not exist!Start Download data automatically!If you have downloaded the data before,You can specific `COMPASS_DATA_CACHE` to avoid downloading~
Traceback (most recent call last):
  File "/root/opencompass/run.py", line 4, in <module>
    main()
  File "/root/opencompass/opencompass/cli/main.py", line 345, in main
    tasks = partitioner(cfg)
  File "/root/opencompass/opencompass/partitioners/base.py", line 83, in __call__
    tasks = self.partition(**model_and_dataset_args,
  File "/root/opencompass/opencompass/partitioners/num_worker.py", line 72, in partition
    dataset_size = self.get_size(dataset)
  File "/root/opencompass/opencompass/partitioners/num_worker.py", line 145, in get_size
    dataset = build_dataset_from_cfg(dataset)
  File "/root/opencompass/opencompass/utils/build.py", line 12, in build_dataset_from_cfg
    return LOAD_DATASET.build(dataset_cfg)
  File "/root/miniconda3/envs/opencompass/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build
    return self.build_func(cfg, *args, **kwargs, registry=self)
  File "/root/miniconda3/envs/opencompass/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
    obj = obj_cls(**args)  # type: ignore
  File "/root/opencompass/opencompass/datasets/base.py", line 19, in __init__
    dataset = self.load(**kwargs)
  File "/root/opencompass/opencompass/datasets/taco.py", line 42, in load
    path = get_data_path(path, local_mode=True)
  File "/root/opencompass/opencompass/utils/datasets.py", line 30, in get_data_path
    return download_dataset(local_path, cache_dir)
  File "/root/opencompass/opencompass/utils/datasets.py", line 90, in download_dataset
    assert dataset_name, f'No valid url for {data_path}!\n' + \
AssertionError: No valid url for ./data/BAAI-TACO!
Please make sure  `./data/BAAI-TACO` is correct

Other information

data_path is a local path: https://github.com/open-compass/opencompass/blob/main/opencompass/configs/datasets/taco/taco_gen_c7893a.py#L22

https://github.com/open-compass/opencompass/blob/main/opencompass/datasets/taco.py#L42-L43

but I cannot find a valid dataset id or url in: https://github.com/open-compass/opencompass/blob/main/opencompass/utils/datasets_info.py

ShaneTian avatar Aug 31 '25 15:08 ShaneTian