optimum-benchmark
optimum-benchmark copied to clipboard
Warning on loading quantized model
Hello,
I'm running base mistral and using bnb to quantize as below:
from optimum_benchmark.backends.pytorch.config import PyTorchConfig
from optimum_benchmark.benchmarks.inference.config import InferenceConfig
from optimum_benchmark.experiment import ExperimentConfig, launch
from optimum_benchmark.launchers.process.config import ProcessConfig
from optimum_benchmark.logging_utils import setup_logging
import torch
if __name__ == "__main__":
setup_logging(level="INFO")
launcher_config = ProcessConfig(device_isolation=False)
benchmark_config = InferenceConfig(
memory=True,
latency=True,
input_shapes={"batch_size": 4, "sequence_length": 128},
generate_kwargs={"max_new_tokens": 128, "min_new_tokens": 128},
)
backend_config = PyTorchConfig(
model="mistralai/Mistral-7B-v0.1",
device="cuda",
device_ids="0",
torch_dtype="float16",
no_weights=True,
quantization_scheme="bnb",
quantization_config={"load_in_4bit": True,
"bnb_4bit_compute_dtype": "float16",
"bnb_4bit_quant_type": "nf4",
},
)
experiment_config = ExperimentConfig(
experiment_name="mistral-fp16-base",
benchmark=benchmark_config,
launcher=launcher_config,
backend=backend_config,
)
benchmark_report = launch(experiment_config)
I get this error about the model already having a quantization config. Is this expected behaviour?
Is there a mistake in how I'm passing the quantization config in the pytorch backend?