Multi-GPU/CPU offloading is still not working as intended
I believe this issue should have been solved in #668, however it is still happening when I run "examples/quantize.py" on a finetuned version of meta-llama/Llama-3.3-70B-Instruct. My system has a 125GB available RAM and 2xA6000 with 48GB VRAM each.
Using the script without modifications results in RAM OOM and the process is automatically killed. (Only displaying "Killed" after the tqdm progressbar).
If I change "device_map" to auto, I get CUDA OOM.
If I add a "max_memory" dict, the script runs but then at some point I get the error:
NotImplementedError: Cannot copy out of meta tensor; no data!
I tried experimenting with different values in max_memory, different values for n_parallel_calib_samples & max_calib_samples but the issue persists.
Here is my code below:
import argparse
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
from datasets import load_dataset
def parse_args():
parser = argparse.ArgumentParser(description="Quantize a model using AWQ.")
parser.add_argument('--model_path', type=str, required=True, help='Path to the pretrained model.')
parser.add_argument('--quant_path', type=str, required=True, help='Path to save the quantized model.')
parser.add_argument('--max_memory', type=str, default="38GIB", help='Max memory allocation per device.')
parser.add_argument('--max_calib_samples', type=int, default=128, help='Maximum number of calibration samples.')
return parser.parse_args()
def main():
args = parse_args()
quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
model_init_kwargs = {"max_memory": {0: args.max_memory, 1: args.max_memory}}
# Load model
model = AutoAWQForCausalLM.from_pretrained(args.model_path, device_map="auto", **model_init_kwargs)
tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True)
# Define data loading methods
def load_custom_dataset():
data = load_dataset('****', split="train")
# concatenate data
def concatenate_data(x):
return {"text": x['question'] + '\n' + x['answer']}
concatenated = data.map(concatenate_data)
return [text for text in concatenated["text"]]
# Quantize
model.quantize(tokenizer,
calib_data=load_custom_dataset(),
quant_config=quant_config,
max_calib_samples=args.max_calib_samples)
# Save quantized model
model.save_quantized(args.quant_path)
tokenizer.save_pretrained(args.quant_path)
print(f'Model is quantized and saved at "{args.quant_path}"')
if __name__ == "__main__":
main()
Same i think it has broke down with new transformers