code:

-- coding: utf-8 --

import argparse import torch from transformers import Qwen2VLProcessor from awq.models.qwen2vl import Qwen2VLAWQForCausalLM from transformers import AutoTokenizer

import math

def infer(args): model_name = args.model_name model = Qwen2VLAWQForCausalLM.from_pretrained( model_name, model_type="qwen2_vl", use_cache=False ) quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM"} processor = Qwen2VLProcessor.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) quant_path = args.output model.quantize(tokenizer, quant_config=quant_config) model.model.config.use_cache = model.model.generation_config.use_cache = True model.save_quantized(quant_path, safetensors=True, shard_size="4GB") processor.save_pretrained(quant_path) print("Completed infer test data")

def main(): print("GPU COUNT:{}".format(torch.cuda.device_count())) parser = argparse.ArgumentParser() parser.add_argument( "--model_name", type=str ) parser.add_argument( "--test_data", type=str, help="path of test data" ) parser.add_argument( "--output", type=str, help="path of output" ) args = parser.parse_args() infer(args) print("Completed fine-tuning")

if name == 'main': main()

LOGS: 16 January 2025 17:18:06 | polyaxon-main | + python quantize_awq.py --model_name=/data/public/yasong/model_output/Qwen2-VL-7B-Instruct/v2-20250113-161832/checkpoint-750-merged --test_data=/data/public/yasong/Projects/pricing/label_studio/momoshop/labelstudio_202410301805_labeled_v1.jsonl --output=/data/public/yasong/model_output/Qwen2-VL-7B-Instruct/v2-20250113-161832/checkpoint-750-merged_quantize_awq_int4 16 January 2025 17:18:28 | polyaxon-main | Qwen2VLRotaryEmbedding can now be fully parameterized by passing the model config through the config argument. All other arguments will be removed in v4.46 16 January 2025 17:18:28 | polyaxon-main | GPU COUNT:1 16 January 2025 17:18:28 | polyaxon-main | Start loading model!! 16 January 2025 17:18:37 | polyaxon-main | Loading checkpoint shards: 100%|██████████| 4/4 [00:08<00:00, 2.09s/it] 16 January 2025 17:18:40 | polyaxon-main | Repo card metadata block was not found. Setting CardData to empty. 16 January 2025 17:19:00 | polyaxon-main | Generating validation split: 100%|██████████| 214670/214670 [00:14<00:00, 14839.95 examples/s] 16 January 2025 17:19:00 | polyaxon-main | Token indices sequence length is longer than the specified maximum sequence length for this model (57053 > 32768). Running this sequence through the model will result in indexing errors 16 January 2025 17:19:04 | polyaxon-main | AWQ: 0%| | 0/28 [00:00<?, ?it/s] 16 January 2025 17:19:04 | polyaxon-main | Traceback (most recent call last): 16 January 2025 17:19:04 | polyaxon-main | File "/plx-context/artifacts/35a955e871d84853aa4405b58e866f15/uploads/scripts/quantize_awq.py", line 218, in 16 January 2025 17:19:04 | polyaxon-main | main() 16 January 2025 17:19:04 | polyaxon-main | File "/plx-context/artifacts/35a955e871d84853aa4405b58e866f15/uploads/scripts/quantize_awq.py", line 213, in main 16 January 2025 17:19:04 | polyaxon-main | infer(args) 16 January 2025 17:19:04 | polyaxon-main | File "/plx-context/artifacts/35a955e871d84853aa4405b58e866f15/uploads/scripts/quantize_awq.py", line 193, in infer 16 January 2025 17:19:04 | polyaxon-main | model.quantize(tokenizer, quant_config=quant_config) 16 January 2025 17:19:04 | polyaxon-main | File "/data/public/yasong/python3.10/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context 16 January 2025 17:19:04 | polyaxon-main | return func(*args, **kwargs) 16 January 2025 17:19:04 | polyaxon-main | File "/data/public/yasong/python3.10/lib/python3.10/site-packages/awq/models/base.py", line 238, in quantize 16 January 2025 17:19:04 | polyaxon-main | self.quantizer.quantize() 16 January 2025 17:19:04 | polyaxon-main | File "/data/public/yasong/python3.10/lib/python3.10/site-packages/awq/quantize/quantizer.py", line 159, in quantize 16 January 2025 17:19:04 | polyaxon-main | input_feat = self._get_input_feat(self.modules[i], named_linears) 16 January 2025 17:19:04 | polyaxon-main | File "/data/public/yasong/python3.10/lib/python3.10/site-packages/awq/quantize/quantizer.py", line 633, in _get_input_feat 16 January 2025 17:19:04 | polyaxon-main | self.inps = self._module_forward(self.inps, layer, module_kwargs) 16 January 2025 17:19:04 | polyaxon-main | File "/data/public/yasong/python3.10/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context 16 January 2025 17:19:04 | polyaxon-main | return func(*args, **kwargs) 16 January 2025 17:19:04 | polyaxon-main | File "/data/public/yasong/python3.10/lib/python3.10/site-packages/awq/quantize/quantizer.py", line 247, in _module_forward 16 January 2025 17:19:04 | polyaxon-main | module_output = module(x, **module_kwargs) 16 January 2025 17:19:04 | polyaxon-main | File "/data/public/yasong/python3.10/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl 16 January 2025 17:19:04 | polyaxon-main | return self._call_impl(*args, **kwargs) 16 January 2025 17:19:04 | polyaxon-main | File "/data/public/yasong/python3.10/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl 16 January 2025 17:19:04 | polyaxon-main | return forward_call(*args, **kwargs) 16 January 2025 17:19:04 | polyaxon-main | File "/data/public/yasong/python3.10/lib/python3.10/site-packages/transformers/models/qwen2_vl/modeling_qwen2_vl.py", line 859, in forward 16 January 2025 17:19:04 | polyaxon-main | hidden_states, self_attn_weights, present_key_value = self.self_attn( 16 January 2025 17:19:04 | polyaxon-main | File "/data/public/yasong/python3.10/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl 16 January 2025 17:19:04 | polyaxon-main | return self._call_impl(*args, **kwargs) 16 January 2025 17:19:04 | polyaxon-main | File "/data/public/yasong/python3.10/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl 16 January 2025 17:19:04 | polyaxon-main | return forward_call(*args, **kwargs) 16 January 2025 17:19:04 | polyaxon-main | File "/data/public/yasong/python3.10/lib/python3.10/site-packages/transformers/models/qwen2_vl/modeling_qwen2_vl.py", line 752, in forward 16 January 2025 17:19:04 | polyaxon-main | cos, sin = position_embeddings 16 January 2025 17:19:04 | polyaxon-main | TypeError: cannot unpack non-iterable NoneType object

pip list WARNING: Ignoring invalid distribution -vidia-cudnn-cu12 (/data/public/yasong/python3.10/lib/python3.10/site-packages) Package Version Editable project location

absl-py 2.1.0 accelerate 1.1.1 addict 2.4.0 aiofiles 23.2.1 aiohttp 3.9.5 aiosignal 1.3.1 aliyun-python-sdk-core 2.16.0 aliyun-python-sdk-kms 2.16.5 annotated-types 0.7.0 anyio 4.6.2.post1 async-timeout 4.0.3 attrdict 2.0.1 attrs 23.2.0 auto_gptq 0.7.1 autoawq 0.2.7.post2 autoawq_kernels 0.0.9 av 13.1.0 binpacking 1.5.2 boto3 1.35.98 botocore 1.35.98 certifi 2024.6.2 cffi 1.17.1 charset-normalizer 3.3.2 click 8.1.7 cloudpickle 3.1.0 cmake 3.31.0.1 colorama 0.4.6 coloredlogs 15.0.1 compressed-tensors 0.6.0 contourpy 1.3.1 cpm-kernels 1.0.11 crcmod 1.7 cryptography 43.0.3 cycler 0.12.1 dacite 1.8.1 datasets 3.2.0 deepseek_vl2 1.0.0 /data/public/yasong/DeepSeek-VL2-main deepspeed 0.14.4 dill 0.3.8 diskcache 5.6.3 distro 1.9.0 dnspython 2.7.0 docstring_parser 0.16 einops 0.8.0 eventlet 0.38.2 exceptiongroup 1.2.2 fastapi 0.115.5 ffmpy 0.4.0 filelock 3.15.4 flash-attn 2.6.3 fonttools 4.54.1 frozenlist 1.4.1 fsspec 2024.5.0 future 1.0.0 gekko 1.2.1 gguf 0.10.0 gradio 5.5.0 gradio_client 1.4.2 greenlet 3.1.1 grpcio 1.64.1 h11 0.14.0 hjson 3.1.0 httpcore 1.0.6 httptools 0.6.4 httpx 0.27.2 huggingface-hub 0.27.1 humanfriendly 10.0 idna 3.7 importlib_metadata 8.5.0 interegular 0.3.3 jieba 0.42.1 Jinja2 3.1.4 jiter 0.7.1 jmespath 0.10.0 joblib 1.4.2 jsonschema 4.23.0 jsonschema-specifications 2024.10.1 kiwisolver 1.4.7 lark 1.2.2 llvmlite 0.43.0 lm-format-enforcer 0.10.6 lxml 5.3.0 Markdown 3.6 markdown-it-py 3.0.0 MarkupSafe 2.1.5 matplotlib 3.9.2 mdurl 0.1.2 metrics 0.3.3 mistral_common 1.4.4 modelscope 1.22.0 mpmath 1.3.0 ms-swift 3.0.2.post1 msgpack 1.1.0 msgspec 0.18.6 multidict 6.0.5 multiprocess 0.70.16 nest-asyncio 1.6.0 networkx 3.3 ninja 1.11.1.1 nltk 3.9.1 numba 0.60.0 numpy 1.26.4 nvidia-cublas-cu12 12.1.3.1 nvidia-cuda-cupti-cu12 12.1.105 nvidia-cuda-nvrtc-cu12 12.1.105 nvidia-cuda-runtime-cu12 12.1.105 nvidia-cudnn-cu12 9.1.0.70 nvidia-cufft-cu12 11.0.2.54 nvidia-curand-cu12 10.3.2.106 nvidia-cusolver-cu12 11.4.5.107 nvidia-cusparse-cu12 12.1.0.106 nvidia-ml-py 12.555.43 nvidia-nccl-cu12 2.20.5 nvidia-nvjitlink-cu12 12.4.127 nvidia-nvtx-cu12 12.1.105 openai 1.54.4 OpenCC 1.1.9 opencv-python-headless 4.10.0.84 optimum 1.23.3 orjson 3.10.11 oss2 2.19.1 outlines 0.0.46 packaging 24.1 pandas 2.2.2 partial-json-parser 0.2.1.1.post4 PasteDeploy 3.1.0 pathlib2 2.3.7.post1 pathspec 0.5.5 peft 0.12.0 pillow 10.4.0 pip 23.0.1 portalocker 2.10.1 prometheus_client 0.21.0 prometheus-fastapi-instrumentator 7.0.0 protobuf 4.25.3 psutil 6.0.0 py-cpuinfo 9.0.0 pyairports 2.1.1 pyarrow 16.1.0 pyarrow-hotfix 0.6 pycountry 24.6.1 pycparser 2.22 pycryptodome 3.21.0 pydantic 2.9.2 pydantic_core 2.23.4 pydub 0.25.1 pyeclib 1.6.4 Pygments 2.18.0 pyparsing 3.2.0 python-dateutil 2.9.0.post0 python-dotenv 1.0.1 python-multipart 0.0.12 pytz 2024.1 PyYAML 6.0.1 pyzmq 26.2.0 qwen-vl-utils 0.0.8 ray 2.39.0 referencing 0.35.1 regex 2024.5.15 requests 2.32.3 rich 13.9.4 rouge 1.0.1 rpds-py 0.21.0 ruff 0.7.3 s3transfer 0.10.4 sacrebleu 2.4.3 safehttpx 0.1.1 safetensors 0.4.3 scikit-learn 1.5.1 scipy 1.14.0 semantic-version 2.10.0 sentence-transformers 3.2.1 sentencepiece 0.2.0 setuptools 69.5.1 shellingham 1.5.4 shtab 1.7.1 simplejson 3.19.3 six 1.16.0 sniffio 1.3.1 sortedcontainers 2.4.0 starlette 0.41.2 swift 2.34.0 sympy 1.13.1 tabulate 0.9.0 tensorboard 2.17.0 tensorboard-data-server 0.7.2 threadpoolctl 3.5.0 tiktoken 0.7.0 timm 1.0.13 tokenizers 0.21.0 tomlkit 0.12.0 torch 2.4.0 torchaudio 2.3.1 torchvision 0.19.0 tqdm 4.66.4 transformers 4.48.0 transformers-stream-generator 0.0.5 triton 3.0.0 trl 0.11.4 typer 0.13.0 typing_extensions 4.12.2 tyro 0.8.14 tzdata 2024.1 urllib3 2.2.2 uvicorn 0.32.0 uvloop 0.21.0 vllm 0.6.3.post1 vllm-flash-attn 2.5.9.post1 watchfiles 0.24.0 websockets 12.0 Werkzeug 3.0.3 xattr 1.1.0 xformers 0.0.27.post2 xxhash 3.4.1 yarl 1.9.4 zipp 3.21.0 zstandard 0.23.0

Jan 16 '25 09:01 songyang23

Not sure about this but maybe downgrading to transformers==4.47.1 might work

https://github.com/casper-hansen/AutoAWQ/issues/690#issuecomment-2589262223

Jan 21 '25 10:01 DebarshiChanda

If you don't want downgrading transformers, you should add position_embeddings to module_kwargs

transformers==4.47.1 transformers==4.48.0

Feb 05 '25 05:02 seungwoos

Failed to convert Qwen2-VL-7B-Instruct LORA model

-- coding: utf-8 --