The program hangs after using the debugger

Open kgboyko opened this issue 6 months ago • 2 comments

Type: Bug

After setting a breakpoint in a Jupiter notebook cell and then running the program, the program hangs in a subprocess.

Steps to Reproduce:

Kaggle Docker GPU v160 is used (I also tested it on other versions v158, v159).
! pip install -U ipykernel ! pip install vllm==0.9.1 # (Tested on versions 0.7.2, 0.8.5.post1, 0.9.1)
Create a Jupiter notebook with the following code. Runtime environment: Python-3.11.11
Set a breakpoint at the last line, then run the application in debug mode (Required NVidia GPU Card >= 3xxx series). VS Code version: Code 1.101.0 (dfaf44141ea9deb3b4096f7cd6d24e00c147a4b1, 2025-06-11T15:00:50.123Z) OS version: Windows_NT x64 10.0.26100 Modes: Remote OS version: Linux x64 6.8.0-60-generic Remote OS version: Linux x64 6.8.0-60-generic

System Info

Item	Value
CPUs	Intel(R) Core(TM) Ultra 5 125H (18 x 2995)
GPU Status	2d_canvas: enabled canvas_oop_rasterization: enabled_on direct_rendering_display_compositor: disabled_off_ok gpu_compositing: enabled multiple_raster_threads: enabled_on opengl: enabled_on rasterization: enabled raw_draw: disabled_off_ok skia_graphite: disabled_off video_decode: enabled video_encode: enabled vulkan: disabled_off webgl: enabled webgl2: enabled webgpu: enabled webnn: disabled_off
Load (avg)	undefined
Memory (System)	23.47GB (12.40GB free)
Process Argv	--crash-reporter-id dc65549e-5a3d-4f34-a3ab-d2ec4803df70
Screen Reader	no
VM	0%

Item	Value
Remote	SSH: Zoo
OS	Linux x64 6.8.0-60-generic
CPUs	13th Gen Intel(R) Core(TM) i5-13600K (20 x 1006)
Memory (System)	125.54GB (119.80GB free)
VM	0%

Item	Value
Remote	Container gcr.io/kaggle-gpu-images/python:v160-cu125-pytorch-2.7.0_vllm_091 (kgl_gpu_160_1) @ Zoo
OS	Linux x64 6.8.0-60-generic
CPUs	13th Gen Intel(R) Core(TM) i5-13600K (20 x 998)
Memory (System)	125.54GB (119.80GB free)
VM	0%

Extensions (23)

Extension	Author (truncated)	Version
jupyter-keymap	ms-	1.1.2
remote-containers	ms-	0.417.0
remote-ssh	ms-	0.120.0
remote-ssh-edit	ms-	0.87.0
vscode-remote-extensionpack	ms-	0.26.0
remote-explorer	ms-	0.5.0
remote-server	ms-	1.5.2
docker	doc	0.10.0
vscode-containers	ms-	2.0.3
vscode-docker	ms-	2.0.0
debugpy	ms-	2025.8.0
python	ms-	2025.6.1
vscode-pylance	ms-	2025.5.1
datawrangler	ms-	1.22.0
jupyter	ms-	2025.4.1
jupyter-keymap	ms-	1.1.2
jupyter-renderers	ms-	1.1.0
vscode-jupyter-cell-tags	ms-	0.1.9
vscode-jupyter-powertoys	ms-	0.1.1
vscode-jupyter-slideshow	ms-	0.1.6
markdown-preview-enhanced	shd	0.8.18
intellicode-api-usage-examples	Vis	0.2.9
vscodeintellicode	Vis	1.3.2

A/B Experiments

vsliv368:30146709
vspor879:30202332
vspor708:30202333
vspor363:30204092
vscod805:30301674
binariesv615:30325510
c4g48928:30535728
azure-dev_surveyone:30548225
962ge761:30959799
h48ei257:31000450
pythontbext0:30879054
cppperfnew:31000557
dwnewjupytercf:31046870
pythonrstrctxt:31112756
nativeloc1:31192215
5fd0e150:31155592
dwcopilot:31170013
6074i472:31201624
dwoutputs:31242946
customenabled:31248079
hdaa2157:31222309
copilot_t_ci:31222730
e5gg6876:31282496
pythoneinst12:31285622
bgtreat:31268568
4gafe986:31271826
c7cif404:31314491
996jf627:31283433
pythonrdcb7:31303018
usemplatestapi:31297334
0aa6g176:31307128
7bj51361:31289155
747dc170:31275177
pylancecolor:31314202
aj953862:31281341
generatesymbolt:31295002
convertfstringf:31295003
gendocf:31295004
pylancequickfixf:31319675
0g0a1943:31327026

import os
import transformers; print('Transformers version:', transformers.__version__)
import torch; print('Torch version:', torch.__version__)
import vllm; print('vLLM version:', vllm.__version__)
from vllm import LLM

container_date = os.environ.get('BUILD_DATE', '').split('-')[0]
print(f"Kaggle Docker BUILD_DATE={container_date}")

!cat /etc/os-release | grep -oP "PRETTY_NAME=\"\K([^\"]*)" && uname -r
!free -h
!nv_version="$(nvidia-smi --query-gpu=driver_version --format=csv,noheader)" && echo "My NVIDIA driver version is '${nv_version}'."
!ls -l /usr/local | grep cuda

is_debug = True

os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
os.environ["TRITON_PTXAS_PATH"] = "/usr/local/cuda/bin/ptxas"

if is_debug:
    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['MKL_NUM_THREADS'] = '1'
    os.environ['POLARS_ALLOW_FORKING_THREAD'] = '1'
    os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
    os.environ['TORCH_USE_CUDA_DSA'] = "1"
    os.environ['OPENBLAS_NUM_THREADS'] = '1'
    os.environ["NUM_INTER_THREADS"] = "1"
    os.environ["NUM_INTRA_THREADS"] = "1"
    os.environ["XLA_FLAGS"] = ("--xla_cpu_multi_thread_eigen=false "
                               "intra_op_parallelism_threads=1")

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["PYTORCH_CUDA_ALLOC_CONF"]='expandable_segments:True'

llm = LLM(
    model="Qwen/Qwen3-0.6B",
    max_num_seqs=1,
    max_model_len=1024,
    trust_remote_code=True,
    enable_prefix_caching = True,
    dtype = torch.half,
    tensor_parallel_size=1,
    gpu_memory_utilization=0.96,
    enforce_eager=True,
    seed=2024,
)
tokenizer = llm.get_tokenizer()
print("LLM Started")

When running with a debugger, the following message appears in the log. If running without a debugger, this message does not appear.

77.40s - pydevd: Sending message related to process being replaced timed-out after 5 seconds

When you press Ctrl+C, this dump appears. It is clear that it hangs in some subprocess.

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
/tmp/ipykernel_2264/2697110242.py in <cell line: 0>()
     34 os.environ["PYTORCH_CUDA_ALLOC_CONF"]='expandable_segments:True'
     35 
---> 36 llm = LLM(
     37     model="Qwen/Qwen3-0.6B",
     38     max_num_seqs=1,

/usr/local/lib/python3.11/dist-packages/vllm/entrypoints/llm.py in __init__(self, model, task, tokenizer, tokenizer_mode, skip_tokenizer_init, trust_remote_code, allowed_local_media_path, tensor_parallel_size, dtype, quantization, revision, tokenizer_revision, seed, gpu_memory_utilization, swap_space, cpu_offload_gb, enforce_eager, max_seq_len_to_capture, disable_custom_all_reduce, disable_async_output_proc, hf_token, hf_overrides, mm_processor_kwargs, override_pooler_config, compilation_config, **kwargs)
    241 
    242         # Create the Engine (autoselects V0 vs V1)
--> 243         self.llm_engine = LLMEngine.from_engine_args(
    244             engine_args=engine_args, usage_context=UsageContext.LLM_CLASS)
    245         self.engine_class = type(self.llm_engine)

/usr/local/lib/python3.11/dist-packages/vllm/engine/llm_engine.py in from_engine_args(cls, engine_args, usage_context, stat_loggers)
    492         """Creates an LLM engine from the engine arguments."""
    493         # Create the engine configs.
--> 494         vllm_config = engine_args.create_engine_config(usage_context)
    495 
    496         engine_cls = cls

/usr/local/lib/python3.11/dist-packages/vllm/engine/arg_utils.py in create_engine_config(self, usage_context)
   1016 
   1017         device_config = DeviceConfig(device=current_platform.device_type)
-> 1018         model_config = self.create_model_config()
   1019 
   1020         # * If VLLM_USE_V1 is unset, we enable V1 for "supported features"

/usr/local/lib/python3.11/dist-packages/vllm/engine/arg_utils.py in create_model_config(self)
    908             self.load_format = LoadFormat.RUNAI_STREAMER
    909 
--> 910         return ModelConfig(
    911             model=self.model,
    912             hf_config_path=self.hf_config_path,

    [... skipping hidden 1 frame]

/usr/local/lib/python3.11/dist-packages/vllm/config.py in __post_init__(self)
    546             self.model, hf_token=self.hf_token, revision=self.revision)
    547 
--> 548         supported_tasks, task = self._resolve_task(self.task)
    549         self.supported_tasks = supported_tasks
    550         self.task = task

/usr/local/lib/python3.11/dist-packages/vllm/config.py in _resolve_task(self, task_option)
    796             # NOTE: Listed from highest to lowest priority,
    797             # in case the model supports multiple of them
--> 798             "transcription": registry.is_transcription_model(architectures),
    799             "generate": registry.is_text_generation_model(architectures),
    800             "pooling": registry.is_pooling_model(architectures),

/usr/local/lib/python3.11/dist-packages/vllm/model_executor/models/registry.py in is_transcription_model(self, architectures)
    556         architectures: Union[str, list[str]],
    557     ) -> bool:
--> 558         model_cls, _ = self.inspect_model_cls(architectures)
    559         return model_cls.supports_transcription
    560 

/usr/local/lib/python3.11/dist-packages/vllm/model_executor/models/registry.py in inspect_model_cls(self, architectures)
    470 
    471         for arch in architectures:
--> 472             model_info = self._try_inspect_model_cls(arch)
    473             if model_info is not None:
    474                 return (model_info, arch)

/usr/local/lib/python3.11/dist-packages/vllm/model_executor/models/registry.py in _try_inspect_model_cls(self, model_arch)
    443             return None
    444 
--> 445         return _try_inspect_model_cls(model_arch, self.models[model_arch])
    446 
    447     def _normalize_archs(

/usr/local/lib/python3.11/dist-packages/vllm/model_executor/models/registry.py in _try_inspect_model_cls(model_arch, model)
    363 ) -> Optional[_ModelInfo]:
    364     try:
--> 365         return model.inspect_model_cls()
    366     except Exception:
    367         logger.exception("Error in inspecting model architecture '%s'",

/usr/local/lib/python3.11/dist-packages/vllm/model_executor/models/registry.py in inspect_model_cls(self)
    334     # Performed in another process to avoid initializing CUDA
    335     def inspect_model_cls(self) -> _ModelInfo:
--> 336         return _run_in_subprocess(
    337             lambda: _ModelInfo.from_model_cls(self.load_model_cls()))
    338 

/usr/local/lib/python3.11/dist-packages/vllm/model_executor/models/registry.py in _run_in_subprocess(fn)
    590         # cannot use `sys.executable __file__` here because the script
    591         # contains relative imports
--> 592         returned = subprocess.run(_SUBPROCESS_COMMAND,
    593                                   input=input_bytes,
    594                                   capture_output=True)

/usr/lib/python3.11/subprocess.py in run(input, capture_output, timeout, check, *popenargs, **kwargs)
    548     with Popen(*popenargs, **kwargs) as process:
    549         try:
--> 550             stdout, stderr = process.communicate(input, timeout=timeout)
    551         except TimeoutExpired as exc:
    552             process.kill()

/usr/lib/python3.11/subprocess.py in communicate(self, input, timeout)
   1207 
   1208             try:
-> 1209                 stdout, stderr = self._communicate(input, endtime, timeout)
   1210             except KeyboardInterrupt:
   1211                 # https://bugs.python.org/issue25942

/usr/lib/python3.11/subprocess.py in _communicate(self, input, endtime, orig_timeout)
   2113                             'failed to raise TimeoutExpired.')
   2114 
-> 2115                     ready = selector.select(timeout)
   2116                     self._check_timeout(endtime, orig_timeout, stdout, stderr)
   2117 

/usr/lib/python3.11/selectors.py in select(self, timeout)
    413         ready = []
    414         try:
--> 415             fd_event_list = self._selector.poll(timeout)
    416         except InterruptedError:
    417             return ready

KeyboardInterrupt:

Having previously tried all working versions of VLLM and Docker, I came to the unambiguous conclusion that the problem is in VSCode.
Log

Transformers version: 4.51.3
Torch version: 2.7.0+cu126
INFO 06-12 21:02:13 [__init__.py:244] Automatically detected platform cuda.
2025-06-12 21:02:13.189772: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-12 21:02:13.197188: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
E0000 00:00:1749762133.205913    4385 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749762133.208566    4385 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-06-12 21:02:13.217907: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
vLLM version: 0.9.1
Kaggle Docker BUILD_DATE=20250508
Ubuntu 22.04.4 LTS
6.8.0-60-generic
               total        used        free      shared  buff/cache   available
Mem:           125Gi       4.3Gi        17Gi       9.0Mi       103Gi       119Gi
Swap:          8.0Gi       0.0Ki       8.0Gi
My NVIDIA driver version is '560.35.03'.
lrwxrwxrwx 1 root root   22 Jul 10  2024 cuda -> /etc/alternatives/cuda
lrwxrwxrwx 1 root root   25 Jul 10  2024 cuda-12 -> /etc/alternatives/cuda-12
drwxr-xr-x 1 root root 4096 Jul 10  2024 cuda-12.5
INFO 06-12 21:02:23 [config.py:823] This model supports multiple tasks: {'reward', 'generate', 'classify', 'score', 'embed'}. Defaulting to 'generate'.
WARNING 06-12 21:02:23 [config.py:3271] Casting torch.bfloat16 to torch.float16.
INFO 06-12 21:02:23 [config.py:2195] Chunked prefill is enabled with max_num_batched_tokens=8192.
WARNING 06-12 21:02:23 [config.py:2232] max_num_batched_tokens (8192) exceeds max_num_seqs* max_model_len (1024). This may lead to unexpected behavior.
WARNING 06-12 21:02:23 [cuda.py:91] To see benefits of async output processing, enable CUDA graph. Since, enforce-eager is enabled, async output processor cannot be used
WARNING 06-12 21:02:25 [env_override.py:17] NCCL_CUMEM_ENABLE is set to 0, skipping override. This may increase memory overhead with cudagraph+allreduce: https://github.com/NVIDIA/nccl/issues/1234
INFO 06-12 21:02:27 [__init__.py:244] Automatically detected platform cuda.
2025-06-12 21:02:27.403391: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
E0000 00:00:1749762147.413643   48564 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749762147.416761   48564 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
INFO 06-12 21:02:30 [core.py:455] Waiting for init message from front-end.
INFO 06-12 21:02:30 [core.py:70] Initializing a V1 LLM engine (v0.9.1) with config: model='Qwen/Qwen3-0.6B', speculative_config=None, tokenizer='Qwen/Qwen3-0.6B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=1024, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=2024, served_model_name=Qwen/Qwen3-0.6B, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=False, pooler_config=None, compilation_config={"level":0,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":[],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"use_cudagraph":true,"cudagraph_num_of_warmups":0,"cudagraph_capture_sizes":[],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"max_capture_size":0,"local_cache_dir":null}
WARNING 06-12 21:02:30 [utils.py:2737] Methods determine_num_available_blocks,device_config,get_cache_block_size_bytes,initialize_cache not implemented in <vllm.v1.worker.gpu_worker.Worker object at 0x710235449c10>
INFO 06-12 21:02:31 [parallel_state.py:1065] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
WARNING 06-12 21:02:31 [topk_topp_sampler.py:59] FlashInfer is not available. Falling back to the PyTorch-native implementation of top-p & top-k sampling. For the best performance, please install FlashInfer.
INFO 06-12 21:02:31 [gpu_model_runner.py:1595] Starting to load model Qwen/Qwen3-0.6B...
INFO 06-12 21:02:31 [gpu_model_runner.py:1600] Loading model from scratch...
INFO 06-12 21:02:31 [logger.py:59] Using Flash Attention backend on V1 engine.
INFO 06-12 21:02:32 [weight_utils.py:292] Using model weights format ['*.safetensors']
INFO 06-12 21:02:32 [weight_utils.py:345] No model.safetensors.index.json found in remote.
Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.71it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.71it/s]

INFO 06-12 21:02:33 [default_loader.py:272] Loading weights took 0.60 seconds
INFO 06-12 21:02:33 [gpu_model_runner.py:1624] Model loading took 1.1201 GiB and 1.735851 seconds
INFO 06-12 21:02:34 [gpu_worker.py:227] Available KV cache memory: 21.20 GiB
INFO 06-12 21:02:34 [kv_cache_utils.py:715] GPU KV cache size: 198,448 tokens
INFO 06-12 21:02:34 [kv_cache_utils.py:719] Maximum concurrency for 1,024 tokens per request: 193.80x
INFO 06-12 21:02:34 [core.py:171] init engine (profile, create kv cache, warmup model) took 1.06 seconds
LLM Started

Jun 12 '25 20:06 kgboyko

There was an assumption that maybe something in Linux Ubuntu was updated. For example, the SSH library was updated. There was a computer with an old Linux that had not been updated for two months. For testing, I took an old container Kaggle Docker GPU v158 and vllm v0.7.2. More than 100 models of different LLMs were tested on this pair and everything worked.

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
Calculating upgrade... Done
The following packages have been kept back:
  cloud-init
The following packages will be upgraded:
  apt distro-info-data docker-buildx-plugin docker-ce docker-ce-cli docker-ce-rootless-extras docker-compose-plugin grub-efi-amd64 grub-efi-amd64-bin grub-efi-amd64-signed initramfs-tools
  initramfs-tools-bin initramfs-tools-core libapt-pkg6.0 libldap-2.5-0 libldap-common libnvidia-container-tools libnvidia-container1 linux-base nvidia-container-toolkit
  nvidia-container-toolkit-base pci.ids python3-update-manager
23 upgraded, 0 newly installed, 0 to remove and 1 not upgraded.
Need to get 103 MB of archives.
After this operation, 2981 kB of additional disk space will be used.
Do you want to continue? [Y/n] n
Abort.

For better repetition of the result, I used previously saved WHL files when installing Python libraries.

! pip install -U ipykernel
! python -m pip install --no-index --find-links=/kaggle/input/whl/vllm/vllm-0-7-2 bitsandbytes vllm triton
! python -m pip install -U --no-dependencies --no-index --find-links=/kaggle/input/whl/flashinfer_python/flashinfer_python-0-2-1-post1-v02 \
    flashinfer-python==0.2.1.post1 -i https://flashinfer.ai/whl/cu121/torch2.5/

I run the example only on the first video card.

+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA GeForce RTX 3090 Ti     Off |   00000000:01:00.0 Off |                  Off |
|  0%   32C    P8             26W /  450W |   23192MiB /  24564MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA GeForce RTX 3090 Ti     Off |   00000000:05:00.0 Off |                  Off |
|  0%   44C    P8             15W /  450W |       2MiB /  24564MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|

Took the old Qwen LLM

import os
import transformers; print('Transformers version:', transformers.__version__)
import torch; print('Torch version:', torch.__version__)
import vllm; print('vLLM version:', vllm.__version__)
from vllm import LLM

container_date = os.environ.get('BUILD_DATE', '').split('-')[0]
print(f"Kaggle Docker BUILD_DATE={container_date}")

!cat /etc/os-release | grep -oP "PRETTY_NAME=\"\K([^\"]*)" && uname -r
!free -h
!nv_version="$(nvidia-smi --query-gpu=driver_version --format=csv,noheader)" && echo "My NVIDIA driver version is '${nv_version}'."
!ls -l /usr/local | grep cuda

is_debug = True

os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
os.environ["TRITON_PTXAS_PATH"] = "/usr/local/cuda/bin/ptxas"

if is_debug:
    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['MKL_NUM_THREADS'] = '1'
    os.environ['POLARS_ALLOW_FORKING_THREAD'] = '1'
    os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
    os.environ['TORCH_USE_CUDA_DSA'] = "1"
    os.environ['OPENBLAS_NUM_THREADS'] = '1'
    os.environ["NUM_INTER_THREADS"] = "1"
    os.environ["NUM_INTRA_THREADS"] = "1"
    os.environ["XLA_FLAGS"] = ("--xla_cpu_multi_thread_eigen=false "
                               "intra_op_parallelism_threads=1")

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["PYTORCH_CUDA_ALLOC_CONF"]='expandable_segments:True'

# https://huggingface.co/casperhansen/deepseek-r1-distill-qwen-14b-awq
llm = LLM(
    model="/kaggle/input/models-llm/Deepseek-AI/DeepSeek-R1-Distill/Qwen/Quant/qwen-14b-awq-casperhansen", # "Qwen/Qwen3-0.6B",
    max_num_seqs=1,
    max_model_len=1024,
    trust_remote_code=True,
    enable_prefix_caching = True,
    dtype = torch.half,
    tensor_parallel_size=1,
    gpu_memory_utilization=0.96,
    enforce_eager=True,
    seed=2024,
)
tokenizer = llm.get_tokenizer()
print("LLM Started")

Transformers version: 4.48.3
Torch version: 2.5.1+cu121
2025-06-13 07:06:36.673294: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-13 07:06:36.679841: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-06-13 07:06:36.687425: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-06-13 07:06:36.689663: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-06-13 07:06:36.695956: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-06-13 07:06:37.511089: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
INFO 06-13 07:06:38 __init__.py:190] Automatically detected platform cuda.
vLLM version: 0.7.2
Kaggle Docker BUILD_DATE=20250219
13.74s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
Ubuntu 22.04.3 LTS
5.15.0-136-generic
18.92s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
               total        used        free      shared  buff/cache   available
Mem:            62Gi       3.6Gi        16Gi       3.0Mi        42Gi        58Gi
Swap:          8.0Gi       4.0Mi       8.0Gi
24.07s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
My NVIDIA driver version is '560.35.03
560.35.03'.
29.29s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
lrwxrwxrwx 1 root root   22 Nov 10  2023 cuda -> /etc/alternatives/cuda
lrwxrwxrwx 1 root root   25 Nov 10  2023 cuda-12 -> /etc/alternatives/cuda-12
drwxr-xr-x 1 root root 4096 Nov 10  2023 cuda-12.2
WARNING 06-13 07:06:59 config.py:2386] Casting torch.bfloat16 to torch.float16.

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
/tmp/ipykernel_2833/478751866.py in <cell line: 36>()
     34 os.environ["PYTORCH_CUDA_ALLOC_CONF"]='expandable_segments:True'
     35 
---> 36 llm = LLM(
     37     model="/kaggle/input/models-llm/Deepseek-AI/DeepSeek-R1-Distill/Qwen/Quant/qwen-14b-awq-casperhansen", # "Qwen/Qwen3-0.6B",
     38     max_num_seqs=1,

/usr/local/lib/python3.10/dist-packages/vllm/utils.py in inner(*args, **kwargs)
   1049                     )
   1050 
-> 1051             return fn(*args, **kwargs)
   1052 
   1053         return inner  # type: ignore

/usr/local/lib/python3.10/dist-packages/vllm/entrypoints/llm.py in __init__(self, model, tokenizer, tokenizer_mode, skip_tokenizer_init, trust_remote_code, allowed_local_media_path, tensor_parallel_size, dtype, quantization, revision, tokenizer_revision, seed, gpu_memory_utilization, swap_space, cpu_offload_gb, enforce_eager, max_seq_len_to_capture, disable_custom_all_reduce, disable_async_output_proc, hf_overrides, mm_processor_kwargs, task, override_pooler_config, compilation_config, **kwargs)
    240         # to avoid import order issues
    241         self.engine_class = self.get_engine_class()
--> 242         self.llm_engine = self.engine_class.from_engine_args(
    243             engine_args, usage_context=UsageContext.LLM_CLASS)
    244 

/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py in from_engine_args(cls, engine_args, usage_context, stat_loggers)
    479         """Creates an LLM engine from the engine arguments."""
    480         # Create the engine configs.
--> 481         engine_config = engine_args.create_engine_config(usage_context)
    482         executor_class = cls._get_executor_cls(engine_config)
    483         # Create the LLM engine.

/usr/local/lib/python3.10/dist-packages/vllm/engine/arg_utils.py in create_engine_config(self, usage_context)
   1073 
   1074         device_config = DeviceConfig(device=self.device)
-> 1075         model_config = self.create_model_config()
   1076 
   1077         if (model_config.is_multimodal_model and not envs.VLLM_USE_V1

/usr/local/lib/python3.10/dist-packages/vllm/engine/arg_utils.py in create_model_config(self)
    996 
    997     def create_model_config(self) -> ModelConfig:
--> 998         return ModelConfig(
    999             model=self.model,
   1000             task=self.task,

/usr/local/lib/python3.10/dist-packages/vllm/config.py in __init__(self, model, task, tokenizer, tokenizer_mode, trust_remote_code, dtype, seed, allowed_local_media_path, revision, code_revision, rope_scaling, rope_theta, tokenizer_revision, max_model_len, spec_target_max_model_len, quantization, enforce_eager, max_seq_len_to_capture, max_logprobs, disable_sliding_window, skip_tokenizer_init, served_model_name, limit_mm_per_prompt, use_async_output_proc, config_format, hf_overrides, mm_processor_kwargs, disable_mm_preprocessor_cache, override_neuron_config, override_pooler_config, logits_processor_pattern, generation_config, enable_sleep_mode, override_generation_config, model_impl)
    362         self.served_model_name = get_served_model_name(model,
    363                                                        served_model_name)
--> 364         self.multimodal_config = self._init_multimodal_config(
    365             limit_mm_per_prompt)
    366         if not self.skip_tokenizer_init:

/usr/local/lib/python3.10/dist-packages/vllm/config.py in _init_multimodal_config(self, limit_mm_per_prompt)
    422     ) -> Optional["MultiModalConfig"]:
    423         architectures = getattr(self.hf_config, "architectures", [])
--> 424         if ModelRegistry.is_multimodal_model(architectures):
    425             return MultiModalConfig(limit_per_prompt=limit_mm_per_prompt or {})
    426 

/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/registry.py in is_multimodal_model(self, architectures)
    443         architectures: Union[str, List[str]],
    444     ) -> bool:
--> 445         model_cls, _ = self.inspect_model_cls(architectures)
    446         return model_cls.supports_multimodal
    447 

/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/registry.py in inspect_model_cls(self, architectures)
    399 
    400         for arch in architectures:
--> 401             model_info = self._try_inspect_model_cls(arch)
    402             if model_info is not None:
    403                 return (model_info, arch)

/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/registry.py in _try_inspect_model_cls(self, model_arch)
    374             return None
    375 
--> 376         return _try_inspect_model_cls(model_arch, self.models[model_arch])
    377 
    378     def _normalize_archs(

/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/registry.py in _try_inspect_model_cls(model_arch, model)
    302 ) -> Optional[_ModelInfo]:
    303     try:
--> 304         return model.inspect_model_cls()
    305     except Exception:
    306         logger.exception("Error in inspecting model architecture '%s'",

/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/registry.py in inspect_model_cls(self)
    273     # Performed in another process to avoid initializing CUDA
    274     def inspect_model_cls(self) -> _ModelInfo:
--> 275         return _run_in_subprocess(
    276             lambda: _ModelInfo.from_model_cls(self.load_model_cls()))
    277 

/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/registry.py in _run_in_subprocess(fn)
    498         # cannot use `sys.executable __file__` here because the script
    499         # contains relative imports
--> 500         returned = subprocess.run(
    501             [sys.executable, "-m", "vllm.model_executor.models.registry"],
    502             input=input_bytes,

/usr/lib/python3.10/subprocess.py in run(input, capture_output, timeout, check, *popenargs, **kwargs)
    503     with Popen(*popenargs, **kwargs) as process:
    504         try:
--> 505             stdout, stderr = process.communicate(input, timeout=timeout)
    506         except TimeoutExpired as exc:
    507             process.kill()

/usr/lib/python3.10/subprocess.py in communicate(self, input, timeout)
   1152 
   1153             try:
-> 1154                 stdout, stderr = self._communicate(input, endtime, timeout)
   1155             except KeyboardInterrupt:
   1156                 # https://bugs.python.org/issue25942

/usr/lib/python3.10/subprocess.py in _communicate(self, input, endtime, orig_timeout)
   2019                             'failed to raise TimeoutExpired.')
   2020 
-> 2021                     ready = selector.select(timeout)
   2022                     self._check_timeout(endtime, orig_timeout, stdout, stderr)
   2023 

/usr/lib/python3.10/selectors.py in select(self, timeout)
    414         ready = []
    415         try:
--> 416             fd_event_list = self._selector.poll(timeout)
    417         except InterruptedError:
    418             return ready

KeyboardInterrupt:

If you run it without a debugger, the log looks like this.

Transformers version: 4.48.3
Torch version: 2.5.1+cu121
2025-06-13 07:22:48.143368: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-13 07:22:48.149609: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-06-13 07:22:48.156711: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-06-13 07:22:48.158846: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-06-13 07:22:48.164362: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-06-13 07:22:48.651736: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
INFO 06-13 07:22:49 __init__.py:190] Automatically detected platform cuda.
vLLM version: 0.7.2
Kaggle Docker BUILD_DATE=20250219
Ubuntu 22.04.3 LTS
5.15.0-136-generic
               total        used        free      shared  buff/cache   available
Mem:            62Gi       3.5Gi        16Gi       3.0Mi        42Gi        58Gi
Swap:          8.0Gi       4.0Mi       8.0Gi
My NVIDIA driver version is '560.35.03
560.35.03'.
lrwxrwxrwx 1 root root   22 Nov 10  2023 cuda -> /etc/alternatives/cuda
lrwxrwxrwx 1 root root   25 Nov 10  2023 cuda-12 -> /etc/alternatives/cuda-12
drwxr-xr-x 1 root root 4096 Nov 10  2023 cuda-12.2
WARNING 06-13 07:22:49 config.py:2386] Casting torch.bfloat16 to torch.float16.
INFO 06-13 07:22:53 config.py:542] This model supports multiple tasks: {'generate', 'reward', 'embed', 'score', 'classify'}. Defaulting to 'generate'.
INFO 06-13 07:22:54 awq_marlin.py:111] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
WARNING 06-13 07:22:54 cuda.py:95] To see benefits of async output processing, enable CUDA graph. Since, enforce-eager is enabled, async output processor cannot be used
WARNING 06-13 07:22:54 config.py:678] Async output processing is not supported on the current platform type cuda.
INFO 06-13 07:22:54 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.2) with config: model='/kaggle/input/models-llm/Deepseek-AI/DeepSeek-R1-Distill/Qwen/Quant/qwen-14b-awq-casperhansen', speculative_config=None, tokenizer='/kaggle/input/models-llm/Deepseek-AI/DeepSeek-R1-Distill/Qwen/Quant/qwen-14b-awq-casperhansen', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=1024, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=True, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=2024, served_model_name=/kaggle/input/models-llm/Deepseek-AI/DeepSeek-R1-Distill/Qwen/Quant/qwen-14b-awq-casperhansen, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=True, chunked_prefill_enabled=False, use_async_output_proc=False, disable_mm_preprocessor_cache=False, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[],"max_capture_size":0}, use_cached_outputs=False, 
INFO 06-13 07:22:54 cuda.py:230] Using Flash Attention backend.
INFO 06-13 07:22:55 model_runner.py:1110] Starting to load model /kaggle/input/models-llm/Deepseek-AI/DeepSeek-R1-Distill/Qwen/Quant/qwen-14b-awq-casperhansen...
[W613 07:22:55.769702180 CUDAAllocatorConfig.h:28] Warning: expandable_segments not supported on this platform (function operator())

INFO 06-13 07:22:56 model_runner.py:1115] Loading model weights took 9.3280 GB
INFO 06-13 07:22:58 worker.py:267] Memory profiling takes 1.05 seconds
INFO 06-13 07:22:58 worker.py:267] the current vLLM instance can use total_gpu_memory (23.58GiB) x gpu_memory_utilization (0.96) = 22.63GiB
INFO 06-13 07:22:58 worker.py:267] model weights take 9.33GiB; non_torch_memory takes 0.06GiB; PyTorch activation peak memory takes 0.31GiB; the rest of the memory reserved for KV Cache is 12.94GiB.
INFO 06-13 07:22:58 executor_base.py:110] # CUDA blocks: 4416, # CPU blocks: 1365
INFO 06-13 07:22:58 executor_base.py:115] Maximum concurrency for 1024 tokens per request: 69.00x
INFO 06-13 07:22:59 llm_engine.py:431] init engine (profile, create kv cache, warmup model) took 2.45 seconds
LLM Started

I just can't understand what the problem is.

Previously, there were freezes on various Deep Learning tasks, not only on LLM. VSCode does not like it when many processes are launched, or perhaps the Jupyter notebook does not like them. And when I was debugging, I made a parameter in the program to turn off all libraries that generate processes during debugging. And when debugging, I used only one process. And this always solved the problem of the debugger freezing in the Jupyter notebook on VSCode. Now this does not help, despite the fact that I specifically took the old docker and old libraries that I used in my previous work. It does not work, everything freezes.

Reduced the number of plugins to the smallest possible option and installed the old debugger plugin - did not help.

Extension	Author (truncated)	Version
jupyter-keymap	ms-	1.1.2
remote-containers	ms-	0.417.0
remote-ssh	ms-	0.120.0
remote-ssh-edit	ms-	0.87.0
vscode-remote-extensionpack	ms-	0.26.0
remote-explorer	ms-	0.5.0
remote-server	ms-	1.5.2
vscode-containers	ms-	2.0.3
vscode-docker	ms-	2.0.0
debugpy	ms-	2025.4.1
python	ms-	2025.6.1
jupyter	ms-	2025.5.0
jupyter-keymap	ms-	1.1.2
jupyter-renderers	ms-	1.1.0
vscode-jupyter-cell-tags	ms-	0.1.9
vscode-jupyter-slideshow	ms-	0.1.6

Jun 13 '25 07:06 kgboyko

She also doesn't want to enter the constructor of the LLM() class, although the parameter "justMyCode": false is set.

{
    // Use IntelliSense to learn about possible attributes.
    // Hover to view descriptions of existing attributes.
    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
    "version": "0.2.0",
    "configurations": [

        {
            "name": "Python Debugger: Current File",
            "type": "debugpy",
            "request": "launch",
            "program": "${file}",
            "console": "integratedTerminal",
            "justMyCode": false
        }
    ]
}

But if I convert the program from Jupiter notebook into a regular Python program, then everything starts working, and I can fail with the debugger in the LLM() constructor.

import os
import transformers; print('Transformers version:', transformers.__version__)
import torch; print('Torch version:', torch.__version__)
import vllm; print('vLLM version:', vllm.__version__)
from vllm import LLM

if __name__ == '__main__':
    container_date = os.environ.get('BUILD_DATE', '').split('-')[0]
    print(f"Kaggle Docker BUILD_DATE={container_date}")

    #! cat /etc/os-release | grep -oP "PRETTY_NAME=\"\K([^\"]*)" && uname -r
    #! free -h
    #! nv_version="$(nvidia-smi --query-gpu=driver_version --format=csv,noheader)" && echo "My NVIDIA driver version is '${nv_version}'."
    #! ls -l /usr/local | grep cuda

    is_debug = True

    os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
    os.environ["TRITON_PTXAS_PATH"] = "/usr/local/cuda/bin/ptxas"

    if is_debug:
        os.environ['OMP_NUM_THREADS'] = '1'
        os.environ['MKL_NUM_THREADS'] = '1'
        os.environ['POLARS_ALLOW_FORKING_THREAD'] = '1'
        os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
        os.environ['TORCH_USE_CUDA_DSA'] = "1"
        os.environ['OPENBLAS_NUM_THREADS'] = '1'
        os.environ["NUM_INTER_THREADS"] = "1"
        os.environ["NUM_INTRA_THREADS"] = "1"
        os.environ["XLA_FLAGS"] = ("--xla_cpu_multi_thread_eigen=false "
                                "intra_op_parallelism_threads=1")

    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    os.environ["PYTORCH_CUDA_ALLOC_CONF"]='expandable_segments:True'

    llm = LLM(
        model="/kaggle/input/models-llm/Deepseek-AI/DeepSeek-R1-Distill/Qwen/Quant/qwen-14b-awq-casperhansen", # "Qwen/Qwen3-0.6B",
        max_num_seqs=1,
        max_model_len=8*1024,
        trust_remote_code=True,
        enable_prefix_caching = True,
        dtype = torch.half,
        tensor_parallel_size=1,
        gpu_memory_utilization=0.96,
        enforce_eager=True,
        seed=2024,
    )
    tokenizer = llm.get_tokenizer()
    print("LLM Started")

I tried the option subProcess=false when starting the debugger, but I didn't get any results.

Jun 13 '25 14:06 HombreLluvia