xla
xla copied to clipboard
v5litepod-8: metadata connection refused
🐛 Bug
I'm running vllm with pytorch_xla==2.5.0 and I have described this bug on their repo, but it appears to be a bug with pytorch_xla itself?
To Reproduce
Steps to reproduce the behavior:
- clone and install vllm according to either their docker or from source instructions here on a v5litepod-8
- vllm fails with a
torch_xlaerror:
File "/usr/local/lib/python3.10/site-packages/torch_xla/_internal/tpu.py", line 187, in version
env = get_tpu_env()
- full exception:
ERROR 10-21 02:38:59 worker_base.py:464] requests.exceptions.ConnectionError: HTTPConnectionPool(host='metadata.google.internal', port=80): Max retries exceeded with url: /computeMetadata/v1/instance/attributes/tpu-env (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f01780d21a0>: Failed to establish a new connection: [Errno 111] Connection refused'))
Process SpawnProcess-1:
Traceback (most recent call last):
File "/usr/local/lib/python3.10/site-packages/urllib3/connection.py", line 199, in _new_conn
sock = connection.create_connection(
File "/usr/local/lib/python3.10/site-packages/urllib3/util/connection.py", line 85, in create_connection
raise err
File "/usr/local/lib/python3.10/site-packages/urllib3/util/connection.py", line 73, in create_connection
sock.connect(sa)
ConnectionRefusedError: [Errno 111] Connection refused
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/usr/local/lib/python3.10/site-packages/urllib3/connectionpool.py", line 789, in urlopen
response = self._make_request(
File "/usr/local/lib/python3.10/site-packages/urllib3/connectionpool.py", line 495, in _make_request
conn.request(
File "/usr/local/lib/python3.10/site-packages/urllib3/connection.py", line 441, in request
self.endheaders()
File "/usr/local/lib/python3.10/http/client.py", line 1278, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "/usr/local/lib/python3.10/http/client.py", line 1038, in _send_output
self.send(msg)
File "/usr/local/lib/python3.10/http/client.py", line 976, in send
self.connect()
File "/usr/local/lib/python3.10/site-packages/urllib3/connection.py", line 279, in connect
self.sock = self._new_conn()
File "/usr/local/lib/python3.10/site-packages/urllib3/connection.py", line 214, in _new_conn
raise NewConnectionError(
urllib3.exceptions.NewConnectionError: <urllib3.connection.HTTPConnection object at 0x7f01780d21a0>: Failed to establish a new connection: [Errno 111] Connection refused
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/usr/local/lib/python3.10/site-packages/requests/adapters.py", line 667, in send
resp = conn.urlopen(
File "/usr/local/lib/python3.10/site-packages/urllib3/connectionpool.py", line 843, in urlopen
retries = retries.increment(
File "/usr/local/lib/python3.10/site-packages/urllib3/util/retry.py", line 519, in increment
raise MaxRetryError(_pool, url, reason) from reason # type: ignore[arg-type]
urllib3.exceptions.MaxRetryError: HTTPConnectionPool(host='metadata.google.internal', port=80): Max retries exceeded with url: /computeMetadata/v1/instance/attributes/tpu-env (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f01780d21a0>: Failed to establish a new connection: [Errno 111] Connection refused'))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/usr/local/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/workspace/vllm/vllm/engine/multiprocessing/engine.py", line 390, in run_mp_engine
engine = MQLLMEngine.from_engine_args(engine_args=engine_args,
File "/workspace/vllm/vllm/engine/multiprocessing/engine.py", line 139, in from_engine_args
return cls(
File "/workspace/vllm/vllm/engine/multiprocessing/engine.py", line 78, in __init__
self.engine = LLMEngine(*args, **kwargs)
File "/workspace/vllm/vllm/engine/llm_engine.py", line 335, in __init__
self.model_executor = executor_class(
File "/workspace/vllm/vllm/executor/ray_tpu_executor.py", line 39, in __init__
super().__init__(*args, **kwargs)
File "/workspace/vllm/vllm/executor/executor_base.py", line 47, in __init__
self._init_executor()
File "/workspace/vllm/vllm/executor/ray_tpu_executor.py", line 51, in _init_executor
self._init_workers_ray(placement_group)
File "/workspace/vllm/vllm/executor/ray_tpu_executor.py", line 198, in _init_workers_ray
self._run_workers("load_model",
File "/workspace/vllm/vllm/executor/ray_tpu_executor.py", line 264, in _run_workers
driver_worker_output = self.driver_worker.execute_method(
File "/workspace/vllm/vllm/worker/worker_base.py", line 465, in execute_method
raise e
File "/workspace/vllm/vllm/worker/worker_base.py", line 456, in execute_method
return executor(*args, **kwargs)
File "/workspace/vllm/vllm/worker/tpu_worker.py", line 111, in load_model
self.model_runner.load_model()
File "/workspace/vllm/vllm/worker/tpu_model_runner.py", line 151, in load_model
model = get_model(
File "/workspace/vllm/vllm/model_executor/model_loader/__init__.py", line 19, in get_model
return loader.load_model(model_config=model_config,
File "/workspace/vllm/vllm/model_executor/model_loader/loader.py", line 398, in load_model
model = _initialize_model(model_config, self.load_config,
File "/workspace/vllm/vllm/model_executor/model_loader/loader.py", line 175, in _initialize_model
return build_model(
File "/workspace/vllm/vllm/model_executor/model_loader/loader.py", line 160, in build_model
return model_class(config=hf_config,
File "/workspace/vllm/vllm/model_executor/models/llama.py", line 515, in __init__
self.model = LlamaModel(config,
File "/workspace/vllm/vllm/compilation/decorators.py", line 71, in __init__
old_init(self, *args, **kwargs)
File "/workspace/vllm/vllm/model_executor/models/llama.py", line 305, in __init__
self.start_layer, self.end_layer, self.layers = make_layers(
File "/workspace/vllm/vllm/model_executor/models/utils.py", line 419, in make_layers
[PPMissingLayer() for _ in range(start_layer)] + [
File "/workspace/vllm/vllm/model_executor/models/utils.py", line 420, in <listcomp>
maybe_offload_to_cpu(layer_fn(prefix=f"{prefix}.{idx}"))
File "/workspace/vllm/vllm/model_executor/models/llama.py", line 307, in <lambda>
lambda prefix: LlamaDecoderLayer(config=config,
File "/workspace/vllm/vllm/model_executor/models/llama.py", line 217, in __init__
self.self_attn = LlamaAttention(
File "/workspace/vllm/vllm/model_executor/models/llama.py", line 170, in __init__
self.attn = Attention(
File "/workspace/vllm/vllm/attention/layer.py", line 85, in __init__
self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads,
File "/workspace/vllm/vllm/attention/backends/pallas.py", line 122, in __init__
if torch_xla.tpu.version() < 4:
File "/usr/local/lib/python3.10/site-packages/torch_xla/_internal/tpu.py", line 187, in version
env = get_tpu_env()
File "/usr/local/lib/python3.10/site-packages/torch_xla/_internal/tpu.py", line 181, in get_tpu_env
metadata = _get_metadata('tpu-env')
File "/usr/local/lib/python3.10/site-packages/torch_xla/_internal/tpu.py", line 89, in _get_metadata
resp = requests.get(path, headers={'Metadata-Flavor': 'Google'})
File "/usr/local/lib/python3.10/site-packages/requests/api.py", line 73, in get
return request("get", url, params=params, **kwargs)
File "/usr/local/lib/python3.10/site-packages/requests/api.py", line 59, in request
return session.request(method=method, url=url, **kwargs)
File "/usr/local/lib/python3.10/site-packages/requests/sessions.py", line 589, in request
resp = self.send(prep, **send_kwargs)
File "/usr/local/lib/python3.10/site-packages/requests/sessions.py", line 703, in send
r = adapter.send(request, **kwargs)
File "/usr/local/lib/python3.10/site-packages/requests/adapters.py", line 700, in send
raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPConnectionPool(host='metadata.google.internal', port=80): Max retries exceeded with url: /computeMetadata/v1/instance/attributes/tpu-env (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f01780d21a0>: Failed to establish a new connection: [Errno 111] Connection refused'))
I think it has to do with ray workers don't have access to metadata in gcp Temporary workaround is this , change it from line 112 in pallas.py ` #if torch_xla.tpu.version() < 4: # raise NotImplementedError("TPU version must be 4 or higher.")
self.megacore_mode = None
#tpu_env = torch_xla.tpu.get_tpu_env()
#tpu_type = (tpu_env.get("ACCELERATOR_TYPE", None)
# or tpu_env.get("TYPE", None)
# or tpu_env.get("TPU_ACCELERATOR_TYPE", None))
#assert tpu_type is not None
#tpu_type = tpu_type.lower()
#if "lite" not in tpu_type:
# if self.num_kv_heads % 2 == 0:
# self.megacore_mode = "kv_head"
# else:
# NOTE(woosuk): If the batch size is not a multiple of 2, the
# megacore mode will be None.
#self.megacore_mode = "batch"
`