mars
mars copied to clipboard
[BUG] session creation error
Describe the bug session creation error
To Reproduce To help us reproducing this bug, please provide information below:
- Python 3.7.13
- Mars 9.0
- //
- Full stack of the error.
ERROR:mars.services.cluster.uploader:Failed to upload node info
Traceback (most recent call last):
File "D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\services\cluster\uploader.py", line 122, in upload_node_info
self._info.env = await asyncio.to_thread(gather_node_env)
File "D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\lib\aio\_threads.py", line 36, in to_thread
return await loop.run_in_executor(None, func_call)
File "D:\Compiler\Anaconda\envs\py37\lib\concurrent\futures\thread.py", line 57, in run
result = self.fn(*self.args, **self.kwargs)
File "D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\services\cluster\gather.py", line 75, in gather_node_env
cuda_info = mars_resource.cuda_info()
File "D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\resource.py", line 360, in cuda_info
products=[nvutils.get_device_info(idx).name for idx in range(gpu_count)],
File "D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\resource.py", line 360, in <listcomp>
products=[nvutils.get_device_info(idx).name for idx in range(gpu_count)],
File "D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\lib\nvutils.py", line 343, in get_device_info
uuid=uuid.UUID(bytes=uuid_t.bytes),
File "D:\Compiler\Anaconda\envs\py37\lib\uuid.py", line 169, in __init__
raise ValueError('bytes is not a 16-char string')
ValueError: bytes is not a 16-char string
ERROR:mars.services.cluster.uploader:Failed to upload node info: bytes is not a 16-char string
`---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_39180\3232464671.py in <module>
----> 1 mars.new_session()
D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\deploy\oscar\session.py in new_session(address, session_id, backend, default, new, **kwargs)
2038
2039 session = SyncSession.init(
-> 2040 address, session_id=session_id, backend=backend, new=new, **kwargs
2041 )
2042 if default:
D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\deploy\oscar\session.py in init(cls, address, session_id, backend, new, **kwargs)
1634 coro = _IsolatedSession.init(address, session_id, backend, new=new, **kwargs)
1635 fut = asyncio.run_coroutine_threadsafe(coro, isolation.loop)
-> 1636 isolated_session = fut.result()
1637 return SyncSession(address, session_id, isolated_session, isolation)
1638
D:\Compiler\Anaconda\envs\py37\lib\concurrent\futures\_base.py in result(self, timeout)
433 raise CancelledError()
434 elif self._state == FINISHED:
--> 435 return self.__get_result()
436 else:
437 raise TimeoutError()
D:\Compiler\Anaconda\envs\py37\lib\concurrent\futures\_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\deploy\oscar\session.py in init(cls, address, session_id, backend, new, timeout, **kwargs)
846 return (
847 await new_cluster_in_isolation(
--> 848 address, timeout=timeout, backend=backend, **kwargs
849 )
850 ).session
D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\deploy\oscar\local.py in new_cluster_in_isolation(address, n_worker, n_cpu, mem_bytes, cuda_devices, subprocess_start_method, backend, config, web, timeout, n_supervisor_process)
89 n_supervisor_process,
90 )
---> 91 await cluster.start()
92 return await LocalClient.create(cluster, timeout)
93
D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\deploy\oscar\local.py in start(self)
217 await self._start_worker_pools()
218 # start service
--> 219 await self._start_service()
220
221 if self._web:
D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\deploy\oscar\local.py in _start_service(self)
263 async def _start_service(self):
264 self._web = await start_supervisor(
--> 265 self.supervisor_address, config=self._config, web=self._web
266 )
267 for worker_pool, band_to_resource in zip(
D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\deploy\oscar\service.py in start_supervisor(address, lookup_address, modules, config, web)
40 config["modules"] = modules
41 try:
---> 42 await start_services(NodeRole.SUPERVISOR, config, address=address)
43 logger.debug("Mars supervisor started at %s", address)
44 except ImportError:
D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\services\core.py in start_services(node_role, config, address, mark_ready)
172 for entries in svc_entries_list:
173 instances = [svc_entry.get_instance(address, config) for svc_entry in entries]
--> 174 await asyncio.gather(*[inst.start() for inst in instances])
175
176 if mark_ready and "cluster" in service_names:
D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\services\cluster\supervisor\service.py in start(self)
65 interval=svc_config.get("node_check_interval"),
66 uid=NodeInfoUploaderActor.default_uid(),
---> 67 address=address,
68 )
69 await mo.create_actor(
D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\oscar\api.py in create_actor(actor_cls, uid, address, *args, **kwargs)
25 async def create_actor(actor_cls, *args, uid=None, address=None, **kwargs) -> ActorRef:
26 ctx = get_context()
---> 27 return await ctx.create_actor(actor_cls, *args, uid=uid, address=address, **kwargs)
28
29
D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\oscar\backends\context.py in create_actor(self, actor_cls, uid, address, *args, **kwargs)
110 future = await self._call(address, create_actor_message, wait=False)
111 result = await self._wait(future, address, create_actor_message)
--> 112 return self._process_result_message(result)
113
114 async def has_actor(self, actor_ref: ActorRef) -> bool:
D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\oscar\backends\context.py in _process_result_message(message)
74 return message.result
75 else:
---> 76 raise message.as_instanceof_cause()
77
78 async def _wait(self, future: asyncio.Future, address: str, message: _MessageBase):
D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\oscar\backends\pool.py in create_actor(self, message)
523 actor.address = address = self.external_address
524 self._actors[actor_id] = actor
--> 525 await self._run_coro(message.message_id, actor.__post_create__())
526
527 result = ActorRef(address, actor_id)
D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\oscar\backends\pool.py in _run_coro(self, message_id, coro)
341 self._process_messages[message_id] = asyncio.tasks.current_task()
342 try:
--> 343 return await coro
344 finally:
345 self._process_messages.pop(message_id, None)
D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\services\cluster\uploader.py in __post_create__(self)
58 async def __post_create__(self):
59 self._upload_task = asyncio.create_task(self._periodical_upload_node_info())
---> 60 await self._uploaded_future
61
62 async def __pre_destroy__(self):
D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\services\cluster\uploader.py in _periodical_upload_node_info(self)
84 while True:
85 try:
---> 86 await self.upload_node_info()
87 if not self._uploaded_future.done():
88 self._uploaded_future.set_result(None)
D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\services\cluster\uploader.py in upload_node_info(self, status)
120 try:
121 if not self._info.env:
--> 122 self._info.env = await asyncio.to_thread(gather_node_env)
123 self._info.detail.update(
124 await asyncio.to_thread(
D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\lib\aio\_threads.py in to_thread(func, *args, **kwargs)
34 ctx = contextvars.copy_context()
35 func_call = functools.partial(ctx.run, func, *args, **kwargs)
---> 36 return await loop.run_in_executor(None, func_call)
D:\Compiler\Anaconda\envs\py37\lib\concurrent\futures\thread.py in run(self)
55
56 try:
---> 57 result = self.fn(*self.args, **self.kwargs)
58 except BaseException as exc:
59 self.future.set_exception(exc)
D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\services\cluster\gather.py in gather_node_env()
73
74 try:
---> 75 cuda_info = mars_resource.cuda_info()
76 except NVError: # pragma: no cover
77 logger.exception("NVError encountered, cannot gather CUDA devices.")
D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\resource.py in cuda_info()
358 driver_version=driver_info.driver_version,
359 cuda_version=driver_info.cuda_version,
--> 360 products=[nvutils.get_device_info(idx).name for idx in range(gpu_count)],
361 gpu_count=gpu_count,
362 )
D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\resource.py in <listcomp>(.0)
358 driver_version=driver_info.driver_version,
359 cuda_version=driver_info.cuda_version,
--> 360 products=[nvutils.get_device_info(idx).name for idx in range(gpu_count)],
361 gpu_count=gpu_count,
362 )
D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\lib\nvutils.py in get_device_info(dev_index)
341 info = _device_infos[dev_index] = _cu_device_info(
342 index=real_dev_index,
--> 343 uuid=uuid.UUID(bytes=uuid_t.bytes),
344 name=name_buf.value.decode(),
345 multiprocessors=cores.value,
D:\Compiler\Anaconda\envs\py37\lib\uuid.py in __init__(self, hex, bytes, bytes_le, fields, int, version, is_safe)
167 if bytes is not None:
168 if len(bytes) != 16:
--> 169 raise ValueError('bytes is not a 16-char string')
170 assert isinstance(bytes, bytes_), repr(bytes)
171 int = int_.from_bytes(bytes, byteorder='big')
ValueError: [address=127.0.0.1:8482, pid=39180] bytes is not a 16-char string`
mars.new_session()
Expected behavior session be created successfully
@wjsi Do you have time to look at this issue?
I met the same problem, have you solved it?