mars icon indicating copy to clipboard operation
mars copied to clipboard

[BUG] session creation error

Open Ukon233 opened this issue 3 years ago • 2 comments

Describe the bug session creation error

To Reproduce To help us reproducing this bug, please provide information below:

  1. Python 3.7.13
  2. Mars 9.0
  3. //
  4. Full stack of the error.
ERROR:mars.services.cluster.uploader:Failed to upload node info
Traceback (most recent call last):
  File "D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\services\cluster\uploader.py", line 122, in upload_node_info
    self._info.env = await asyncio.to_thread(gather_node_env)
  File "D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\lib\aio\_threads.py", line 36, in to_thread
    return await loop.run_in_executor(None, func_call)
  File "D:\Compiler\Anaconda\envs\py37\lib\concurrent\futures\thread.py", line 57, in run
    result = self.fn(*self.args, **self.kwargs)
  File "D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\services\cluster\gather.py", line 75, in gather_node_env
    cuda_info = mars_resource.cuda_info()
  File "D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\resource.py", line 360, in cuda_info
    products=[nvutils.get_device_info(idx).name for idx in range(gpu_count)],
  File "D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\resource.py", line 360, in <listcomp>
    products=[nvutils.get_device_info(idx).name for idx in range(gpu_count)],
  File "D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\lib\nvutils.py", line 343, in get_device_info
    uuid=uuid.UUID(bytes=uuid_t.bytes),
  File "D:\Compiler\Anaconda\envs\py37\lib\uuid.py", line 169, in __init__
    raise ValueError('bytes is not a 16-char string')
ValueError: bytes is not a 16-char string
ERROR:mars.services.cluster.uploader:Failed to upload node info: bytes is not a 16-char string

`---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_39180\3232464671.py in <module>
----> 1 mars.new_session()

D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\deploy\oscar\session.py in new_session(address, session_id, backend, default, new, **kwargs)
   2038 
   2039     session = SyncSession.init(
-> 2040         address, session_id=session_id, backend=backend, new=new, **kwargs
   2041     )
   2042     if default:

D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\deploy\oscar\session.py in init(cls, address, session_id, backend, new, **kwargs)
   1634         coro = _IsolatedSession.init(address, session_id, backend, new=new, **kwargs)
   1635         fut = asyncio.run_coroutine_threadsafe(coro, isolation.loop)
-> 1636         isolated_session = fut.result()
   1637         return SyncSession(address, session_id, isolated_session, isolation)
   1638 

D:\Compiler\Anaconda\envs\py37\lib\concurrent\futures\_base.py in result(self, timeout)
    433                 raise CancelledError()
    434             elif self._state == FINISHED:
--> 435                 return self.__get_result()
    436             else:
    437                 raise TimeoutError()

D:\Compiler\Anaconda\envs\py37\lib\concurrent\futures\_base.py in __get_result(self)
    382     def __get_result(self):
    383         if self._exception:
--> 384             raise self._exception
    385         else:
    386             return self._result

D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\deploy\oscar\session.py in init(cls, address, session_id, backend, new, timeout, **kwargs)
    846             return (
    847                 await new_cluster_in_isolation(
--> 848                     address, timeout=timeout, backend=backend, **kwargs
    849                 )
    850             ).session

D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\deploy\oscar\local.py in new_cluster_in_isolation(address, n_worker, n_cpu, mem_bytes, cuda_devices, subprocess_start_method, backend, config, web, timeout, n_supervisor_process)
     89         n_supervisor_process,
     90     )
---> 91     await cluster.start()
     92     return await LocalClient.create(cluster, timeout)
     93 

D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\deploy\oscar\local.py in start(self)
    217         await self._start_worker_pools()
    218         # start service
--> 219         await self._start_service()
    220 
    221         if self._web:

D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\deploy\oscar\local.py in _start_service(self)
    263     async def _start_service(self):
    264         self._web = await start_supervisor(
--> 265             self.supervisor_address, config=self._config, web=self._web
    266         )
    267         for worker_pool, band_to_resource in zip(

D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\deploy\oscar\service.py in start_supervisor(address, lookup_address, modules, config, web)
     40         config["modules"] = modules
     41     try:
---> 42         await start_services(NodeRole.SUPERVISOR, config, address=address)
     43         logger.debug("Mars supervisor started at %s", address)
     44     except ImportError:

D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\services\core.py in start_services(node_role, config, address, mark_ready)
    172     for entries in svc_entries_list:
    173         instances = [svc_entry.get_instance(address, config) for svc_entry in entries]
--> 174         await asyncio.gather(*[inst.start() for inst in instances])
    175 
    176     if mark_ready and "cluster" in service_names:

D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\services\cluster\supervisor\service.py in start(self)
     65             interval=svc_config.get("node_check_interval"),
     66             uid=NodeInfoUploaderActor.default_uid(),
---> 67             address=address,
     68         )
     69         await mo.create_actor(

D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\oscar\api.py in create_actor(actor_cls, uid, address, *args, **kwargs)
     25 async def create_actor(actor_cls, *args, uid=None, address=None, **kwargs) -> ActorRef:
     26     ctx = get_context()
---> 27     return await ctx.create_actor(actor_cls, *args, uid=uid, address=address, **kwargs)
     28 
     29 

D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\oscar\backends\context.py in create_actor(self, actor_cls, uid, address, *args, **kwargs)
    110         future = await self._call(address, create_actor_message, wait=False)
    111         result = await self._wait(future, address, create_actor_message)
--> 112         return self._process_result_message(result)
    113 
    114     async def has_actor(self, actor_ref: ActorRef) -> bool:

D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\oscar\backends\context.py in _process_result_message(message)
     74             return message.result
     75         else:
---> 76             raise message.as_instanceof_cause()
     77 
     78     async def _wait(self, future: asyncio.Future, address: str, message: _MessageBase):

D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\oscar\backends\pool.py in create_actor(self, message)
    523             actor.address = address = self.external_address
    524             self._actors[actor_id] = actor
--> 525             await self._run_coro(message.message_id, actor.__post_create__())
    526 
    527             result = ActorRef(address, actor_id)

D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\oscar\backends\pool.py in _run_coro(self, message_id, coro)
    341         self._process_messages[message_id] = asyncio.tasks.current_task()
    342         try:
--> 343             return await coro
    344         finally:
    345             self._process_messages.pop(message_id, None)

D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\services\cluster\uploader.py in __post_create__(self)
     58     async def __post_create__(self):
     59         self._upload_task = asyncio.create_task(self._periodical_upload_node_info())
---> 60         await self._uploaded_future
     61 
     62     async def __pre_destroy__(self):

D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\services\cluster\uploader.py in _periodical_upload_node_info(self)
     84         while True:
     85             try:
---> 86                 await self.upload_node_info()
     87                 if not self._uploaded_future.done():
     88                     self._uploaded_future.set_result(None)

D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\services\cluster\uploader.py in upload_node_info(self, status)
    120         try:
    121             if not self._info.env:
--> 122                 self._info.env = await asyncio.to_thread(gather_node_env)
    123             self._info.detail.update(
    124                 await asyncio.to_thread(

D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\lib\aio\_threads.py in to_thread(func, *args, **kwargs)
     34     ctx = contextvars.copy_context()
     35     func_call = functools.partial(ctx.run, func, *args, **kwargs)
---> 36     return await loop.run_in_executor(None, func_call)

D:\Compiler\Anaconda\envs\py37\lib\concurrent\futures\thread.py in run(self)
     55 
     56         try:
---> 57             result = self.fn(*self.args, **self.kwargs)
     58         except BaseException as exc:
     59             self.future.set_exception(exc)

D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\services\cluster\gather.py in gather_node_env()
     73 
     74     try:
---> 75         cuda_info = mars_resource.cuda_info()
     76     except NVError:  # pragma: no cover
     77         logger.exception("NVError encountered, cannot gather CUDA devices.")

D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\resource.py in cuda_info()
    358         driver_version=driver_info.driver_version,
    359         cuda_version=driver_info.cuda_version,
--> 360         products=[nvutils.get_device_info(idx).name for idx in range(gpu_count)],
    361         gpu_count=gpu_count,
    362     )

D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\resource.py in <listcomp>(.0)
    358         driver_version=driver_info.driver_version,
    359         cuda_version=driver_info.cuda_version,
--> 360         products=[nvutils.get_device_info(idx).name for idx in range(gpu_count)],
    361         gpu_count=gpu_count,
    362     )

D:\Compiler\Anaconda\envs\py37\lib\site-packages\mars\lib\nvutils.py in get_device_info(dev_index)
    341     info = _device_infos[dev_index] = _cu_device_info(
    342         index=real_dev_index,
--> 343         uuid=uuid.UUID(bytes=uuid_t.bytes),
    344         name=name_buf.value.decode(),
    345         multiprocessors=cores.value,

D:\Compiler\Anaconda\envs\py37\lib\uuid.py in __init__(self, hex, bytes, bytes_le, fields, int, version, is_safe)
    167         if bytes is not None:
    168             if len(bytes) != 16:
--> 169                 raise ValueError('bytes is not a 16-char string')
    170             assert isinstance(bytes, bytes_), repr(bytes)
    171             int = int_.from_bytes(bytes, byteorder='big')

ValueError: [address=127.0.0.1:8482, pid=39180] bytes is not a 16-char string`
mars.new_session()

Expected behavior session be created successfully

Ukon233 avatar Jun 29 '22 13:06 Ukon233

@wjsi Do you have time to look at this issue?

qinxuye avatar Jun 30 '22 02:06 qinxuye

I met the same problem, have you solved it?

HXuan-Wang avatar Jul 07 '22 12:07 HXuan-Wang