dask-cloudprovider
dask-cloudprovider copied to clipboard
Timeout Error
Hi,
After trying out the example on https://cloudprovider.dask.org/en/latest/ with:
from dask_cloudprovider import FargateCluster
cluster = FargateCluster()
I was getting a rate exceeded error. So i went ahead and made a following code change https://github.com/dask/dask-cloudprovider/pull/124 locally.
However, now I get the following error:
OSError: Timed out trying to connect to 'tcp://<sldkflkjdf>' after 10 s: Timed out trying to connect to 'tcp://<sldkflkjdf>' after 10 s: connect() didn't finish in time
Full error:
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
~/.pyenv/versions/miniconda3-4.3.30/envs/dask_test/lib/python3.8/site-packages/distributed/comm/core.py in connect(addr, timeout, deserialize, handshake_overrides, **connection_args)
321 if not comm:
--> 322 _raise(error)
323 except FatalCommClosedError:
~/.pyenv/versions/miniconda3-4.3.30/envs/dask_test/lib/python3.8/site-packages/distributed/comm/core.py in _raise(error)
274 )
--> 275 raise IOError(msg)
276
OSError: Timed out trying to connect to 'tcp://3.84.98.29:8786' after 10 s: connect() didn't finish in time
During handling of the above exception, another exception occurred:
OSError Traceback (most recent call last)
<ipython-input-4-2c6d42e9ffff> in <module>
----> 1 cluster2 = FargateCluster()
~/.pyenv/versions/miniconda3-4.3.30/envs/dask_test/lib/python3.8/site-packages/dask_cloudprovider/providers/aws/ecs.py in __init__(self, **kwargs)
1216
1217 def __init__(self, **kwargs):
-> 1218 super().__init__(fargate_scheduler=True, fargate_workers=True, **kwargs)
1219
1220
~/.pyenv/versions/miniconda3-4.3.30/envs/dask_test/lib/python3.8/site-packages/dask_cloudprovider/providers/aws/ecs.py in __init__(self, fargate_scheduler, fargate_workers, image, scheduler_cpu, scheduler_mem, scheduler_timeout, scheduler_extra_args, worker_cpu, worker_mem, worker_gpu, worker_extra_args, n_workers, cluster_arn, cluster_name_template, execution_role_arn, task_role_arn, task_role_policies, cloudwatch_logs_group, cloudwatch_logs_stream_prefix, cloudwatch_logs_default_retention, vpc, subnets, security_groups, environment, tags, find_address_timeout, skip_cleanup, aws_access_key_id, aws_secret_access_key, region_name, platform_version, fargate_use_private_ip, mount_points, volumes, mount_volumes_on_scheduler, **kwargs)
668 self._lock = asyncio.Lock()
669 self.session = aiobotocore.get_session()
--> 670 super().__init__(**kwargs)
671
672 def _client(self, name: str):
~/.pyenv/versions/miniconda3-4.3.30/envs/dask_test/lib/python3.8/site-packages/distributed/deploy/spec.py in __init__(self, workers, scheduler, worker, asynchronous, loop, security, silence_logs, name)
274 if not self.asynchronous:
275 self._loop_runner.start()
--> 276 self.sync(self._start)
277 self.sync(self._correct_state)
278
~/.pyenv/versions/miniconda3-4.3.30/envs/dask_test/lib/python3.8/site-packages/distributed/deploy/cluster.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
181 return future
182 else:
--> 183 return sync(self.loop, func, *args, **kwargs)
184
185 def _log(self, log):
~/.pyenv/versions/miniconda3-4.3.30/envs/dask_test/lib/python3.8/site-packages/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)
338 if error[0]:
339 typ, exc, tb = error[0]
--> 340 raise exc.with_traceback(tb)
341 else:
342 return result[0]
~/.pyenv/versions/miniconda3-4.3.30/envs/dask_test/lib/python3.8/site-packages/distributed/utils.py in f()
322 if callback_timeout is not None:
323 future = asyncio.wait_for(future, callback_timeout)
--> 324 result[0] = yield future
325 except Exception as exc:
326 error[0] = sys.exc_info()
~/.pyenv/versions/miniconda3-4.3.30/envs/dask_test/lib/python3.8/site-packages/tornado/gen.py in run(self)
733
734 try:
--> 735 value = future.result()
736 except Exception:
737 exc_info = sys.exc_info()
~/.pyenv/versions/miniconda3-4.3.30/envs/dask_test/lib/python3.8/site-packages/dask_cloudprovider/providers/aws/ecs.py in _start(self)
859 "Hang tight! ",
860 ):
--> 861 await super()._start()
862
863 @property
~/.pyenv/versions/miniconda3-4.3.30/envs/dask_test/lib/python3.8/site-packages/distributed/deploy/spec.py in _start(self)
307 connection_args=self.security.get_connection_args("client"),
308 )
--> 309 await super()._start()
310
311 def _correct_state(self):
~/.pyenv/versions/miniconda3-4.3.30/envs/dask_test/lib/python3.8/site-packages/distributed/deploy/cluster.py in _start(self)
65
66 async def _start(self):
---> 67 comm = await self.scheduler_comm.live_comm()
68 await comm.write({"op": "subscribe_worker_status"})
69 self.scheduler_info = await comm.read()
~/.pyenv/versions/miniconda3-4.3.30/envs/dask_test/lib/python3.8/site-packages/distributed/core.py in live_comm(self)
766 del self.comms[s]
767 if not open or comm.closed():
--> 768 comm = await connect(
769 self.address,
770 self.timeout,
~/.pyenv/versions/miniconda3-4.3.30/envs/dask_test/lib/python3.8/site-packages/distributed/comm/core.py in connect(addr, timeout, deserialize, handshake_overrides, **connection_args)
332 backoff = min(backoff, 1) # wait at most one second
333 else:
--> 334 _raise(error)
335 else:
336 break
~/.pyenv/versions/miniconda3-4.3.30/envs/dask_test/lib/python3.8/site-packages/distributed/comm/core.py in _raise(error)
273 error,
274 )
--> 275 raise IOError(msg)
276
277 backoff = 0.01
OSError: Timed out trying to connect to 'tcp://<sldkflkjdf>' after 10 s: Timed out trying to connect to 'tcp://<sldkflkjdf>' after 10 s: connect() didn't finish in time
I see the scheduler. and worker tasks are created in the AWS console but not sure what I should do next?
Thanks for raising this @hiqbal2.
It seems like your Python session is not able to connect to the IP address given to the scheduler. Can you confirm that you are able to open the dashboard in a web browser? For example according to the traceback above that would have been at 3.84.98.29:8787
.
If it helps, heres the portforward_out_log.txt from one of my students.
(I'm afraid I know next to nothing about networking, so limited ability to parse myself)
Same problem here - no web browser dashboard connection. Using Docker container recommended in docs currently.
Timeout was a different dask version problem - but can't connect to the dashboard it seems via the web.
So is there a library the web browser connection depends on that could be wrong/missing or doesn't match from local environment to the container and cause a problem?
There aren't any local dependencies you would need to view the web page. I am wondering if this is some firewall rule or restriction on the AWS accounts you are using that blocks this traffic from being exposed to the internet.