hail
hail copied to clipboard
[batch] Too many open files when submitting a batch
What happened?
https://hail.zulipchat.com/#narrow/stream/223457-Hail-Batch-support/topic/Too.20many.20open.20files.20exception
- [ ] Replicate
- [ ] Attempt to reduce parallelism to 75x and see if that resolves it.
Version
0.2.124
Relevant log output
Task exception was never retrieved
future: <Task finished name='Task-29959' coro=<BaseSession.post() done, defined at /usr/local/lib/python3.9/site-packages/hailtop/aiocloud/common/session.py:20> exception=ServerDisconnectedError('Server disconnected')>
Traceback (most recent call last):
File "/usr/local/lib/python3.9/site-packages/aiohttp/connector.py", line 986, in _wrap_create_connection
return await self._loop.create_connection(*args, **kwargs) # type: ignore[return-value] # noqa
File "/usr/local/Cellar/[email protected]/3.9.17_1/Frameworks/Python.framework/Versions/3.9/lib/python3.9/asyncio/base_events.py", line 1065, in create_connection
raise exceptions[0]
File "/usr/local/Cellar/[email protected]/3.9.17_1/Frameworks/Python.framework/Versions/3.9/lib/python3.9/asyncio/base_events.py", line 1050, in create_connection
sock = await self._connect_sock(
File "/usr/local/Cellar/[email protected]/3.9.17_1/Frameworks/Python.framework/Versions/3.9/lib/python3.9/asyncio/base_events.py", line 944, in _connect_sock
sock = socket.socket(family=family, type=type_, proto=proto)
File "/usr/local/Cellar/[email protected]/3.9.17_1/Frameworks/Python.framework/Versions/3.9/lib/python3.9/socket.py", line 232, in __init__
_socket.socket.__init__(self, family, type, proto, fileno)
OSError: [Errno 24] Too many open files
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/Users/weisburd/code/sma_finder/sma_finder_pipeline.py", line 473, in <module>
main()
File "/Users/weisburd/code/sma_finder/sma_finder_pipeline.py", line 393, in main
bp.run()
File "/Users/weisburd/code/step-pipeline/step_pipeline/batch.py", line 300, in run
result = self._run_batch_obj()
File "/Users/weisburd/code/step-pipeline/step_pipeline/batch.py", line 368, in _run_batch_obj
result = self._batch.run(
File "/usr/local/lib/python3.9/site-packages/hailtop/batch/batch.py", line 712, in run
run_result = self._backend._run(self, dry_run, verbose, delete_scratch_on_exit, **backend_kwargs) # pylint: disable=assignment-from-no-return
File "/usr/local/lib/python3.9/site-packages/hailtop/batch/backend.py", line 595, in _run
return async_to_blocking(
File "/usr/local/lib/python3.9/site-packages/hailtop/utils/utils.py", line 156, in async_to_blocking
return loop.run_until_complete(task)
File "/usr/local/lib/python3.9/site-packages/nest_asyncio.py", line 81, in run_until_complete
return f.result()
File "/usr/local/Cellar/[email protected]/3.9.17_1/Frameworks/Python.framework/Versions/3.9/lib/python3.9/asyncio/futures.py", line 201, in result
raise self._exception
File "/usr/local/Cellar/[email protected]/3.9.17_1/Frameworks/Python.framework/Versions/3.9/lib/python3.9/asyncio/tasks.py", line 256, in __step
result = coro.send(None)
File "/usr/local/lib/python3.9/site-packages/hailtop/batch/backend.py", line 698, in _async_run
used_remote_tmpdir_results = await bounded_gather(*[functools.partial(compile_job, j) for j in unsubmitted_jobs], parallelism=150)
File "/usr/local/lib/python3.9/site-packages/hailtop/utils/utils.py", line 187, in bounded_gather
return await gatherer.wait()
File "/usr/local/lib/python3.9/site-packages/hailtop/utils/utils.py", line 248, in wait
raise self._errors[0]
File "/usr/local/lib/python3.9/site-packages/hailtop/utils/utils.py", line 224, in _worker
res = await pf()
File "/usr/local/lib/python3.9/site-packages/hailtop/batch/backend.py", line 695, in compile_job
used_remote_tmpdir = await job._compile(local_tmpdir, batch_remote_tmpdir, dry_run=dry_run)
File "/usr/local/lib/python3.9/site-packages/hailtop/batch/job.py", line 876, in _compile
await self._batch._fs.write(code_path, job_command_bytes)
File "/usr/local/lib/python3.9/site-packages/hailtop/aiotools/fs/fs.py", line 298, in write
await retry_transient_errors(_write)
File "/usr/local/lib/python3.9/site-packages/hailtop/utils/utils.py", line 813, in retry_transient_errors
return await retry_transient_errors_with_debug_string('', 0, f, *args, **kwargs)
File "/usr/local/lib/python3.9/site-packages/hailtop/utils/utils.py", line 825, in retry_transient_errors_with_debug_string
return await f(*args, **kwargs)
File "/usr/local/lib/python3.9/site-packages/hailtop/aiotools/fs/fs.py", line 296, in _write
await f.write(data)
File "/usr/local/lib/python3.9/site-packages/hailtop/aiotools/fs/stream.py", line 119, in __aexit__
await self.wait_closed()
File "/usr/local/lib/python3.9/site-packages/hailtop/aiotools/fs/stream.py", line 104, in wait_closed
await self._wait_closed()
File "/usr/local/lib/python3.9/site-packages/hailtop/aiocloud/aiogoogle/client/storage_client.py", line 85, in _wait_closed
async with await self._request_task as resp:
File "/usr/local/Cellar/[email protected]/3.9.17_1/Frameworks/Python.framework/Versions/3.9/lib/python3.9/asyncio/futures.py", line 287, in __await__
return self.result() # May raise too.
File "/usr/local/Cellar/[email protected]/3.9.17_1/Frameworks/Python.framework/Versions/3.9/lib/python3.9/asyncio/futures.py", line 201, in result
raise self._exception
File "/usr/local/Cellar/[email protected]/3.9.17_1/Frameworks/Python.framework/Versions/3.9/lib/python3.9/asyncio/tasks.py", line 256, in __step
result = coro.send(None)
File "/usr/local/lib/python3.9/site-packages/hailtop/aiocloud/common/session.py", line 21, in post
return await self.request('POST', url, **kwargs)
File "/usr/local/lib/python3.9/site-packages/hailtop/aiocloud/common/session.py", line 105, in request
return await self._http_session.request(method, url, **kwargs)
File "/usr/local/lib/python3.9/site-packages/hailtop/httpx.py", line 137, in request_and_raise_for_status
resp = await self.client_session._request(method, url, **kwargs)
File "/usr/local/lib/python3.9/site-packages/aiohttp/client.py", line 535, in _request
conn = await self._connector.connect(
File "/usr/local/lib/python3.9/site-packages/aiohttp/connector.py", line 542, in connect
proto = await self._create_connection(req, traces, timeout)
File "/usr/local/lib/python3.9/site-packages/aiohttp/connector.py", line 907, in _create_connection
_, proto = await self._create_direct_connection(req, traces, timeout)
File "/usr/local/lib/python3.9/site-packages/aiohttp/connector.py", line 1206, in _create_direct_connection
raise last_exc
File "/usr/local/lib/python3.9/site-packages/aiohttp/connector.py", line 1175, in _create_direct_connection
transp, proto = await self._wrap_create_connection(
File "/usr/local/lib/python3.9/site-packages/aiohttp/connector.py", line 992, in _wrap_create_connection
raise client_error(req.connection_key, exc) from exc
aiohttp.client_exceptions.ClientConnectorError: Cannot connect to host storage.googleapis.com:443 ssl:default [Too many open files]