Timeouts during batch disk addition using juju storage on deploy time
The following logs were seen in a single node microceph deployment done on localhost substrate:
juju deploy ch:microceph -n 1 --config "enable-rgw"="*" --constraints "virt-type=virtual-machine root-disk=50G" --storage osd-standalone="loop,5G,3"
Logs:
unit-microceph-0: 16:14:42 ERROR juju.worker.dependency "api-caller" manifold worker returned unexpected error: api connection broken unexpectedly
unit-microceph-0: 16:14:47 ERROR unit.microceph/0.juju-log Exception raised in section 'storage': Command '['microceph', 'disk', 'add', '/dev/loop0', '/dev/loop1', '/dev/loop2']' timed out after 180 seconds
unit-microceph-0: 16:14:49 ERROR unit.microceph/0.juju-log Traceback (most recent call last):
File "/var/lib/juju/agents/unit-microceph-0/charm/venv/ops_sunbeam/guard.py", line 94, in guard
yield
File "/var/lib/juju/agents/unit-microceph-0/charm/src/storage.py", line 100, in _on_osd_standalone_attached
self._enroll_disks_in_batch(enroll)
File "/var/lib/juju/agents/unit-microceph-0/charm/src/storage.py", line 214, in _enroll_disks_in_batch
microceph.enroll_disks_as_osds(disk_paths)
File "/var/lib/juju/agents/unit-microceph-0/charm/src/microceph.py", line 265, in enroll_disks_as_osds
add_batch_osds(available_disks)
File "/var/lib/juju/agents/unit-microceph-0/charm/src/microceph.py", line 214, in add_batch_osds
_run_cmd(cmd)
File "/var/lib/juju/agents/unit-microceph-0/charm/src/microceph.py", line 41, in _run_cmd
process = subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=180)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/subprocess.py", line 550, in run
stdout, stderr = process.communicate(input, timeout=timeout)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/subprocess.py", line 1209, in communicate
stdout, stderr = self._communicate(input, endtime, timeout)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/subprocess.py", line 2116, in _communicate
self._check_timeout(endtime, orig_timeout, stdout, stderr)
File "/usr/lib/python3.12/subprocess.py", line 1253, in _check_timeout
raise TimeoutExpired(
subprocess.TimeoutExpired: Command '['microceph', 'disk', 'add', '/dev/loop0', '/dev/loop1', '/dev/loop2']' timed out after 180 seconds
unit-microceph-0: 16:14:52 INFO juju.worker.apicaller [6084c9] "unit-microceph-0" successfully connected to "wss://10.63.95.20:17070"
unit-microceph-0: 16:14:58 ERROR unit.microceph/0.juju-log Uncaught exception while in charm code:
Traceback (most recent call last):
File "/var/lib/juju/agents/unit-microceph-0/charm/venv/ops_sunbeam/guard.py", line 94, in guard
yield
File "/var/lib/juju/agents/unit-microceph-0/charm/src/storage.py", line 100, in _on_osd_standalone_attached
self._enroll_disks_in_batch(enroll)
File "/var/lib/juju/agents/unit-microceph-0/charm/src/storage.py", line 214, in _enroll_disks_in_batch
microceph.enroll_disks_as_osds(disk_paths)
File "/var/lib/juju/agents/unit-microceph-0/charm/src/microceph.py", line 265, in enroll_disks_as_osds
add_batch_osds(available_disks)
File "/var/lib/juju/agents/unit-microceph-0/charm/src/microceph.py", line 214, in add_batch_osds
_run_cmd(cmd)
File "/var/lib/juju/agents/unit-microceph-0/charm/src/microceph.py", line 41, in _run_cmd
process = subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=180)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/subprocess.py", line 550, in run
stdout, stderr = process.communicate(input, timeout=timeout)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/subprocess.py", line 1209, in communicate
stdout, stderr = self._communicate(input, endtime, timeout)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/subprocess.py", line 2116, in _communicate
self._check_timeout(endtime, orig_timeout, stdout, stderr)
File "/usr/lib/python3.12/subprocess.py", line 1253, in _check_timeout
raise TimeoutExpired(
subprocess.TimeoutExpired: Command '['microceph', 'disk', 'add', '/dev/loop0', '/dev/loop1', '/dev/loop2']' timed out after 180 seconds
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/var/lib/juju/agents/unit-microceph-0/charm/venv/ops/model.py", line 3320, in _run
result = subprocess.run(args, **kwargs) # type: ignore
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/subprocess.py", line 571, in run
raise CalledProcessError(retcode, process.args,
subprocess.CalledProcessError: Command '('/var/lib/juju/tools/unit-microceph-0/status-set', '--application=False', 'blocked', "(workload) Error in charm (see logs): Command '['microceph', 'disk', 'add', '/dev/loop0', '/dev/loop1', '/dev/loop2']' timed out after 180 seconds")' returned non-zero exit status 1.
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/var/lib/juju/agents/unit-microceph-0/charm/./src/charm.py", line 677, in <module>
main(MicroCephCharm)
File "/var/lib/juju/agents/unit-microceph-0/charm/venv/ops/main.py", line 40, in main
return _main.main(charm_class=charm_class, use_juju_for_storage=use_juju_for_storage)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/var/lib/juju/agents/unit-microceph-0/charm/venv/ops/_main.py", line 558, in main
manager.run()
File "/var/lib/juju/agents/unit-microceph-0/charm/venv/ops/_main.py", line 543, in run
self._emit()
File "/var/lib/juju/agents/unit-microceph-0/charm/venv/ops/_main.py", line 499, in _emit
self.framework.reemit()
File "/var/lib/juju/agents/unit-microceph-0/charm/venv/ops/framework.py", line 916, in reemit
self._reemit()
File "/var/lib/juju/agents/unit-microceph-0/charm/venv/ops/framework.py", line 996, in _reemit
custom_handler(event)
File "/var/lib/juju/agents/unit-microceph-0/charm/src/storage.py", line 98, in _on_osd_standalone_attached
with sunbeam_guard.guard(self.charm, self.name):
File "/usr/lib/python3.12/contextlib.py", line 158, in __exit__
self.gen.throw(value)
File "/var/lib/juju/agents/unit-microceph-0/charm/venv/ops_sunbeam/guard.py", line 129, in guard
charm.status.set(
File "/var/lib/juju/agents/unit-microceph-0/charm/venv/ops_sunbeam/compound_status.py", line 101, in set
self.on_update()
File "/var/lib/juju/agents/unit-microceph-0/charm/venv/ops_sunbeam/compound_status.py", line 270, in on_update
self._charm.unit.status = status
^^^^^^^^^^^^^^^^^^^^^^^
File "/var/lib/juju/agents/unit-microceph-0/charm/venv/ops/model.py", line 619, in status
self._backend.status_set(
File "/var/lib/juju/agents/unit-microceph-0/charm/venv/ops/model.py", line 3529, in status_set
self._run('status-set', f'--application={is_app}', status, message)
File "/var/lib/juju/agents/unit-microceph-0/charm/venv/ops/model.py", line 3322, in _run
raise ModelError(e.stderr) from e
ops.model.ModelError: ERROR connection is shut down
Thank you for reporting your feedback to us!
The internal ticket has been created: https://warthogs.atlassian.net/browse/CEPH-1190.
This message was autogenerated
@UtkarshBhatthere we'd need system logs to be able to debug this further -- this timeout points to some hang. Is this something reproducable you reckon?
FTR. ops.model.ModelError: ERROR connection is shut down means the juju agent lost connectivity to the controller which probably tripped up charm-microceph here.