test_stop_sim[using_scheduler] fails nightly test
Describe the bug
test_stop_sim[using_scheduler] fails Logs: https://github.com/equinor/komodo-releases/actions/runs/8088980330/job/22120233675
________________________ test_stop_sim[using_scheduler] ________________________
[gw3] linux -- Python 3.8.18 /tmp/f_scout_ci/actions-runner-01/_temp/test-kenv/root/bin/python
copy_case = <function fixture_copy_case.<locals>._copy_case at 0x7f4de2293e50>
storage = <ert.storage.local_storage.LocalStorage object at 0x7f4dc07e36a0>
@pytest.mark.usefixtures("using_scheduler")
def test_stop_sim(copy_case, storage):
copy_case("batch_sim")
with open("sleepy_time.ert", "a", encoding="utf-8") as f:
f.write(
"""
LOAD_WORKFLOW_JOB workflows/jobs/REALIZATION_NUMBER
LOAD_WORKFLOW workflows/REALIZATION_NUMBER_WORKFLOW
HOOK_WORKFLOW REALIZATION_NUMBER_WORKFLOW PRE_SIMULATION
LOAD_WORKFLOW_JOB workflows/jobs/REALIZATION_NUMBER
"""
)
ert_config = ErtConfig.from_file("sleepy_time.ert")
rsim = BatchSimulator(
ert_config,
{"WELL_ORDER": ["W1", "W2", "W3"], "WELL_ON_OFF": ["W1", "W2", "W3"]},
["ORDER", "ON_OFF"],
)
case_name = "MyCaseName_123"
case_data = [
(
2,
{
"WELL_ORDER": {"W1": 1, "W2": 2, "W3": 3},
"WELL_ON_OFF": {"W1": 4, "W2": 5, "W3": 6},
},
),
(
1,
{
"WELL_ORDER": {"W1": 7, "W2": 8, "W3": 9},
"WELL_ON_OFF": {"W1": 10, "W2": 11, "W3": 12},
},
),
]
# Starting a simulation which should actually run through.
ctx = rsim.start(case_name, case_data, storage=storage)
> ctx.stop()
/tmp/f_scout_ci/actions-runner-01/_temp/test_root/tests/unit_tests/simulator/test_batch_sim.py:395:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
/tmp/f_scout_ci/actions-runner-01/_temp/test-kenv/root/lib64/python3.8/site-packages/ert/simulator/simulation_context.py:250: in stop
self._sim_thread.join()
/opt/rh/rh-python38/root/usr/lib64/python3.8/threading.py:1011: in join
self._wait_for_tstate_lock()
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <ErtThread(Thread-733, started 139969888691968)>, block = True
timeout = -1
def _wait_for_tstate_lock(self, block=True, timeout=-1):
# Issue #18808: wait for the thread state to be gone.
# At the end of the thread's life, after all knowledge of the thread
# is removed from C data structures, C code releases our _tstate_lock.
# This method passes its arguments to _tstate_lock.acquire().
# If the lock is acquired, the C code is done, and self._stop() is
# called. That sets ._is_stopped to True, and ._tstate_lock to None.
lock = self._tstate_lock
if lock is None: # already determined that the C code is done
assert self._is_stopped
> elif lock.acquire(block, timeout):
E Failed: Timeout >360.0s
/opt/rh/rh-python38/root/usr/lib64/python3.8/threading.py:1027: Failed
To reproduce TBD
Expected behaviour Not fail
Screenshots N/A
Environment
- OS: RHEL7
- ERT/Komodo release: bleeding
- Python version: 3,8
- Remote/HPC execution involved: [yes|no]
Additional context N/A
Reproduced locally with pytest -vs tests/unit_tests/simulator/test_batch_sim.py::test_stop_sim --count=1000 -k using_scheduler
tests/unit_tests/simulator/test_batch_sim.py::test_stop_sim[using_scheduler-153-1000]
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Timeout ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Stack of Thread-153 (<lambda>) (140569485747968) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
File "/usr/lib64/python3.11/threading.py", line 995, in _bootstrap
self._bootstrap_inner()
File "/usr/lib64/python3.11/threading.py", line 1038, in _bootstrap_inner
self.run()
File "/data/workspace/ert/src/_ert/threading.py", line 31, in run
super().run()
File "/usr/lib64/python3.11/threading.py", line 975, in run
self._target(*self._args, **self._kwargs)
File "/data/workspace/ert/src/ert/simulator/simulation_context.py", line 147, in <lambda>
target=lambda: _run_forward_model(
File "/data/workspace/ert/src/ert/simulator/simulation_context.py", line 43, in _run_forward_model
asyncio.run(_submit_and_run_jobqueue(ert, job_queue, run_context))
File "/usr/lib64/python3.11/asyncio/runners.py", line 190, in run
return runner.run(main)
File "/usr/lib64/python3.11/asyncio/runners.py", line 118, in run
return self._loop.run_until_complete(task)
File "/usr/lib64/python3.11/asyncio/base_events.py", line 640, in run_until_complete
self.run_forever()
File "/usr/lib64/python3.11/asyncio/base_events.py", line 607, in run_forever
self._run_once()
File "/usr/lib64/python3.11/asyncio/base_events.py", line 1884, in _run_once
event_list = self._selector.select(timeout)
File "/usr/lib64/python3.11/selectors.py", line 468, in select
fd_event_list = self._selector.poll(timeout, max_ev)
The code hangs here: https://github.com/equinor/ert/blob/759f431ec6827a149663ece6691672c8b5122057/src/ert/simulator/simulation_context.py#L249
#7336 will avoid this bug for simulation_context, but is tripped by the test test_max_runtime_while_killing(). It is possible that 7336 is the correct solution, but that code handling the other test should be changed.
Can this have been fixed by something else?
I ran with --count=2000 both locally and on rgs node, and both passed.
Reproduced locally with
pytest -vs tests/unit_tests/simulator/test_batch_sim.py::test_stop_sim --count=1000 -k using_schedulertests/unit_tests/simulator/test_batch_sim.py::test_stop_sim[using_scheduler-153-1000] +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Timeout ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
File "/usr/lib64/python3.11/threading.py", line 995, in _bootstrap self._bootstrap_inner() File "/usr/lib64/python3.11/threading.py", line 1038, in _bootstrap_inner self.run() File "/data/workspace/ert/src/_ert/threading.py", line 31, in run super().run() File "/usr/lib64/python3.11/threading.py", line 975, in run self._target(*self._args, **self._kwargs) File "/data/workspace/ert/src/ert/simulator/simulation_context.py", line 147, in <lambda> target=lambda: _run_forward_model( File "/data/workspace/ert/src/ert/simulator/simulation_context.py", line 43, in _run_forward_model asyncio.run(_submit_and_run_jobqueue(ert, job_queue, run_context)) File "/usr/lib64/python3.11/asyncio/runners.py", line 190, in run return runner.run(main) File "/usr/lib64/python3.11/asyncio/runners.py", line 118, in run return self._loop.run_until_complete(task) File "/usr/lib64/python3.11/asyncio/base_events.py", line 640, in run_until_complete self.run_forever() File "/usr/lib64/python3.11/asyncio/base_events.py", line 607, in run_forever self._run_once() File "/usr/lib64/python3.11/asyncio/base_events.py", line 1884, in _run_once event_list = self._selector.select(timeout) File "/usr/lib64/python3.11/selectors.py", line 468, in select fd_event_list = self._selector.poll(timeout, max_ev)
Can't reproduce either. Closing.