mirror of
https://github.com/home-assistant/supervisor.git
synced 2026-07-04 12:25:02 +01:00
95bb8fe6ab
* Make Core.shutdown idempotent and safe to call concurrently After #6887, Core.shutdown() now runs in the SIGTERM path during host shutdown in addition to the existing host.control reboot/shutdown and backup restore paths. Multiple concurrent callers were possible (e.g. SIGTERM arriving while a reboot API call is mid-flight), so __main__.py debounced the signal handler by stashing the in-flight task in a single- element list and bailing out on the second SIGTERM. Move the idempotency into Core.shutdown() itself, where it belongs: - A second call while shutdown is in progress awaits the in-flight shutdown via an asyncio.Event rather than re-running the sequence. - Calls during STOPPING/CLOSE return early (Supervisor is already going away; the work is moot). - Calls during STARTING_STATES (INITIALIZE/STARTUP/SETUP) return early too. There is nothing coherent to gracefully stop before startup completes, and on the SIGTERM-during-startup path the caller cancels startup_task first, so waiting for it to complete would deadlock. - The sequence is wrapped in try/finally so the completion event is set even when an inner step raises. With that in place the closure workaround in __main__.py collapses to a plain coresys.create_task(stop_supervisor()): repeat SIGTERMs spawn extra tasks but each just observes the in-flight shutdown and waits. Tests cover the four state branches and confirm the event is reset between repeated shutdown cycles (backup restore re-enters RUNNING). * Split Core.shutdown() into teardown_services + shutdown PR feedback (@mdegat01) flagged the "supports repeated use" comment on _shutdown_event.clear() as describing a use case that does not exist. Investigating, the real source of confusion is that the old shutdown() did two different things stitched together: - Stop user-facing containers (add-ons + Home Assistant Core), which backup restore uses while leaving the host alone. - Run the full shutdown ceremony (state transition to SHUTDOWN, stop plugins), which only the SIGTERM signal handler and the host reboot/power-off API want. That dispatch was implemented with an asymmetric state transition ("only set SHUTDOWN if state == RUNNING") and a plugin-shutdown gate ("only stop plugins if state in (STOPPING, SHUTDOWN)"). It worked but made the intent of each branch hard to read, broke the reentrancy guard on the restore path (state never reaches SHUTDOWN, so concurrent callers fall through every early return), and forced the misleading "repeated cycles" framing on the event handling. Split into two methods with one job each: - teardown_services(): stop add-ons + Home Assistant Core. Does not change Core state and does not stop plugins. Backup restore calls this directly so HA Core's watchdog stays registered (it only disables on transitions into CLOSING_STATES) and plugins keep running for the restore body to use. - shutdown(): real shutdown ceremony. Unconditionally transitions to SHUTDOWN, calls teardown_services(), then stops plugins. The reentrancy guard (state == SHUTDOWN -> await event) now works correctly because every caller transitions state on entry. One-shot per process lifetime; no clear() needed. Update backups/manager.py:867 to call teardown_services() instead. remove_homeassistant_container moves to teardown_services() since restore is the only caller that passes it. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> * Release shutdown waiters when set_state() is cancelled PR feedback from Copilot: set_state() updates Core._state synchronously (line 84 in core.py) before awaiting _write_run_state(). If the shutdown task is cancelled while awaiting that write, in-memory state is already SHUTDOWN but the function exits before entering the try/finally that sets _shutdown_event. Any concurrent or later shutdown() caller then sees state == SHUTDOWN and blocks forever on _shutdown_event.wait(). Move the set_state(SHUTDOWN) call inside the try so finally always runs and releases waiters. CancelledError still propagates to the caller after finally as expected; we just no longer leak the lock. Add a regression test that simulates cancellation inside _write_run_state() and asserts both that state has moved to SHUTDOWN and that _shutdown_event is set. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.7 <noreply@anthropic.com>
368 lines
13 KiB
Python
368 lines
13 KiB
Python
"""Testing handling with CoreState."""
|
|
|
|
# pylint: disable=W0212
|
|
import asyncio
|
|
from contextlib import suppress
|
|
import datetime
|
|
import errno
|
|
from unittest.mock import AsyncMock, MagicMock, PropertyMock, patch
|
|
|
|
import pytest
|
|
|
|
from supervisor.const import CoreState
|
|
from supervisor.coresys import CoreSys
|
|
from supervisor.exceptions import AppFileReadError, HassioError, WhoamiSSLError
|
|
from supervisor.host.control import SystemControl
|
|
from supervisor.host.info import InfoCenter
|
|
from supervisor.resolution.const import IssueType, SuggestionType, UnhealthyReason
|
|
from supervisor.supervisor import Supervisor
|
|
from supervisor.utils.whoami import WhoamiData
|
|
|
|
from tests.dbus_service_mocks.base import DBusServiceMock
|
|
from tests.dbus_service_mocks.systemd import Systemd as SystemdService
|
|
from tests.dbus_service_mocks.systemd_unit import SystemdUnit as SystemdUnitService
|
|
|
|
|
|
@pytest.mark.parametrize("run_supervisor_state", ["test_file"], indirect=True)
|
|
async def test_write_state(run_supervisor_state: MagicMock, coresys: CoreSys):
|
|
"""Test write corestate to /run/supervisor."""
|
|
run_supervisor_state.reset_mock()
|
|
|
|
await coresys.core.set_state(CoreState.RUNNING)
|
|
|
|
run_supervisor_state.write_text.assert_called_with(
|
|
str(CoreState.RUNNING), encoding="utf-8"
|
|
)
|
|
|
|
await coresys.core.set_state(CoreState.SHUTDOWN)
|
|
|
|
run_supervisor_state.write_text.assert_called_with(
|
|
str(CoreState.SHUTDOWN), encoding="utf-8"
|
|
)
|
|
|
|
|
|
async def test_adjust_system_datetime(coresys: CoreSys, websession: MagicMock):
|
|
"""Test _adjust_system_datetime method with successful retrieve_whoami."""
|
|
utc_ts = datetime.datetime.now().replace(tzinfo=datetime.UTC)
|
|
with patch(
|
|
"supervisor.core.retrieve_whoami",
|
|
new_callable=AsyncMock,
|
|
side_effect=[WhoamiData("Europe/Zurich", utc_ts)],
|
|
) as mock_retrieve_whoami:
|
|
await coresys.core._adjust_system_datetime()
|
|
mock_retrieve_whoami.assert_called_once()
|
|
assert coresys.core.sys_config.timezone == "Europe/Zurich"
|
|
|
|
# Validate we don't retrieve whoami once timezone has been set
|
|
mock_retrieve_whoami.reset_mock()
|
|
await coresys.core._adjust_system_datetime()
|
|
mock_retrieve_whoami.assert_not_called()
|
|
|
|
|
|
async def test_adjust_system_datetime_without_ssl(
|
|
coresys: CoreSys, websession: MagicMock
|
|
):
|
|
"""Test _adjust_system_datetime method when retrieve_whoami raises WhoamiSSLError."""
|
|
utc_ts = datetime.datetime.now().replace(tzinfo=datetime.UTC)
|
|
with patch(
|
|
"supervisor.core.retrieve_whoami",
|
|
new_callable=AsyncMock,
|
|
side_effect=[WhoamiSSLError("SSL error"), WhoamiData("Europe/Zurich", utc_ts)],
|
|
) as mock_retrieve_whoami:
|
|
await coresys.core._adjust_system_datetime()
|
|
assert mock_retrieve_whoami.call_count == 2
|
|
assert mock_retrieve_whoami.call_args_list[0].args[1]
|
|
assert not mock_retrieve_whoami.call_args_list[1].args[1]
|
|
assert coresys.core.sys_config.timezone == "Europe/Zurich"
|
|
|
|
|
|
async def test_adjust_system_datetime_if_time_behind(
|
|
coresys: CoreSys,
|
|
websession: MagicMock,
|
|
all_dbus_services: dict[str, DBusServiceMock | dict[str, DBusServiceMock]],
|
|
):
|
|
"""Test _adjust_system_datetime method when current time is ahead more than 1 hour."""
|
|
systemd_service: SystemdService = all_dbus_services["systemd"]
|
|
systemd_service.StopUnit.calls.clear()
|
|
systemd_unit_service: SystemdUnitService = all_dbus_services["systemd_unit"]
|
|
systemd_unit_service.active_state = "active"
|
|
|
|
utc_ts = datetime.datetime.now().replace(tzinfo=datetime.UTC) + datetime.timedelta(
|
|
hours=1, minutes=1
|
|
)
|
|
with (
|
|
patch(
|
|
"supervisor.core.retrieve_whoami",
|
|
new_callable=AsyncMock,
|
|
side_effect=[WhoamiData("Europe/Zurich", utc_ts)],
|
|
) as mock_retrieve_whoami,
|
|
patch.object(SystemControl, "set_datetime") as mock_set_datetime,
|
|
patch.object(SystemControl, "set_timezone") as mock_set_timezone,
|
|
patch.object(
|
|
InfoCenter, "dt_synchronized", new=PropertyMock(return_value=False)
|
|
),
|
|
patch.object(InfoCenter, "use_ntp", new=PropertyMock(return_value=True)),
|
|
patch.object(
|
|
Supervisor, "check_and_update_connectivity"
|
|
) as mock_check_connectivity,
|
|
):
|
|
# Start the time adjustment which will wait for timesyncd to stop
|
|
task = asyncio.create_task(coresys.core._adjust_system_datetime())
|
|
await asyncio.sleep(0.1)
|
|
# Simulate timesyncd stopping via D-Bus signal
|
|
systemd_unit_service.emit_properties_changed({"ActiveState": "inactive"})
|
|
await task
|
|
|
|
mock_retrieve_whoami.assert_called_once()
|
|
mock_set_datetime.assert_called_once()
|
|
mock_check_connectivity.assert_called_once()
|
|
mock_set_timezone.assert_called_once_with("Europe/Zurich")
|
|
|
|
# Verify timesyncd was stopped before setting time
|
|
assert systemd_service.StopUnit.calls == [
|
|
("systemd-timesyncd.service", "replace")
|
|
]
|
|
|
|
# Verify issue was created
|
|
assert any(
|
|
issue.type == IssueType.NTP_SYNC_FAILED
|
|
for issue in coresys.resolution.issues
|
|
)
|
|
assert any(
|
|
suggestion.type == SuggestionType.ENABLE_NTP
|
|
for suggestion in coresys.resolution.suggestions
|
|
)
|
|
|
|
|
|
async def test_adjust_system_datetime_sync_timezone_to_host(
|
|
coresys: CoreSys, websession: MagicMock
|
|
):
|
|
"""Test _adjust_system_datetime method syncs timezone to host when different."""
|
|
await coresys.core.sys_config.set_timezone("Europe/Prague")
|
|
|
|
with (
|
|
patch.object(SystemControl, "set_timezone") as mock_set_timezone,
|
|
patch.object(InfoCenter, "timezone", new=PropertyMock(return_value="Etc/UTC")),
|
|
):
|
|
await coresys.core._adjust_system_datetime()
|
|
mock_set_timezone.assert_called_once_with("Europe/Prague")
|
|
|
|
|
|
async def test_write_state_failure(
|
|
run_supervisor_state: MagicMock, coresys: CoreSys, caplog: pytest.LogCaptureFixture
|
|
):
|
|
"""Test failure to write corestate to /run/supervisor."""
|
|
err = OSError()
|
|
err.errno = errno.EBADMSG
|
|
run_supervisor_state.write_text.side_effect = err
|
|
await coresys.core.set_state(CoreState.RUNNING)
|
|
|
|
assert "Can't update the Supervisor state" in caplog.text
|
|
assert coresys.core.state == CoreState.RUNNING
|
|
|
|
|
|
# Components whose load() method is awaited from Core.setup().
|
|
_SETUP_LOAD_COMPONENTS = (
|
|
"api",
|
|
"hardware",
|
|
"dbus",
|
|
"host",
|
|
"os",
|
|
"mounts",
|
|
"docker",
|
|
"updater",
|
|
"plugins",
|
|
"homeassistant",
|
|
"arch",
|
|
"store",
|
|
"apps",
|
|
"backups",
|
|
"services",
|
|
"discovery",
|
|
"ingress",
|
|
"resolution",
|
|
)
|
|
|
|
|
|
@pytest.fixture
|
|
def mocked_setup_loads(coresys: CoreSys):
|
|
"""Replace all load() calls in Core.setup() with AsyncMock."""
|
|
with (
|
|
patch.object(coresys, "init_websession", new=AsyncMock()),
|
|
patch.object(Supervisor, "check_and_update_connectivity", new=AsyncMock()),
|
|
patch.object(coresys.core, "_adjust_system_datetime", new=AsyncMock()),
|
|
):
|
|
patches = [
|
|
patch.object(getattr(coresys, attr), "load", new=AsyncMock())
|
|
for attr in _SETUP_LOAD_COMPONENTS
|
|
]
|
|
for p in patches:
|
|
p.start()
|
|
try:
|
|
yield
|
|
finally:
|
|
for p in patches:
|
|
p.stop()
|
|
|
|
|
|
@pytest.mark.usefixtures("mocked_setup_loads")
|
|
async def test_setup_app_file_read_error_not_captured(
|
|
coresys: CoreSys, caplog: pytest.LogCaptureFixture
|
|
):
|
|
"""Test setup does not capture AppFileReadError to Sentry but marks unhealthy."""
|
|
coresys.apps.load.side_effect = AppFileReadError(
|
|
app="local_example", error="[Errno 74] Bad message"
|
|
)
|
|
with patch("supervisor.core.async_capture_exception") as capture_mock:
|
|
await coresys.core.setup()
|
|
|
|
capture_mock.assert_not_called()
|
|
assert "Fatal error happening on load Task" not in caplog.text
|
|
assert "Error on load Task" in caplog.text
|
|
assert UnhealthyReason.SETUP in coresys.resolution.unhealthy
|
|
|
|
|
|
@pytest.mark.usefixtures("mocked_setup_loads")
|
|
async def test_setup_unhandled_exception_captured(
|
|
coresys: CoreSys, caplog: pytest.LogCaptureFixture
|
|
):
|
|
"""Test setup captures unhandled exceptions to Sentry and marks unhealthy."""
|
|
coresys.apps.load.side_effect = HassioError("boom")
|
|
with patch("supervisor.core.async_capture_exception") as capture_mock:
|
|
await coresys.core.setup()
|
|
|
|
capture_mock.assert_called_once()
|
|
assert "Fatal error happening on load Task" in caplog.text
|
|
assert UnhealthyReason.SETUP in coresys.resolution.unhealthy
|
|
|
|
|
|
async def test_shutdown_reentrant_waits(coresys: CoreSys):
|
|
"""Concurrent shutdown() calls await the in-flight shutdown rather than re-running."""
|
|
call_count = 0
|
|
shutdown_started = asyncio.Event()
|
|
proceed = asyncio.Event()
|
|
|
|
original_shutdown = coresys.apps.shutdown
|
|
|
|
async def slow_app_shutdown(startup):
|
|
nonlocal call_count
|
|
call_count += 1
|
|
shutdown_started.set()
|
|
await proceed.wait()
|
|
return await original_shutdown(startup)
|
|
|
|
await coresys.core.set_state(CoreState.RUNNING)
|
|
|
|
with patch.object(coresys.apps, "shutdown", side_effect=slow_app_shutdown):
|
|
task1 = asyncio.create_task(coresys.core.shutdown())
|
|
await shutdown_started.wait()
|
|
|
|
# Second call should wait, not start a new shutdown
|
|
task2 = asyncio.create_task(coresys.core.shutdown())
|
|
await asyncio.sleep(0.05)
|
|
|
|
proceed.set()
|
|
await asyncio.gather(task1, task2)
|
|
|
|
# AppStartup has 4 levels (APPLICATION/SERVICES/SYSTEM/INITIALIZE); a single
|
|
# shutdown call iterates them. A re-entered shutdown would double the count.
|
|
assert call_count == 4
|
|
assert coresys.core._shutdown_event.is_set()
|
|
|
|
|
|
async def test_shutdown_releases_event_when_set_state_cancelled(coresys: CoreSys):
|
|
"""Cancellation mid set_state() must still release waiters.
|
|
|
|
set_state() updates Core._state before awaiting the run-state file write.
|
|
If cancellation hits during that await, in-memory state is already
|
|
SHUTDOWN. Without the try/finally around set_state(), _shutdown_event
|
|
would never be set and concurrent callers would deadlock on wait().
|
|
"""
|
|
await coresys.core.set_state(CoreState.RUNNING)
|
|
|
|
cancel_during_write = asyncio.Event()
|
|
|
|
async def cancel_during_set_state(*_args, **_kwargs):
|
|
cancel_during_write.set()
|
|
await asyncio.sleep(3600) # wait long enough to be cancelled
|
|
|
|
with patch.object(
|
|
coresys.core, "_write_run_state", side_effect=cancel_during_set_state
|
|
):
|
|
task = asyncio.create_task(coresys.core.shutdown())
|
|
await cancel_during_write.wait()
|
|
task.cancel()
|
|
with suppress(asyncio.CancelledError):
|
|
await task
|
|
|
|
# In-memory state moved to SHUTDOWN before the cancellation point
|
|
assert coresys.core.state == CoreState.SHUTDOWN
|
|
# finally must have run so any future caller does not deadlock
|
|
assert coresys.core._shutdown_event.is_set()
|
|
|
|
|
|
async def test_shutdown_transitions_state(coresys: CoreSys):
|
|
"""Shutdown moves Core into SHUTDOWN state so HA Core/WS observers react."""
|
|
await coresys.core.set_state(CoreState.RUNNING)
|
|
await coresys.core.shutdown()
|
|
assert coresys.core.state == CoreState.SHUTDOWN
|
|
|
|
|
|
async def test_teardown_services_does_not_change_state(coresys: CoreSys):
|
|
"""Teardown leaves Core state alone so callers (e.g. backup restore) control it."""
|
|
await coresys.core.set_state(CoreState.FREEZE)
|
|
await coresys.core.teardown_services()
|
|
assert coresys.core.state == CoreState.FREEZE
|
|
|
|
|
|
async def test_teardown_services_does_not_stop_plugins(coresys: CoreSys):
|
|
"""Plugins must keep running across teardown so restore can talk to them."""
|
|
await coresys.core.set_state(CoreState.FREEZE)
|
|
with patch.object(coresys.plugins, "shutdown") as mock_plugins_shutdown:
|
|
await coresys.core.teardown_services()
|
|
mock_plugins_shutdown.assert_not_called()
|
|
|
|
|
|
async def test_shutdown_stops_plugins(coresys: CoreSys):
|
|
"""Real shutdown stops plugins as the final step."""
|
|
await coresys.core.set_state(CoreState.RUNNING)
|
|
with patch.object(coresys.plugins, "shutdown") as mock_plugins_shutdown:
|
|
await coresys.core.shutdown()
|
|
mock_plugins_shutdown.assert_called_once()
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"state", [CoreState.STOPPING, CoreState.CLOSE], ids=["stopping", "close"]
|
|
)
|
|
async def test_shutdown_ignored_during_stop(
|
|
coresys: CoreSys, caplog: pytest.LogCaptureFixture, state: CoreState
|
|
):
|
|
"""Shutdown is ignored when Supervisor is already stopping."""
|
|
await coresys.core.set_state(state)
|
|
|
|
with patch.object(coresys.apps, "shutdown") as mock_app_shutdown:
|
|
await coresys.core.shutdown()
|
|
|
|
mock_app_shutdown.assert_not_called()
|
|
assert "Ignoring shutdown request, Supervisor is already stopping" in caplog.text
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"state",
|
|
[CoreState.INITIALIZE, CoreState.STARTUP, CoreState.SETUP],
|
|
ids=["initialize", "startup", "setup"],
|
|
)
|
|
async def test_shutdown_skipped_during_startup(
|
|
coresys: CoreSys, caplog: pytest.LogCaptureFixture, state: CoreState
|
|
):
|
|
"""Shutdown returns early when Supervisor has not finished starting yet."""
|
|
await coresys.core.set_state(state)
|
|
|
|
with patch.object(coresys.apps, "shutdown") as mock_app_shutdown:
|
|
await coresys.core.shutdown()
|
|
|
|
mock_app_shutdown.assert_not_called()
|
|
assert (
|
|
"Ignoring shutdown request, Supervisor has not finished starting" in caplog.text
|
|
)
|