diff --git a/supervisor/dbus/const.py b/supervisor/dbus/const.py index e7e53dfe5..bd4d2d43a 100644 --- a/supervisor/dbus/const.py +++ b/supervisor/dbus/const.py @@ -50,6 +50,7 @@ DBUS_SIGNAL_NM_CONNECTION_ACTIVE_CHANGED = ( ) DBUS_SIGNAL_PROPERTIES_CHANGED = "org.freedesktop.DBus.Properties.PropertiesChanged" DBUS_SIGNAL_RAUC_INSTALLER_COMPLETED = "de.pengutronix.rauc.Installer.Completed" +DBUS_SIGNAL_SYSTEMD_JOB_REMOVED = "org.freedesktop.systemd1.Manager.JobRemoved" DBUS_OBJECT_BASE = "/" DBUS_OBJECT_DNS = "/org/freedesktop/NetworkManager/DnsManager" diff --git a/supervisor/dbus/systemd.py b/supervisor/dbus/systemd.py index 66776a41e..dd53acb87 100644 --- a/supervisor/dbus/systemd.py +++ b/supervisor/dbus/systemd.py @@ -1,5 +1,6 @@ """Interface to Systemd over D-Bus.""" +from collections.abc import Callable from functools import wraps import logging from typing import NamedTuple @@ -47,6 +48,34 @@ class ExecStartEntry(NamedTuple): ignore_failure: bool +def job_removed_filter(get_job_path: Callable[[], str | None]) -> Callable[..., bool]: + """Build a DBusSignalWrapper filter that matches a specific JobRemoved. + + JobRemoved has signature `(u id, o job, s unit, s result)`. The filter + returns True only for the signal whose `job` path matches the one + returned by get_job_path() — used to wait for a specific systemd job + we dispatched, ignoring concurrent jobs on the system bus. + + The getter indirection exists because race-free use requires + subscribing *before* dispatching the job (so we can't miss its + JobRemoved), which means the job path isn't known at filter + construction time. The closure reads it lazily at wait time: + + job_path: str | None = None + async with systemd.connected_dbus.signal( + DBUS_SIGNAL_SYSTEMD_JOB_REMOVED, + job_removed_filter(lambda: job_path), + ) as signal: + job_path = await systemd.restart_unit(...) + _id, _path, _unit, result = await signal.wait_for_signal() + """ + + def _match(_id: int, path: str, _unit: str, _result: str) -> bool: + return path == get_job_path() + + return _match + + def systemd_errors(func): """Wrap systemd dbus methods to handle its specific error types.""" @@ -128,7 +157,7 @@ class Systemd(DBusInterfaceProxy): await super().connect(bus) except DBusError: _LOGGER.warning("Can't connect to systemd") - except (DBusServiceUnkownError, DBusInterfaceError): + except DBusServiceUnkownError, DBusInterfaceError: _LOGGER.warning( "No systemd support on the host. Host control has been disabled." ) diff --git a/supervisor/mounts/mount.py b/supervisor/mounts/mount.py index 1788a3606..88dbb6d20 100644 --- a/supervisor/mounts/mount.py +++ b/supervisor/mounts/mount.py @@ -2,10 +2,12 @@ from abc import ABC, abstractmethod import asyncio -from collections.abc import Callable +from collections.abc import Awaitable, Callable from functools import cached_property import logging +import os from pathlib import Path, PurePath +import time from dbus_fast import Variant from voluptuous import Coerce @@ -17,11 +19,12 @@ from ..dbus.const import ( DBUS_ATTR_TIMEOUT_USEC, DBUS_ATTR_TYPE, DBUS_ATTR_WHAT, + DBUS_SIGNAL_SYSTEMD_JOB_REMOVED, StartUnitMode, StopUnitMode, UnitActiveState, ) -from ..dbus.systemd import SystemdUnit +from ..dbus.systemd import SystemdUnit, job_removed_filter from ..docker.const import PATH_MEDIA, PATH_SHARE from ..exceptions import ( DBusError, @@ -38,6 +41,25 @@ from .validate import MountData _LOGGER: logging.Logger = logging.getLogger(__name__) + +def _probe_network_mount(path: Path) -> bool: + """Verify `path` is a live mount on a reachable server. + + Run inside an executor — both syscalls share one thread and + benefit from the kernel's warm session state on a real mount. + + Raises OSError (typically ETIMEDOUT / EHOSTDOWN / ECONNABORTED) + when the server is unreachable. Returns False when statvfs + succeeded but the path is not actually a mount point (the + "ghost mount" case where statvfs returns the underlying root + filesystem's stats). Returns True only when statvfs forced an + RPC that the server answered AND the path actually crosses a + filesystem boundary. + """ + os.statvfs(path) + return path.stat().st_dev != path.parent.stat().st_dev + + # Three layered timeouts cooperate to keep the host alive when an NFS server # becomes unreachable while a `.mount` unit is being reloaded (see #6827): # @@ -52,6 +74,13 @@ _LOGGER: logging.Logger = logging.getLogger(__name__) # task) — the safe path. MOUNT_UNIT_TIMEOUT_USEC = 35 * 1_000_000 UPDATE_STATE_TIMEOUT = 40 +# Maximum time to wait for a systemd job (mount/unmount/reload/restart) +# to leave the queue. Sized to cover RestartUnit = stop + start, each +# bounded by MOUNT_UNIT_TIMEOUT_USEC (35 s), with headroom for systemd's +# queue dispatch. A single-phase job (mount/unmount/reload) typically +# completes within ~40 s; the larger budget is just so the RestartUnit +# case doesn't time out one second before JobRemoved fires. +SYSTEMD_JOB_TIMEOUT = 90 COERCE_MOUNT_TYPE: Callable[[str], MountType] = Coerce(MountType) COERCE_MOUNT_USAGE: Callable[[str], MountUsage] = Coerce(MountUsage) @@ -229,20 +258,26 @@ class Mount(CoreSysAttributes, ABC): await self._update_state(unit) - # If active, dismiss corresponding failed mount issue if found - if (mounted := await self.is_mounted()) and ( - issue := self.sys_resolution.get_issue_if_present(self.failed_issue) - ): - self.sys_resolution.dismiss_issue(issue) + if not await self.is_mounted(): + return False - return mounted + if issue := self.sys_resolution.get_issue_if_present(self.failed_issue): + self.sys_resolution.dismiss_issue(issue) + return True async def _update_state_await( self, unit: SystemdUnit, expected_states: set[UnitActiveState] | None = None, ) -> None: - """Update state info about mount from dbus. Wait for one of expected_states to appear.""" + """Update state info about mount from dbus. Wait for one of expected_states to appear. + + Used for the initial `load()` observation where no systemd job is + in flight — we're just polling for the unit to settle out of any + transitional state. Job-dispatching paths (mount/unmount/reload/ + restart) instead subscribe to JobRemoved before dispatching and + wait for that signal — see `_run_systemd_job`. + """ if expected_states is None: expected_states = { UnitActiveState.ACTIVE, @@ -261,6 +296,46 @@ class Mount(CoreSysAttributes, ABC): UPDATE_STATE_TIMEOUT, ) + async def _run_systemd_job( + self, + op_name: str, + dispatch: Awaitable[str], + ) -> str | None: + """Dispatch a systemd job and wait for its JobRemoved signal. + + Subscribing before dispatching closes the race where a fast job + could complete (and emit JobRemoved) before we set up the signal + match. The returned result string is the systemd job outcome + ("done", "failed", "canceled", "timeout", "dependency", "skipped"). + + Returns None on timeout — callers should re-read state to decide + what to do next. + """ + # Late-bound: dispatch hasn't run yet when we subscribe; the + # filter reads job_path each time it's evaluated, so it picks + # up the assignment below before any JobRemoved is consumed. + job_path: str | None = None + async with self.sys_dbus.systemd.connected_dbus.signal( + DBUS_SIGNAL_SYSTEMD_JOB_REMOVED, + job_removed_filter(lambda: job_path), + ) as signal: + job_path = await dispatch + try: + async with asyncio.timeout(SYSTEMD_JOB_TIMEOUT): + _id, _path, _unit, result = await signal.wait_for_signal() + except TimeoutError: + _LOGGER.warning( + "Systemd %s job for mount %s did not complete within %d seconds", + op_name, + self.name, + SYSTEMD_JOB_TIMEOUT, + ) + return None + _LOGGER.debug( + "Systemd %s job for mount %s completed: %s", op_name, self.name, result + ) + return result + async def mount(self) -> None: """Mount using systemd.""" @@ -283,27 +358,25 @@ class Mount(CoreSysAttributes, ABC): await self.sys_run_in_executor(ensure_empty_folder) - try: - options = ( - [(DBUS_ATTR_OPTIONS, Variant("s", ",".join(self.options)))] - if self.options - else [] - ) - if self.type != MountType.BIND: - options += [(DBUS_ATTR_TYPE, Variant("s", self.type))] + options = ( + [(DBUS_ATTR_OPTIONS, Variant("s", ",".join(self.options)))] + if self.options + else [] + ) + if self.type != MountType.BIND: + options += [(DBUS_ATTR_TYPE, Variant("s", self.type))] + properties = options + [ + (DBUS_ATTR_DESCRIPTION, Variant("s", self.description)), + (DBUS_ATTR_WHAT, Variant("s", self.what)), + (DBUS_ATTR_TIMEOUT_USEC, Variant("t", MOUNT_UNIT_TIMEOUT_USEC)), + ] - await self.sys_dbus.systemd.start_transient_unit( - self.unit_name, - StartUnitMode.FAIL, - options - + [ - (DBUS_ATTR_DESCRIPTION, Variant("s", self.description)), - (DBUS_ATTR_WHAT, Variant("s", self.what)), - ( - DBUS_ATTR_TIMEOUT_USEC, - Variant("t", MOUNT_UNIT_TIMEOUT_USEC), - ), - ], + try: + await self._run_systemd_job( + "start_transient_unit", + self.sys_dbus.systemd.start_transient_unit( + self.unit_name, StartUnitMode.FAIL, properties + ), ) except DBusError as err: raise MountError( @@ -311,7 +384,7 @@ class Mount(CoreSysAttributes, ABC): ) from err if unit := await self._update_unit(): - await self._update_state_await(unit) + await self._update_state(unit) if not await self.is_mounted(): raise MountActivationError( @@ -328,11 +401,11 @@ class Mount(CoreSysAttributes, ABC): await self._update_state(unit) try: if self.state != UnitActiveState.FAILED: - await self.sys_dbus.systemd.stop_unit(self.unit_name, StopUnitMode.FAIL) - - await self._update_state_await( - unit, {UnitActiveState.INACTIVE, UnitActiveState.FAILED} - ) + await self._run_systemd_job( + "stop_unit", + self.sys_dbus.systemd.stop_unit(self.unit_name, StopUnitMode.FAIL), + ) + await self._update_state(unit) if self.state == UnitActiveState.FAILED: await self.sys_dbus.systemd.reset_failed_unit(self.unit_name) @@ -345,56 +418,96 @@ class Mount(CoreSysAttributes, ABC): self._state = None async def reload(self) -> None: - """Reload or restart mount unit to re-mount.""" + """Verify the mount is reachable; reload/restart as needed. + + `is_mounted()` is the source of truth here: for network mounts it + runs a statvfs probe that forces an RPC, so a passing return value + proves the share is live right now — systemd's "active/mounted" + state alone does not (CIFS reload is local-only — smb3_reconfigure + never contacts the server). The reload → restart escalation only + runs when the probe fails. + """ + if await self.is_mounted(): + self._dismiss_failed_issue() + return + try: - await self.sys_dbus.systemd.reload_unit(self.unit_name, StartUnitMode.FAIL) + result = await self._run_systemd_job( + "reload_or_restart_unit", + self.sys_dbus.systemd.reload_unit(self.unit_name, StartUnitMode.FAIL), + ) except DBusSystemdNoSuchUnit: _LOGGER.info( "Mount %s is not mounted, mounting instead of reloading", self.name ) await self.mount() + return except DBusError as err: _LOGGER.error( "Could not reload mount %s due to: %s. Trying a restart", self.name, err ) await self._restart() + self._dismiss_failed_issue() + return + + if unit := await self._update_unit(): + await self._update_state(unit) + + # Safety net for #6827: with the layered timeouts above + # (RPC < TimeoutSec < state-await) the unit should always have + # left RELOADING by the time we get here. If it has not, the + # systemd-side cleanup did not complete in time; escalating to + # RestartUnit while a mount/umount helper is still pinned in the + # kernel is the destructive pattern that wedges PID 1, so refuse + # to escalate and surface the failure instead. + if self.state == UnitActiveState.RELOADING: + raise MountActivationError( + f"Reloading {self.name} did not complete in time and the " + f"unit is still in RELOADING. Refusing to escalate to a " + f"restart while the mount helper may be pinned in the " + f"kernel — this should not happen with the configured " + f"unit timeout. Check host logs for the systemd unit " + f"{self.unit_name} for details.", + _LOGGER.critical, + ) + + # When systemd already reports the reload as failed (or timed + # out so we never saw JobRemoved), skip the post-reload probe + # and escalate directly. The probe on a dead NFS share can + # take 90+ seconds in the kernel-side reconnect churn that + # follows a killed mount helper — and it would just confirm + # what we already know. For CIFS, smb3_reconfigure is + # local-only and always returns "done" even against a dead + # server, so we still need the probe in the success branch. + if result == "done": + if await self.is_mounted(): + self._dismiss_failed_issue() + return + _LOGGER.info( + "Mount %s reload reported success but probe failed. Trying a restart", + self.name, + ) else: - if unit := await self._update_unit(): - await self._update_state_await(unit) + _LOGGER.info( + "Mount %s reload did not complete (systemd result: %s). Trying a restart", + self.name, + result, + ) + await self._restart() + self._dismiss_failed_issue() - # Safety net for #6827: with the layered timeouts above - # (RPC < TimeoutSec < state-await) the unit should always have - # left RELOADING by the time we get here. If it has not, the - # systemd-side cleanup did not complete in time; escalating to - # RestartUnit while a mount/umount helper is still pinned in the - # kernel is the destructive pattern that wedges PID 1, so refuse - # to escalate and surface the failure instead. - if self.state == UnitActiveState.RELOADING: - raise MountActivationError( - f"Reloading {self.name} did not complete in time and the " - f"unit is still in RELOADING. Refusing to escalate to a " - f"restart while the mount helper may be pinned in the " - f"kernel — this should not happen with the configured " - f"unit timeout. Check host logs for the systemd unit " - f"{self.unit_name} for details.", - _LOGGER.critical, - ) - - if not await self.is_mounted(): - _LOGGER.info( - "Mount %s not correctly mounted after a reload. Trying a restart", - self.name, - ) - await self._restart() - - # If it is mounted now, dismiss corresponding issue if present + def _dismiss_failed_issue(self) -> None: + """Dismiss the failed-mount resolution issue if present.""" if issue := self.sys_resolution.get_issue_if_present(self.failed_issue): self.sys_resolution.dismiss_issue(issue) async def _restart(self) -> None: """Restart mount unit to re-mount.""" try: - await self.sys_dbus.systemd.restart_unit(self.unit_name, StartUnitMode.FAIL) + result = await self._run_systemd_job( + "restart_unit", + self.sys_dbus.systemd.restart_unit(self.unit_name, StartUnitMode.FAIL), + ) except DBusSystemdNoSuchUnit: _LOGGER.info( "Mount %s is not mounted, mounting instead of restarting", self.name @@ -407,11 +520,17 @@ class Mount(CoreSysAttributes, ABC): ) from err if unit := await self._update_unit(): - await self._update_state_await(unit) + await self._update_state(unit) - if not await self.is_mounted(): + # If systemd already reports the restart job as failed (or we + # timed out waiting for JobRemoved), don't bother probing — we + # know the mount isn't healthy and the probe would just add + # another 30-90s on a dead network share for no diagnostic gain. + if result != "done" or not await self.is_mounted(): raise MountActivationError( - f"Restarting {self.name} did not succeed. Check host logs for errors from mount or systemd unit {self.unit_name} for details.", + f"Restarting {self.name} did not succeed (systemd result: {result}). " + f"Check host logs for errors from mount or systemd unit " + f"{self.unit_name} for details.", _LOGGER.error, ) @@ -450,10 +569,61 @@ class NetworkMount(Mount, ABC): return options async def is_mounted(self) -> bool: - """Return true if successfully mounted and available.""" - return self.state == UnitActiveState.ACTIVE and await self.sys_run_in_executor( - self.local_where.is_mount - ) + """Return true if the mount is active and the server actually answers. + + Three checks compose the verdict: + + 1. systemd reports the unit ACTIVE — cheap, may be stale. + 2. `os.statvfs()` forces an RPC for both NFS (FSSTAT) and CIFS + (QUERY_FS_INFO). Those per-filesystem fields aren't cached + client-side, so the kernel must reach the server or fail + with ETIMEDOUT / EHOSTDOWN / ECONNABORTED. We do this + first because on a dead server it fails fast within the + protocol budget (~30s, bounded by softerr,timeo=100,retrans=2 + for NFS and soft,echo_interval=10 for CIFS) — and on a + ghost mount (path no longer mounted, e.g. after a failed + restart whose umount succeeded but mount step failed) it + returns the underlying root filesystem's stats without + touching the network. + 3. A parent vs. path `st_dev` comparison distinguishes the + ghost-mount case from a real live mount: statvfs succeeds + for both, but only a real mount crosses a filesystem + boundary. These stat calls are cheap on success — the + attrs were just revalidated by the successful statvfs. + + Both syscalls run in a single executor hop so they share one + thread and the kernel's warm session state. No asyncio + timeout — the kernel-side bound is authoritative. + """ + if self.state != UnitActiveState.ACTIVE: + return False + + local_where = self.local_where + _LOGGER.debug("Probing mount %s at %s", self.name, local_where) + start = time.monotonic() + try: + is_real_mount = await self.sys_run_in_executor( + _probe_network_mount, local_where + ) + except OSError as err: + _LOGGER.debug( + "Probe of mount %s failed after %.2fs: %s", + self.name, + time.monotonic() - start, + err, + ) + return False + elapsed = time.monotonic() - start + if not is_real_mount: + _LOGGER.debug( + "Mount %s reported active but %s is not a mount point (probe %.2fs)", + self.name, + local_where, + elapsed, + ) + return False + _LOGGER.debug("Probe of mount %s succeeded in %.2fs", self.name, elapsed) + return True class CIFSMount(NetworkMount): @@ -501,7 +671,22 @@ class CIFSMount(NetworkMount): @property def options(self) -> list[str]: """Options to use to mount.""" - options = super().options + ["noserverino"] + # soft + echo_interval=10 + retrans=0 give a ~30s per-operation budget + # before the kernel reports the server unreachable (3 × echo_interval + # since last server response). This roughly matches the NFS budget + # from `softerr,timeo=100,retrans=2` so the userspace probe behaves + # symmetrically across both protocols. On give-up the syscall + # returns EHOSTDOWN / ECONNABORTED rather than blocking forever, + # which is what makes the statvfs probe a reliable health check. + # `soft` is the kernel default but is set explicitly so the + # behavior is part of the recorded mount options rather than an + # implicit assumption. + options = super().options + [ + "noserverino", + "soft", + "echo_interval=10", + "retrans=0", + ] if self.version: options.append(f"vers={self.version}") @@ -565,7 +750,7 @@ class NFSMount(NetworkMount): @property def options(self) -> list[str]: """Options to use to mount.""" - return super().options + ["soft", "timeo=100", "retrans=2"] + return super().options + ["softerr", "timeo=100", "retrans=2"] class BindMount(Mount): diff --git a/supervisor/utils/dbus.py b/supervisor/utils/dbus.py index 0a0d1daf9..3988891df 100644 --- a/supervisor/utils/dbus.py +++ b/supervisor/utils/dbus.py @@ -298,9 +298,20 @@ class DBus: self._signal_monitors = {} - def signal(self, signal_member: str) -> DBusSignalWrapper: - """Get signal context manager for this object.""" - return DBusSignalWrapper(self, signal_member) + def signal( + self, + signal_member: str, + message_filter: Callable[..., bool] | None = None, + ) -> DBusSignalWrapper: + """Get signal context manager for this object. + + message_filter is invoked with the signal body unpacked positionally + (`filter(*msg.body)`) and only signals where it returns True are + delivered to `wait_for_signal`. Useful when subscribing to a + broadcast signal but only caring about specific payloads (e.g. + systemd's JobRemoved, where we want one specific job path). + """ + return DBusSignalWrapper(self, signal_member, message_filter) def _add_signal_monitor( self, interface: str, dbus_name: str, callback: Callable @@ -481,7 +492,12 @@ class DBusCallWrapper: class DBusSignalWrapper: """Wrapper for D-Bus Signal.""" - def __init__(self, dbus: DBus, signal_member: str) -> None: + def __init__( + self, + dbus: DBus, + signal_member: str, + message_filter: Callable[..., bool] | None = None, + ) -> None: """Initialize wrapper.""" self._dbus: DBus = dbus signal_parts = signal_member.split(".") @@ -489,6 +505,7 @@ class DBusSignalWrapper: self._member = signal_parts[-1] self._match: str = f"type='signal',interface={self._interface},member={self._member},path={self._dbus.object_path}" self._messages: asyncio.Queue[Message] = asyncio.Queue() + self._filter = message_filter def _message_handler(self, msg: Message): if msg.message_type != MessageType.SIGNAL: @@ -528,9 +545,15 @@ class DBusSignalWrapper: return self async def wait_for_signal(self) -> Any: - """Wait for signal and returns signal payload.""" - msg = await self._messages.get() - return msg.body + """Wait for signal and returns signal payload. + + If a message_filter was provided at construction, non-matching + signals are discarded and the wait continues. + """ + while True: + msg = await self._messages.get() + if self._filter is None or self._filter(*msg.body): + return msg.body async def __aexit__(self, exc_t, exc_v, exc_tb): """Stop collecting signal messages and remove match for signals.""" diff --git a/tests/api/test_backups.py b/tests/api/test_backups.py index 813e9cca2..97dd75aca 100644 --- a/tests/api/test_backups.py +++ b/tests/api/test_backups.py @@ -1,6 +1,7 @@ """Test backups API.""" import asyncio +import errno from pathlib import Path, PurePath from shutil import copy from typing import Any @@ -220,14 +221,18 @@ async def test_backup_to_down_mount_returns_400( await coresys.mounts.create_mount(mount) coresys.mounts.default_backup_mount = mount - mock_is_mount.return_value = False await coresys.core.set_state(CoreState.RUNNING) coresys.hardware.disk.get_disk_free_space = lambda x: 5000 - resp = await api_client.post( - "/backups/new/full", - json={"name": "Mount test", "location": "backup_test"}, - ) + # Simulate the mount being down: probe (statvfs) fails with EHOSTDOWN. + with patch( + "supervisor.mounts.mount._probe_network_mount", + side_effect=OSError(errno.EHOSTDOWN, "Host is down"), + ): + resp = await api_client.post( + "/backups/new/full", + json={"name": "Mount test", "location": "backup_test"}, + ) assert resp.status == 400 body = await resp.json() assert body["result"] == "error" diff --git a/tests/api/test_mounts.py b/tests/api/test_mounts.py index ace1d3876..f17af49aa 100644 --- a/tests/api/test_mounts.py +++ b/tests/api/test_mounts.py @@ -1,6 +1,7 @@ """Test mounts API.""" import asyncio +import errno from unittest.mock import patch from aiohttp.test_utils import TestClient @@ -382,9 +383,21 @@ async def test_api_reload_mount( systemd_service: SystemdService = all_dbus_services["systemd"] systemd_service.ReloadOrRestartUnit.calls.clear() + # Healthy mount (probe passes): API reload completes without touching + # systemd — the periodic refresh + probe-as-fast-path means a healthy + # mount only gets reloaded when the share has actually gone bad. resp = await api_client.post(f"{prefix}/mounts/backup_test/reload") result = await resp.json() assert result["result"] == "ok" + assert systemd_service.ReloadOrRestartUnit.calls == [] + + # Probe failure forces the reload to actually go to systemd. + with patch( + "supervisor.mounts.mount._probe_network_mount", + side_effect=OSError(errno.EHOSTDOWN, "Host is down"), + ): + resp = await api_client.post(f"{prefix}/mounts/backup_test/reload") + await resp.json() assert systemd_service.ReloadOrRestartUnit.calls == [ ("mnt-data-supervisor-mounts-backup_test.mount", "fail") diff --git a/tests/backups/test_manager.py b/tests/backups/test_manager.py index cce224782..ea9119b39 100644 --- a/tests/backups/test_manager.py +++ b/tests/backups/test_manager.py @@ -800,16 +800,20 @@ async def test_backup_to_down_mount_error(coresys: CoreSys, mock_is_mount: Magic assert "backup_test" in coresys.backups.backup_locations assert coresys.backups.backup_locations["backup_test"] == mount_dir - # Attempt to make a backup which fails because is_mount on directory is false - mock_is_mount.return_value = False + # Attempt to make a backup which fails because the probe (statvfs) + # surfaces the server as unreachable. await coresys.core.set_state(CoreState.RUNNING) coresys.hardware.disk.get_disk_free_space = lambda x: 5000 - with pytest.raises(BackupMountDownError): - await coresys.backups.do_backup_full("test", location=mount) - with pytest.raises(BackupMountDownError): - await coresys.backups.do_backup_partial( - "test", location=mount, homeassistant=True - ) + with patch( + "supervisor.mounts.mount._probe_network_mount", + side_effect=OSError(errno.EHOSTDOWN, "Host is down"), + ): + with pytest.raises(BackupMountDownError): + await coresys.backups.do_backup_full("test", location=mount) + with pytest.raises(BackupMountDownError): + await coresys.backups.do_backup_partial( + "test", location=mount, homeassistant=True + ) @pytest.mark.usefixtures( @@ -906,12 +910,18 @@ async def test_backup_to_default_mount_down_error( await coresys.mounts.create_mount(mount) coresys.mounts.default_backup_mount = mount - # Attempt to make a backup which fails because is_mount on directory is false - mock_is_mount.return_value = False + # Attempt to make a backup which fails because the probe (statvfs) + # surfaces the server as unreachable. await coresys.core.set_state(CoreState.RUNNING) coresys.hardware.disk.get_disk_free_space = lambda x: 5000 - with pytest.raises(BackupMountDownError): + with ( + patch( + "supervisor.mounts.mount._probe_network_mount", + side_effect=OSError(errno.EHOSTDOWN, "Host is down"), + ), + pytest.raises(BackupMountDownError), + ): await coresys.backups.do_backup_partial("test", homeassistant=True) diff --git a/tests/conftest.py b/tests/conftest.py index fecdc22e4..4c9c3f298 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1021,9 +1021,18 @@ def mock_aarch64_arch_supported(coresys: CoreSys) -> None: @pytest.fixture def mock_is_mount() -> MagicMock: - """Mock is_mount in mounts.""" - with patch("supervisor.mounts.mount.Path.is_mount", return_value=True) as is_mount: - yield is_mount + """Mock the network-mount probe to report a healthy mount. + + Patches `_probe_network_mount` (the executor-side syscall combo + of statvfs + st_dev comparison) so existing tests don't need a + real filesystem mount to look healthy. Tests that simulate a + broken mount override with `side_effect=OSError(...)` for the + unreachable case or `return_value=False` for the ghost case. + """ + with patch( + "supervisor.mounts.mount._probe_network_mount", return_value=True + ) as probe: + yield probe @pytest.fixture diff --git a/tests/dbus_service_mocks/systemd.py b/tests/dbus_service_mocks/systemd.py index 4ebb3f035..5b5c2270e 100644 --- a/tests/dbus_service_mocks/systemd.py +++ b/tests/dbus_service_mocks/systemd.py @@ -1,7 +1,7 @@ """Mock of systemd dbus service.""" from dbus_fast import DBusError -from dbus_fast.service import PropertyAccess, dbus_property +from dbus_fast.service import PropertyAccess, dbus_property, signal from .base import DBusServiceMock, dbus_method from .systemd_unit import SystemdUnit @@ -660,12 +660,29 @@ class Systemd(DBusServiceMock): def Subscribe(self) -> None: """Subscribe to systemd signals.""" + @signal() + def JobRemoved( # noqa: PLR0913 + self, + job_id: "u", + job: "o", + unit: "s", + result: "s", + ) -> "uoss": + """Emit JobRemoved signal with (id, job_path, unit_name, result).""" + return [job_id, job, unit, result] + + def _emit_job_removed(self, job_path: str, unit: str, result: str = "done") -> None: + """Emit JobRemoved with a deterministic id placeholder.""" + self.JobRemoved(0, job_path, unit, result) + @dbus_method() def StartUnit(self, name: "s", mode: "s") -> "o": """Start a service unit.""" if self.mock_systemd_unit: self.mock_systemd_unit.active_state = "active" - return "/org/freedesktop/systemd1/job/7623" + job_path = "/org/freedesktop/systemd1/job/7623" + self._emit_job_removed(job_path, name) + return job_path @dbus_method() def StopUnit(self, name: "s", mode: "s") -> "o": @@ -674,6 +691,7 @@ class Systemd(DBusServiceMock): raise self.response_stop_unit # pylint: disable=raising-bad-type if self.mock_systemd_unit: self.mock_systemd_unit.active_state = "inactive" + self._emit_job_removed(self.response_stop_unit, name) return self.response_stop_unit @dbus_method() @@ -683,6 +701,7 @@ class Systemd(DBusServiceMock): raise self.response_reload_or_restart_unit # pylint: disable=raising-bad-type if self.mock_systemd_unit: self.mock_systemd_unit.active_state = "active" + self._emit_job_removed(self.response_reload_or_restart_unit, name) return self.response_reload_or_restart_unit @dbus_method() @@ -692,6 +711,7 @@ class Systemd(DBusServiceMock): raise self.response_restart_unit # pylint: disable=raising-bad-type if self.mock_systemd_unit: self.mock_systemd_unit.active_state = "active" + self._emit_job_removed(self.response_restart_unit, name) return self.response_restart_unit @dbus_method() @@ -703,6 +723,7 @@ class Systemd(DBusServiceMock): raise self.response_start_transient_unit # pylint: disable=raising-bad-type if self.mock_systemd_unit: self.mock_systemd_unit.active_state = "active" + self._emit_job_removed(self.response_start_transient_unit, name) return self.response_start_transient_unit @dbus_method() diff --git a/tests/mounts/test_manager.py b/tests/mounts/test_manager.py index 0c0d04a88..99924cead 100644 --- a/tests/mounts/test_manager.py +++ b/tests/mounts/test_manager.py @@ -1,8 +1,10 @@ """Tests for mount manager.""" +import errno import json import os from pathlib import Path +from unittest.mock import patch from unittest.util import unorderable_list_difference from dbus_fast import DBusError, ErrorType, Variant @@ -119,7 +121,12 @@ async def test_load( "mnt-data-supervisor-mounts-backup_test.mount", "fail", [ - ("Options", Variant("s", "noserverino,guest")), + ( + "Options", + Variant( + "s", "noserverino,soft,echo_interval=10,retrans=0,guest" + ), + ), ("Type", Variant("s", "cifs")), ("Description", Variant("s", "Supervisor cifs mount: backup_test")), ("What", Variant("s", "//backup.local/backups")), @@ -131,7 +138,7 @@ async def test_load( "mnt-data-supervisor-mounts-media_test.mount", "fail", [ - ("Options", Variant("s", "soft,timeo=100,retrans=2")), + ("Options", Variant("s", "softerr,timeo=100,retrans=2")), ("Type", Variant("s", "nfs")), ("Description", Variant("s", "Supervisor nfs mount: media_test")), ("What", Variant("s", "media.local:/media")), @@ -201,7 +208,7 @@ async def test_load_share_mount( "mnt-data-supervisor-mounts-share_test.mount", "fail", [ - ("Options", Variant("s", "soft,timeo=100,retrans=2")), + ("Options", Variant("s", "softerr,timeo=100,retrans=2")), ("Type", Variant("s", "nfs")), ("Description", Variant("s", "Supervisor nfs mount: share_test")), ("What", Variant("s", "share.local:/share")), @@ -418,21 +425,42 @@ async def test_update_mount( ] -async def test_reload_mount( +async def test_reload_mount_healthy_skips_systemd( coresys: CoreSys, all_dbus_services: dict[str, DBusServiceMock], mount: Mount, ): - """Test reloading a mount.""" + """A healthy mount (active + probe passes) skips the systemd reload.""" systemd_service: SystemdService = all_dbus_services["systemd"] systemd_service.ReloadOrRestartUnit.calls.clear() - # Reload the mount - systemd_service.response_get_unit = [ - "/org/freedesktop/systemd1/unit/tmp_2dyellow_2emount" - ] await coresys.mounts.reload_mount(mount.name) + assert systemd_service.ReloadOrRestartUnit.calls == [] + + +async def test_reload_mount_probe_failure_triggers_systemd_reload( + coresys: CoreSys, + all_dbus_services: dict[str, DBusServiceMock], + mount: Mount, +): + """A failed probe drives the systemd reload.""" + systemd_service: SystemdService = all_dbus_services["systemd"] + systemd_service.ReloadOrRestartUnit.calls.clear() + + systemd_service.response_get_unit = [ + "/org/freedesktop/systemd1/unit/tmp_2dyellow_2emount", + "/org/freedesktop/systemd1/unit/tmp_2dyellow_2emount", + ] + with ( + patch( + "supervisor.mounts.mount._probe_network_mount", + side_effect=OSError(errno.EHOSTDOWN, "Host is down"), + ), + pytest.raises(MountActivationError), + ): + await coresys.mounts.reload_mount(mount.name) + assert len(systemd_service.ReloadOrRestartUnit.calls) == 1 assert ( systemd_service.ReloadOrRestartUnit.calls[0][0] @@ -640,7 +668,7 @@ async def test_reload_mounts_attempts_initial_mount( "mnt-data-supervisor-mounts-media_test.mount", "fail", [ - ("Options", Variant("s", "soft,timeo=100,retrans=2")), + ("Options", Variant("s", "softerr,timeo=100,retrans=2")), ("Type", Variant("s", "nfs")), ("Description", Variant("s", "Supervisor nfs mount: media_test")), ("What", Variant("s", "media.local:/media")), diff --git a/tests/mounts/test_mount.py b/tests/mounts/test_mount.py index 1763a71a6..95ca8653c 100644 --- a/tests/mounts/test_mount.py +++ b/tests/mounts/test_mount.py @@ -3,6 +3,7 @@ from __future__ import annotations import asyncio +import errno import os from pathlib import Path import stat @@ -84,7 +85,12 @@ async def test_cifs_mount( assert mount.what == "//test.local/camera" assert mount.where == Path("/mnt/data/supervisor/mounts/test") assert mount.local_where == tmp_supervisor_data / "mounts" / "test" - assert mount.options == ["noserverino"] + expected_options + [ + assert mount.options == [ + "noserverino", + "soft", + "echo_interval=10", + "retrans=0", + ] + expected_options + [ "credentials=/mnt/data/supervisor/.mounts_credentials/test", ] @@ -110,7 +116,12 @@ async def test_cifs_mount( Variant( "s", ",".join( - ["noserverino"] + [ + "noserverino", + "soft", + "echo_interval=10", + "retrans=0", + ] + expected_options + [ "credentials=/mnt/data/supervisor/.mounts_credentials/test" @@ -178,7 +189,12 @@ async def test_cifs_mount_read_only( "mnt-data-supervisor-mounts-test.mount", "fail", [ - ("Options", Variant("s", "ro,noserverino,guest")), + ( + "Options", + Variant( + "s", "ro,noserverino,soft,echo_interval=10,retrans=0,guest" + ), + ), ("Type", Variant("s", "cifs")), ("Description", Variant("s", "Supervisor cifs mount: test")), ("What", Variant("s", "//test.local/camera")), @@ -223,7 +239,7 @@ async def test_nfs_mount( assert mount.what == "test.local:/media/camera" assert mount.where == Path("/mnt/data/supervisor/mounts/test") assert mount.local_where == tmp_supervisor_data / "mounts" / "test" - assert mount.options == ["port=1234", "soft", "timeo=100", "retrans=2"] + assert mount.options == ["port=1234", "softerr", "timeo=100", "retrans=2"] assert not mount.local_where.exists() assert mount.to_dict() == mount_data @@ -239,7 +255,7 @@ async def test_nfs_mount( "mnt-data-supervisor-mounts-test.mount", "fail", [ - ("Options", Variant("s", "port=1234,soft,timeo=100,retrans=2")), + ("Options", Variant("s", "port=1234,softerr,timeo=100,retrans=2")), ("Type", Variant("s", "nfs")), ("Description", Variant("s", "Supervisor nfs mount: test")), ("What", Variant("s", "test.local:/media/camera")), @@ -286,7 +302,7 @@ async def test_nfs_mount_read_only( "mnt-data-supervisor-mounts-test.mount", "fail", [ - ("Options", Variant("s", "ro,port=1234,soft,timeo=100,retrans=2")), + ("Options", Variant("s", "ro,port=1234,softerr,timeo=100,retrans=2")), ("Type", Variant("s", "nfs")), ("Description", Variant("s", "Supervisor nfs mount: test")), ("What", Variant("s", "test.local:/media/camera")), @@ -335,7 +351,10 @@ async def test_load( "mnt-data-supervisor-mounts-test.mount", "fail", [ - ("Options", Variant("s", "noserverino,guest")), + ( + "Options", + Variant("s", "noserverino,soft,echo_interval=10,retrans=0,guest"), + ), ("Type", Variant("s", "cifs")), ("Description", Variant("s", "Supervisor cifs mount: test")), ("What", Variant("s", "//test.local/share")), @@ -373,6 +392,11 @@ async def test_load( ] # Load waits up to 30 seconds if it finds a unit in the activating state + # (the wait happens inside _update_state_await driven by PropertiesChanged). + # Once the state settles to FAILED, load triggers a reload, and the reload + # is driven to completion by the mock-emitted JobRemoved signal — which + # also flips active_state to "active" via mock_systemd_unit. + systemd_service.mock_systemd_unit = systemd_unit_service systemd_service.ReloadOrRestartUnit.calls.clear() systemd_unit_service.active_state = "activating" mount = Mount.from_dict(coresys, mount_data) @@ -380,8 +404,6 @@ async def test_load( load_task = asyncio.create_task(mount.load()) await asyncio.sleep(0.1) systemd_unit_service.emit_properties_changed({"ActiveState": "failed"}) - await asyncio.sleep(0.1) - systemd_unit_service.emit_properties_changed({"ActiveState": "active"}) await load_task assert mount.state == UnitActiveState.ACTIVE @@ -471,16 +493,15 @@ async def test_mount_failure( assert len(systemd_service.StartTransientUnit.calls) == 1 assert len(systemd_service.GetUnit.calls) == 1 - # If state is 'activating', wait it out and raise error if it does not become 'active' + # When the post-dispatch state is not 'active' the mount call raises. + # With JobRemoved as the completion signal, supervisor trusts that the + # job is done by the time the signal fires — the systemd-side state + # await happens inside systemd, not in supervisor. systemd_service.StartTransientUnit.calls.clear() systemd_service.GetUnit.calls.clear() - systemd_unit_service.active_state = "activating" - - load_task = asyncio.create_task(mount.mount()) - await asyncio.sleep(0.1) - systemd_unit_service.emit_properties_changed({"ActiveState": "failed"}) + systemd_unit_service.active_state = "failed" with pytest.raises(MountError): - await load_task + await mount.mount() assert mount.state == UnitActiveState.FAILED assert len(systemd_service.StartTransientUnit.calls) == 1 @@ -561,7 +582,12 @@ async def test_reload_failure( "/org/freedesktop/systemd1/job/7623" ) systemd_service.response_restart_unit = "/org/freedesktop/systemd1/job/7623" - with patch("supervisor.mounts.mount.Path.is_mount", side_effect=[False, True]): + # Probe fails after reload (server still unreachable) but succeeds + # after restart — exercises the reload -> restart escalation path. + with patch( + "supervisor.mounts.mount._probe_network_mount", + side_effect=[OSError(errno.EHOSTDOWN, "Host is down"), True], + ): await mount.reload() assert mount.state == UnitActiveState.ACTIVE @@ -575,7 +601,15 @@ async def test_reload_failure( systemd_service.RestartUnit.calls.clear() systemd_service.GetUnit.calls.clear() systemd_unit_service.active_state = "failed" - with pytest.raises(MountError): + # Force the fast-path probe to fail so reload actually exercises the + # reload -> restart escalation we're testing here. + with ( + patch( + "supervisor.mounts.mount._probe_network_mount", + side_effect=OSError(errno.EHOSTDOWN, "Host is down"), + ), + pytest.raises(MountError), + ): await mount.reload() assert mount.state == UnitActiveState.FAILED @@ -723,7 +757,7 @@ async def test_update_clears_issue(coresys: CoreSys, path_extern, mock_is_mount) async def test_update_leaves_issue_if_down( coresys: CoreSys, mock_is_mount: MagicMock, path_extern ): - """Test issue is left if system is down after update (is_mount is false).""" + """Test issue is left if system is down after update (probe fails).""" mount = Mount.from_dict( coresys, { @@ -747,8 +781,11 @@ async def test_update_leaves_issue_if_down( assert mount.failed_issue in coresys.resolution.issues assert len(coresys.resolution.suggestions_for_issue(mount.failed_issue)) == 2 - mock_is_mount.return_value = False - assert (await mount.update()) is False + with patch( + "supervisor.mounts.mount._probe_network_mount", + side_effect=OSError(errno.EHOSTDOWN, "Host is down"), + ): + assert (await mount.update()) is False assert mount.state == UnitActiveState.ACTIVE assert mount.failed_issue in coresys.resolution.issues @@ -762,7 +799,7 @@ async def test_mount_fails_if_down( mock_is_mount: MagicMock, path_extern, ): - """Test mount fails if system is down (is_mount is false).""" + """Test mount fails if system is down (probe fails after activation).""" systemd_service: SystemdService = all_dbus_services["systemd"] systemd_service.StartTransientUnit.calls.clear() @@ -777,8 +814,13 @@ async def test_mount_fails_if_down( } mount: NFSMount = Mount.from_dict(coresys, mount_data) - mock_is_mount.return_value = False - with pytest.raises(MountActivationError): + with ( + patch( + "supervisor.mounts.mount._probe_network_mount", + side_effect=OSError(errno.EHOSTDOWN, "Host is down"), + ), + pytest.raises(MountActivationError), + ): await mount.mount() assert mount.state == UnitActiveState.ACTIVE @@ -790,7 +832,7 @@ async def test_mount_fails_if_down( "mnt-data-supervisor-mounts-test.mount", "fail", [ - ("Options", Variant("s", "port=1234,soft,timeo=100,retrans=2")), + ("Options", Variant("s", "port=1234,softerr,timeo=100,retrans=2")), ("Type", Variant("s", "nfs")), ("Description", Variant("s", "Supervisor nfs mount: test")), ("What", Variant("s", "test.local:/media/camera")), diff --git a/tests/resolution/fixup/test_mount_execute_reload.py b/tests/resolution/fixup/test_mount_execute_reload.py index 6fa8ebc5b..bd95b2e6d 100644 --- a/tests/resolution/fixup/test_mount_execute_reload.py +++ b/tests/resolution/fixup/test_mount_execute_reload.py @@ -1,6 +1,7 @@ """Test fixup mount reload.""" -from unittest.mock import MagicMock +import errno +from unittest.mock import MagicMock, patch from supervisor.coresys import CoreSys from supervisor.mounts.mount import Mount @@ -51,9 +52,11 @@ async def test_fixup( assert coresys.resolution.issues == [] assert coresys.resolution.suggestions == [] assert "test" in coresys.mounts - assert systemd_service.ReloadOrRestartUnit.calls == [ - ("mnt-data-supervisor-mounts-test.mount", "fail") - ] + # Mount is reachable (probe passes via mock_is_mount); the fixup + # clears the issue without needing to touch systemd. A user invoking + # the fixup on a still-broken mount would fail the probe, exercising + # the reload->restart path covered by test_fixup_error_after_reload. + assert systemd_service.ReloadOrRestartUnit.calls == [] async def test_fixup_error_after_reload( @@ -84,12 +87,17 @@ async def test_fixup_error_after_reload( reference="test", suggestions=[SuggestionType.EXECUTE_RELOAD, SuggestionType.EXECUTE_REMOVE], ) - mock_is_mount.return_value = False - # Fixup catches the MountError and re-raises ResolutionFixupError, which - # FixupBase.__call__ swallows to skip issue cleanup. Caller sees no error. - await mount_execute_reload() + # Probe (statvfs) fails — the mount stays unreachable through the + # reload -> restart cycle. Fixup catches MountError and re-raises + # ResolutionFixupError, which FixupBase.__call__ swallows to skip + # issue cleanup. Caller sees no error. + with patch( + "supervisor.mounts.mount._probe_network_mount", + side_effect=OSError(errno.EHOSTDOWN, "Host is down"), + ): + await mount_execute_reload() - # Since is_mount is false, issue remains + # Probe never succeeds, issue remains. assert ( Issue(IssueType.MOUNT_FAILED, ContextType.MOUNT, reference="test") in coresys.resolution.issues