From 8a82b98e5b74add50c1f8b11a34092e2d6fa3881 Mon Sep 17 00:00:00 2001 From: Mike Degatano Date: Wed, 13 Aug 2025 12:05:27 -0400 Subject: [PATCH] Improved error handling for docker image pulls (#6095) * Improved error handling for docker image pulls * Fix mocking in tests due to api use change --- supervisor/const.py | 1 + supervisor/docker/interface.py | 5 +- supervisor/docker/manager.py | 105 +++++++++++++++++- supervisor/exceptions.py | 8 ++ tests/addons/test_addon.py | 7 +- tests/addons/test_manager.py | 15 +-- tests/conftest.py | 13 ++- tests/docker/test_interface.py | 179 ++++++++++++++++++++++++++++-- tests/homeassistant/test_core.py | 20 ++-- tests/plugins/test_plugin_base.py | 5 +- 10 files changed, 320 insertions(+), 38 deletions(-) diff --git a/supervisor/const.py b/supervisor/const.py index 2441fbddf..bd4c90588 100644 --- a/supervisor/const.py +++ b/supervisor/const.py @@ -502,6 +502,7 @@ class BusEvent(StrEnum): """Bus event type.""" DOCKER_CONTAINER_STATE_CHANGE = "docker_container_state_change" + DOCKER_IMAGE_PULL_UPDATE = "docker_image_pull_update" HARDWARE_NEW_DEVICE = "hardware_new_device" HARDWARE_REMOVE_DEVICE = "hardware_remove_device" SUPERVISOR_CONNECTIVITY_CHANGE = "supervisor_connectivity_change" diff --git a/supervisor/docker/interface.py b/supervisor/docker/interface.py index 838a86c0f..574bdef1d 100644 --- a/supervisor/docker/interface.py +++ b/supervisor/docker/interface.py @@ -244,8 +244,9 @@ class DockerInterface(JobGroup, ABC): # Pull new image docker_image = await self.sys_run_in_executor( - self.sys_docker.images.pull, - f"{image}:{version!s}", + self.sys_docker.pull_image, + image, + str(version), platform=MAP_ARCH[image_arch], ) diff --git a/supervisor/docker/manager.py b/supervisor/docker/manager.py index ae4ad7827..1c58f8d19 100644 --- a/supervisor/docker/manager.py +++ b/supervisor/docker/manager.py @@ -1,7 +1,10 @@ """Manager for Supervisor Docker.""" +from __future__ import annotations + import asyncio from contextlib import suppress +from dataclasses import dataclass from functools import partial from ipaddress import IPv4Address import logging @@ -30,9 +33,16 @@ from ..const import ( ENV_SUPERVISOR_CPU_RT, FILE_HASSIO_DOCKER, SOCKET_DOCKER, + BusEvent, +) +from ..coresys import CoreSys, CoreSysAttributes +from ..exceptions import ( + DockerAPIError, + DockerError, + DockerNoSpaceOnDevice, + DockerNotFound, + DockerRequestError, ) -from ..coresys import CoreSys -from ..exceptions import DockerAPIError, DockerError, DockerNotFound, DockerRequestError from ..utils.common import FileConfiguration from ..validate import SCHEMA_DOCKER_CONFIG from .const import LABEL_MANAGED @@ -88,6 +98,68 @@ class DockerInfo: return bool(os.environ.get(ENV_SUPERVISOR_CPU_RT) == "1") +@dataclass(frozen=True, slots=True) +class PullProgressDetail: + """Progress detail information for pull. + + Documentation lacking but both of these seem to be in bytes when populated. + """ + + current: int | None = None + total: int | None = None + + @classmethod + def from_pull_log_dict(cls, value: dict[str, int]) -> PullProgressDetail: + """Convert pull progress log dictionary into instance.""" + return cls(current=value.get("current"), total=value.get("total")) + + +@dataclass(frozen=True, slots=True) +class PullLogEntry: + """Details for a entry in pull log. + + Not seeing documentation on this structure. Notes from exploration: + 1. All entries have status except errors + 2. Nearly all (but not all) entries have an id + 3. Most entries have progress but it may be empty string and dictionary + 4. Status is not an enum. It includes dynamic data like the image name + 5. Progress is what you see in the CLI. It's for humans, progressDetail is for machines + + Omitted field - errorDetail. It seems to be a dictionary with one field "message" that + exactly matches "error". As that is redundant, skipping for now. + """ + + id: str | None = None + status: str | None = None + progress: str | None = None + progress_detail: PullProgressDetail | None = None + error: str | None = None + + @classmethod + def from_pull_log_dict(cls, value: dict[str, Any]) -> PullLogEntry: + """Convert pull progress log dictionary into instance.""" + return cls( + id=value.get("id"), + status=value.get("status"), + progress=value.get("progress"), + progress_detail=PullProgressDetail.from_pull_log_dict( + value["progressDetail"] + ) + if "progressDetail" in value + else None, + error=value.get("error"), + ) + + @property + def exception(self) -> DockerError: + """Converts error message into a raisable exception. Raises RuntimeError if there is no error.""" + if not self.error: + raise RuntimeError("No error to convert to exception!") + if self.error.endswith("no space left on device"): + return DockerNoSpaceOnDevice(_LOGGER.error) + return DockerError(self.error, _LOGGER.error) + + class DockerConfig(FileConfiguration): """Home Assistant core object for Docker configuration.""" @@ -121,7 +193,7 @@ class DockerConfig(FileConfiguration): return self._data.get(ATTR_REGISTRIES, {}) -class DockerAPI: +class DockerAPI(CoreSysAttributes): """Docker Supervisor wrapper. This class is not AsyncIO safe! @@ -129,6 +201,7 @@ class DockerAPI: def __init__(self, coresys: CoreSys): """Initialize Docker base wrapper.""" + self.coresys = coresys self._docker: DockerClient | None = None self._network: DockerNetwork | None = None self._info: DockerInfo | None = None @@ -302,6 +375,32 @@ class DockerAPI: return container + def pull_image( + self, repository: str, tag: str = "latest", platform: str | None = None + ) -> Image: + """Pull the specified image and return it. + + This mimics the high level API of images.pull but provides better error handling by raising + based on a docker error on pull. Whereas the high level API ignores all errors on pull and + raises only if the get fails afterwards. Additionally it fires progress reports for the pull + on the bus so listeners can use that to update status for users. + + Must be run in executor. + """ + pull_log = self.docker.api.pull( + repository, tag=tag, platform=platform, stream=True, decode=True + ) + for e in pull_log: + entry = PullLogEntry.from_pull_log_dict(e) + if entry.error: + raise entry.exception + self.sys_loop.call_soon_threadsafe( + self.sys_bus.fire_event, BusEvent.DOCKER_IMAGE_PULL_UPDATE, entry + ) + + sep = "@" if tag.startswith("sha256:") else ":" + return self.images.get(f"{repository}{sep}{tag}") + def run_command( self, image: str, diff --git a/supervisor/exceptions.py b/supervisor/exceptions.py index 7abfa9365..2b68fcf58 100644 --- a/supervisor/exceptions.py +++ b/supervisor/exceptions.py @@ -556,6 +556,14 @@ class DockerNotFound(DockerError): """Docker object don't Exists.""" +class DockerNoSpaceOnDevice(DockerError): + """Raise if a docker pull fails due to available space.""" + + def __init__(self, logger: Callable[..., None] | None = None) -> None: + """Raise & log.""" + super().__init__("No space left on disk", logger=logger) + + class DockerJobError(DockerError, JobException): """Error executing docker job.""" diff --git a/tests/addons/test_addon.py b/tests/addons/test_addon.py index 0bea2cc54..4c453bbbd 100644 --- a/tests/addons/test_addon.py +++ b/tests/addons/test_addon.py @@ -232,9 +232,12 @@ async def test_listener_attached_on_install( ): await coresys.addons.install(TEST_ADDON_SLUG) + # Normally this would be defaulted to False on start of the addon but test skips that + coresys.addons.get_local_only(TEST_ADDON_SLUG).watchdog = False + _fire_test_event(coresys, f"addon_{TEST_ADDON_SLUG}", ContainerState.RUNNING) await asyncio.sleep(0) - assert coresys.addons.get(TEST_ADDON_SLUG).state == AddonState.STARTUP + assert coresys.addons.get(TEST_ADDON_SLUG).state == AddonState.STARTED @pytest.mark.parametrize( @@ -954,7 +957,7 @@ async def test_addon_load_succeeds_with_docker_errors( # Image pull failure install_addon_ssh.data["image"] = "test/amd64-addon-ssh" coresys.docker.images.build.reset_mock(side_effect=True) - coresys.docker.images.pull.side_effect = DockerException() + coresys.docker.pull_image.side_effect = DockerException() caplog.clear() await install_addon_ssh.load() assert "Unknown error with test/amd64-addon-ssh:9.2.1" in caplog.text diff --git a/tests/addons/test_manager.py b/tests/addons/test_manager.py index f5843d001..0c02b156f 100644 --- a/tests/addons/test_manager.py +++ b/tests/addons/test_manager.py @@ -561,11 +561,12 @@ async def test_shared_image_kept_on_update( docker.images.get.side_effect = [image_new, image_old] docker.images.list.return_value = [image_new, image_old] - await coresys.addons.update("local_example2") - docker.images.remove.assert_not_called() - assert example_2.version == "1.3.0" + with patch.object(DockerAPI, "pull_image", return_value=image_new): + await coresys.addons.update("local_example2") + docker.images.remove.assert_not_called() + assert example_2.version == "1.3.0" - docker.images.get.side_effect = [image_new] - await coresys.addons.update("local_example_image") - docker.images.remove.assert_called_once_with("image_old", force=True) - assert install_addon_example_image.version == "1.3.0" + docker.images.get.side_effect = [image_new] + await coresys.addons.update("local_example_image") + docker.images.remove.assert_called_once_with("image_old", force=True) + assert install_addon_example_image.version == "1.3.0" diff --git a/tests/conftest.py b/tests/conftest.py index 6625d7601..da1976151 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -116,7 +116,10 @@ async def docker() -> DockerAPI: patch( "supervisor.docker.manager.DockerAPI.containers", return_value=MagicMock() ), - patch("supervisor.docker.manager.DockerAPI.api", return_value=MagicMock()), + patch( + "supervisor.docker.manager.DockerAPI.api", + return_value=(api_mock := MagicMock()), + ), patch("supervisor.docker.manager.DockerAPI.images.get", return_value=image), patch("supervisor.docker.manager.DockerAPI.images.list", return_value=images), patch( @@ -134,6 +137,9 @@ async def docker() -> DockerAPI: docker_obj.info.storage = "overlay2" docker_obj.info.version = AwesomeVersion("1.0.0") + # Need an iterable for logs + api_mock.pull.return_value = [] + yield docker_obj @@ -386,6 +392,7 @@ async def coresys( # Mock docker coresys_obj._docker = docker + coresys_obj.docker.coresys = coresys_obj coresys_obj.docker._monitor = DockerMonitor(coresys_obj) # Set internet state @@ -804,11 +811,11 @@ async def container(docker: DockerAPI) -> MagicMock: """Mock attrs and status for container on attach.""" docker.containers.get.return_value = addon = MagicMock() docker.containers.create.return_value = addon - docker.images.pull.return_value = addon docker.images.build.return_value = (addon, "") addon.status = "stopped" addon.attrs = {"State": {"ExitCode": 0}} - yield addon + with patch.object(DockerAPI, "pull_image", return_value=addon): + yield addon @pytest.fixture diff --git a/tests/docker/test_interface.py b/tests/docker/test_interface.py index c9a5a3407..d028b14fd 100644 --- a/tests/docker/test_interface.py +++ b/tests/docker/test_interface.py @@ -16,10 +16,12 @@ from supervisor.const import BusEvent, CpuArch from supervisor.coresys import CoreSys from supervisor.docker.const import ContainerState from supervisor.docker.interface import DockerInterface +from supervisor.docker.manager import PullLogEntry, PullProgressDetail from supervisor.docker.monitor import DockerContainerStateEvent from supervisor.exceptions import ( DockerAPIError, DockerError, + DockerNoSpaceOnDevice, DockerNotFound, DockerRequestError, ) @@ -52,13 +54,15 @@ async def test_docker_image_platform( ): """Test platform set correctly from arch.""" with patch.object( - coresys.docker.images, "pull", return_value=Mock(id="test:1.2.3") - ) as pull: + coresys.docker.images, "get", return_value=Mock(id="test:1.2.3") + ) as get: await test_docker_interface.install( AwesomeVersion("1.2.3"), "test", arch=cpu_arch ) - assert pull.call_count == 1 - assert pull.call_args == call("test:1.2.3", platform=platform) + coresys.docker.docker.api.pull.assert_called_once_with( + "test", tag="1.2.3", platform=platform, stream=True, decode=True + ) + get.assert_called_once_with("test:1.2.3") async def test_docker_image_default_platform( @@ -70,12 +74,14 @@ async def test_docker_image_default_platform( type(coresys.supervisor), "arch", PropertyMock(return_value="i386") ), patch.object( - coresys.docker.images, "pull", return_value=Mock(id="test:1.2.3") - ) as pull, + coresys.docker.images, "get", return_value=Mock(id="test:1.2.3") + ) as get, ): await test_docker_interface.install(AwesomeVersion("1.2.3"), "test") - assert pull.call_count == 1 - assert pull.call_args == call("test:1.2.3", platform="linux/386") + coresys.docker.docker.api.pull.assert_called_once_with( + "test", tag="1.2.3", platform="linux/386", stream=True, decode=True + ) + get.assert_called_once_with("test:1.2.3") @pytest.mark.parametrize( @@ -256,7 +262,7 @@ async def test_image_pull_fail( coresys: CoreSys, capture_exception: Mock, err: Exception ): """Test failure to pull image.""" - coresys.docker.images.pull.side_effect = err + coresys.docker.images.get.side_effect = err with pytest.raises(DockerError): await coresys.homeassistant.core.instance.install( AwesomeVersion("2022.7.3"), arch=CpuArch.AMD64 @@ -281,3 +287,158 @@ async def test_run_missing_image( await install_addon_ssh.instance.run() capture_exception.assert_called_once() + + +async def test_install_fires_progress_events( + coresys: CoreSys, test_docker_interface: DockerInterface +): + """Test progress events are fired during an install for listeners.""" + # This is from a sample pull. Filtered log to just one per unique status for test + coresys.docker.docker.api.pull.return_value = [ + { + "status": "Pulling from home-assistant/odroid-n2-homeassistant", + "id": "2025.7.2", + }, + {"status": "Already exists", "progressDetail": {}, "id": "6e771e15690e"}, + {"status": "Pulling fs layer", "progressDetail": {}, "id": "1578b14a573c"}, + {"status": "Waiting", "progressDetail": {}, "id": "2488d0e401e1"}, + { + "status": "Downloading", + "progressDetail": {"current": 1378, "total": 1486}, + "progress": "[==============================================> ] 1.378kB/1.486kB", + "id": "1578b14a573c", + }, + {"status": "Download complete", "progressDetail": {}, "id": "1578b14a573c"}, + { + "status": "Extracting", + "progressDetail": {"current": 1486, "total": 1486}, + "progress": "[==================================================>] 1.486kB/1.486kB", + "id": "1578b14a573c", + }, + {"status": "Pull complete", "progressDetail": {}, "id": "1578b14a573c"}, + {"status": "Verifying Checksum", "progressDetail": {}, "id": "6a1e931d8f88"}, + { + "status": "Digest: sha256:490080d7da0f385928022927990e04f604615f7b8c622ef3e58253d0f089881d" + }, + { + "status": "Status: Downloaded newer image for ghcr.io/home-assistant/odroid-n2-homeassistant:2025.7.2" + }, + ] + + events: list[PullLogEntry] = [] + + async def capture_log_entry(event: PullLogEntry) -> None: + events.append(event) + + coresys.bus.register_event(BusEvent.DOCKER_IMAGE_PULL_UPDATE, capture_log_entry) + + with ( + patch.object( + type(coresys.supervisor), "arch", PropertyMock(return_value="i386") + ), + ): + await test_docker_interface.install(AwesomeVersion("1.2.3"), "test") + coresys.docker.docker.api.pull.assert_called_once_with( + "test", tag="1.2.3", platform="linux/386", stream=True, decode=True + ) + coresys.docker.images.get.assert_called_once_with("test:1.2.3") + + await asyncio.sleep(1) + assert events == [ + PullLogEntry( + status="Pulling from home-assistant/odroid-n2-homeassistant", + id="2025.7.2", + ), + PullLogEntry( + status="Already exists", + progress_detail=PullProgressDetail(), + id="6e771e15690e", + ), + PullLogEntry( + status="Pulling fs layer", + progress_detail=PullProgressDetail(), + id="1578b14a573c", + ), + PullLogEntry( + status="Waiting", progress_detail=PullProgressDetail(), id="2488d0e401e1" + ), + PullLogEntry( + status="Downloading", + progress_detail=PullProgressDetail(current=1378, total=1486), + progress="[==============================================> ] 1.378kB/1.486kB", + id="1578b14a573c", + ), + PullLogEntry( + status="Download complete", + progress_detail=PullProgressDetail(), + id="1578b14a573c", + ), + PullLogEntry( + status="Extracting", + progress_detail=PullProgressDetail(current=1486, total=1486), + progress="[==================================================>] 1.486kB/1.486kB", + id="1578b14a573c", + ), + PullLogEntry( + status="Pull complete", + progress_detail=PullProgressDetail(), + id="1578b14a573c", + ), + PullLogEntry( + status="Verifying Checksum", + progress_detail=PullProgressDetail(), + id="6a1e931d8f88", + ), + PullLogEntry( + status="Digest: sha256:490080d7da0f385928022927990e04f604615f7b8c622ef3e58253d0f089881d" + ), + PullLogEntry( + status="Status: Downloaded newer image for ghcr.io/home-assistant/odroid-n2-homeassistant:2025.7.2" + ), + ] + + +@pytest.mark.parametrize( + ("error_log", "exc_type", "exc_msg"), + [ + ( + { + "errorDetail": { + "message": "write /mnt/data/docker/tmp/GetImageBlob2228293192: no space left on device" + }, + "error": "write /mnt/data/docker/tmp/GetImageBlob2228293192: no space left on device", + }, + DockerNoSpaceOnDevice, + "No space left on disk", + ), + ( + {"errorDetail": {"message": "failure"}, "error": "failure"}, + DockerError, + "failure", + ), + ], +) +async def test_install_raises_on_pull_error( + coresys: CoreSys, + test_docker_interface: DockerInterface, + error_log: dict[str, Any], + exc_type: type[DockerError], + exc_msg: str, +): + """Test exceptions raised from errors in pull log.""" + coresys.docker.docker.api.pull.return_value = [ + { + "status": "Pulling from home-assistant/odroid-n2-homeassistant", + "id": "2025.7.2", + }, + { + "status": "Downloading", + "progressDetail": {"current": 1378, "total": 1486}, + "progress": "[==============================================> ] 1.378kB/1.486kB", + "id": "1578b14a573c", + }, + error_log, + ] + + with pytest.raises(exc_type, match=exc_msg): + await test_docker_interface.install(AwesomeVersion("1.2.3"), "test") diff --git a/tests/homeassistant/test_core.py b/tests/homeassistant/test_core.py index c53d0cb99..54e2804f2 100644 --- a/tests/homeassistant/test_core.py +++ b/tests/homeassistant/test_core.py @@ -69,7 +69,7 @@ async def test_install_landingpage_docker_error( ), patch("supervisor.homeassistant.core.asyncio.sleep") as sleep, ): - coresys.docker.images.pull.side_effect = [APIError("fail"), MagicMock()] + coresys.docker.images.get.side_effect = [APIError("fail"), MagicMock()] await coresys.homeassistant.core.install_landingpage() sleep.assert_awaited_once_with(30) @@ -81,7 +81,7 @@ async def test_install_landingpage_other_error( coresys: CoreSys, capture_exception: Mock, caplog: pytest.LogCaptureFixture ): """Test install landing page fails due to other error.""" - coresys.docker.images.pull.side_effect = [(err := OSError()), MagicMock()] + coresys.docker.images.get.side_effect = [(err := OSError()), MagicMock()] with ( patch.object(DockerHomeAssistant, "attach", side_effect=DockerError), @@ -123,7 +123,7 @@ async def test_install_docker_error( ), patch("supervisor.homeassistant.core.asyncio.sleep") as sleep, ): - coresys.docker.images.pull.side_effect = [APIError("fail"), MagicMock()] + coresys.docker.images.get.side_effect = [APIError("fail"), MagicMock()] await coresys.homeassistant.core.install() sleep.assert_awaited_once_with(30) @@ -135,7 +135,7 @@ async def test_install_other_error( coresys: CoreSys, capture_exception: Mock, caplog: pytest.LogCaptureFixture ): """Test install fails due to other error.""" - coresys.docker.images.pull.side_effect = [(err := OSError()), MagicMock()] + coresys.docker.images.get.side_effect = [(err := OSError()), MagicMock()] with ( patch.object(HomeAssistantCore, "start"), @@ -407,8 +407,9 @@ async def test_core_loads_wrong_image_for_machine( "image": "ghcr.io/home-assistant/odroid-n2-homeassistant:2024.4.0", "force": True, } - coresys.docker.images.pull.assert_called_once_with( - "ghcr.io/home-assistant/qemux86-64-homeassistant:2024.4.0", + coresys.docker.pull_image.assert_called_once_with( + "ghcr.io/home-assistant/qemux86-64-homeassistant", + "2024.4.0", platform="linux/amd64", ) assert ( @@ -427,7 +428,7 @@ async def test_core_load_allows_image_override(coresys: CoreSys, container: Magi container.remove.assert_not_called() coresys.docker.images.remove.assert_not_called() - coresys.docker.images.pull.assert_not_called() + coresys.docker.images.get.assert_not_called() assert ( coresys.homeassistant.image == "ghcr.io/home-assistant/odroid-n2-homeassistant" ) @@ -454,8 +455,9 @@ async def test_core_loads_wrong_image_for_architecture( "image": "ghcr.io/home-assistant/qemux86-64-homeassistant:2024.4.0", "force": True, } - coresys.docker.images.pull.assert_called_once_with( - "ghcr.io/home-assistant/qemux86-64-homeassistant:2024.4.0", + coresys.docker.pull_image.assert_called_once_with( + "ghcr.io/home-assistant/qemux86-64-homeassistant", + "2024.4.0", platform="linux/amd64", ) assert ( diff --git a/tests/plugins/test_plugin_base.py b/tests/plugins/test_plugin_base.py index 45a939914..68f0fff94 100644 --- a/tests/plugins/test_plugin_base.py +++ b/tests/plugins/test_plugin_base.py @@ -372,9 +372,8 @@ async def test_load_with_incorrect_image( "image": f"{old_image}:2024.4.0", "force": True, } - coresys.docker.images.pull.assert_called_once_with( - f"{correct_image}:2024.4.0", - platform="linux/amd64", + coresys.docker.pull_image.assert_called_once_with( + correct_image, "2024.4.0", platform="linux/amd64" ) assert plugin.image == correct_image