1
0
mirror of https://github.com/home-assistant/core.git synced 2026-02-15 07:36:16 +00:00

Add timeout to B2 metadata downloads to prevent backup hang (#162562)

This commit is contained in:
ElCruncharino
2026-02-12 09:26:50 -05:00
committed by GitHub
parent ccb3b35694
commit 7d583be8e1
3 changed files with 254 additions and 20 deletions

View File

@@ -16,12 +16,18 @@ CONNECTION_TIMEOUT = 120 # 2 minutes
# Default TIMEOUT_FOR_UPLOAD is 128 seconds, which is too short for large backups
TIMEOUT_FOR_UPLOAD = 43200 # 12 hours
# Reduced retry count for download operations
# Default is 20 retries with exponential backoff, which can hang for 30+ minutes
# when there are persistent connection errors (e.g., SSL failures)
TRY_COUNT_DOWNLOAD = 3
class B2Http(BaseB2Http): # type: ignore[misc]
"""B2Http with extended timeouts for backup operations."""
CONNECTION_TIMEOUT = CONNECTION_TIMEOUT
TIMEOUT_FOR_UPLOAD = TIMEOUT_FOR_UPLOAD
TRY_COUNT_DOWNLOAD = TRY_COUNT_DOWNLOAD
class B2Session(BaseB2Session): # type: ignore[misc]

View File

@@ -40,6 +40,10 @@ CACHE_TTL = 300
# This prevents uploads from hanging indefinitely
UPLOAD_TIMEOUT = 43200 # 12 hours (matches B2 HTTP timeout)
# Timeout for metadata download operations (in seconds)
# This prevents the backup system from hanging when B2 connections fail
METADATA_DOWNLOAD_TIMEOUT = 60
def suggested_filenames(backup: AgentBackup) -> tuple[str, str]:
"""Return the suggested filenames for the backup and metadata files."""
@@ -413,12 +417,21 @@ class BackblazeBackupAgent(BackupAgent):
backups = {}
for file_name, file_version in all_files_in_prefix.items():
if file_name.endswith(METADATA_FILE_SUFFIX):
backup = await self._hass.async_add_executor_job(
self._process_metadata_file_sync,
file_name,
file_version,
all_files_in_prefix,
)
try:
backup = await asyncio.wait_for(
self._hass.async_add_executor_job(
self._process_metadata_file_sync,
file_name,
file_version,
all_files_in_prefix,
),
timeout=METADATA_DOWNLOAD_TIMEOUT,
)
except TimeoutError:
_LOGGER.warning(
"Timeout downloading metadata file %s", file_name
)
continue
if backup:
backups[backup.backup_id] = backup
self._backup_list_cache = backups
@@ -442,10 +455,18 @@ class BackblazeBackupAgent(BackupAgent):
if not file or not metadata_file_version:
raise BackupNotFound(f"Backup {backup_id} not found")
metadata_content = await self._hass.async_add_executor_job(
self._download_and_parse_metadata_sync,
metadata_file_version,
)
try:
metadata_content = await asyncio.wait_for(
self._hass.async_add_executor_job(
self._download_and_parse_metadata_sync,
metadata_file_version,
),
timeout=METADATA_DOWNLOAD_TIMEOUT,
)
except TimeoutError:
raise BackupAgentError(
f"Timeout downloading metadata for backup {backup_id}"
) from None
_LOGGER.debug(
"Successfully retrieved metadata for backup ID %s from file %s",
@@ -468,16 +489,27 @@ class BackblazeBackupAgent(BackupAgent):
# Process metadata files sequentially to avoid exhausting executor pool
for file_name, file_version in all_files_in_prefix.items():
if file_name.endswith(METADATA_FILE_SUFFIX):
(
result_backup_file,
result_metadata_file_version,
) = await self._hass.async_add_executor_job(
self._process_metadata_file_for_id_sync,
file_name,
file_version,
backup_id,
all_files_in_prefix,
)
try:
(
result_backup_file,
result_metadata_file_version,
) = await asyncio.wait_for(
self._hass.async_add_executor_job(
self._process_metadata_file_for_id_sync,
file_name,
file_version,
backup_id,
all_files_in_prefix,
),
timeout=METADATA_DOWNLOAD_TIMEOUT,
)
except TimeoutError:
_LOGGER.warning(
"Timeout downloading metadata file %s while searching for backup %s",
file_name,
backup_id,
)
continue
if result_backup_file and result_metadata_file_version:
return result_backup_file, result_metadata_file_version

View File

@@ -955,3 +955,199 @@ async def test_upload_cancelled(
# CancelledError propagates up and causes a 500 error
assert resp.status == 500
assert any("cancelled" in msg for msg in caplog.messages)
async def test_metadata_download_timeout_during_list(
hass: HomeAssistant,
hass_ws_client: WebSocketGenerator,
caplog: pytest.LogCaptureFixture,
) -> None:
"""Test that metadata download timeout during list is handled gracefully."""
client = await hass_ws_client(hass)
mock_metadata = Mock()
mock_metadata.file_name = "testprefix/slow.metadata.json"
mock_tar = Mock()
mock_tar.file_name = "testprefix/slow.tar"
mock_tar.size = TEST_BACKUP.size
def mock_ls(_self, _prefix=""):
return iter([(mock_metadata, None), (mock_tar, None)])
with (
patch.object(BucketSimulator, "ls", mock_ls),
patch(
"homeassistant.components.backblaze_b2.backup.asyncio.wait_for",
side_effect=TimeoutError,
),
caplog.at_level(logging.WARNING),
):
await client.send_json_auto_id({"type": "backup/info"})
response = await client.receive_json()
assert response["success"]
# The backup should not appear in the list due to timeout
assert len(response["result"]["backups"]) == 0
assert any("Timeout downloading metadata file" in msg for msg in caplog.messages)
async def test_metadata_download_timeout_during_find_by_id(
hass: HomeAssistant,
hass_ws_client: WebSocketGenerator,
caplog: pytest.LogCaptureFixture,
) -> None:
"""Test that metadata download timeout during find by ID is handled gracefully."""
client = await hass_ws_client(hass)
mock_metadata = Mock()
mock_metadata.file_name = f"testprefix/{TEST_BACKUP.backup_id}.metadata.json"
mock_tar = Mock()
mock_tar.file_name = f"testprefix/{TEST_BACKUP.backup_id}.tar"
mock_tar.size = TEST_BACKUP.size
def mock_ls(_self, _prefix=""):
return iter([(mock_metadata, None), (mock_tar, None)])
with (
patch.object(BucketSimulator, "ls", mock_ls),
patch(
"homeassistant.components.backblaze_b2.backup.asyncio.wait_for",
side_effect=TimeoutError,
),
caplog.at_level(logging.WARNING),
):
await client.send_json_auto_id(
{"type": "backup/details", "backup_id": TEST_BACKUP.backup_id}
)
response = await client.receive_json()
assert response["success"]
# The backup should not be found due to timeout
assert response["result"]["backup"] is None
assert any(
"Timeout downloading metadata file" in msg
and "while searching for backup" in msg
for msg in caplog.messages
)
async def test_metadata_timeout_does_not_block_healthy_backups(
hass: HomeAssistant,
hass_ws_client: WebSocketGenerator,
caplog: pytest.LogCaptureFixture,
) -> None:
"""Test that a timed out metadata download doesn't prevent listing other backups."""
client = await hass_ws_client(hass)
mock_hanging_metadata = Mock()
mock_hanging_metadata.file_name = "testprefix/hanging_backup.metadata.json"
mock_hanging_metadata.download = Mock(side_effect=B2Error("SSL failure"))
mock_hanging_tar = Mock()
mock_hanging_tar.file_name = "testprefix/hanging_backup.tar"
mock_hanging_tar.size = 1000
mock_healthy_metadata = Mock()
mock_healthy_metadata.file_name = (
f"testprefix/{TEST_BACKUP.backup_id}.metadata.json"
)
mock_healthy_download = Mock()
mock_healthy_response = Mock()
mock_healthy_response.content = json.dumps(BACKUP_METADATA).encode()
mock_healthy_download.response = mock_healthy_response
mock_healthy_metadata.download = Mock(return_value=mock_healthy_download)
mock_healthy_tar = Mock()
mock_healthy_tar.file_name = f"testprefix/{TEST_BACKUP.backup_id}.tar"
mock_healthy_tar.size = TEST_BACKUP.size
def mock_ls(_self, _prefix=""):
return iter(
[
(mock_hanging_metadata, None),
(mock_hanging_tar, None),
(mock_healthy_metadata, None),
(mock_healthy_tar, None),
]
)
call_count = 0
original_wait_for = asyncio.wait_for
async def wait_for_first_timeout(coro, *, timeout=None):
nonlocal call_count
call_count += 1
if call_count == 1:
raise TimeoutError
return await original_wait_for(coro, timeout=timeout)
with (
patch.object(BucketSimulator, "ls", mock_ls),
patch(
"homeassistant.components.backblaze_b2.backup.asyncio.wait_for",
wait_for_first_timeout,
),
caplog.at_level(logging.WARNING),
):
await client.send_json_auto_id({"type": "backup/info"})
response = await client.receive_json()
assert response["success"]
backups = response["result"]["backups"]
assert len(backups) == 1
assert backups[0]["backup_id"] == TEST_BACKUP.backup_id
assert any("Timeout downloading metadata file" in msg for msg in caplog.messages)
async def test_metadata_download_timeout_during_get_backup(
hass: HomeAssistant,
hass_ws_client: WebSocketGenerator,
mock_config_entry: MockConfigEntry,
) -> None:
"""Test timeout on metadata re-download after file is found."""
client = await hass_ws_client(hass)
mock_metadata = Mock()
mock_metadata.file_name = f"testprefix/{TEST_BACKUP.backup_id}.metadata.json"
mock_download = Mock()
mock_response = Mock()
mock_response.content = json.dumps(BACKUP_METADATA).encode()
mock_download.response = mock_response
mock_metadata.download = Mock(return_value=mock_download)
mock_tar = Mock()
mock_tar.file_name = f"testprefix/{TEST_BACKUP.backup_id}.tar"
mock_tar.size = TEST_BACKUP.size
def mock_ls(_self, _prefix=""):
return iter([(mock_metadata, None), (mock_tar, None)])
call_count = 0
original_wait_for = asyncio.wait_for
async def wait_for_second_timeout(coro, *, timeout=None):
nonlocal call_count
call_count += 1
if call_count >= 2:
raise TimeoutError
return await original_wait_for(coro, timeout=timeout)
with (
patch.object(BucketSimulator, "ls", mock_ls),
patch(
"homeassistant.components.backblaze_b2.backup.asyncio.wait_for",
wait_for_second_timeout,
),
patch("homeassistant.components.backblaze_b2.backup.CACHE_TTL", 0),
):
await client.send_json_auto_id(
{"type": "backup/details", "backup_id": TEST_BACKUP.backup_id}
)
response = await client.receive_json()
assert response["success"]
assert (
f"{DOMAIN}.{mock_config_entry.entry_id}" in response["result"]["agent_errors"]
)