mirror of
https://github.com/home-assistant/core.git
synced 2026-02-15 07:36:16 +00:00
Add timeout to B2 metadata downloads to prevent backup hang (#162562)
This commit is contained in:
@@ -16,12 +16,18 @@ CONNECTION_TIMEOUT = 120 # 2 minutes
|
||||
# Default TIMEOUT_FOR_UPLOAD is 128 seconds, which is too short for large backups
|
||||
TIMEOUT_FOR_UPLOAD = 43200 # 12 hours
|
||||
|
||||
# Reduced retry count for download operations
|
||||
# Default is 20 retries with exponential backoff, which can hang for 30+ minutes
|
||||
# when there are persistent connection errors (e.g., SSL failures)
|
||||
TRY_COUNT_DOWNLOAD = 3
|
||||
|
||||
|
||||
class B2Http(BaseB2Http): # type: ignore[misc]
|
||||
"""B2Http with extended timeouts for backup operations."""
|
||||
|
||||
CONNECTION_TIMEOUT = CONNECTION_TIMEOUT
|
||||
TIMEOUT_FOR_UPLOAD = TIMEOUT_FOR_UPLOAD
|
||||
TRY_COUNT_DOWNLOAD = TRY_COUNT_DOWNLOAD
|
||||
|
||||
|
||||
class B2Session(BaseB2Session): # type: ignore[misc]
|
||||
|
||||
@@ -40,6 +40,10 @@ CACHE_TTL = 300
|
||||
# This prevents uploads from hanging indefinitely
|
||||
UPLOAD_TIMEOUT = 43200 # 12 hours (matches B2 HTTP timeout)
|
||||
|
||||
# Timeout for metadata download operations (in seconds)
|
||||
# This prevents the backup system from hanging when B2 connections fail
|
||||
METADATA_DOWNLOAD_TIMEOUT = 60
|
||||
|
||||
|
||||
def suggested_filenames(backup: AgentBackup) -> tuple[str, str]:
|
||||
"""Return the suggested filenames for the backup and metadata files."""
|
||||
@@ -413,12 +417,21 @@ class BackblazeBackupAgent(BackupAgent):
|
||||
backups = {}
|
||||
for file_name, file_version in all_files_in_prefix.items():
|
||||
if file_name.endswith(METADATA_FILE_SUFFIX):
|
||||
backup = await self._hass.async_add_executor_job(
|
||||
self._process_metadata_file_sync,
|
||||
file_name,
|
||||
file_version,
|
||||
all_files_in_prefix,
|
||||
)
|
||||
try:
|
||||
backup = await asyncio.wait_for(
|
||||
self._hass.async_add_executor_job(
|
||||
self._process_metadata_file_sync,
|
||||
file_name,
|
||||
file_version,
|
||||
all_files_in_prefix,
|
||||
),
|
||||
timeout=METADATA_DOWNLOAD_TIMEOUT,
|
||||
)
|
||||
except TimeoutError:
|
||||
_LOGGER.warning(
|
||||
"Timeout downloading metadata file %s", file_name
|
||||
)
|
||||
continue
|
||||
if backup:
|
||||
backups[backup.backup_id] = backup
|
||||
self._backup_list_cache = backups
|
||||
@@ -442,10 +455,18 @@ class BackblazeBackupAgent(BackupAgent):
|
||||
if not file or not metadata_file_version:
|
||||
raise BackupNotFound(f"Backup {backup_id} not found")
|
||||
|
||||
metadata_content = await self._hass.async_add_executor_job(
|
||||
self._download_and_parse_metadata_sync,
|
||||
metadata_file_version,
|
||||
)
|
||||
try:
|
||||
metadata_content = await asyncio.wait_for(
|
||||
self._hass.async_add_executor_job(
|
||||
self._download_and_parse_metadata_sync,
|
||||
metadata_file_version,
|
||||
),
|
||||
timeout=METADATA_DOWNLOAD_TIMEOUT,
|
||||
)
|
||||
except TimeoutError:
|
||||
raise BackupAgentError(
|
||||
f"Timeout downloading metadata for backup {backup_id}"
|
||||
) from None
|
||||
|
||||
_LOGGER.debug(
|
||||
"Successfully retrieved metadata for backup ID %s from file %s",
|
||||
@@ -468,16 +489,27 @@ class BackblazeBackupAgent(BackupAgent):
|
||||
# Process metadata files sequentially to avoid exhausting executor pool
|
||||
for file_name, file_version in all_files_in_prefix.items():
|
||||
if file_name.endswith(METADATA_FILE_SUFFIX):
|
||||
(
|
||||
result_backup_file,
|
||||
result_metadata_file_version,
|
||||
) = await self._hass.async_add_executor_job(
|
||||
self._process_metadata_file_for_id_sync,
|
||||
file_name,
|
||||
file_version,
|
||||
backup_id,
|
||||
all_files_in_prefix,
|
||||
)
|
||||
try:
|
||||
(
|
||||
result_backup_file,
|
||||
result_metadata_file_version,
|
||||
) = await asyncio.wait_for(
|
||||
self._hass.async_add_executor_job(
|
||||
self._process_metadata_file_for_id_sync,
|
||||
file_name,
|
||||
file_version,
|
||||
backup_id,
|
||||
all_files_in_prefix,
|
||||
),
|
||||
timeout=METADATA_DOWNLOAD_TIMEOUT,
|
||||
)
|
||||
except TimeoutError:
|
||||
_LOGGER.warning(
|
||||
"Timeout downloading metadata file %s while searching for backup %s",
|
||||
file_name,
|
||||
backup_id,
|
||||
)
|
||||
continue
|
||||
if result_backup_file and result_metadata_file_version:
|
||||
return result_backup_file, result_metadata_file_version
|
||||
|
||||
|
||||
@@ -955,3 +955,199 @@ async def test_upload_cancelled(
|
||||
# CancelledError propagates up and causes a 500 error
|
||||
assert resp.status == 500
|
||||
assert any("cancelled" in msg for msg in caplog.messages)
|
||||
|
||||
|
||||
async def test_metadata_download_timeout_during_list(
|
||||
hass: HomeAssistant,
|
||||
hass_ws_client: WebSocketGenerator,
|
||||
caplog: pytest.LogCaptureFixture,
|
||||
) -> None:
|
||||
"""Test that metadata download timeout during list is handled gracefully."""
|
||||
client = await hass_ws_client(hass)
|
||||
|
||||
mock_metadata = Mock()
|
||||
mock_metadata.file_name = "testprefix/slow.metadata.json"
|
||||
|
||||
mock_tar = Mock()
|
||||
mock_tar.file_name = "testprefix/slow.tar"
|
||||
mock_tar.size = TEST_BACKUP.size
|
||||
|
||||
def mock_ls(_self, _prefix=""):
|
||||
return iter([(mock_metadata, None), (mock_tar, None)])
|
||||
|
||||
with (
|
||||
patch.object(BucketSimulator, "ls", mock_ls),
|
||||
patch(
|
||||
"homeassistant.components.backblaze_b2.backup.asyncio.wait_for",
|
||||
side_effect=TimeoutError,
|
||||
),
|
||||
caplog.at_level(logging.WARNING),
|
||||
):
|
||||
await client.send_json_auto_id({"type": "backup/info"})
|
||||
response = await client.receive_json()
|
||||
|
||||
assert response["success"]
|
||||
# The backup should not appear in the list due to timeout
|
||||
assert len(response["result"]["backups"]) == 0
|
||||
assert any("Timeout downloading metadata file" in msg for msg in caplog.messages)
|
||||
|
||||
|
||||
async def test_metadata_download_timeout_during_find_by_id(
|
||||
hass: HomeAssistant,
|
||||
hass_ws_client: WebSocketGenerator,
|
||||
caplog: pytest.LogCaptureFixture,
|
||||
) -> None:
|
||||
"""Test that metadata download timeout during find by ID is handled gracefully."""
|
||||
client = await hass_ws_client(hass)
|
||||
|
||||
mock_metadata = Mock()
|
||||
mock_metadata.file_name = f"testprefix/{TEST_BACKUP.backup_id}.metadata.json"
|
||||
|
||||
mock_tar = Mock()
|
||||
mock_tar.file_name = f"testprefix/{TEST_BACKUP.backup_id}.tar"
|
||||
mock_tar.size = TEST_BACKUP.size
|
||||
|
||||
def mock_ls(_self, _prefix=""):
|
||||
return iter([(mock_metadata, None), (mock_tar, None)])
|
||||
|
||||
with (
|
||||
patch.object(BucketSimulator, "ls", mock_ls),
|
||||
patch(
|
||||
"homeassistant.components.backblaze_b2.backup.asyncio.wait_for",
|
||||
side_effect=TimeoutError,
|
||||
),
|
||||
caplog.at_level(logging.WARNING),
|
||||
):
|
||||
await client.send_json_auto_id(
|
||||
{"type": "backup/details", "backup_id": TEST_BACKUP.backup_id}
|
||||
)
|
||||
response = await client.receive_json()
|
||||
|
||||
assert response["success"]
|
||||
# The backup should not be found due to timeout
|
||||
assert response["result"]["backup"] is None
|
||||
assert any(
|
||||
"Timeout downloading metadata file" in msg
|
||||
and "while searching for backup" in msg
|
||||
for msg in caplog.messages
|
||||
)
|
||||
|
||||
|
||||
async def test_metadata_timeout_does_not_block_healthy_backups(
|
||||
hass: HomeAssistant,
|
||||
hass_ws_client: WebSocketGenerator,
|
||||
caplog: pytest.LogCaptureFixture,
|
||||
) -> None:
|
||||
"""Test that a timed out metadata download doesn't prevent listing other backups."""
|
||||
client = await hass_ws_client(hass)
|
||||
|
||||
mock_hanging_metadata = Mock()
|
||||
mock_hanging_metadata.file_name = "testprefix/hanging_backup.metadata.json"
|
||||
mock_hanging_metadata.download = Mock(side_effect=B2Error("SSL failure"))
|
||||
|
||||
mock_hanging_tar = Mock()
|
||||
mock_hanging_tar.file_name = "testprefix/hanging_backup.tar"
|
||||
mock_hanging_tar.size = 1000
|
||||
|
||||
mock_healthy_metadata = Mock()
|
||||
mock_healthy_metadata.file_name = (
|
||||
f"testprefix/{TEST_BACKUP.backup_id}.metadata.json"
|
||||
)
|
||||
mock_healthy_download = Mock()
|
||||
mock_healthy_response = Mock()
|
||||
mock_healthy_response.content = json.dumps(BACKUP_METADATA).encode()
|
||||
mock_healthy_download.response = mock_healthy_response
|
||||
mock_healthy_metadata.download = Mock(return_value=mock_healthy_download)
|
||||
|
||||
mock_healthy_tar = Mock()
|
||||
mock_healthy_tar.file_name = f"testprefix/{TEST_BACKUP.backup_id}.tar"
|
||||
mock_healthy_tar.size = TEST_BACKUP.size
|
||||
|
||||
def mock_ls(_self, _prefix=""):
|
||||
return iter(
|
||||
[
|
||||
(mock_hanging_metadata, None),
|
||||
(mock_hanging_tar, None),
|
||||
(mock_healthy_metadata, None),
|
||||
(mock_healthy_tar, None),
|
||||
]
|
||||
)
|
||||
|
||||
call_count = 0
|
||||
original_wait_for = asyncio.wait_for
|
||||
|
||||
async def wait_for_first_timeout(coro, *, timeout=None):
|
||||
nonlocal call_count
|
||||
call_count += 1
|
||||
if call_count == 1:
|
||||
raise TimeoutError
|
||||
return await original_wait_for(coro, timeout=timeout)
|
||||
|
||||
with (
|
||||
patch.object(BucketSimulator, "ls", mock_ls),
|
||||
patch(
|
||||
"homeassistant.components.backblaze_b2.backup.asyncio.wait_for",
|
||||
wait_for_first_timeout,
|
||||
),
|
||||
caplog.at_level(logging.WARNING),
|
||||
):
|
||||
await client.send_json_auto_id({"type": "backup/info"})
|
||||
response = await client.receive_json()
|
||||
|
||||
assert response["success"]
|
||||
backups = response["result"]["backups"]
|
||||
assert len(backups) == 1
|
||||
assert backups[0]["backup_id"] == TEST_BACKUP.backup_id
|
||||
assert any("Timeout downloading metadata file" in msg for msg in caplog.messages)
|
||||
|
||||
|
||||
async def test_metadata_download_timeout_during_get_backup(
|
||||
hass: HomeAssistant,
|
||||
hass_ws_client: WebSocketGenerator,
|
||||
mock_config_entry: MockConfigEntry,
|
||||
) -> None:
|
||||
"""Test timeout on metadata re-download after file is found."""
|
||||
client = await hass_ws_client(hass)
|
||||
|
||||
mock_metadata = Mock()
|
||||
mock_metadata.file_name = f"testprefix/{TEST_BACKUP.backup_id}.metadata.json"
|
||||
mock_download = Mock()
|
||||
mock_response = Mock()
|
||||
mock_response.content = json.dumps(BACKUP_METADATA).encode()
|
||||
mock_download.response = mock_response
|
||||
mock_metadata.download = Mock(return_value=mock_download)
|
||||
|
||||
mock_tar = Mock()
|
||||
mock_tar.file_name = f"testprefix/{TEST_BACKUP.backup_id}.tar"
|
||||
mock_tar.size = TEST_BACKUP.size
|
||||
|
||||
def mock_ls(_self, _prefix=""):
|
||||
return iter([(mock_metadata, None), (mock_tar, None)])
|
||||
|
||||
call_count = 0
|
||||
original_wait_for = asyncio.wait_for
|
||||
|
||||
async def wait_for_second_timeout(coro, *, timeout=None):
|
||||
nonlocal call_count
|
||||
call_count += 1
|
||||
if call_count >= 2:
|
||||
raise TimeoutError
|
||||
return await original_wait_for(coro, timeout=timeout)
|
||||
|
||||
with (
|
||||
patch.object(BucketSimulator, "ls", mock_ls),
|
||||
patch(
|
||||
"homeassistant.components.backblaze_b2.backup.asyncio.wait_for",
|
||||
wait_for_second_timeout,
|
||||
),
|
||||
patch("homeassistant.components.backblaze_b2.backup.CACHE_TTL", 0),
|
||||
):
|
||||
await client.send_json_auto_id(
|
||||
{"type": "backup/details", "backup_id": TEST_BACKUP.backup_id}
|
||||
)
|
||||
response = await client.receive_json()
|
||||
|
||||
assert response["success"]
|
||||
assert (
|
||||
f"{DOMAIN}.{mock_config_entry.entry_id}" in response["result"]["agent_errors"]
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user