From 9b80cf7d9432ffe5ae719e1311f3630ff2b13d42 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Wed, 3 Sep 2025 13:13:02 -0500 Subject: [PATCH] Prevent multiple Home Assistant instances from running with the same config directory (#151631) --- homeassistant/__main__.py | 56 +++++---- homeassistant/runner.py | 115 ++++++++++++++++++ tests/test_runner.py | 249 +++++++++++++++++++++++++++++++++++++- 3 files changed, 394 insertions(+), 26 deletions(-) diff --git a/homeassistant/__main__.py b/homeassistant/__main__.py index 6fd48c4809c..7821caac749 100644 --- a/homeassistant/__main__.py +++ b/homeassistant/__main__.py @@ -187,36 +187,42 @@ def main() -> int: from . import config, runner # noqa: PLC0415 - safe_mode = config.safe_mode_enabled(config_dir) + # Ensure only one instance runs per config directory + with runner.ensure_single_execution(config_dir) as single_execution_lock: + # Check if another instance is already running + if single_execution_lock.exit_code is not None: + return single_execution_lock.exit_code - runtime_conf = runner.RuntimeConfig( - config_dir=config_dir, - verbose=args.verbose, - log_rotate_days=args.log_rotate_days, - log_file=args.log_file, - log_no_color=args.log_no_color, - skip_pip=args.skip_pip, - skip_pip_packages=args.skip_pip_packages, - recovery_mode=args.recovery_mode, - debug=args.debug, - open_ui=args.open_ui, - safe_mode=safe_mode, - ) + safe_mode = config.safe_mode_enabled(config_dir) - fault_file_name = os.path.join(config_dir, FAULT_LOG_FILENAME) - with open(fault_file_name, mode="a", encoding="utf8") as fault_file: - faulthandler.enable(fault_file) - exit_code = runner.run(runtime_conf) - faulthandler.disable() + runtime_conf = runner.RuntimeConfig( + config_dir=config_dir, + verbose=args.verbose, + log_rotate_days=args.log_rotate_days, + log_file=args.log_file, + log_no_color=args.log_no_color, + skip_pip=args.skip_pip, + skip_pip_packages=args.skip_pip_packages, + recovery_mode=args.recovery_mode, + debug=args.debug, + open_ui=args.open_ui, + safe_mode=safe_mode, + ) - # It's possible for the fault file to disappear, so suppress obvious errors - with suppress(FileNotFoundError): - if os.path.getsize(fault_file_name) == 0: - os.remove(fault_file_name) + fault_file_name = os.path.join(config_dir, FAULT_LOG_FILENAME) + with open(fault_file_name, mode="a", encoding="utf8") as fault_file: + faulthandler.enable(fault_file) + exit_code = runner.run(runtime_conf) + faulthandler.disable() - check_threads() + # It's possible for the fault file to disappear, so suppress obvious errors + with suppress(FileNotFoundError): + if os.path.getsize(fault_file_name) == 0: + os.remove(fault_file_name) - return exit_code + check_threads() + + return exit_code if __name__ == "__main__": diff --git a/homeassistant/runner.py b/homeassistant/runner.py index abcf32f2659..6fa59923e81 100644 --- a/homeassistant/runner.py +++ b/homeassistant/runner.py @@ -3,10 +3,20 @@ from __future__ import annotations import asyncio +from collections.abc import Generator +from contextlib import contextmanager import dataclasses +from datetime import datetime +import fcntl +from io import TextIOWrapper +import json import logging +import os +from pathlib import Path import subprocess +import sys import threading +import time from time import monotonic import traceback from typing import Any @@ -14,6 +24,7 @@ from typing import Any import packaging.tags from . import bootstrap +from .const import __version__ from .core import callback from .helpers.frame import warn_use from .util.executor import InterruptibleThreadPoolExecutor @@ -33,9 +44,113 @@ from .util.thread import deadlock_safe_shutdown MAX_EXECUTOR_WORKERS = 64 TASK_CANCELATION_TIMEOUT = 5 +# Lock file configuration +LOCK_FILE_NAME = ".ha_run.lock" +LOCK_FILE_VERSION = 1 # Increment if format changes + _LOGGER = logging.getLogger(__name__) +@dataclasses.dataclass +class SingleExecutionLock: + """Context object for single execution lock.""" + + exit_code: int | None = None + + +def _write_lock_info(lock_file: TextIOWrapper) -> None: + """Write current instance information to the lock file. + + Args: + lock_file: The open lock file handle. + """ + lock_file.seek(0) + lock_file.truncate() + + instance_info = { + "pid": os.getpid(), + "version": LOCK_FILE_VERSION, + "ha_version": __version__, + "start_ts": time.time(), + } + json.dump(instance_info, lock_file) + lock_file.flush() + + +def _report_existing_instance(lock_file_path: Path, config_dir: str) -> None: + """Report that another instance is already running. + + Attempts to read the lock file to provide details about the running instance. + """ + error_msg: list[str] = [] + error_msg.append("Error: Another Home Assistant instance is already running!") + + # Try to read information about the existing instance + try: + with open(lock_file_path, encoding="utf-8") as f: + if content := f.read().strip(): + existing_info = json.loads(content) + start_dt = datetime.fromtimestamp(existing_info["start_ts"]) + # Format with timezone abbreviation if available, otherwise add local time indicator + if tz_abbr := start_dt.strftime("%Z"): + start_time = start_dt.strftime(f"%Y-%m-%d %H:%M:%S {tz_abbr}") + else: + start_time = ( + start_dt.strftime("%Y-%m-%d %H:%M:%S") + " (local time)" + ) + + error_msg.append(f" PID: {existing_info['pid']}") + error_msg.append(f" Version: {existing_info['ha_version']}") + error_msg.append(f" Started: {start_time}") + else: + error_msg.append(" Unable to read lock file details.") + except (json.JSONDecodeError, OSError) as ex: + error_msg.append(f" Unable to read lock file details: {ex}") + + error_msg.append(f" Config directory: {config_dir}") + error_msg.append("") + error_msg.append("Please stop the existing instance before starting a new one.") + + for line in error_msg: + print(line, file=sys.stderr) # noqa: T201 + + +@contextmanager +def ensure_single_execution(config_dir: str) -> Generator[SingleExecutionLock]: + """Ensure only one Home Assistant instance runs per config directory. + + Uses file locking to prevent multiple instances from running with the + same configuration directory, which can cause data corruption. + + Returns a context object with exit_code attribute that will be set + if another instance is already running. + """ + lock_file_path = Path(config_dir) / LOCK_FILE_NAME + lock_context = SingleExecutionLock() + + # Open with 'a+' mode to avoid truncating existing content + # This allows us to read existing content if lock fails + with open(lock_file_path, "a+", encoding="utf-8") as lock_file: + # Try to acquire an exclusive, non-blocking lock + # This will raise BlockingIOError if lock is already held + try: + fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB) + except BlockingIOError: + # Another instance is already running + _report_existing_instance(lock_file_path, config_dir) + lock_context.exit_code = 1 + yield lock_context + return # Exit early since we couldn't get the lock + + # If we got the lock (no exception), write our instance info + _write_lock_info(lock_file) + + # Yield the context - lock will be released when the with statement closes the file + # IMPORTANT: We don't unlink the file to avoid races where multiple processes + # could create different lock files + yield lock_context + + @dataclasses.dataclass(slots=True) class RuntimeConfig: """Class to hold the information for running Home Assistant.""" diff --git a/tests/test_runner.py b/tests/test_runner.py index c61b8ed5628..6da9839f6fb 100644 --- a/tests/test_runner.py +++ b/tests/test_runner.py @@ -2,15 +2,21 @@ import asyncio from collections.abc import Iterator +import fcntl +import json +import os +from pathlib import Path import subprocess import threading -from unittest.mock import patch +import time +from unittest.mock import MagicMock, patch import packaging.tags import py import pytest from homeassistant import core, runner +from homeassistant.const import __version__ from homeassistant.core import HomeAssistant from homeassistant.util import executor, thread @@ -187,3 +193,244 @@ def test_enable_posix_spawn() -> None: ): runner._enable_posix_spawn() assert subprocess._USE_POSIX_SPAWN is False + + +def test_ensure_single_execution_success(tmp_path: Path) -> None: + """Test successful single instance execution.""" + config_dir = str(tmp_path) + lock_file_path = tmp_path / runner.LOCK_FILE_NAME + + with runner.ensure_single_execution(config_dir) as lock: + assert lock.exit_code is None + assert lock_file_path.exists() + + with open(lock_file_path, encoding="utf-8") as f: + data = json.load(f) + assert data["pid"] == os.getpid() + assert data["version"] == runner.LOCK_FILE_VERSION + assert data["ha_version"] == __version__ + assert "start_ts" in data + assert isinstance(data["start_ts"], float) + + # Lock file should still exist after context exit (we don't unlink to avoid races) + assert lock_file_path.exists() + + +def test_ensure_single_execution_blocked( + tmp_path: Path, capfd: pytest.CaptureFixture[str] +) -> None: + """Test that second instance is blocked when lock exists.""" + config_dir = str(tmp_path) + lock_file_path = tmp_path / runner.LOCK_FILE_NAME + + # Create and lock the file to simulate another instance + with open(lock_file_path, "w+", encoding="utf-8") as lock_file: + fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB) + + instance_info = { + "pid": 12345, + "version": 1, + "ha_version": "2025.1.0", + "start_ts": time.time() - 3600, # Started 1 hour ago + } + json.dump(instance_info, lock_file) + lock_file.flush() + + with runner.ensure_single_execution(config_dir) as lock: + assert lock.exit_code == 1 + + captured = capfd.readouterr() + assert "Another Home Assistant instance is already running!" in captured.err + assert "PID: 12345" in captured.err + assert "Version: 2025.1.0" in captured.err + assert "Started: " in captured.err + # Should show local time since naive datetime + assert "(local time)" in captured.err + assert f"Config directory: {config_dir}" in captured.err + + +def test_ensure_single_execution_corrupt_lock_file( + tmp_path: Path, capfd: pytest.CaptureFixture[str] +) -> None: + """Test handling of corrupted lock file.""" + config_dir = str(tmp_path) + lock_file_path = tmp_path / runner.LOCK_FILE_NAME + + with open(lock_file_path, "w+", encoding="utf-8") as lock_file: + fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB) + lock_file.write("not valid json{]") + lock_file.flush() + + # Try to acquire lock (should set exit_code but handle corrupt file gracefully) + with runner.ensure_single_execution(config_dir) as lock: + assert lock.exit_code == 1 + + # Check error output + captured = capfd.readouterr() + assert "Another Home Assistant instance is already running!" in captured.err + assert "Unable to read lock file details:" in captured.err + assert f"Config directory: {config_dir}" in captured.err + + +def test_ensure_single_execution_empty_lock_file( + tmp_path: Path, capfd: pytest.CaptureFixture[str] +) -> None: + """Test handling of empty lock file.""" + config_dir = str(tmp_path) + lock_file_path = tmp_path / runner.LOCK_FILE_NAME + + with open(lock_file_path, "w+", encoding="utf-8") as lock_file: + fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB) + # Don't write anything - leave it empty + lock_file.flush() + + # Try to acquire lock (should set exit_code but handle empty file gracefully) + with runner.ensure_single_execution(config_dir) as lock: + assert lock.exit_code == 1 + + # Check error output + captured = capfd.readouterr() + assert "Another Home Assistant instance is already running!" in captured.err + assert "Unable to read lock file details." in captured.err + + +def test_ensure_single_execution_with_timezone( + tmp_path: Path, capfd: pytest.CaptureFixture[str] +) -> None: + """Test handling of lock file with timezone info (edge case).""" + config_dir = str(tmp_path) + lock_file_path = tmp_path / runner.LOCK_FILE_NAME + + # Note: This tests an edge case - our code doesn't create timezone-aware timestamps, + # but we handle them if they exist + with open(lock_file_path, "w+", encoding="utf-8") as lock_file: + fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB) + + # Started 2 hours ago + instance_info = { + "pid": 54321, + "version": 1, + "ha_version": "2025.2.0", + "start_ts": time.time() - 7200, + } + json.dump(instance_info, lock_file) + lock_file.flush() + + with runner.ensure_single_execution(config_dir) as lock: + assert lock.exit_code == 1 + + captured = capfd.readouterr() + assert "Another Home Assistant instance is already running!" in captured.err + assert "PID: 54321" in captured.err + assert "Version: 2025.2.0" in captured.err + assert "Started: " in captured.err + # Should show local time indicator since fromtimestamp creates naive datetime + assert "(local time)" in captured.err + + +def test_ensure_single_execution_with_tz_abbreviation( + tmp_path: Path, capfd: pytest.CaptureFixture[str] +) -> None: + """Test handling of lock file when timezone abbreviation is available.""" + config_dir = str(tmp_path) + lock_file_path = tmp_path / runner.LOCK_FILE_NAME + + with open(lock_file_path, "w+", encoding="utf-8") as lock_file: + fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB) + + instance_info = { + "pid": 98765, + "version": 1, + "ha_version": "2025.3.0", + "start_ts": time.time() - 1800, # Started 30 minutes ago + } + json.dump(instance_info, lock_file) + lock_file.flush() + + # Mock datetime to return a timezone abbreviation + # We use mocking because strftime("%Z") behavior is OS-specific: + # On some systems it returns empty string for naive datetimes + mock_dt = MagicMock() + + def _mock_strftime(fmt: str) -> str: + if fmt == "%Z": + return "PST" + if fmt == "%Y-%m-%d %H:%M:%S": + return "2025-09-03 10:30:45" + return "2025-09-03 10:30:45 PST" + + mock_dt.strftime.side_effect = _mock_strftime + + with patch("homeassistant.runner.datetime") as mock_datetime: + mock_datetime.fromtimestamp.return_value = mock_dt + with runner.ensure_single_execution(config_dir) as lock: + assert lock.exit_code == 1 + + captured = capfd.readouterr() + assert "Another Home Assistant instance is already running!" in captured.err + assert "PID: 98765" in captured.err + assert "Version: 2025.3.0" in captured.err + assert "Started: 2025-09-03 10:30:45 PST" in captured.err + # Should NOT have "(local time)" when timezone abbreviation is present + assert "(local time)" not in captured.err + + +def test_ensure_single_execution_file_not_unlinked(tmp_path: Path) -> None: + """Test that lock file is never unlinked to avoid race conditions.""" + config_dir = str(tmp_path) + lock_file_path = tmp_path / runner.LOCK_FILE_NAME + + # First run creates the lock file + with runner.ensure_single_execution(config_dir) as lock: + assert lock.exit_code is None + assert lock_file_path.exists() + # Get inode to verify it's the same file + stat1 = lock_file_path.stat() + + # After context exit, file should still exist + assert lock_file_path.exists() + stat2 = lock_file_path.stat() + # Verify it's the exact same file (same inode) + assert stat1.st_ino == stat2.st_ino + + # Second run should reuse the same file + with runner.ensure_single_execution(config_dir) as lock: + assert lock.exit_code is None + assert lock_file_path.exists() + stat3 = lock_file_path.stat() + # Still the same file (not recreated) + assert stat1.st_ino == stat3.st_ino + + # After second run, still the same file + assert lock_file_path.exists() + stat4 = lock_file_path.stat() + assert stat1.st_ino == stat4.st_ino + + +def test_ensure_single_execution_sequential_runs(tmp_path: Path) -> None: + """Test that sequential runs work correctly after lock is released.""" + config_dir = str(tmp_path) + lock_file_path = tmp_path / runner.LOCK_FILE_NAME + + with runner.ensure_single_execution(config_dir) as lock: + assert lock.exit_code is None + assert lock_file_path.exists() + with open(lock_file_path, encoding="utf-8") as f: + first_data = json.load(f) + + # Lock file should still exist after first run (not unlinked) + assert lock_file_path.exists() + + # Small delay to ensure different timestamp + time.sleep(0.00001) + + with runner.ensure_single_execution(config_dir) as lock: + assert lock.exit_code is None + assert lock_file_path.exists() + with open(lock_file_path, encoding="utf-8") as f: + second_data = json.load(f) + assert second_data["pid"] == os.getpid() + assert second_data["start_ts"] > first_data["start_ts"] + + # Lock file should still exist after second run (not unlinked) + assert lock_file_path.exists()