mirror of
https://github.com/home-assistant/core.git
synced 2026-06-02 21:54:27 +01:00
8c8620c511
Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> Co-authored-by: Martin Hjelmare <marhje52@gmail.com>
266 lines
8.7 KiB
Python
266 lines
8.7 KiB
Python
"""PyPI metadata + PEP 740 provenance attestation lookups."""
|
|
|
|
from dataclasses import dataclass, field
|
|
import logging
|
|
import re
|
|
from typing import Any
|
|
from urllib.parse import urlparse
|
|
|
|
import requests
|
|
|
|
_LOGGER = logging.getLogger(__name__)
|
|
|
|
# Characters that could escape markdown / HTML in the rendered comment or the
|
|
# prompt fence used to ship the artifact to the agent. PyPI maintainers are
|
|
# upstream-untrusted, so we strip these from any value we lift from PyPI
|
|
# metadata before it enters the artifact.
|
|
_UNSAFE = re.compile(r"[`\n\r<>]")
|
|
|
|
|
|
def _safe(text: str | None) -> str | None:
|
|
"""Strip characters that could escape markdown / HTML / a prompt fence."""
|
|
if text is None:
|
|
return None
|
|
return _UNSAFE.sub("", text)
|
|
|
|
|
|
# Order matters — first hit wins.
|
|
_REPO_URL_KEYS = (
|
|
"source",
|
|
"source code",
|
|
"repository",
|
|
"code",
|
|
"github",
|
|
"homepage",
|
|
)
|
|
|
|
# Known CI publishers that appear in PEP 740 attestation bundles. Matched
|
|
# case-insensitively. Anything else is treated as inconclusive (NEEDS_AGENT).
|
|
_KNOWN_CI_PUBLISHERS = (
|
|
"github", # "GitHub" / "GitHub Actions"
|
|
"gitlab",
|
|
"google cloud",
|
|
"activestate",
|
|
)
|
|
|
|
# Repository host suffixes we accept as a valid `repo_url` answer for Step 3.
|
|
# Matched against the URL's netloc (not substring of the full URL) to avoid
|
|
# accepting `https://evil.com/?x=github.com` as a code-host URL.
|
|
_REPO_HOST_SUFFIXES = (
|
|
"github.com",
|
|
"gitlab.com",
|
|
)
|
|
|
|
|
|
def _is_code_host_url(url: str) -> bool:
|
|
"""True if `url`'s host is (or ends with) a known code-host suffix."""
|
|
host = urlparse(url).netloc.lower().removeprefix("www.")
|
|
if not host:
|
|
return False
|
|
return any(
|
|
host == suffix or host.endswith(f".{suffix}") for suffix in _REPO_HOST_SUFFIXES
|
|
)
|
|
|
|
|
|
_HEADERS = {
|
|
"User-Agent": "home-assistant-check-requirements/1.0",
|
|
"Accept": "application/json",
|
|
}
|
|
_TIMEOUT = 30.0
|
|
|
|
|
|
@dataclass(slots=True, frozen=True)
|
|
class Vulnerability:
|
|
"""One advisory entry for a specific package version (OSV / PyPA / GHSA)."""
|
|
|
|
id: str
|
|
aliases: tuple[str, ...]
|
|
summary: str
|
|
fixed_in: tuple[str, ...]
|
|
link: str
|
|
|
|
|
|
@dataclass(slots=True)
|
|
class PypiPackageInfo:
|
|
"""The subset of PyPI metadata we care about for a specific version."""
|
|
|
|
project_urls: dict[str, str]
|
|
repo_url: str | None
|
|
file_provenance_urls: list[str] # may be empty
|
|
found: bool # False if the version doesn't exist on PyPI
|
|
yanked: bool = False
|
|
yanked_reason: str | None = None
|
|
vulnerabilities: list[Vulnerability] = field(default_factory=list)
|
|
|
|
|
|
@dataclass(slots=True)
|
|
class ProvenanceResult:
|
|
"""Parsed PEP 740 attestation status."""
|
|
|
|
has_attestation: bool
|
|
publisher_kind: str | None
|
|
recognized_publisher: bool
|
|
detail: str
|
|
|
|
|
|
def _get_json(url: str) -> dict[str, Any] | None:
|
|
"""Fetch JSON or return None on 404/network error."""
|
|
try:
|
|
response = requests.get(url, headers=_HEADERS, timeout=_TIMEOUT)
|
|
except requests.RequestException as err:
|
|
_LOGGER.warning("Failed to fetch %s: %s", url, err)
|
|
return None
|
|
if response.status_code == 404:
|
|
return None
|
|
if not response.ok:
|
|
_LOGGER.warning("HTTP %s fetching %s", response.status_code, url)
|
|
return None
|
|
try:
|
|
return response.json()
|
|
except ValueError as err:
|
|
_LOGGER.warning("Invalid JSON at %s: %s", url, err)
|
|
return None
|
|
|
|
|
|
def _pick_repo_url(project_urls: dict[str, str]) -> str | None:
|
|
"""Pick the most likely source-repo URL from `info.project_urls`."""
|
|
if not project_urls:
|
|
return None
|
|
lower_map = {k.lower(): v for k, v in project_urls.items()}
|
|
for key in _REPO_URL_KEYS:
|
|
url = lower_map.get(key)
|
|
if url and _is_code_host_url(url):
|
|
return _safe(url)
|
|
for url in project_urls.values():
|
|
if _is_code_host_url(url):
|
|
return _safe(url)
|
|
return None
|
|
|
|
|
|
def fetch_package_info(name: str, version: str) -> PypiPackageInfo:
|
|
"""Fetch per-version PyPI metadata for one package."""
|
|
versioned = _get_json(f"https://pypi.org/pypi/{name}/{version}/json")
|
|
if versioned is None:
|
|
latest = _get_json(f"https://pypi.org/pypi/{name}/json") or {}
|
|
info = latest.get("info") or {}
|
|
project_urls = info.get("project_urls") or {}
|
|
return PypiPackageInfo(
|
|
project_urls=project_urls,
|
|
repo_url=_pick_repo_url(project_urls),
|
|
file_provenance_urls=[],
|
|
found=False,
|
|
)
|
|
|
|
info = versioned.get("info") or {}
|
|
project_urls = info.get("project_urls") or {}
|
|
# PyPI's `urls[].provenance` field is unreliable — it can be null even when
|
|
# an attestation bundle exists at /integrity/.../provenance. Construct the
|
|
# integrity URL ourselves from the filename; check_provenance probes it.
|
|
# All files in a release share a publisher, so the first file is enough.
|
|
provenance_urls: list[str] = []
|
|
files = versioned.get("urls") or []
|
|
for entry in files:
|
|
filename = entry.get("filename")
|
|
if filename:
|
|
provenance_urls.append(
|
|
f"https://pypi.org/integrity/{name}/{version}/{filename}/provenance"
|
|
)
|
|
break
|
|
return PypiPackageInfo(
|
|
project_urls=project_urls,
|
|
repo_url=_pick_repo_url(project_urls),
|
|
file_provenance_urls=provenance_urls,
|
|
found=True,
|
|
yanked=bool(info.get("yanked")),
|
|
yanked_reason=_safe(info.get("yanked_reason")),
|
|
vulnerabilities=_parse_vulnerabilities(versioned.get("vulnerabilities")),
|
|
)
|
|
|
|
|
|
def _parse_vulnerabilities(raw: Any) -> list[Vulnerability]:
|
|
"""Extract non-withdrawn advisories from the PyPI `vulnerabilities` field."""
|
|
if not isinstance(raw, list):
|
|
return []
|
|
out: list[Vulnerability] = []
|
|
for entry in raw:
|
|
if not isinstance(entry, dict):
|
|
continue
|
|
if entry.get("withdrawn"):
|
|
# Withdrawn means the advisory was removed by the maintainer
|
|
# and should not be treated as valid.
|
|
continue
|
|
vid = _safe(entry.get("id"))
|
|
if not vid:
|
|
continue
|
|
aliases_raw = entry.get("aliases") or []
|
|
aliases = tuple(a for a in (_safe(str(x)) for x in aliases_raw if x) if a)
|
|
fixed_raw = entry.get("fixed_in") or []
|
|
fixed_in = tuple(f for f in (_safe(str(x)) for x in fixed_raw if x) if f)
|
|
out.append(
|
|
Vulnerability(
|
|
id=vid,
|
|
aliases=aliases,
|
|
summary=_safe(entry.get("summary")) or "",
|
|
fixed_in=fixed_in,
|
|
link=_safe(entry.get("link")) or "",
|
|
)
|
|
)
|
|
return out
|
|
|
|
|
|
def check_provenance(pkg: PypiPackageInfo) -> ProvenanceResult:
|
|
"""Resolve the provenance attestation, if any, to a Step 2b verdict."""
|
|
if not pkg.found:
|
|
return ProvenanceResult(
|
|
has_attestation=False,
|
|
publisher_kind=None,
|
|
recognized_publisher=False,
|
|
detail="Version not found on PyPI; cannot verify provenance.",
|
|
)
|
|
# Inspect any one file's attestation; all files of a release share a publisher.
|
|
any_bundle_fetched = False
|
|
for url in pkg.file_provenance_urls:
|
|
bundle = _get_json(url)
|
|
if not bundle:
|
|
continue
|
|
any_bundle_fetched = True
|
|
for entry in bundle.get("attestation_bundles") or []:
|
|
publisher = entry.get("publisher") or {}
|
|
kind = publisher.get("kind")
|
|
if not kind:
|
|
continue
|
|
safe_kind = _safe(kind) or ""
|
|
normalized_kind = safe_kind.strip().lower()
|
|
recognized = normalized_kind in {
|
|
token.strip().lower() for token in _KNOWN_CI_PUBLISHERS
|
|
}
|
|
return ProvenanceResult(
|
|
has_attestation=True,
|
|
publisher_kind=safe_kind,
|
|
recognized_publisher=recognized,
|
|
detail=(
|
|
f"Trusted Publisher attestation found ({safe_kind})."
|
|
if recognized
|
|
else (
|
|
f"Attestation present but publisher kind '{safe_kind}' is not in "
|
|
"the recognized-CI allowlist."
|
|
)
|
|
),
|
|
)
|
|
if any_bundle_fetched:
|
|
return ProvenanceResult(
|
|
has_attestation=False,
|
|
publisher_kind=None,
|
|
recognized_publisher=False,
|
|
detail="Provenance URL was present but the attestation could not be parsed.",
|
|
)
|
|
return ProvenanceResult(
|
|
has_attestation=False,
|
|
publisher_kind=None,
|
|
recognized_publisher=False,
|
|
detail=(
|
|
"No PEP 740 provenance attestation present on PyPI. Upload method "
|
|
"cannot be verified from PyPI alone."
|
|
),
|
|
)
|