1
0
mirror of https://github.com/home-assistant/core.git synced 2026-06-02 21:54:27 +01:00
Files
core/script/check_requirements/pypi.py
T
Robert Resch 8c8620c511 Add check requirements yanked and CVE check (#171641)
Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Martin Hjelmare <marhje52@gmail.com>
2026-05-21 12:54:15 +02:00

266 lines
8.7 KiB
Python

"""PyPI metadata + PEP 740 provenance attestation lookups."""
from dataclasses import dataclass, field
import logging
import re
from typing import Any
from urllib.parse import urlparse
import requests
_LOGGER = logging.getLogger(__name__)
# Characters that could escape markdown / HTML in the rendered comment or the
# prompt fence used to ship the artifact to the agent. PyPI maintainers are
# upstream-untrusted, so we strip these from any value we lift from PyPI
# metadata before it enters the artifact.
_UNSAFE = re.compile(r"[`\n\r<>]")
def _safe(text: str | None) -> str | None:
"""Strip characters that could escape markdown / HTML / a prompt fence."""
if text is None:
return None
return _UNSAFE.sub("", text)
# Order matters — first hit wins.
_REPO_URL_KEYS = (
"source",
"source code",
"repository",
"code",
"github",
"homepage",
)
# Known CI publishers that appear in PEP 740 attestation bundles. Matched
# case-insensitively. Anything else is treated as inconclusive (NEEDS_AGENT).
_KNOWN_CI_PUBLISHERS = (
"github", # "GitHub" / "GitHub Actions"
"gitlab",
"google cloud",
"activestate",
)
# Repository host suffixes we accept as a valid `repo_url` answer for Step 3.
# Matched against the URL's netloc (not substring of the full URL) to avoid
# accepting `https://evil.com/?x=github.com` as a code-host URL.
_REPO_HOST_SUFFIXES = (
"github.com",
"gitlab.com",
)
def _is_code_host_url(url: str) -> bool:
"""True if `url`'s host is (or ends with) a known code-host suffix."""
host = urlparse(url).netloc.lower().removeprefix("www.")
if not host:
return False
return any(
host == suffix or host.endswith(f".{suffix}") for suffix in _REPO_HOST_SUFFIXES
)
_HEADERS = {
"User-Agent": "home-assistant-check-requirements/1.0",
"Accept": "application/json",
}
_TIMEOUT = 30.0
@dataclass(slots=True, frozen=True)
class Vulnerability:
"""One advisory entry for a specific package version (OSV / PyPA / GHSA)."""
id: str
aliases: tuple[str, ...]
summary: str
fixed_in: tuple[str, ...]
link: str
@dataclass(slots=True)
class PypiPackageInfo:
"""The subset of PyPI metadata we care about for a specific version."""
project_urls: dict[str, str]
repo_url: str | None
file_provenance_urls: list[str] # may be empty
found: bool # False if the version doesn't exist on PyPI
yanked: bool = False
yanked_reason: str | None = None
vulnerabilities: list[Vulnerability] = field(default_factory=list)
@dataclass(slots=True)
class ProvenanceResult:
"""Parsed PEP 740 attestation status."""
has_attestation: bool
publisher_kind: str | None
recognized_publisher: bool
detail: str
def _get_json(url: str) -> dict[str, Any] | None:
"""Fetch JSON or return None on 404/network error."""
try:
response = requests.get(url, headers=_HEADERS, timeout=_TIMEOUT)
except requests.RequestException as err:
_LOGGER.warning("Failed to fetch %s: %s", url, err)
return None
if response.status_code == 404:
return None
if not response.ok:
_LOGGER.warning("HTTP %s fetching %s", response.status_code, url)
return None
try:
return response.json()
except ValueError as err:
_LOGGER.warning("Invalid JSON at %s: %s", url, err)
return None
def _pick_repo_url(project_urls: dict[str, str]) -> str | None:
"""Pick the most likely source-repo URL from `info.project_urls`."""
if not project_urls:
return None
lower_map = {k.lower(): v for k, v in project_urls.items()}
for key in _REPO_URL_KEYS:
url = lower_map.get(key)
if url and _is_code_host_url(url):
return _safe(url)
for url in project_urls.values():
if _is_code_host_url(url):
return _safe(url)
return None
def fetch_package_info(name: str, version: str) -> PypiPackageInfo:
"""Fetch per-version PyPI metadata for one package."""
versioned = _get_json(f"https://pypi.org/pypi/{name}/{version}/json")
if versioned is None:
latest = _get_json(f"https://pypi.org/pypi/{name}/json") or {}
info = latest.get("info") or {}
project_urls = info.get("project_urls") or {}
return PypiPackageInfo(
project_urls=project_urls,
repo_url=_pick_repo_url(project_urls),
file_provenance_urls=[],
found=False,
)
info = versioned.get("info") or {}
project_urls = info.get("project_urls") or {}
# PyPI's `urls[].provenance` field is unreliable — it can be null even when
# an attestation bundle exists at /integrity/.../provenance. Construct the
# integrity URL ourselves from the filename; check_provenance probes it.
# All files in a release share a publisher, so the first file is enough.
provenance_urls: list[str] = []
files = versioned.get("urls") or []
for entry in files:
filename = entry.get("filename")
if filename:
provenance_urls.append(
f"https://pypi.org/integrity/{name}/{version}/{filename}/provenance"
)
break
return PypiPackageInfo(
project_urls=project_urls,
repo_url=_pick_repo_url(project_urls),
file_provenance_urls=provenance_urls,
found=True,
yanked=bool(info.get("yanked")),
yanked_reason=_safe(info.get("yanked_reason")),
vulnerabilities=_parse_vulnerabilities(versioned.get("vulnerabilities")),
)
def _parse_vulnerabilities(raw: Any) -> list[Vulnerability]:
"""Extract non-withdrawn advisories from the PyPI `vulnerabilities` field."""
if not isinstance(raw, list):
return []
out: list[Vulnerability] = []
for entry in raw:
if not isinstance(entry, dict):
continue
if entry.get("withdrawn"):
# Withdrawn means the advisory was removed by the maintainer
# and should not be treated as valid.
continue
vid = _safe(entry.get("id"))
if not vid:
continue
aliases_raw = entry.get("aliases") or []
aliases = tuple(a for a in (_safe(str(x)) for x in aliases_raw if x) if a)
fixed_raw = entry.get("fixed_in") or []
fixed_in = tuple(f for f in (_safe(str(x)) for x in fixed_raw if x) if f)
out.append(
Vulnerability(
id=vid,
aliases=aliases,
summary=_safe(entry.get("summary")) or "",
fixed_in=fixed_in,
link=_safe(entry.get("link")) or "",
)
)
return out
def check_provenance(pkg: PypiPackageInfo) -> ProvenanceResult:
"""Resolve the provenance attestation, if any, to a Step 2b verdict."""
if not pkg.found:
return ProvenanceResult(
has_attestation=False,
publisher_kind=None,
recognized_publisher=False,
detail="Version not found on PyPI; cannot verify provenance.",
)
# Inspect any one file's attestation; all files of a release share a publisher.
any_bundle_fetched = False
for url in pkg.file_provenance_urls:
bundle = _get_json(url)
if not bundle:
continue
any_bundle_fetched = True
for entry in bundle.get("attestation_bundles") or []:
publisher = entry.get("publisher") or {}
kind = publisher.get("kind")
if not kind:
continue
safe_kind = _safe(kind) or ""
normalized_kind = safe_kind.strip().lower()
recognized = normalized_kind in {
token.strip().lower() for token in _KNOWN_CI_PUBLISHERS
}
return ProvenanceResult(
has_attestation=True,
publisher_kind=safe_kind,
recognized_publisher=recognized,
detail=(
f"Trusted Publisher attestation found ({safe_kind})."
if recognized
else (
f"Attestation present but publisher kind '{safe_kind}' is not in "
"the recognized-CI allowlist."
)
),
)
if any_bundle_fetched:
return ProvenanceResult(
has_attestation=False,
publisher_kind=None,
recognized_publisher=False,
detail="Provenance URL was present but the attestation could not be parsed.",
)
return ProvenanceResult(
has_attestation=False,
publisher_kind=None,
recognized_publisher=False,
detail=(
"No PEP 740 provenance attestation present on PyPI. Upload method "
"cannot be verified from PyPI alone."
),
)