Source code for constellation_utils.recording_hash

"""Content-addressed `recording_hash` computation for the Constellation catalog.

See ENG-1070 and the Constellation Research Stack architecture doc §3 for the
contract this module implements: a recording's catalog identity is the SHA-256
hex of a canonical manifest representation combined with the recording's
`start_time`. The canonical form is producer-agnostic — data-engine computes
it at finalize time, and backfill tools compute the same hex from the same
manifest on disk.
"""

from __future__ import annotations

import hashlib
import json
from typing import Any, Mapping

__all__ = ["compute_recording_hash"]


_EXCLUDED_TOP_LEVEL = frozenset(
    {
        "recording_id",
        "participant",
        "testing_mode",
        "started_at",
        "ended_at",
        "recording_hash",
    }
)


def _canonical_file(entry: Mapping[str, Any]) -> dict[str, str]:
    return {"path": str(entry["path"]), "sha256": str(entry["sha256"])}


def _canonical_workers(workers: list[Any]) -> list[dict[str, Any]]:
    reduced: list[dict[str, Any]] = []
    for w in workers:
        files = sorted(
            (_canonical_file(f) for f in w.get("files", [])),
            key=lambda f: f["path"],
        )
        reduced.append({"worker_id": str(w["worker_id"]), "files": files})
    reduced.sort(key=lambda w: w["worker_id"])
    return reduced


[docs] def compute_recording_hash(manifest: Mapping[str, Any], *, start_time: str) -> str: """Return the 64-char SHA-256 hex of the canonical manifest + start_time. The manifest must already contain per-file sha256s for every raw segment (data-engine's `_finalize_session` is responsible for this). `start_time` is a non-None ISO-8601 string, passed explicitly so backfill can supply it from a source other than the manifest's ``started_at`` field if needed. Passing ``None`` raises ``TypeError`` — callers must guard upstream. The format of ``start_time`` is not validated. Callers must ensure it is a consistent ISO-8601 representation (including timezone offset) for the hash to be reproducible. Canonicalization rules (locked — cross-language reimplementations must match exactly): * Reduced view is ``{"start_time": <given>, "workers": [...]}``. * Each worker is ``{"worker_id": <str>, "files": [...]}`` and each file entry is exactly ``{"path": <str>, "sha256": <hex>}``. ``bytes`` is intentionally omitted (the sha256 already encodes content length). * ``path`` is consumed verbatim — the producer relativizes against its own session_dir before serializing. * Files are sorted by ``path`` within each worker; workers are sorted by ``worker_id``. * Top-level keys ``recording_id``, ``participant``, ``testing_mode``, ``started_at``, ``ended_at``, ``recording_hash``, and any key whose name starts with ``_`` are excluded from the canonical view. In particular ``started_at`` is excluded so it isn't double-encoded — the only source of start time in the hash is the ``start_time`` argument. * Serialized with ``json.dumps(reduced, sort_keys=True, separators=(",", ":"), ensure_ascii=True)`` then UTF-8 encoded and SHA-256 hashed. ``ensure_ascii=True`` is pinned: non-ASCII characters in file paths become ``\\uXXXX`` escapes. """ if start_time is None: raise TypeError("start_time must be a non-None str") workers = manifest.get("workers", []) reduced = {"start_time": start_time, "workers": _canonical_workers(list(workers))} serialized = json.dumps(reduced, sort_keys=True, separators=(",", ":"), ensure_ascii=True) return hashlib.sha256(serialized.encode("utf-8")).hexdigest()