Source code for constellation_utils.recording_hash
"""Content-addressed `recording_hash` computation for the Constellation catalog.
See ENG-1070 and the Constellation Research Stack architecture doc §3 for the
contract this module implements: a recording's catalog identity is the SHA-256
hex of a canonical manifest representation combined with the recording's
`start_time`. The canonical form is producer-agnostic — data-engine computes
it at finalize time, and backfill tools compute the same hex from the same
manifest on disk.
"""
from __future__ import annotations
import hashlib
import json
from typing import Any, Mapping
__all__ = ["compute_recording_hash"]
_EXCLUDED_TOP_LEVEL = frozenset(
{
"recording_id",
"participant",
"testing_mode",
"started_at",
"ended_at",
"recording_hash",
}
)
def _canonical_file(entry: Mapping[str, Any]) -> dict[str, str]:
return {"path": str(entry["path"]), "sha256": str(entry["sha256"])}
def _canonical_workers(workers: list[Any]) -> list[dict[str, Any]]:
reduced: list[dict[str, Any]] = []
for w in workers:
files = sorted(
(_canonical_file(f) for f in w.get("files", [])),
key=lambda f: f["path"],
)
reduced.append({"worker_id": str(w["worker_id"]), "files": files})
reduced.sort(key=lambda w: w["worker_id"])
return reduced
[docs]
def compute_recording_hash(manifest: Mapping[str, Any], *, start_time: str) -> str:
"""Return the 64-char SHA-256 hex of the canonical manifest + start_time.
The manifest must already contain per-file sha256s for every raw segment
(data-engine's `_finalize_session` is responsible for this). `start_time`
is a non-None ISO-8601 string, passed explicitly so backfill can supply it
from a source other than the manifest's ``started_at`` field if needed.
Passing ``None`` raises ``TypeError`` — callers must guard upstream.
The format of ``start_time`` is not validated. Callers must ensure it is a
consistent ISO-8601 representation (including timezone offset) for the
hash to be reproducible.
Canonicalization rules (locked — cross-language reimplementations must
match exactly):
* Reduced view is ``{"start_time": <given>, "workers": [...]}``.
* Each worker is ``{"worker_id": <str>, "files": [...]}`` and each file
entry is exactly ``{"path": <str>, "sha256": <hex>}``. ``bytes`` is
intentionally omitted (the sha256 already encodes content length).
* ``path`` is consumed verbatim — the producer relativizes against its
own session_dir before serializing.
* Files are sorted by ``path`` within each worker; workers are sorted by
``worker_id``.
* Top-level keys ``recording_id``, ``participant``, ``testing_mode``,
``started_at``, ``ended_at``, ``recording_hash``, and any key whose
name starts with ``_`` are excluded from the canonical view. In
particular ``started_at`` is excluded so it isn't double-encoded — the
only source of start time in the hash is the ``start_time`` argument.
* Serialized with ``json.dumps(reduced, sort_keys=True,
separators=(",", ":"), ensure_ascii=True)`` then UTF-8 encoded and
SHA-256 hashed. ``ensure_ascii=True`` is pinned: non-ASCII characters
in file paths become ``\\uXXXX`` escapes.
"""
if start_time is None:
raise TypeError("start_time must be a non-None str")
workers = manifest.get("workers", [])
reduced = {"start_time": start_time, "workers": _canonical_workers(list(workers))}
serialized = json.dumps(reduced, sort_keys=True, separators=(",", ":"), ensure_ascii=True)
return hashlib.sha256(serialized.encode("utf-8")).hexdigest()