scripts: dry-run-first backfill script to hash-suffix pre-hash figs

Reads each legacy <stem>.metrics.json for its embed_args, computes the same sha1-8 digest main.py uses, renames the .html and its sidecars in place. Skips Reference figs (no embed_args) and any fig lacking a metrics.json (can't recover the hash from a missing sidecar).
2026-04-22 15:57:09 -06:00 · 2026-04-22 15:57:09 -06:00 · a1d242ae36
commit a1d242ae36
parent fe49565651
1 changed files with 119 additions and 0 deletions
--- a/scripts/backfill_hashes.py
+++ b/scripts/backfill_hashes.py
@ -0,0 +1,119 @@
 """Rename pre-hash embedder figs to include the embed_args hash suffix.
 Walks figs/ for `.html` files matching the old stem shape (no hash tail) that
 represent an embedder run (not Reference), reads the sibling
 `<stem>.metrics.json` to recover `meta.embed_args`, computes the hash, and
 renames the .html + .metrics.json in place.
 Default is a dry-run — pass `--apply` to actually rename. Reference files are
 left alone (they have no embed_args). Missing metrics.json → warn and skip.
 Target-name collision → warn and skip.
 Usage:
    .venv/bin/python scripts/backfill_hashes.py [--apply] [--figs-dir PATH]
 """
 from __future__ import annotations
 import argparse
 import json
 import re
 import sys
 from pathlib import Path
 # Reach up to the project root so we can reuse the canonical hash helper.
 _ROOT = Path(__file__).resolve().parent.parent
 sys.path.insert(0, str(_ROOT))
 from app.web.main import embed_args_hash  # noqa: E402
 _LEGACY_STEM = re.compile(
    r"^(?P<base>make_[A-Za-z_]+?_[A-Za-z]+_N\d+_T\d+_J[\d.]+_s\d+)$"
 )
 def plan_renames(figs_dir: Path):
    for html in sorted(figs_dir.glob("*.html")):
        stem = html.stem
        m = _LEGACY_STEM.match(stem)
        if not m:
            # Either already hashed or doesn't match our scheme at all.
            continue
        # Skip Reference runs — they have no embed_args.
        if "_Reference_" in stem:
            continue
        metrics = figs_dir / f"{stem}.metrics.json"
        if not metrics.is_file():
            yield (html, None, "missing metrics.json — can't compute hash")
            continue
        try:
            ea = json.loads(metrics.read_text(encoding="utf-8"))["meta"]["embed_args"]
        except (KeyError, json.JSONDecodeError) as e:
            yield (html, None, f"bad metrics.json: {e}")
            continue
        new_stem = f"{stem}_{embed_args_hash(ea)}"
        new_html = figs_dir / f"{new_stem}.html"
        if new_html.exists():
            yield (html, None, f"target exists: {new_html.name}")
            continue
        yield (html, new_stem, None)
 def apply_rename(figs_dir: Path, old_stem: str, new_stem: str) -> list[str]:
    """Rename every sidecar sharing the old stem. Returns the renamed files."""
    renamed = []
    for suffix in (".html", ".metrics.json", ".frames.json"):
        src = figs_dir / f"{old_stem}{suffix}"
        if not src.exists():
            continue
        dst = figs_dir / f"{new_stem}{suffix}"
        src.rename(dst)
        renamed.append(f"{src.name} -> {dst.name}")
    return renamed
 def main() -> int:
    ap = argparse.ArgumentParser(description=__doc__)
    ap.add_argument("--apply", action="store_true", help="actually rename (default: dry-run)")
    ap.add_argument("--figs-dir", default=str(_ROOT / "figs"), help="path to figs/ directory")
    args = ap.parse_args()
    figs_dir = Path(args.figs_dir).resolve()
    if not figs_dir.is_dir():
        print(f"no such directory: {figs_dir}", file=sys.stderr)
        return 2
    planned, skipped = [], []
    for html, new_stem, reason in plan_renames(figs_dir):
        if new_stem is None:
            skipped.append((html.name, reason))
        else:
            planned.append((html.stem, new_stem))
    print(f"scanning {figs_dir}")
    print(f"  {len(planned)} to rename, {len(skipped)} skipped\n")
    for old, new in planned:
        print(f"  rename  {old}  ->  {new}")
    if skipped:
        print("\n  skipped:")
        for name, reason in skipped:
            print(f"    {name}  ({reason})")
    if not planned:
        return 0
    if not args.apply:
        print("\n(dry run — pass --apply to rename)")
        return 0
    print("\napplying...")
    for old, new in planned:
        moved = apply_rename(figs_dir, old, new)
        for line in moved:
            print(f"  {line}")
    print(f"done — renamed {len(planned)} run(s)")
    return 0
 if __name__ == "__main__":
    sys.exit(main())