scripts: dry-run-first backfill script to hash-suffix pre-hash figs

Reads each legacy <stem>.metrics.json for its embed_args, computes the
same sha1-8 digest main.py uses, renames the .html and its sidecars in
place. Skips Reference figs (no embed_args) and any fig lacking a
metrics.json (can't recover the hash from a missing sidecar).
This commit is contained in:
Michael Pilosov 2026-04-22 15:57:09 -06:00
parent fe49565651
commit a1d242ae36

119
scripts/backfill_hashes.py Normal file
View File

@ -0,0 +1,119 @@
"""Rename pre-hash embedder figs to include the embed_args hash suffix.
Walks figs/ for `.html` files matching the old stem shape (no hash tail) that
represent an embedder run (not Reference), reads the sibling
`<stem>.metrics.json` to recover `meta.embed_args`, computes the hash, and
renames the .html + .metrics.json in place.
Default is a dry-run pass `--apply` to actually rename. Reference files are
left alone (they have no embed_args). Missing metrics.json warn and skip.
Target-name collision warn and skip.
Usage:
.venv/bin/python scripts/backfill_hashes.py [--apply] [--figs-dir PATH]
"""
from __future__ import annotations
import argparse
import json
import re
import sys
from pathlib import Path
# Reach up to the project root so we can reuse the canonical hash helper.
_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(_ROOT))
from app.web.main import embed_args_hash # noqa: E402
_LEGACY_STEM = re.compile(
r"^(?P<base>make_[A-Za-z_]+?_[A-Za-z]+_N\d+_T\d+_J[\d.]+_s\d+)$"
)
def plan_renames(figs_dir: Path):
for html in sorted(figs_dir.glob("*.html")):
stem = html.stem
m = _LEGACY_STEM.match(stem)
if not m:
# Either already hashed or doesn't match our scheme at all.
continue
# Skip Reference runs — they have no embed_args.
if "_Reference_" in stem:
continue
metrics = figs_dir / f"{stem}.metrics.json"
if not metrics.is_file():
yield (html, None, "missing metrics.json — can't compute hash")
continue
try:
ea = json.loads(metrics.read_text(encoding="utf-8"))["meta"]["embed_args"]
except (KeyError, json.JSONDecodeError) as e:
yield (html, None, f"bad metrics.json: {e}")
continue
new_stem = f"{stem}_{embed_args_hash(ea)}"
new_html = figs_dir / f"{new_stem}.html"
if new_html.exists():
yield (html, None, f"target exists: {new_html.name}")
continue
yield (html, new_stem, None)
def apply_rename(figs_dir: Path, old_stem: str, new_stem: str) -> list[str]:
"""Rename every sidecar sharing the old stem. Returns the renamed files."""
renamed = []
for suffix in (".html", ".metrics.json", ".frames.json"):
src = figs_dir / f"{old_stem}{suffix}"
if not src.exists():
continue
dst = figs_dir / f"{new_stem}{suffix}"
src.rename(dst)
renamed.append(f"{src.name} -> {dst.name}")
return renamed
def main() -> int:
ap = argparse.ArgumentParser(description=__doc__)
ap.add_argument("--apply", action="store_true", help="actually rename (default: dry-run)")
ap.add_argument("--figs-dir", default=str(_ROOT / "figs"), help="path to figs/ directory")
args = ap.parse_args()
figs_dir = Path(args.figs_dir).resolve()
if not figs_dir.is_dir():
print(f"no such directory: {figs_dir}", file=sys.stderr)
return 2
planned, skipped = [], []
for html, new_stem, reason in plan_renames(figs_dir):
if new_stem is None:
skipped.append((html.name, reason))
else:
planned.append((html.stem, new_stem))
print(f"scanning {figs_dir}")
print(f" {len(planned)} to rename, {len(skipped)} skipped\n")
for old, new in planned:
print(f" rename {old} -> {new}")
if skipped:
print("\n skipped:")
for name, reason in skipped:
print(f" {name} ({reason})")
if not planned:
return 0
if not args.apply:
print("\n(dry run — pass --apply to rename)")
return 0
print("\napplying...")
for old, new in planned:
moved = apply_rename(figs_dir, old, new)
for line in moved:
print(f" {line}")
print(f"done — renamed {len(planned)} run(s)")
return 0
if __name__ == "__main__":
sys.exit(main())