diff --git a/scripts/backfill_hashes.py b/scripts/backfill_hashes.py new file mode 100644 index 0000000..729fc9e --- /dev/null +++ b/scripts/backfill_hashes.py @@ -0,0 +1,119 @@ +"""Rename pre-hash embedder figs to include the embed_args hash suffix. + +Walks figs/ for `.html` files matching the old stem shape (no hash tail) that +represent an embedder run (not Reference), reads the sibling +`.metrics.json` to recover `meta.embed_args`, computes the hash, and +renames the .html + .metrics.json in place. + +Default is a dry-run — pass `--apply` to actually rename. Reference files are +left alone (they have no embed_args). Missing metrics.json → warn and skip. +Target-name collision → warn and skip. + +Usage: + .venv/bin/python scripts/backfill_hashes.py [--apply] [--figs-dir PATH] +""" + +from __future__ import annotations + +import argparse +import json +import re +import sys +from pathlib import Path + +# Reach up to the project root so we can reuse the canonical hash helper. +_ROOT = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(_ROOT)) +from app.web.main import embed_args_hash # noqa: E402 + +_LEGACY_STEM = re.compile( + r"^(?Pmake_[A-Za-z_]+?_[A-Za-z]+_N\d+_T\d+_J[\d.]+_s\d+)$" +) + + +def plan_renames(figs_dir: Path): + for html in sorted(figs_dir.glob("*.html")): + stem = html.stem + m = _LEGACY_STEM.match(stem) + if not m: + # Either already hashed or doesn't match our scheme at all. + continue + # Skip Reference runs — they have no embed_args. + if "_Reference_" in stem: + continue + metrics = figs_dir / f"{stem}.metrics.json" + if not metrics.is_file(): + yield (html, None, "missing metrics.json — can't compute hash") + continue + try: + ea = json.loads(metrics.read_text(encoding="utf-8"))["meta"]["embed_args"] + except (KeyError, json.JSONDecodeError) as e: + yield (html, None, f"bad metrics.json: {e}") + continue + new_stem = f"{stem}_{embed_args_hash(ea)}" + new_html = figs_dir / f"{new_stem}.html" + if new_html.exists(): + yield (html, None, f"target exists: {new_html.name}") + continue + yield (html, new_stem, None) + + +def apply_rename(figs_dir: Path, old_stem: str, new_stem: str) -> list[str]: + """Rename every sidecar sharing the old stem. Returns the renamed files.""" + renamed = [] + for suffix in (".html", ".metrics.json", ".frames.json"): + src = figs_dir / f"{old_stem}{suffix}" + if not src.exists(): + continue + dst = figs_dir / f"{new_stem}{suffix}" + src.rename(dst) + renamed.append(f"{src.name} -> {dst.name}") + return renamed + + +def main() -> int: + ap = argparse.ArgumentParser(description=__doc__) + ap.add_argument("--apply", action="store_true", help="actually rename (default: dry-run)") + ap.add_argument("--figs-dir", default=str(_ROOT / "figs"), help="path to figs/ directory") + args = ap.parse_args() + + figs_dir = Path(args.figs_dir).resolve() + if not figs_dir.is_dir(): + print(f"no such directory: {figs_dir}", file=sys.stderr) + return 2 + + planned, skipped = [], [] + for html, new_stem, reason in plan_renames(figs_dir): + if new_stem is None: + skipped.append((html.name, reason)) + else: + planned.append((html.stem, new_stem)) + + print(f"scanning {figs_dir}") + print(f" {len(planned)} to rename, {len(skipped)} skipped\n") + + for old, new in planned: + print(f" rename {old} -> {new}") + if skipped: + print("\n skipped:") + for name, reason in skipped: + print(f" {name} ({reason})") + + if not planned: + return 0 + + if not args.apply: + print("\n(dry run — pass --apply to rename)") + return 0 + + print("\napplying...") + for old, new in planned: + moved = apply_rename(figs_dir, old, new) + for line in moved: + print(f" {line}") + print(f"done — renamed {len(planned)} run(s)") + return 0 + + +if __name__ == "__main__": + sys.exit(main())