stems: include embed_args hash in output filename + emit frames.json sidecar

Stem grows an 8-hex sha1 digest of the (keys-sorted) embed_args dict, so runs differing only in embed_args (e.g. UMAP n_neighbors=5 vs 15) now produce distinct figs. The stem regex and parser both accept an optional _<hash> tail so pre-hash figs still render in the runs list and compare page; legacy filename is resolved on disk fallback. Duplicate-submission check now rejects against BOTH the hashed and the legacy hashless variant so users can't accidentally duplicate an old run either. Flow additionally writes a <stem>.frames.json sidecar next to the plotly HTML (same shape as app/web/plotly_parse returns). Server prefers the sidecar when present; falls back to parsing HTML for older runs. Sidecar emission is non-critical — any failure just logs and keeps going.
2026-04-22 15:52:39 -06:00 · 2026-04-22 15:52:39 -06:00 · fe49565651
commit fe49565651
parent 36e217f51e
3 changed files with 97 additions and 26 deletions
--- a/app/web/main.py
+++ b/app/web/main.py
@ -10,6 +10,7 @@ framework; hand-written styles.
 from __future__ import annotations
 import hashlib
 import importlib.util
 import json
 import os
@ -449,6 +450,13 @@ def build_embed_args(reducer_key: str, form: Dict[str, str]) -> Dict[str, Any]:
 # ---------------------------------------------------------------------------
 def embed_args_hash(embed_args: Optional[Dict[str, Any]]) -> str:
    """8-hex digest of an embed_args dict (keys sorted). Stems incorporate
    this so runs that differ only in embed_args get distinct output files."""
    s = json.dumps(embed_args or {}, sort_keys=True, default=str)
    return hashlib.sha1(s.encode()).hexdigest()[:8]
 def synthesize_output_paths(
    generator_path: str,
    embedder: str,
@ -456,14 +464,34 @@ def synthesize_output_paths(
    num_timesteps: int,
    jitter_scale: float,
    seed: int,
    embed_args: Optional[Dict[str, Any]] = None,
 ) -> Tuple[str, str]:
    gen = generator_path.split(".")[-1]
    emb = embedder.split(".")[-1]
    ref = f"{gen}_Reference_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}.html"
-    embf = f"{gen}_{emb}_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}.html"
+    base = f"{gen}_{emb}_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}"
    if embed_args is None:
        embf = f"{base}.html"
    else:
        embf = f"{base}_{embed_args_hash(embed_args)}.html"
    return ref, embf
 def _resolve_emb_file(synthesized: str) -> str:
    """Disk-backed fallback: prefer the synthesized (hashed) name; if that
    doesn't exist on disk but an older hash-less variant does, return that
    so pre-hash runs still render in the UI."""
    if (FIGS_DIR / synthesized).exists():
        return synthesized
    # Strip trailing _<8hex>.html to get the legacy name.
    m = re.match(r"^(?P<base>.+)_[0-9a-f]{8}\.html$", synthesized)
    if m:
        legacy = m.group("base") + ".html"
        if (FIGS_DIR / legacy).exists():
            return legacy
    return synthesized  # new / still-running run; let emb_exists resolve
 # ---------------------------------------------------------------------------
 # Prefect client
 # ---------------------------------------------------------------------------
@ -586,7 +614,10 @@ def _run_view(run: Dict[str, Any]) -> Dict[str, Any]:
                int(params.get("num_timesteps", params.get("num_snapshots", 48))),
                float(params.get("jitter_scale", 0.01)),
                int(params.get("seed", 42)),
                embed_args=params.get("embed_args") or {},
            )
            # Older runs may lack the hash suffix; prefer legacy name on disk.
            emb_file = _resolve_emb_file(emb_file)
        except Exception:
            ref_file, emb_file = None, None
@ -752,19 +783,22 @@ async def submit(request: Request) -> HTMLResponse:
    embed_args = build_embed_args(reducer, data)
    # Reject submissions whose output path would overwrite an existing fig.
-    # The stem is fully determined by (generator, embedder, N, T, J, seed) —
+    # The stem now includes an 8-hex hash of embed_args, so UMAP(n_neighbors=5)
-    # embed_args don't affect the filename, so e.g. UMAP(n_neighbors=5) and
+    # and UMAP(n_neighbors=15) produce distinct files. Check both the hashed
-    # UMAP(n_neighbors=15) with otherwise-matching params collide too. Force
+    # path (new runs) and the legacy hashless path (pre-hash runs) so users
-    # the user to change a stem-shaping param (seed is usually cheapest) or
+    # can't accidentally duplicate against a pre-hash fig either.
-    # delete the existing fig first.
+    _, hashed_emb = synthesize_output_paths(
-    _, emb_file = synthesize_output_paths(
+        generator_path, reducer, num_points, num_timesteps, jitter_scale, seed,
-        generator_path, reducer, num_points, num_timesteps, jitter_scale, seed
+        embed_args=embed_args,
    )
-    if (FIGS_DIR / emb_file).exists():
+    _, legacy_emb = synthesize_output_paths(
        generator_path, reducer, num_points, num_timesteps, jitter_scale, seed,
    )
    for candidate in (hashed_emb, legacy_emb):
        if (FIGS_DIR / candidate).exists():
            return HTMLResponse(
-            f"<div class='flash err'>a run with these params already exists "
+                f"<div class='flash err'>a run with matching params already "
-            f"(<code>{emb_file}</code>). change <code>seed</code> / "
+                f"exists (<code>{candidate}</code>). change a param or delete "
            f"<code>jitter</code> / <code>N</code> / <code>T</code>, or delete "
                f"the fig first.</div>",
                status_code=409,
            )
@ -798,7 +832,8 @@ async def submit(request: Request) -> HTMLResponse:
        )
    ref_file, emb_file = synthesize_output_paths(
-        generator_path, reducer, num_points, num_timesteps, jitter_scale, seed
+        generator_path, reducer, num_points, num_timesteps, jitter_scale, seed,
        embed_args=embed_args,
    )
    RUN_OUTPUTS[run["id"]] = {"ref": ref_file, "embed": emb_file}
@ -845,7 +880,7 @@ async def metrics_json() -> JSONResponse:
 _STEM_RE = re.compile(
-    r"^make_[A-Za-z_]+?_[A-Za-z]+_N\d+_T\d+_J[\d.]+_s\d+$"
+    r"^make_[A-Za-z_]+?_[A-Za-z]+_N\d+_T\d+_J[\d.]+_s\d+(?:_[0-9a-f]{8})?$"
 )
 # Map short generator name ("make_blobs") to its DATASET_META entry.
@ -885,7 +920,13 @@ def _enrich_with_labels(d: Dict[str, Any]) -> Dict[str, Any]:
@lru_cache(maxsize=32)
 def _cached_frames(stem: str) -> str:
-    """Parse <stem>.html and return the frames dict as a JSON string."""
+    """Return the frames dict as a JSON string. Prefers a <stem>.frames.json
    sidecar (emitted by new flow runs); falls back to parsing <stem>.html
    (for pre-sidecar runs). Either way, enriches with dataset labels."""
    sidecar = FIGS_DIR / f"{stem}.frames.json"
    if sidecar.is_file():
        d = json.loads(sidecar.read_text(encoding="utf-8"))
    else:
        html = FIGS_DIR / f"{stem}.html"
        d = parse_plotly_run(html)
    d = _enrich_with_labels(d)
@ -896,8 +937,7 @@ def _cached_frames(stem: str) -> str:
 async def run_frames(stem: str) -> Response:
    if not _STEM_RE.match(stem):
        raise HTTPException(400, f"malformed stem: {stem!r}")
-    html = FIGS_DIR / f"{stem}.html"
+    if not (FIGS_DIR / f"{stem}.frames.json").is_file() and not (FIGS_DIR / f"{stem}.html").is_file():
    if not html.is_file():
        raise HTTPException(404, f"no such run: {stem}")
    try:
        payload = _cached_frames(stem)
--- a/app/web/plotly_parse.py
+++ b/app/web/plotly_parse.py
@ -20,7 +20,7 @@ from pathlib import Path
 _STEM_RE = re.compile(
    r"^(?P<gen>make_.+?)_(?P<emb>[A-Za-z]+)_N(?P<n>\d+)_T(?P<t>\d+)"
-    r"_J(?P<j>[\d.]+)_s(?P<s>\d+)$"
+    r"_J(?P<j>[\d.]+)_s(?P<s>\d+)(?:_(?P<h>[0-9a-f]{8}))?$"
 )
 # plotly's typed-array dtype -> (struct format char, item size bytes)
--- a/flows/embedding_flow.py
+++ b/flows/embedding_flow.py
@ -20,11 +20,19 @@ os.environ.setdefault("NUMEXPR_NUM_THREADS", "1")
 os.environ.setdefault("NUMBA_NUM_THREADS", "1")
 from datetime import timedelta
 import hashlib
 import json
 import math
 from pathlib import Path
 from typing import Any, Dict, List, Optional
 def _embed_args_hash(ea: Optional[Dict[str, Any]]) -> str:
    """8-hex digest of embed_args (keys sorted) — output stem includes this
    so runs differing only in embed_args get distinct files."""
    s = json.dumps(ea or {}, sort_keys=True, default=str)
    return hashlib.sha1(s.encode()).hexdigest()[:8]
 from prefect import flow, task
 from prefect.artifacts import create_markdown_artifact, create_table_artifact
 from prefect.cache_policies import INPUTS, NO_CACHE
@ -279,10 +287,12 @@ def embedding_flow(
    output_ref: str = (
        f"{output_dir.strip('/')}/{_generator}_Reference_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}.html"
    )
    _args_tag = _embed_args_hash(embed_args)
    output_embed: str = (
-        f"{output_dir.strip('/')}/{_generator}_{embedder.split('.')[-1]}_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}.html"
+        f"{output_dir.strip('/')}/{_generator}_{embedder.split('.')[-1]}_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}_{_args_tag}.html"
    )
    output_metrics: str = output_embed[:-5] + ".metrics.json"
    output_frames: str = output_embed[:-5] + ".frames.json"
    title_ref = f"Reference: {_generator}, N={num_points} with {jitter_scale} noise"
    title_embed = f"Embedding: {embedder.split('.')[-1]} on {_generator}, N={num_points} with {jitter_scale} noise"
@ -378,7 +388,28 @@ def embedding_flow(
        k=10,
    )
-    return (ref_path.result(), emb_path.result(), metrics_path.result())
+    emb_path_result = emb_path.result()
    metrics_path_result = metrics_path.result()
    # Emit a frames.json sidecar so the compare page doesn't have to parse
    # the 5 MB plotly HTML on every first request. Non-critical — the server
    # falls back to HTML parsing when the sidecar is absent.
    try:
        import sys as _sys
        _root = str(Path(__file__).resolve().parent.parent)
        if _root not in _sys.path:
            _sys.path.insert(0, _root)
        from app.web.plotly_parse import parse_plotly_run
        frames = parse_plotly_run(emb_path_result)
        Path(output_frames).write_text(
            json.dumps(frames, separators=(",", ":")), encoding="utf-8"
        )
    except Exception as _sidecar_err:
        import traceback as _tb
        print(f"[frames-sidecar] skipped: {_sidecar_err}")
        _tb.print_exc()
    return (ref_path.result(), emb_path_result, metrics_path_result)
 if __name__ == "__main__":