stems: include embed_args hash in output filename + emit frames.json sidecar

Stem grows an 8-hex sha1 digest of the (keys-sorted) embed_args dict, so
runs differing only in embed_args (e.g. UMAP n_neighbors=5 vs 15) now
produce distinct figs. The stem regex and parser both accept an optional
_<hash> tail so pre-hash figs still render in the runs list and compare
page; legacy filename is resolved on disk fallback.

Duplicate-submission check now rejects against BOTH the hashed and the
legacy hashless variant so users can't accidentally duplicate an old run
either.

Flow additionally writes a <stem>.frames.json sidecar next to the plotly
HTML (same shape as app/web/plotly_parse returns). Server prefers the
sidecar when present; falls back to parsing HTML for older runs. Sidecar
emission is non-critical — any failure just logs and keeps going.
This commit is contained in:
Michael Pilosov 2026-04-22 15:52:39 -06:00
parent 36e217f51e
commit fe49565651
3 changed files with 97 additions and 26 deletions

View File

@ -10,6 +10,7 @@ framework; hand-written styles.
from __future__ import annotations from __future__ import annotations
import hashlib
import importlib.util import importlib.util
import json import json
import os import os
@ -449,6 +450,13 @@ def build_embed_args(reducer_key: str, form: Dict[str, str]) -> Dict[str, Any]:
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
def embed_args_hash(embed_args: Optional[Dict[str, Any]]) -> str:
"""8-hex digest of an embed_args dict (keys sorted). Stems incorporate
this so runs that differ only in embed_args get distinct output files."""
s = json.dumps(embed_args or {}, sort_keys=True, default=str)
return hashlib.sha1(s.encode()).hexdigest()[:8]
def synthesize_output_paths( def synthesize_output_paths(
generator_path: str, generator_path: str,
embedder: str, embedder: str,
@ -456,14 +464,34 @@ def synthesize_output_paths(
num_timesteps: int, num_timesteps: int,
jitter_scale: float, jitter_scale: float,
seed: int, seed: int,
embed_args: Optional[Dict[str, Any]] = None,
) -> Tuple[str, str]: ) -> Tuple[str, str]:
gen = generator_path.split(".")[-1] gen = generator_path.split(".")[-1]
emb = embedder.split(".")[-1] emb = embedder.split(".")[-1]
ref = f"{gen}_Reference_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}.html" ref = f"{gen}_Reference_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}.html"
embf = f"{gen}_{emb}_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}.html" base = f"{gen}_{emb}_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}"
if embed_args is None:
embf = f"{base}.html"
else:
embf = f"{base}_{embed_args_hash(embed_args)}.html"
return ref, embf return ref, embf
def _resolve_emb_file(synthesized: str) -> str:
"""Disk-backed fallback: prefer the synthesized (hashed) name; if that
doesn't exist on disk but an older hash-less variant does, return that
so pre-hash runs still render in the UI."""
if (FIGS_DIR / synthesized).exists():
return synthesized
# Strip trailing _<8hex>.html to get the legacy name.
m = re.match(r"^(?P<base>.+)_[0-9a-f]{8}\.html$", synthesized)
if m:
legacy = m.group("base") + ".html"
if (FIGS_DIR / legacy).exists():
return legacy
return synthesized # new / still-running run; let emb_exists resolve
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Prefect client # Prefect client
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@ -586,7 +614,10 @@ def _run_view(run: Dict[str, Any]) -> Dict[str, Any]:
int(params.get("num_timesteps", params.get("num_snapshots", 48))), int(params.get("num_timesteps", params.get("num_snapshots", 48))),
float(params.get("jitter_scale", 0.01)), float(params.get("jitter_scale", 0.01)),
int(params.get("seed", 42)), int(params.get("seed", 42)),
embed_args=params.get("embed_args") or {},
) )
# Older runs may lack the hash suffix; prefer legacy name on disk.
emb_file = _resolve_emb_file(emb_file)
except Exception: except Exception:
ref_file, emb_file = None, None ref_file, emb_file = None, None
@ -752,22 +783,25 @@ async def submit(request: Request) -> HTMLResponse:
embed_args = build_embed_args(reducer, data) embed_args = build_embed_args(reducer, data)
# Reject submissions whose output path would overwrite an existing fig. # Reject submissions whose output path would overwrite an existing fig.
# The stem is fully determined by (generator, embedder, N, T, J, seed) — # The stem now includes an 8-hex hash of embed_args, so UMAP(n_neighbors=5)
# embed_args don't affect the filename, so e.g. UMAP(n_neighbors=5) and # and UMAP(n_neighbors=15) produce distinct files. Check both the hashed
# UMAP(n_neighbors=15) with otherwise-matching params collide too. Force # path (new runs) and the legacy hashless path (pre-hash runs) so users
# the user to change a stem-shaping param (seed is usually cheapest) or # can't accidentally duplicate against a pre-hash fig either.
# delete the existing fig first. _, hashed_emb = synthesize_output_paths(
_, emb_file = synthesize_output_paths( generator_path, reducer, num_points, num_timesteps, jitter_scale, seed,
generator_path, reducer, num_points, num_timesteps, jitter_scale, seed embed_args=embed_args,
) )
if (FIGS_DIR / emb_file).exists(): _, legacy_emb = synthesize_output_paths(
return HTMLResponse( generator_path, reducer, num_points, num_timesteps, jitter_scale, seed,
f"<div class='flash err'>a run with these params already exists " )
f"(<code>{emb_file}</code>). change <code>seed</code> / " for candidate in (hashed_emb, legacy_emb):
f"<code>jitter</code> / <code>N</code> / <code>T</code>, or delete " if (FIGS_DIR / candidate).exists():
f"the fig first.</div>", return HTMLResponse(
status_code=409, f"<div class='flash err'>a run with matching params already "
) f"exists (<code>{candidate}</code>). change a param or delete "
f"the fig first.</div>",
status_code=409,
)
parameters: Dict[str, Any] = { parameters: Dict[str, Any] = {
"num_points": num_points, "num_points": num_points,
@ -798,7 +832,8 @@ async def submit(request: Request) -> HTMLResponse:
) )
ref_file, emb_file = synthesize_output_paths( ref_file, emb_file = synthesize_output_paths(
generator_path, reducer, num_points, num_timesteps, jitter_scale, seed generator_path, reducer, num_points, num_timesteps, jitter_scale, seed,
embed_args=embed_args,
) )
RUN_OUTPUTS[run["id"]] = {"ref": ref_file, "embed": emb_file} RUN_OUTPUTS[run["id"]] = {"ref": ref_file, "embed": emb_file}
@ -845,7 +880,7 @@ async def metrics_json() -> JSONResponse:
_STEM_RE = re.compile( _STEM_RE = re.compile(
r"^make_[A-Za-z_]+?_[A-Za-z]+_N\d+_T\d+_J[\d.]+_s\d+$" r"^make_[A-Za-z_]+?_[A-Za-z]+_N\d+_T\d+_J[\d.]+_s\d+(?:_[0-9a-f]{8})?$"
) )
# Map short generator name ("make_blobs") to its DATASET_META entry. # Map short generator name ("make_blobs") to its DATASET_META entry.
@ -885,9 +920,15 @@ def _enrich_with_labels(d: Dict[str, Any]) -> Dict[str, Any]:
@lru_cache(maxsize=32) @lru_cache(maxsize=32)
def _cached_frames(stem: str) -> str: def _cached_frames(stem: str) -> str:
"""Parse <stem>.html and return the frames dict as a JSON string.""" """Return the frames dict as a JSON string. Prefers a <stem>.frames.json
html = FIGS_DIR / f"{stem}.html" sidecar (emitted by new flow runs); falls back to parsing <stem>.html
d = parse_plotly_run(html) (for pre-sidecar runs). Either way, enriches with dataset labels."""
sidecar = FIGS_DIR / f"{stem}.frames.json"
if sidecar.is_file():
d = json.loads(sidecar.read_text(encoding="utf-8"))
else:
html = FIGS_DIR / f"{stem}.html"
d = parse_plotly_run(html)
d = _enrich_with_labels(d) d = _enrich_with_labels(d)
return json.dumps(d, separators=(",", ":")) return json.dumps(d, separators=(",", ":"))
@ -896,8 +937,7 @@ def _cached_frames(stem: str) -> str:
async def run_frames(stem: str) -> Response: async def run_frames(stem: str) -> Response:
if not _STEM_RE.match(stem): if not _STEM_RE.match(stem):
raise HTTPException(400, f"malformed stem: {stem!r}") raise HTTPException(400, f"malformed stem: {stem!r}")
html = FIGS_DIR / f"{stem}.html" if not (FIGS_DIR / f"{stem}.frames.json").is_file() and not (FIGS_DIR / f"{stem}.html").is_file():
if not html.is_file():
raise HTTPException(404, f"no such run: {stem}") raise HTTPException(404, f"no such run: {stem}")
try: try:
payload = _cached_frames(stem) payload = _cached_frames(stem)

View File

@ -20,7 +20,7 @@ from pathlib import Path
_STEM_RE = re.compile( _STEM_RE = re.compile(
r"^(?P<gen>make_.+?)_(?P<emb>[A-Za-z]+)_N(?P<n>\d+)_T(?P<t>\d+)" r"^(?P<gen>make_.+?)_(?P<emb>[A-Za-z]+)_N(?P<n>\d+)_T(?P<t>\d+)"
r"_J(?P<j>[\d.]+)_s(?P<s>\d+)$" r"_J(?P<j>[\d.]+)_s(?P<s>\d+)(?:_(?P<h>[0-9a-f]{8}))?$"
) )
# plotly's typed-array dtype -> (struct format char, item size bytes) # plotly's typed-array dtype -> (struct format char, item size bytes)

View File

@ -20,11 +20,19 @@ os.environ.setdefault("NUMEXPR_NUM_THREADS", "1")
os.environ.setdefault("NUMBA_NUM_THREADS", "1") os.environ.setdefault("NUMBA_NUM_THREADS", "1")
from datetime import timedelta from datetime import timedelta
import hashlib
import json import json
import math import math
from pathlib import Path from pathlib import Path
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
def _embed_args_hash(ea: Optional[Dict[str, Any]]) -> str:
"""8-hex digest of embed_args (keys sorted) — output stem includes this
so runs differing only in embed_args get distinct files."""
s = json.dumps(ea or {}, sort_keys=True, default=str)
return hashlib.sha1(s.encode()).hexdigest()[:8]
from prefect import flow, task from prefect import flow, task
from prefect.artifacts import create_markdown_artifact, create_table_artifact from prefect.artifacts import create_markdown_artifact, create_table_artifact
from prefect.cache_policies import INPUTS, NO_CACHE from prefect.cache_policies import INPUTS, NO_CACHE
@ -279,10 +287,12 @@ def embedding_flow(
output_ref: str = ( output_ref: str = (
f"{output_dir.strip('/')}/{_generator}_Reference_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}.html" f"{output_dir.strip('/')}/{_generator}_Reference_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}.html"
) )
_args_tag = _embed_args_hash(embed_args)
output_embed: str = ( output_embed: str = (
f"{output_dir.strip('/')}/{_generator}_{embedder.split('.')[-1]}_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}.html" f"{output_dir.strip('/')}/{_generator}_{embedder.split('.')[-1]}_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}_{_args_tag}.html"
) )
output_metrics: str = output_embed[:-5] + ".metrics.json" output_metrics: str = output_embed[:-5] + ".metrics.json"
output_frames: str = output_embed[:-5] + ".frames.json"
title_ref = f"Reference: {_generator}, N={num_points} with {jitter_scale} noise" title_ref = f"Reference: {_generator}, N={num_points} with {jitter_scale} noise"
title_embed = f"Embedding: {embedder.split('.')[-1]} on {_generator}, N={num_points} with {jitter_scale} noise" title_embed = f"Embedding: {embedder.split('.')[-1]} on {_generator}, N={num_points} with {jitter_scale} noise"
@ -378,7 +388,28 @@ def embedding_flow(
k=10, k=10,
) )
return (ref_path.result(), emb_path.result(), metrics_path.result()) emb_path_result = emb_path.result()
metrics_path_result = metrics_path.result()
# Emit a frames.json sidecar so the compare page doesn't have to parse
# the 5 MB plotly HTML on every first request. Non-critical — the server
# falls back to HTML parsing when the sidecar is absent.
try:
import sys as _sys
_root = str(Path(__file__).resolve().parent.parent)
if _root not in _sys.path:
_sys.path.insert(0, _root)
from app.web.plotly_parse import parse_plotly_run
frames = parse_plotly_run(emb_path_result)
Path(output_frames).write_text(
json.dumps(frames, separators=(",", ":")), encoding="utf-8"
)
except Exception as _sidecar_err:
import traceback as _tb
print(f"[frames-sidecar] skipped: {_sidecar_err}")
_tb.print_exc()
return (ref_path.result(), emb_path_result, metrics_path_result)
if __name__ == "__main__": if __name__ == "__main__":