diff --git a/app/web/main.py b/app/web/main.py
index 551fd34..77d38d5 100644
--- a/app/web/main.py
+++ b/app/web/main.py
@@ -10,6 +10,7 @@ framework; hand-written styles.
from __future__ import annotations
+import hashlib
import importlib.util
import json
import os
@@ -449,6 +450,13 @@ def build_embed_args(reducer_key: str, form: Dict[str, str]) -> Dict[str, Any]:
# ---------------------------------------------------------------------------
+def embed_args_hash(embed_args: Optional[Dict[str, Any]]) -> str:
+ """8-hex digest of an embed_args dict (keys sorted). Stems incorporate
+ this so runs that differ only in embed_args get distinct output files."""
+ s = json.dumps(embed_args or {}, sort_keys=True, default=str)
+ return hashlib.sha1(s.encode()).hexdigest()[:8]
+
+
def synthesize_output_paths(
generator_path: str,
embedder: str,
@@ -456,14 +464,34 @@ def synthesize_output_paths(
num_timesteps: int,
jitter_scale: float,
seed: int,
+ embed_args: Optional[Dict[str, Any]] = None,
) -> Tuple[str, str]:
gen = generator_path.split(".")[-1]
emb = embedder.split(".")[-1]
ref = f"{gen}_Reference_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}.html"
- embf = f"{gen}_{emb}_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}.html"
+ base = f"{gen}_{emb}_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}"
+ if embed_args is None:
+ embf = f"{base}.html"
+ else:
+ embf = f"{base}_{embed_args_hash(embed_args)}.html"
return ref, embf
+def _resolve_emb_file(synthesized: str) -> str:
+ """Disk-backed fallback: prefer the synthesized (hashed) name; if that
+ doesn't exist on disk but an older hash-less variant does, return that
+ so pre-hash runs still render in the UI."""
+ if (FIGS_DIR / synthesized).exists():
+ return synthesized
+ # Strip trailing _<8hex>.html to get the legacy name.
+ m = re.match(r"^(?P.+)_[0-9a-f]{8}\.html$", synthesized)
+ if m:
+ legacy = m.group("base") + ".html"
+ if (FIGS_DIR / legacy).exists():
+ return legacy
+ return synthesized # new / still-running run; let emb_exists resolve
+
+
# ---------------------------------------------------------------------------
# Prefect client
# ---------------------------------------------------------------------------
@@ -586,7 +614,10 @@ def _run_view(run: Dict[str, Any]) -> Dict[str, Any]:
int(params.get("num_timesteps", params.get("num_snapshots", 48))),
float(params.get("jitter_scale", 0.01)),
int(params.get("seed", 42)),
+ embed_args=params.get("embed_args") or {},
)
+ # Older runs may lack the hash suffix; prefer legacy name on disk.
+ emb_file = _resolve_emb_file(emb_file)
except Exception:
ref_file, emb_file = None, None
@@ -752,22 +783,25 @@ async def submit(request: Request) -> HTMLResponse:
embed_args = build_embed_args(reducer, data)
# Reject submissions whose output path would overwrite an existing fig.
- # The stem is fully determined by (generator, embedder, N, T, J, seed) —
- # embed_args don't affect the filename, so e.g. UMAP(n_neighbors=5) and
- # UMAP(n_neighbors=15) with otherwise-matching params collide too. Force
- # the user to change a stem-shaping param (seed is usually cheapest) or
- # delete the existing fig first.
- _, emb_file = synthesize_output_paths(
- generator_path, reducer, num_points, num_timesteps, jitter_scale, seed
+ # The stem now includes an 8-hex hash of embed_args, so UMAP(n_neighbors=5)
+ # and UMAP(n_neighbors=15) produce distinct files. Check both the hashed
+ # path (new runs) and the legacy hashless path (pre-hash runs) so users
+ # can't accidentally duplicate against a pre-hash fig either.
+ _, hashed_emb = synthesize_output_paths(
+ generator_path, reducer, num_points, num_timesteps, jitter_scale, seed,
+ embed_args=embed_args,
)
- if (FIGS_DIR / emb_file).exists():
- return HTMLResponse(
- f"
a run with these params already exists "
- f"({emb_file}). change seed / "
- f"jitter / N / T, or delete "
- f"the fig first.
",
- status_code=409,
- )
+ _, legacy_emb = synthesize_output_paths(
+ generator_path, reducer, num_points, num_timesteps, jitter_scale, seed,
+ )
+ for candidate in (hashed_emb, legacy_emb):
+ if (FIGS_DIR / candidate).exists():
+ return HTMLResponse(
+ f"a run with matching params already "
+ f"exists ({candidate}). change a param or delete "
+ f"the fig first.
",
+ status_code=409,
+ )
parameters: Dict[str, Any] = {
"num_points": num_points,
@@ -798,7 +832,8 @@ async def submit(request: Request) -> HTMLResponse:
)
ref_file, emb_file = synthesize_output_paths(
- generator_path, reducer, num_points, num_timesteps, jitter_scale, seed
+ generator_path, reducer, num_points, num_timesteps, jitter_scale, seed,
+ embed_args=embed_args,
)
RUN_OUTPUTS[run["id"]] = {"ref": ref_file, "embed": emb_file}
@@ -845,7 +880,7 @@ async def metrics_json() -> JSONResponse:
_STEM_RE = re.compile(
- r"^make_[A-Za-z_]+?_[A-Za-z]+_N\d+_T\d+_J[\d.]+_s\d+$"
+ r"^make_[A-Za-z_]+?_[A-Za-z]+_N\d+_T\d+_J[\d.]+_s\d+(?:_[0-9a-f]{8})?$"
)
# Map short generator name ("make_blobs") to its DATASET_META entry.
@@ -885,9 +920,15 @@ def _enrich_with_labels(d: Dict[str, Any]) -> Dict[str, Any]:
@lru_cache(maxsize=32)
def _cached_frames(stem: str) -> str:
- """Parse .html and return the frames dict as a JSON string."""
- html = FIGS_DIR / f"{stem}.html"
- d = parse_plotly_run(html)
+ """Return the frames dict as a JSON string. Prefers a .frames.json
+ sidecar (emitted by new flow runs); falls back to parsing .html
+ (for pre-sidecar runs). Either way, enriches with dataset labels."""
+ sidecar = FIGS_DIR / f"{stem}.frames.json"
+ if sidecar.is_file():
+ d = json.loads(sidecar.read_text(encoding="utf-8"))
+ else:
+ html = FIGS_DIR / f"{stem}.html"
+ d = parse_plotly_run(html)
d = _enrich_with_labels(d)
return json.dumps(d, separators=(",", ":"))
@@ -896,8 +937,7 @@ def _cached_frames(stem: str) -> str:
async def run_frames(stem: str) -> Response:
if not _STEM_RE.match(stem):
raise HTTPException(400, f"malformed stem: {stem!r}")
- html = FIGS_DIR / f"{stem}.html"
- if not html.is_file():
+ if not (FIGS_DIR / f"{stem}.frames.json").is_file() and not (FIGS_DIR / f"{stem}.html").is_file():
raise HTTPException(404, f"no such run: {stem}")
try:
payload = _cached_frames(stem)
diff --git a/app/web/plotly_parse.py b/app/web/plotly_parse.py
index 84d4561..5e0006c 100644
--- a/app/web/plotly_parse.py
+++ b/app/web/plotly_parse.py
@@ -20,7 +20,7 @@ from pathlib import Path
_STEM_RE = re.compile(
r"^(?Pmake_.+?)_(?P[A-Za-z]+)_N(?P\d+)_T(?P\d+)"
- r"_J(?P[\d.]+)_s(?P\d+)$"
+ r"_J(?P[\d.]+)_s(?P\d+)(?:_(?P[0-9a-f]{8}))?$"
)
# plotly's typed-array dtype -> (struct format char, item size bytes)
diff --git a/flows/embedding_flow.py b/flows/embedding_flow.py
index 96cd995..dd71725 100644
--- a/flows/embedding_flow.py
+++ b/flows/embedding_flow.py
@@ -20,11 +20,19 @@ os.environ.setdefault("NUMEXPR_NUM_THREADS", "1")
os.environ.setdefault("NUMBA_NUM_THREADS", "1")
from datetime import timedelta
+import hashlib
import json
import math
from pathlib import Path
from typing import Any, Dict, List, Optional
+
+def _embed_args_hash(ea: Optional[Dict[str, Any]]) -> str:
+ """8-hex digest of embed_args (keys sorted) — output stem includes this
+ so runs differing only in embed_args get distinct files."""
+ s = json.dumps(ea or {}, sort_keys=True, default=str)
+ return hashlib.sha1(s.encode()).hexdigest()[:8]
+
from prefect import flow, task
from prefect.artifacts import create_markdown_artifact, create_table_artifact
from prefect.cache_policies import INPUTS, NO_CACHE
@@ -279,10 +287,12 @@ def embedding_flow(
output_ref: str = (
f"{output_dir.strip('/')}/{_generator}_Reference_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}.html"
)
+ _args_tag = _embed_args_hash(embed_args)
output_embed: str = (
- f"{output_dir.strip('/')}/{_generator}_{embedder.split('.')[-1]}_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}.html"
+ f"{output_dir.strip('/')}/{_generator}_{embedder.split('.')[-1]}_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}_{_args_tag}.html"
)
output_metrics: str = output_embed[:-5] + ".metrics.json"
+ output_frames: str = output_embed[:-5] + ".frames.json"
title_ref = f"Reference: {_generator}, N={num_points} with {jitter_scale} noise"
title_embed = f"Embedding: {embedder.split('.')[-1]} on {_generator}, N={num_points} with {jitter_scale} noise"
@@ -378,7 +388,28 @@ def embedding_flow(
k=10,
)
- return (ref_path.result(), emb_path.result(), metrics_path.result())
+ emb_path_result = emb_path.result()
+ metrics_path_result = metrics_path.result()
+
+ # Emit a frames.json sidecar so the compare page doesn't have to parse
+ # the 5 MB plotly HTML on every first request. Non-critical — the server
+ # falls back to HTML parsing when the sidecar is absent.
+ try:
+ import sys as _sys
+ _root = str(Path(__file__).resolve().parent.parent)
+ if _root not in _sys.path:
+ _sys.path.insert(0, _root)
+ from app.web.plotly_parse import parse_plotly_run
+ frames = parse_plotly_run(emb_path_result)
+ Path(output_frames).write_text(
+ json.dumps(frames, separators=(",", ":")), encoding="utf-8"
+ )
+ except Exception as _sidecar_err:
+ import traceback as _tb
+ print(f"[frames-sidecar] skipped: {_sidecar_err}")
+ _tb.print_exc()
+
+ return (ref_path.result(), emb_path_result, metrics_path_result)
if __name__ == "__main__":