diff --git a/app/web/main.py b/app/web/main.py index 551fd34..77d38d5 100644 --- a/app/web/main.py +++ b/app/web/main.py @@ -10,6 +10,7 @@ framework; hand-written styles. from __future__ import annotations +import hashlib import importlib.util import json import os @@ -449,6 +450,13 @@ def build_embed_args(reducer_key: str, form: Dict[str, str]) -> Dict[str, Any]: # --------------------------------------------------------------------------- +def embed_args_hash(embed_args: Optional[Dict[str, Any]]) -> str: + """8-hex digest of an embed_args dict (keys sorted). Stems incorporate + this so runs that differ only in embed_args get distinct output files.""" + s = json.dumps(embed_args or {}, sort_keys=True, default=str) + return hashlib.sha1(s.encode()).hexdigest()[:8] + + def synthesize_output_paths( generator_path: str, embedder: str, @@ -456,14 +464,34 @@ def synthesize_output_paths( num_timesteps: int, jitter_scale: float, seed: int, + embed_args: Optional[Dict[str, Any]] = None, ) -> Tuple[str, str]: gen = generator_path.split(".")[-1] emb = embedder.split(".")[-1] ref = f"{gen}_Reference_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}.html" - embf = f"{gen}_{emb}_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}.html" + base = f"{gen}_{emb}_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}" + if embed_args is None: + embf = f"{base}.html" + else: + embf = f"{base}_{embed_args_hash(embed_args)}.html" return ref, embf +def _resolve_emb_file(synthesized: str) -> str: + """Disk-backed fallback: prefer the synthesized (hashed) name; if that + doesn't exist on disk but an older hash-less variant does, return that + so pre-hash runs still render in the UI.""" + if (FIGS_DIR / synthesized).exists(): + return synthesized + # Strip trailing _<8hex>.html to get the legacy name. + m = re.match(r"^(?P.+)_[0-9a-f]{8}\.html$", synthesized) + if m: + legacy = m.group("base") + ".html" + if (FIGS_DIR / legacy).exists(): + return legacy + return synthesized # new / still-running run; let emb_exists resolve + + # --------------------------------------------------------------------------- # Prefect client # --------------------------------------------------------------------------- @@ -586,7 +614,10 @@ def _run_view(run: Dict[str, Any]) -> Dict[str, Any]: int(params.get("num_timesteps", params.get("num_snapshots", 48))), float(params.get("jitter_scale", 0.01)), int(params.get("seed", 42)), + embed_args=params.get("embed_args") or {}, ) + # Older runs may lack the hash suffix; prefer legacy name on disk. + emb_file = _resolve_emb_file(emb_file) except Exception: ref_file, emb_file = None, None @@ -752,22 +783,25 @@ async def submit(request: Request) -> HTMLResponse: embed_args = build_embed_args(reducer, data) # Reject submissions whose output path would overwrite an existing fig. - # The stem is fully determined by (generator, embedder, N, T, J, seed) — - # embed_args don't affect the filename, so e.g. UMAP(n_neighbors=5) and - # UMAP(n_neighbors=15) with otherwise-matching params collide too. Force - # the user to change a stem-shaping param (seed is usually cheapest) or - # delete the existing fig first. - _, emb_file = synthesize_output_paths( - generator_path, reducer, num_points, num_timesteps, jitter_scale, seed + # The stem now includes an 8-hex hash of embed_args, so UMAP(n_neighbors=5) + # and UMAP(n_neighbors=15) produce distinct files. Check both the hashed + # path (new runs) and the legacy hashless path (pre-hash runs) so users + # can't accidentally duplicate against a pre-hash fig either. + _, hashed_emb = synthesize_output_paths( + generator_path, reducer, num_points, num_timesteps, jitter_scale, seed, + embed_args=embed_args, ) - if (FIGS_DIR / emb_file).exists(): - return HTMLResponse( - f"
a run with these params already exists " - f"({emb_file}). change seed / " - f"jitter / N / T, or delete " - f"the fig first.
", - status_code=409, - ) + _, legacy_emb = synthesize_output_paths( + generator_path, reducer, num_points, num_timesteps, jitter_scale, seed, + ) + for candidate in (hashed_emb, legacy_emb): + if (FIGS_DIR / candidate).exists(): + return HTMLResponse( + f"
a run with matching params already " + f"exists ({candidate}). change a param or delete " + f"the fig first.
", + status_code=409, + ) parameters: Dict[str, Any] = { "num_points": num_points, @@ -798,7 +832,8 @@ async def submit(request: Request) -> HTMLResponse: ) ref_file, emb_file = synthesize_output_paths( - generator_path, reducer, num_points, num_timesteps, jitter_scale, seed + generator_path, reducer, num_points, num_timesteps, jitter_scale, seed, + embed_args=embed_args, ) RUN_OUTPUTS[run["id"]] = {"ref": ref_file, "embed": emb_file} @@ -845,7 +880,7 @@ async def metrics_json() -> JSONResponse: _STEM_RE = re.compile( - r"^make_[A-Za-z_]+?_[A-Za-z]+_N\d+_T\d+_J[\d.]+_s\d+$" + r"^make_[A-Za-z_]+?_[A-Za-z]+_N\d+_T\d+_J[\d.]+_s\d+(?:_[0-9a-f]{8})?$" ) # Map short generator name ("make_blobs") to its DATASET_META entry. @@ -885,9 +920,15 @@ def _enrich_with_labels(d: Dict[str, Any]) -> Dict[str, Any]: @lru_cache(maxsize=32) def _cached_frames(stem: str) -> str: - """Parse .html and return the frames dict as a JSON string.""" - html = FIGS_DIR / f"{stem}.html" - d = parse_plotly_run(html) + """Return the frames dict as a JSON string. Prefers a .frames.json + sidecar (emitted by new flow runs); falls back to parsing .html + (for pre-sidecar runs). Either way, enriches with dataset labels.""" + sidecar = FIGS_DIR / f"{stem}.frames.json" + if sidecar.is_file(): + d = json.loads(sidecar.read_text(encoding="utf-8")) + else: + html = FIGS_DIR / f"{stem}.html" + d = parse_plotly_run(html) d = _enrich_with_labels(d) return json.dumps(d, separators=(",", ":")) @@ -896,8 +937,7 @@ def _cached_frames(stem: str) -> str: async def run_frames(stem: str) -> Response: if not _STEM_RE.match(stem): raise HTTPException(400, f"malformed stem: {stem!r}") - html = FIGS_DIR / f"{stem}.html" - if not html.is_file(): + if not (FIGS_DIR / f"{stem}.frames.json").is_file() and not (FIGS_DIR / f"{stem}.html").is_file(): raise HTTPException(404, f"no such run: {stem}") try: payload = _cached_frames(stem) diff --git a/app/web/plotly_parse.py b/app/web/plotly_parse.py index 84d4561..5e0006c 100644 --- a/app/web/plotly_parse.py +++ b/app/web/plotly_parse.py @@ -20,7 +20,7 @@ from pathlib import Path _STEM_RE = re.compile( r"^(?Pmake_.+?)_(?P[A-Za-z]+)_N(?P\d+)_T(?P\d+)" - r"_J(?P[\d.]+)_s(?P\d+)$" + r"_J(?P[\d.]+)_s(?P\d+)(?:_(?P[0-9a-f]{8}))?$" ) # plotly's typed-array dtype -> (struct format char, item size bytes) diff --git a/flows/embedding_flow.py b/flows/embedding_flow.py index 96cd995..dd71725 100644 --- a/flows/embedding_flow.py +++ b/flows/embedding_flow.py @@ -20,11 +20,19 @@ os.environ.setdefault("NUMEXPR_NUM_THREADS", "1") os.environ.setdefault("NUMBA_NUM_THREADS", "1") from datetime import timedelta +import hashlib import json import math from pathlib import Path from typing import Any, Dict, List, Optional + +def _embed_args_hash(ea: Optional[Dict[str, Any]]) -> str: + """8-hex digest of embed_args (keys sorted) — output stem includes this + so runs differing only in embed_args get distinct files.""" + s = json.dumps(ea or {}, sort_keys=True, default=str) + return hashlib.sha1(s.encode()).hexdigest()[:8] + from prefect import flow, task from prefect.artifacts import create_markdown_artifact, create_table_artifact from prefect.cache_policies import INPUTS, NO_CACHE @@ -279,10 +287,12 @@ def embedding_flow( output_ref: str = ( f"{output_dir.strip('/')}/{_generator}_Reference_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}.html" ) + _args_tag = _embed_args_hash(embed_args) output_embed: str = ( - f"{output_dir.strip('/')}/{_generator}_{embedder.split('.')[-1]}_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}.html" + f"{output_dir.strip('/')}/{_generator}_{embedder.split('.')[-1]}_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}_{_args_tag}.html" ) output_metrics: str = output_embed[:-5] + ".metrics.json" + output_frames: str = output_embed[:-5] + ".frames.json" title_ref = f"Reference: {_generator}, N={num_points} with {jitter_scale} noise" title_embed = f"Embedding: {embedder.split('.')[-1]} on {_generator}, N={num_points} with {jitter_scale} noise" @@ -378,7 +388,28 @@ def embedding_flow( k=10, ) - return (ref_path.result(), emb_path.result(), metrics_path.result()) + emb_path_result = emb_path.result() + metrics_path_result = metrics_path.result() + + # Emit a frames.json sidecar so the compare page doesn't have to parse + # the 5 MB plotly HTML on every first request. Non-critical — the server + # falls back to HTML parsing when the sidecar is absent. + try: + import sys as _sys + _root = str(Path(__file__).resolve().parent.parent) + if _root not in _sys.path: + _sys.path.insert(0, _root) + from app.web.plotly_parse import parse_plotly_run + frames = parse_plotly_run(emb_path_result) + Path(output_frames).write_text( + json.dumps(frames, separators=(",", ":")), encoding="utf-8" + ) + except Exception as _sidecar_err: + import traceback as _tb + print(f"[frames-sidecar] skipped: {_sidecar_err}") + _tb.print_exc() + + return (ref_path.result(), emb_path_result, metrics_path_result) if __name__ == "__main__":