stems: include embed_args hash in output filename + emit frames.json sidecar
Stem grows an 8-hex sha1 digest of the (keys-sorted) embed_args dict, so runs differing only in embed_args (e.g. UMAP n_neighbors=5 vs 15) now produce distinct figs. The stem regex and parser both accept an optional _<hash> tail so pre-hash figs still render in the runs list and compare page; legacy filename is resolved on disk fallback. Duplicate-submission check now rejects against BOTH the hashed and the legacy hashless variant so users can't accidentally duplicate an old run either. Flow additionally writes a <stem>.frames.json sidecar next to the plotly HTML (same shape as app/web/plotly_parse returns). Server prefers the sidecar when present; falls back to parsing HTML for older runs. Sidecar emission is non-critical — any failure just logs and keeps going.
This commit is contained in:
parent
36e217f51e
commit
fe49565651
@ -10,6 +10,7 @@ framework; hand-written styles.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import importlib.util
|
||||
import json
|
||||
import os
|
||||
@ -449,6 +450,13 @@ def build_embed_args(reducer_key: str, form: Dict[str, str]) -> Dict[str, Any]:
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def embed_args_hash(embed_args: Optional[Dict[str, Any]]) -> str:
|
||||
"""8-hex digest of an embed_args dict (keys sorted). Stems incorporate
|
||||
this so runs that differ only in embed_args get distinct output files."""
|
||||
s = json.dumps(embed_args or {}, sort_keys=True, default=str)
|
||||
return hashlib.sha1(s.encode()).hexdigest()[:8]
|
||||
|
||||
|
||||
def synthesize_output_paths(
|
||||
generator_path: str,
|
||||
embedder: str,
|
||||
@ -456,14 +464,34 @@ def synthesize_output_paths(
|
||||
num_timesteps: int,
|
||||
jitter_scale: float,
|
||||
seed: int,
|
||||
embed_args: Optional[Dict[str, Any]] = None,
|
||||
) -> Tuple[str, str]:
|
||||
gen = generator_path.split(".")[-1]
|
||||
emb = embedder.split(".")[-1]
|
||||
ref = f"{gen}_Reference_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}.html"
|
||||
embf = f"{gen}_{emb}_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}.html"
|
||||
base = f"{gen}_{emb}_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}"
|
||||
if embed_args is None:
|
||||
embf = f"{base}.html"
|
||||
else:
|
||||
embf = f"{base}_{embed_args_hash(embed_args)}.html"
|
||||
return ref, embf
|
||||
|
||||
|
||||
def _resolve_emb_file(synthesized: str) -> str:
|
||||
"""Disk-backed fallback: prefer the synthesized (hashed) name; if that
|
||||
doesn't exist on disk but an older hash-less variant does, return that
|
||||
so pre-hash runs still render in the UI."""
|
||||
if (FIGS_DIR / synthesized).exists():
|
||||
return synthesized
|
||||
# Strip trailing _<8hex>.html to get the legacy name.
|
||||
m = re.match(r"^(?P<base>.+)_[0-9a-f]{8}\.html$", synthesized)
|
||||
if m:
|
||||
legacy = m.group("base") + ".html"
|
||||
if (FIGS_DIR / legacy).exists():
|
||||
return legacy
|
||||
return synthesized # new / still-running run; let emb_exists resolve
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Prefect client
|
||||
# ---------------------------------------------------------------------------
|
||||
@ -586,7 +614,10 @@ def _run_view(run: Dict[str, Any]) -> Dict[str, Any]:
|
||||
int(params.get("num_timesteps", params.get("num_snapshots", 48))),
|
||||
float(params.get("jitter_scale", 0.01)),
|
||||
int(params.get("seed", 42)),
|
||||
embed_args=params.get("embed_args") or {},
|
||||
)
|
||||
# Older runs may lack the hash suffix; prefer legacy name on disk.
|
||||
emb_file = _resolve_emb_file(emb_file)
|
||||
except Exception:
|
||||
ref_file, emb_file = None, None
|
||||
|
||||
@ -752,19 +783,22 @@ async def submit(request: Request) -> HTMLResponse:
|
||||
embed_args = build_embed_args(reducer, data)
|
||||
|
||||
# Reject submissions whose output path would overwrite an existing fig.
|
||||
# The stem is fully determined by (generator, embedder, N, T, J, seed) —
|
||||
# embed_args don't affect the filename, so e.g. UMAP(n_neighbors=5) and
|
||||
# UMAP(n_neighbors=15) with otherwise-matching params collide too. Force
|
||||
# the user to change a stem-shaping param (seed is usually cheapest) or
|
||||
# delete the existing fig first.
|
||||
_, emb_file = synthesize_output_paths(
|
||||
generator_path, reducer, num_points, num_timesteps, jitter_scale, seed
|
||||
# The stem now includes an 8-hex hash of embed_args, so UMAP(n_neighbors=5)
|
||||
# and UMAP(n_neighbors=15) produce distinct files. Check both the hashed
|
||||
# path (new runs) and the legacy hashless path (pre-hash runs) so users
|
||||
# can't accidentally duplicate against a pre-hash fig either.
|
||||
_, hashed_emb = synthesize_output_paths(
|
||||
generator_path, reducer, num_points, num_timesteps, jitter_scale, seed,
|
||||
embed_args=embed_args,
|
||||
)
|
||||
if (FIGS_DIR / emb_file).exists():
|
||||
_, legacy_emb = synthesize_output_paths(
|
||||
generator_path, reducer, num_points, num_timesteps, jitter_scale, seed,
|
||||
)
|
||||
for candidate in (hashed_emb, legacy_emb):
|
||||
if (FIGS_DIR / candidate).exists():
|
||||
return HTMLResponse(
|
||||
f"<div class='flash err'>a run with these params already exists "
|
||||
f"(<code>{emb_file}</code>). change <code>seed</code> / "
|
||||
f"<code>jitter</code> / <code>N</code> / <code>T</code>, or delete "
|
||||
f"<div class='flash err'>a run with matching params already "
|
||||
f"exists (<code>{candidate}</code>). change a param or delete "
|
||||
f"the fig first.</div>",
|
||||
status_code=409,
|
||||
)
|
||||
@ -798,7 +832,8 @@ async def submit(request: Request) -> HTMLResponse:
|
||||
)
|
||||
|
||||
ref_file, emb_file = synthesize_output_paths(
|
||||
generator_path, reducer, num_points, num_timesteps, jitter_scale, seed
|
||||
generator_path, reducer, num_points, num_timesteps, jitter_scale, seed,
|
||||
embed_args=embed_args,
|
||||
)
|
||||
RUN_OUTPUTS[run["id"]] = {"ref": ref_file, "embed": emb_file}
|
||||
|
||||
@ -845,7 +880,7 @@ async def metrics_json() -> JSONResponse:
|
||||
|
||||
|
||||
_STEM_RE = re.compile(
|
||||
r"^make_[A-Za-z_]+?_[A-Za-z]+_N\d+_T\d+_J[\d.]+_s\d+$"
|
||||
r"^make_[A-Za-z_]+?_[A-Za-z]+_N\d+_T\d+_J[\d.]+_s\d+(?:_[0-9a-f]{8})?$"
|
||||
)
|
||||
|
||||
# Map short generator name ("make_blobs") to its DATASET_META entry.
|
||||
@ -885,7 +920,13 @@ def _enrich_with_labels(d: Dict[str, Any]) -> Dict[str, Any]:
|
||||
|
||||
@lru_cache(maxsize=32)
|
||||
def _cached_frames(stem: str) -> str:
|
||||
"""Parse <stem>.html and return the frames dict as a JSON string."""
|
||||
"""Return the frames dict as a JSON string. Prefers a <stem>.frames.json
|
||||
sidecar (emitted by new flow runs); falls back to parsing <stem>.html
|
||||
(for pre-sidecar runs). Either way, enriches with dataset labels."""
|
||||
sidecar = FIGS_DIR / f"{stem}.frames.json"
|
||||
if sidecar.is_file():
|
||||
d = json.loads(sidecar.read_text(encoding="utf-8"))
|
||||
else:
|
||||
html = FIGS_DIR / f"{stem}.html"
|
||||
d = parse_plotly_run(html)
|
||||
d = _enrich_with_labels(d)
|
||||
@ -896,8 +937,7 @@ def _cached_frames(stem: str) -> str:
|
||||
async def run_frames(stem: str) -> Response:
|
||||
if not _STEM_RE.match(stem):
|
||||
raise HTTPException(400, f"malformed stem: {stem!r}")
|
||||
html = FIGS_DIR / f"{stem}.html"
|
||||
if not html.is_file():
|
||||
if not (FIGS_DIR / f"{stem}.frames.json").is_file() and not (FIGS_DIR / f"{stem}.html").is_file():
|
||||
raise HTTPException(404, f"no such run: {stem}")
|
||||
try:
|
||||
payload = _cached_frames(stem)
|
||||
|
||||
@ -20,7 +20,7 @@ from pathlib import Path
|
||||
|
||||
_STEM_RE = re.compile(
|
||||
r"^(?P<gen>make_.+?)_(?P<emb>[A-Za-z]+)_N(?P<n>\d+)_T(?P<t>\d+)"
|
||||
r"_J(?P<j>[\d.]+)_s(?P<s>\d+)$"
|
||||
r"_J(?P<j>[\d.]+)_s(?P<s>\d+)(?:_(?P<h>[0-9a-f]{8}))?$"
|
||||
)
|
||||
|
||||
# plotly's typed-array dtype -> (struct format char, item size bytes)
|
||||
|
||||
@ -20,11 +20,19 @@ os.environ.setdefault("NUMEXPR_NUM_THREADS", "1")
|
||||
os.environ.setdefault("NUMBA_NUM_THREADS", "1")
|
||||
|
||||
from datetime import timedelta
|
||||
import hashlib
|
||||
import json
|
||||
import math
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
|
||||
def _embed_args_hash(ea: Optional[Dict[str, Any]]) -> str:
|
||||
"""8-hex digest of embed_args (keys sorted) — output stem includes this
|
||||
so runs differing only in embed_args get distinct files."""
|
||||
s = json.dumps(ea or {}, sort_keys=True, default=str)
|
||||
return hashlib.sha1(s.encode()).hexdigest()[:8]
|
||||
|
||||
from prefect import flow, task
|
||||
from prefect.artifacts import create_markdown_artifact, create_table_artifact
|
||||
from prefect.cache_policies import INPUTS, NO_CACHE
|
||||
@ -279,10 +287,12 @@ def embedding_flow(
|
||||
output_ref: str = (
|
||||
f"{output_dir.strip('/')}/{_generator}_Reference_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}.html"
|
||||
)
|
||||
_args_tag = _embed_args_hash(embed_args)
|
||||
output_embed: str = (
|
||||
f"{output_dir.strip('/')}/{_generator}_{embedder.split('.')[-1]}_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}.html"
|
||||
f"{output_dir.strip('/')}/{_generator}_{embedder.split('.')[-1]}_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}_{_args_tag}.html"
|
||||
)
|
||||
output_metrics: str = output_embed[:-5] + ".metrics.json"
|
||||
output_frames: str = output_embed[:-5] + ".frames.json"
|
||||
title_ref = f"Reference: {_generator}, N={num_points} with {jitter_scale} noise"
|
||||
title_embed = f"Embedding: {embedder.split('.')[-1]} on {_generator}, N={num_points} with {jitter_scale} noise"
|
||||
|
||||
@ -378,7 +388,28 @@ def embedding_flow(
|
||||
k=10,
|
||||
)
|
||||
|
||||
return (ref_path.result(), emb_path.result(), metrics_path.result())
|
||||
emb_path_result = emb_path.result()
|
||||
metrics_path_result = metrics_path.result()
|
||||
|
||||
# Emit a frames.json sidecar so the compare page doesn't have to parse
|
||||
# the 5 MB plotly HTML on every first request. Non-critical — the server
|
||||
# falls back to HTML parsing when the sidecar is absent.
|
||||
try:
|
||||
import sys as _sys
|
||||
_root = str(Path(__file__).resolve().parent.parent)
|
||||
if _root not in _sys.path:
|
||||
_sys.path.insert(0, _root)
|
||||
from app.web.plotly_parse import parse_plotly_run
|
||||
frames = parse_plotly_run(emb_path_result)
|
||||
Path(output_frames).write_text(
|
||||
json.dumps(frames, separators=(",", ":")), encoding="utf-8"
|
||||
)
|
||||
except Exception as _sidecar_err:
|
||||
import traceback as _tb
|
||||
print(f"[frames-sidecar] skipped: {_sidecar_err}")
|
||||
_tb.print_exc()
|
||||
|
||||
return (ref_path.result(), emb_path_result, metrics_path_result)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Loading…
Reference in New Issue
Block a user