stems: include embed_args hash in output filename + emit frames.json sidecar
Stem grows an 8-hex sha1 digest of the (keys-sorted) embed_args dict, so runs differing only in embed_args (e.g. UMAP n_neighbors=5 vs 15) now produce distinct figs. The stem regex and parser both accept an optional _<hash> tail so pre-hash figs still render in the runs list and compare page; legacy filename is resolved on disk fallback. Duplicate-submission check now rejects against BOTH the hashed and the legacy hashless variant so users can't accidentally duplicate an old run either. Flow additionally writes a <stem>.frames.json sidecar next to the plotly HTML (same shape as app/web/plotly_parse returns). Server prefers the sidecar when present; falls back to parsing HTML for older runs. Sidecar emission is non-critical — any failure just logs and keeps going.
This commit is contained in:
parent
36e217f51e
commit
fe49565651
@ -10,6 +10,7 @@ framework; hand-written styles.
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import hashlib
|
||||||
import importlib.util
|
import importlib.util
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
@ -449,6 +450,13 @@ def build_embed_args(reducer_key: str, form: Dict[str, str]) -> Dict[str, Any]:
|
|||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def embed_args_hash(embed_args: Optional[Dict[str, Any]]) -> str:
|
||||||
|
"""8-hex digest of an embed_args dict (keys sorted). Stems incorporate
|
||||||
|
this so runs that differ only in embed_args get distinct output files."""
|
||||||
|
s = json.dumps(embed_args or {}, sort_keys=True, default=str)
|
||||||
|
return hashlib.sha1(s.encode()).hexdigest()[:8]
|
||||||
|
|
||||||
|
|
||||||
def synthesize_output_paths(
|
def synthesize_output_paths(
|
||||||
generator_path: str,
|
generator_path: str,
|
||||||
embedder: str,
|
embedder: str,
|
||||||
@ -456,14 +464,34 @@ def synthesize_output_paths(
|
|||||||
num_timesteps: int,
|
num_timesteps: int,
|
||||||
jitter_scale: float,
|
jitter_scale: float,
|
||||||
seed: int,
|
seed: int,
|
||||||
|
embed_args: Optional[Dict[str, Any]] = None,
|
||||||
) -> Tuple[str, str]:
|
) -> Tuple[str, str]:
|
||||||
gen = generator_path.split(".")[-1]
|
gen = generator_path.split(".")[-1]
|
||||||
emb = embedder.split(".")[-1]
|
emb = embedder.split(".")[-1]
|
||||||
ref = f"{gen}_Reference_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}.html"
|
ref = f"{gen}_Reference_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}.html"
|
||||||
embf = f"{gen}_{emb}_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}.html"
|
base = f"{gen}_{emb}_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}"
|
||||||
|
if embed_args is None:
|
||||||
|
embf = f"{base}.html"
|
||||||
|
else:
|
||||||
|
embf = f"{base}_{embed_args_hash(embed_args)}.html"
|
||||||
return ref, embf
|
return ref, embf
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_emb_file(synthesized: str) -> str:
|
||||||
|
"""Disk-backed fallback: prefer the synthesized (hashed) name; if that
|
||||||
|
doesn't exist on disk but an older hash-less variant does, return that
|
||||||
|
so pre-hash runs still render in the UI."""
|
||||||
|
if (FIGS_DIR / synthesized).exists():
|
||||||
|
return synthesized
|
||||||
|
# Strip trailing _<8hex>.html to get the legacy name.
|
||||||
|
m = re.match(r"^(?P<base>.+)_[0-9a-f]{8}\.html$", synthesized)
|
||||||
|
if m:
|
||||||
|
legacy = m.group("base") + ".html"
|
||||||
|
if (FIGS_DIR / legacy).exists():
|
||||||
|
return legacy
|
||||||
|
return synthesized # new / still-running run; let emb_exists resolve
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Prefect client
|
# Prefect client
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@ -586,7 +614,10 @@ def _run_view(run: Dict[str, Any]) -> Dict[str, Any]:
|
|||||||
int(params.get("num_timesteps", params.get("num_snapshots", 48))),
|
int(params.get("num_timesteps", params.get("num_snapshots", 48))),
|
||||||
float(params.get("jitter_scale", 0.01)),
|
float(params.get("jitter_scale", 0.01)),
|
||||||
int(params.get("seed", 42)),
|
int(params.get("seed", 42)),
|
||||||
|
embed_args=params.get("embed_args") or {},
|
||||||
)
|
)
|
||||||
|
# Older runs may lack the hash suffix; prefer legacy name on disk.
|
||||||
|
emb_file = _resolve_emb_file(emb_file)
|
||||||
except Exception:
|
except Exception:
|
||||||
ref_file, emb_file = None, None
|
ref_file, emb_file = None, None
|
||||||
|
|
||||||
@ -752,19 +783,22 @@ async def submit(request: Request) -> HTMLResponse:
|
|||||||
embed_args = build_embed_args(reducer, data)
|
embed_args = build_embed_args(reducer, data)
|
||||||
|
|
||||||
# Reject submissions whose output path would overwrite an existing fig.
|
# Reject submissions whose output path would overwrite an existing fig.
|
||||||
# The stem is fully determined by (generator, embedder, N, T, J, seed) —
|
# The stem now includes an 8-hex hash of embed_args, so UMAP(n_neighbors=5)
|
||||||
# embed_args don't affect the filename, so e.g. UMAP(n_neighbors=5) and
|
# and UMAP(n_neighbors=15) produce distinct files. Check both the hashed
|
||||||
# UMAP(n_neighbors=15) with otherwise-matching params collide too. Force
|
# path (new runs) and the legacy hashless path (pre-hash runs) so users
|
||||||
# the user to change a stem-shaping param (seed is usually cheapest) or
|
# can't accidentally duplicate against a pre-hash fig either.
|
||||||
# delete the existing fig first.
|
_, hashed_emb = synthesize_output_paths(
|
||||||
_, emb_file = synthesize_output_paths(
|
generator_path, reducer, num_points, num_timesteps, jitter_scale, seed,
|
||||||
generator_path, reducer, num_points, num_timesteps, jitter_scale, seed
|
embed_args=embed_args,
|
||||||
)
|
)
|
||||||
if (FIGS_DIR / emb_file).exists():
|
_, legacy_emb = synthesize_output_paths(
|
||||||
|
generator_path, reducer, num_points, num_timesteps, jitter_scale, seed,
|
||||||
|
)
|
||||||
|
for candidate in (hashed_emb, legacy_emb):
|
||||||
|
if (FIGS_DIR / candidate).exists():
|
||||||
return HTMLResponse(
|
return HTMLResponse(
|
||||||
f"<div class='flash err'>a run with these params already exists "
|
f"<div class='flash err'>a run with matching params already "
|
||||||
f"(<code>{emb_file}</code>). change <code>seed</code> / "
|
f"exists (<code>{candidate}</code>). change a param or delete "
|
||||||
f"<code>jitter</code> / <code>N</code> / <code>T</code>, or delete "
|
|
||||||
f"the fig first.</div>",
|
f"the fig first.</div>",
|
||||||
status_code=409,
|
status_code=409,
|
||||||
)
|
)
|
||||||
@ -798,7 +832,8 @@ async def submit(request: Request) -> HTMLResponse:
|
|||||||
)
|
)
|
||||||
|
|
||||||
ref_file, emb_file = synthesize_output_paths(
|
ref_file, emb_file = synthesize_output_paths(
|
||||||
generator_path, reducer, num_points, num_timesteps, jitter_scale, seed
|
generator_path, reducer, num_points, num_timesteps, jitter_scale, seed,
|
||||||
|
embed_args=embed_args,
|
||||||
)
|
)
|
||||||
RUN_OUTPUTS[run["id"]] = {"ref": ref_file, "embed": emb_file}
|
RUN_OUTPUTS[run["id"]] = {"ref": ref_file, "embed": emb_file}
|
||||||
|
|
||||||
@ -845,7 +880,7 @@ async def metrics_json() -> JSONResponse:
|
|||||||
|
|
||||||
|
|
||||||
_STEM_RE = re.compile(
|
_STEM_RE = re.compile(
|
||||||
r"^make_[A-Za-z_]+?_[A-Za-z]+_N\d+_T\d+_J[\d.]+_s\d+$"
|
r"^make_[A-Za-z_]+?_[A-Za-z]+_N\d+_T\d+_J[\d.]+_s\d+(?:_[0-9a-f]{8})?$"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Map short generator name ("make_blobs") to its DATASET_META entry.
|
# Map short generator name ("make_blobs") to its DATASET_META entry.
|
||||||
@ -885,7 +920,13 @@ def _enrich_with_labels(d: Dict[str, Any]) -> Dict[str, Any]:
|
|||||||
|
|
||||||
@lru_cache(maxsize=32)
|
@lru_cache(maxsize=32)
|
||||||
def _cached_frames(stem: str) -> str:
|
def _cached_frames(stem: str) -> str:
|
||||||
"""Parse <stem>.html and return the frames dict as a JSON string."""
|
"""Return the frames dict as a JSON string. Prefers a <stem>.frames.json
|
||||||
|
sidecar (emitted by new flow runs); falls back to parsing <stem>.html
|
||||||
|
(for pre-sidecar runs). Either way, enriches with dataset labels."""
|
||||||
|
sidecar = FIGS_DIR / f"{stem}.frames.json"
|
||||||
|
if sidecar.is_file():
|
||||||
|
d = json.loads(sidecar.read_text(encoding="utf-8"))
|
||||||
|
else:
|
||||||
html = FIGS_DIR / f"{stem}.html"
|
html = FIGS_DIR / f"{stem}.html"
|
||||||
d = parse_plotly_run(html)
|
d = parse_plotly_run(html)
|
||||||
d = _enrich_with_labels(d)
|
d = _enrich_with_labels(d)
|
||||||
@ -896,8 +937,7 @@ def _cached_frames(stem: str) -> str:
|
|||||||
async def run_frames(stem: str) -> Response:
|
async def run_frames(stem: str) -> Response:
|
||||||
if not _STEM_RE.match(stem):
|
if not _STEM_RE.match(stem):
|
||||||
raise HTTPException(400, f"malformed stem: {stem!r}")
|
raise HTTPException(400, f"malformed stem: {stem!r}")
|
||||||
html = FIGS_DIR / f"{stem}.html"
|
if not (FIGS_DIR / f"{stem}.frames.json").is_file() and not (FIGS_DIR / f"{stem}.html").is_file():
|
||||||
if not html.is_file():
|
|
||||||
raise HTTPException(404, f"no such run: {stem}")
|
raise HTTPException(404, f"no such run: {stem}")
|
||||||
try:
|
try:
|
||||||
payload = _cached_frames(stem)
|
payload = _cached_frames(stem)
|
||||||
|
|||||||
@ -20,7 +20,7 @@ from pathlib import Path
|
|||||||
|
|
||||||
_STEM_RE = re.compile(
|
_STEM_RE = re.compile(
|
||||||
r"^(?P<gen>make_.+?)_(?P<emb>[A-Za-z]+)_N(?P<n>\d+)_T(?P<t>\d+)"
|
r"^(?P<gen>make_.+?)_(?P<emb>[A-Za-z]+)_N(?P<n>\d+)_T(?P<t>\d+)"
|
||||||
r"_J(?P<j>[\d.]+)_s(?P<s>\d+)$"
|
r"_J(?P<j>[\d.]+)_s(?P<s>\d+)(?:_(?P<h>[0-9a-f]{8}))?$"
|
||||||
)
|
)
|
||||||
|
|
||||||
# plotly's typed-array dtype -> (struct format char, item size bytes)
|
# plotly's typed-array dtype -> (struct format char, item size bytes)
|
||||||
|
|||||||
@ -20,11 +20,19 @@ os.environ.setdefault("NUMEXPR_NUM_THREADS", "1")
|
|||||||
os.environ.setdefault("NUMBA_NUM_THREADS", "1")
|
os.environ.setdefault("NUMBA_NUM_THREADS", "1")
|
||||||
|
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
|
import hashlib
|
||||||
import json
|
import json
|
||||||
import math
|
import math
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
|
||||||
|
def _embed_args_hash(ea: Optional[Dict[str, Any]]) -> str:
|
||||||
|
"""8-hex digest of embed_args (keys sorted) — output stem includes this
|
||||||
|
so runs differing only in embed_args get distinct files."""
|
||||||
|
s = json.dumps(ea or {}, sort_keys=True, default=str)
|
||||||
|
return hashlib.sha1(s.encode()).hexdigest()[:8]
|
||||||
|
|
||||||
from prefect import flow, task
|
from prefect import flow, task
|
||||||
from prefect.artifacts import create_markdown_artifact, create_table_artifact
|
from prefect.artifacts import create_markdown_artifact, create_table_artifact
|
||||||
from prefect.cache_policies import INPUTS, NO_CACHE
|
from prefect.cache_policies import INPUTS, NO_CACHE
|
||||||
@ -279,10 +287,12 @@ def embedding_flow(
|
|||||||
output_ref: str = (
|
output_ref: str = (
|
||||||
f"{output_dir.strip('/')}/{_generator}_Reference_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}.html"
|
f"{output_dir.strip('/')}/{_generator}_Reference_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}.html"
|
||||||
)
|
)
|
||||||
|
_args_tag = _embed_args_hash(embed_args)
|
||||||
output_embed: str = (
|
output_embed: str = (
|
||||||
f"{output_dir.strip('/')}/{_generator}_{embedder.split('.')[-1]}_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}.html"
|
f"{output_dir.strip('/')}/{_generator}_{embedder.split('.')[-1]}_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}_{_args_tag}.html"
|
||||||
)
|
)
|
||||||
output_metrics: str = output_embed[:-5] + ".metrics.json"
|
output_metrics: str = output_embed[:-5] + ".metrics.json"
|
||||||
|
output_frames: str = output_embed[:-5] + ".frames.json"
|
||||||
title_ref = f"Reference: {_generator}, N={num_points} with {jitter_scale} noise"
|
title_ref = f"Reference: {_generator}, N={num_points} with {jitter_scale} noise"
|
||||||
title_embed = f"Embedding: {embedder.split('.')[-1]} on {_generator}, N={num_points} with {jitter_scale} noise"
|
title_embed = f"Embedding: {embedder.split('.')[-1]} on {_generator}, N={num_points} with {jitter_scale} noise"
|
||||||
|
|
||||||
@ -378,7 +388,28 @@ def embedding_flow(
|
|||||||
k=10,
|
k=10,
|
||||||
)
|
)
|
||||||
|
|
||||||
return (ref_path.result(), emb_path.result(), metrics_path.result())
|
emb_path_result = emb_path.result()
|
||||||
|
metrics_path_result = metrics_path.result()
|
||||||
|
|
||||||
|
# Emit a frames.json sidecar so the compare page doesn't have to parse
|
||||||
|
# the 5 MB plotly HTML on every first request. Non-critical — the server
|
||||||
|
# falls back to HTML parsing when the sidecar is absent.
|
||||||
|
try:
|
||||||
|
import sys as _sys
|
||||||
|
_root = str(Path(__file__).resolve().parent.parent)
|
||||||
|
if _root not in _sys.path:
|
||||||
|
_sys.path.insert(0, _root)
|
||||||
|
from app.web.plotly_parse import parse_plotly_run
|
||||||
|
frames = parse_plotly_run(emb_path_result)
|
||||||
|
Path(output_frames).write_text(
|
||||||
|
json.dumps(frames, separators=(",", ":")), encoding="utf-8"
|
||||||
|
)
|
||||||
|
except Exception as _sidecar_err:
|
||||||
|
import traceback as _tb
|
||||||
|
print(f"[frames-sidecar] skipped: {_sidecar_err}")
|
||||||
|
_tb.print_exc()
|
||||||
|
|
||||||
|
return (ref_path.result(), emb_path_result, metrics_path_result)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user