From e94d28b8fcde565b8adfb73ea2d10b6c2fbb5e5a Mon Sep 17 00:00:00 2001 From: Michael Pilosov Date: Wed, 22 Apr 2026 17:54:46 -0600 Subject: [PATCH] filenames + run names: J in sci notation (5E-3 not 0.005) Periods in filenames are avoidable and the Prefect UI dislikes them in run names. Uses a shared sci_notation helper in main.py mirrored in the flow. Stem regex (main + parser) now matches J to accept both old decimal-J and new sci-J filenames so the two transition together. J tag in Prefect tag list also uses the sci form, so chip filters stay consistent. Backfill script extended to find pre-transition (decimal-J) files on disk via a second base-stem variant, then rename them to the sci form. backfill_tags re-patches existing runs so their J tag matches the new canonical form. All 13 existing figs + runs renamed / retagged in-place. --- app/web/main.py | 21 +++++++++++++--- app/web/plotly_parse.py | 2 +- flows/embedding_flow.py | 20 ++++++++++++--- scripts/backfill_hashes.py | 51 ++++++++++++++++++++++++-------------- 4 files changed, 68 insertions(+), 26 deletions(-) diff --git a/app/web/main.py b/app/web/main.py index 88cbf83..bdb62c9 100644 --- a/app/web/main.py +++ b/app/web/main.py @@ -475,6 +475,18 @@ def run_args_hash( embed_args_hash = run_args_hash +def sci_notation(v: Any) -> str: + """Float → compact sci notation without a period (0.005 → '5E-3'). + Used in stems and Prefect run names so filenames + UI avoid periods.""" + try: + f = float(v) + except (TypeError, ValueError): + return str(v) + m, e = f"{f:.3e}".split("e") + m = m.rstrip("0").rstrip(".") + return f"{m}E{int(e)}" + + def synthesize_output_paths( generator_path: str, embedder: str, @@ -487,8 +499,9 @@ def synthesize_output_paths( ) -> Tuple[str, str]: gen = generator_path.split(".")[-1] emb = embedder.split(".")[-1] - ref = f"{gen}_Reference_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}.html" - base = f"{gen}_{emb}_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}" + j = sci_notation(jitter_scale) + ref = f"{gen}_Reference_N{num_points}_T{num_timesteps}_J{j}_s{seed}.html" + base = f"{gen}_{emb}_N{num_points}_T{num_timesteps}_J{j}_s{seed}" if embed_args is None: embf = f"{base}.html" else: @@ -977,7 +990,7 @@ async def metrics_json() -> JSONResponse: _STEM_RE = re.compile( - r"^make_[A-Za-z_]+?_[A-Za-z]+_N\d+_T\d+_J[\d.]+_s\d+(?:_[0-9a-f]{8})?$" + r"^make_[A-Za-z_]+?_[A-Za-z]+_N\d+_T\d+_J[\d.Ee+\-]+_s\d+(?:_[0-9a-f]{8})?$" ) # Map short generator name ("make_blobs") to its DATASET_META entry. @@ -1020,7 +1033,7 @@ def build_run_tags( f"algorithm:{(embedder or '').rsplit('.', 1)[-1]}", f"N:{int(num_points)}", f"T:{int(num_timesteps)}", - f"J:{jitter_scale}", + f"J:{sci_notation(jitter_scale)}", ] diff --git a/app/web/plotly_parse.py b/app/web/plotly_parse.py index 5e0006c..376eb2d 100644 --- a/app/web/plotly_parse.py +++ b/app/web/plotly_parse.py @@ -20,7 +20,7 @@ from pathlib import Path _STEM_RE = re.compile( r"^(?Pmake_.+?)_(?P[A-Za-z]+)_N(?P\d+)_T(?P\d+)" - r"_J(?P[\d.]+)_s(?P\d+)(?:_(?P[0-9a-f]{8}))?$" + r"_J(?P[\d.Ee+\-]+)_s(?P\d+)(?:_(?P[0-9a-f]{8}))?$" ) # plotly's typed-array dtype -> (struct format char, item size bytes) diff --git a/flows/embedding_flow.py b/flows/embedding_flow.py index 3d638ec..e480e6c 100644 --- a/flows/embedding_flow.py +++ b/flows/embedding_flow.py @@ -43,6 +43,19 @@ def _run_args_hash( return hashlib.sha1(s.encode()).hexdigest()[:8] +def _sci(v: Any) -> str: + """Float → compact sci notation without a period (e.g. 0.005 → 5E-3, + 0.01 → 1E-2). Keeps Prefect's UI happy — it doesn't like periods in + run names.""" + try: + f = float(v) + except (TypeError, ValueError): + return str(v) + m, e = f"{f:.3e}".split("e") + m = m.rstrip("0").rstrip(".") + return f"{m}E{int(e)}" + + def _flow_run_name() -> str: """Name each Prefect run after the stem of its output fig, so runs are searchable / hoverable instead of wearing Prefect's auto-generated @@ -52,7 +65,7 @@ def _flow_run_name() -> str: emb = (p.get("embedder") or "").rsplit(".", 1)[-1] or "?" N = p.get("num_points", "?") T = p.get("num_timesteps", "?") - J = p.get("jitter_scale", "?") + J = _sci(p.get("jitter_scale", "?")) s = p.get("seed", "?") tag = _run_args_hash(p.get("embed_args"), p.get("generator_kwargs")) return f"{gen}_{emb}_N{N}_T{T}_J{J}_s{s}_{tag}" @@ -315,12 +328,13 @@ def embedding_flow( Path(output_dir).mkdir(parents=True, exist_ok=True) _generator = generator_path.split(".")[-1] + _j = _sci(jitter_scale) output_ref: str = ( - f"{output_dir.strip('/')}/{_generator}_Reference_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}.html" + f"{output_dir.strip('/')}/{_generator}_Reference_N{num_points}_T{num_timesteps}_J{_j}_s{seed}.html" ) _args_tag = _run_args_hash(embed_args, user_generator_kwargs) output_embed: str = ( - f"{output_dir.strip('/')}/{_generator}_{embedder.split('.')[-1]}_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}_{_args_tag}.html" + f"{output_dir.strip('/')}/{_generator}_{embedder.split('.')[-1]}_N{num_points}_T{num_timesteps}_J{_j}_s{seed}_{_args_tag}.html" ) output_metrics: str = output_embed[:-5] + ".metrics.json" output_frames: str = output_embed[:-5] + ".frames.json" diff --git a/scripts/backfill_hashes.py b/scripts/backfill_hashes.py index 0df2eeb..56e526d 100644 --- a/scripts/backfill_hashes.py +++ b/scripts/backfill_hashes.py @@ -30,7 +30,7 @@ from typing import Any, Dict, List, Optional _ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(_ROOT)) -from app.web.main import PREFECT, run_args_hash # noqa: E402 +from app.web.main import PREFECT, run_args_hash, sci_notation # noqa: E402 def _legacy_hash(ea: Optional[Dict[str, Any]]) -> str: @@ -38,30 +38,45 @@ def _legacy_hash(ea: Optional[Dict[str, Any]]) -> str: return hashlib.sha1(s.encode()).hexdigest()[:8] -def _base_stem(params: Dict[str, Any]) -> Optional[str]: +def _base_stems(params: Dict[str, Any]) -> List[str]: + """Return the stem prefix(es) for this run's params: both the current + sci-J form and the legacy decimal-J form, so we can find pre-transition + files on disk too.""" try: gen = (params.get("generator_path") or "").rsplit(".", 1)[-1] emb = (params.get("embedder") or "").rsplit(".", 1)[-1] N = int(params["num_points"]) T = int(params.get("num_timesteps", params.get("num_snapshots"))) - J = float(params["jitter_scale"]) + Jf = float(params["jitter_scale"]) s = int(params["seed"]) except (KeyError, TypeError, ValueError): - return None + return [] if not gen or not emb: - return None - return f"{gen}_{emb}_N{N}_T{T}_J{J}_s{s}" + return [] + out = [f"{gen}_{emb}_N{N}_T{T}_J{sci_notation(Jf)}_s{s}"] + legacy = f"{gen}_{emb}_N{N}_T{T}_J{Jf}_s{s}" + if legacy not in out: + out.append(legacy) + return out -def _candidate_names(base: str, ea: Dict[str, Any], gk: Dict[str, Any]) -> List[str]: - target = f"{base}_{run_args_hash(ea, gk)}.html" - legacy = f"{base}_{_legacy_hash(ea)}.html" - no_hash = f"{base}.html" - # Preserve order: target first so we short-circuit on already-backfilled. +def _candidate_names(bases: List[str], ea: Dict[str, Any], gk: Dict[str, Any]) -> List[str]: + # Target = current sci-J base + new-scheme hash. + if not bases: + return [] + target_base = bases[0] + target = f"{target_base}_{run_args_hash(ea, gk)}.html" out = [target] - for x in (legacy, no_hash): - if x not in out: - out.append(x) + # Fall back to every (base, hash) combination we might find on disk. + hashes = [run_args_hash(ea, gk), _legacy_hash(ea)] + for b in bases: + for h in hashes: + x = f"{b}_{h}.html" + if x not in out: + out.append(x) + no_hash = f"{b}.html" + if no_hash not in out: + out.append(no_hash) return out @@ -125,13 +140,13 @@ def main() -> int: params = r.get("parameters") or {} ea = params.get("embed_args") or {} gk = params.get("generator_kwargs") or {} - base = _base_stem(params) - if not base: + bases = _base_stems(params) + if not bases: continue - target = f"{base}_{run_args_hash(ea, gk)}.html" + target = f"{bases[0]}_{run_args_hash(ea, gk)}.html" if target in seen_targets: continue # later duplicate — the stale-marking logic will handle it - for candidate in _candidate_names(base, ea, gk): + for candidate in _candidate_names(bases, ea, gk): if (figs_dir / candidate).exists(): if candidate == target: # Already at target; just ensure metrics.json carries gk.