filenames + run names: J in sci notation (5E-3 not 0.005)

Periods in filenames are avoidable and the Prefect UI dislikes them in run names. Uses a shared sci_notation helper in main.py mirrored in the flow. Stem regex (main + parser) now matches J<digits.Ee+-> to accept both old decimal-J and new sci-J filenames so the two transition together. J tag in Prefect tag list also uses the sci form, so chip filters stay consistent. Backfill script extended to find pre-transition (decimal-J) files on disk via a second base-stem variant, then rename them to the sci form. backfill_tags re-patches existing runs so their J tag matches the new canonical form. All 13 existing figs + runs renamed / retagged in-place.
2026-04-22 17:54:46 -06:00 · 2026-04-22 17:54:46 -06:00 · e94d28b8fc
commit e94d28b8fc
parent 56279dbb1b
4 changed files with 68 additions and 26 deletions
--- a/app/web/main.py
+++ b/app/web/main.py
@ -475,6 +475,18 @@ def run_args_hash(
 embed_args_hash = run_args_hash


+def sci_notation(v: Any) -> str:
+    """Float → compact sci notation without a period (0.005 → '5E-3').
+    Used in stems and Prefect run names so filenames + UI avoid periods."""
+    try:
+        f = float(v)
+    except (TypeError, ValueError):
+        return str(v)
+    m, e = f"{f:.3e}".split("e")
+    m = m.rstrip("0").rstrip(".")
+    return f"{m}E{int(e)}"
+
+
 def synthesize_output_paths(
    generator_path: str,
    embedder: str,
@ -487,8 +499,9 @@ def synthesize_output_paths(
 ) -> Tuple[str, str]:
    gen = generator_path.split(".")[-1]
    emb = embedder.split(".")[-1]
-    ref = f"{gen}_Reference_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}.html"
-    base = f"{gen}_{emb}_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}"
+    j = sci_notation(jitter_scale)
+    ref = f"{gen}_Reference_N{num_points}_T{num_timesteps}_J{j}_s{seed}.html"
+    base = f"{gen}_{emb}_N{num_points}_T{num_timesteps}_J{j}_s{seed}"
    if embed_args is None:
        embf = f"{base}.html"
    else:
@ -977,7 +990,7 @@ async def metrics_json() -> JSONResponse:


 _STEM_RE = re.compile(
-    r"^make_[A-Za-z_]+?_[A-Za-z]+_N\d+_T\d+_J[\d.]+_s\d+(?:_[0-9a-f]{8})?$"
+    r"^make_[A-Za-z_]+?_[A-Za-z]+_N\d+_T\d+_J[\d.Ee+\-]+_s\d+(?:_[0-9a-f]{8})?$"
 )

 # Map short generator name ("make_blobs") to its DATASET_META entry.
@ -1020,7 +1033,7 @@ def build_run_tags(
        f"algorithm:{(embedder or '').rsplit('.', 1)[-1]}",
        f"N:{int(num_points)}",
        f"T:{int(num_timesteps)}",
-        f"J:{jitter_scale}",
+        f"J:{sci_notation(jitter_scale)}",
    ]


--- a/app/web/plotly_parse.py
+++ b/app/web/plotly_parse.py
@ -20,7 +20,7 @@ from pathlib import Path

 _STEM_RE = re.compile(
    r"^(?P<gen>make_.+?)_(?P<emb>[A-Za-z]+)_N(?P<n>\d+)_T(?P<t>\d+)"
-    r"_J(?P<j>[\d.]+)_s(?P<s>\d+)(?:_(?P<h>[0-9a-f]{8}))?$"
+    r"_J(?P<j>[\d.Ee+\-]+)_s(?P<s>\d+)(?:_(?P<h>[0-9a-f]{8}))?$"
 )

 # plotly's typed-array dtype -> (struct format char, item size bytes)
--- a/flows/embedding_flow.py
+++ b/flows/embedding_flow.py
@ -43,6 +43,19 @@ def _run_args_hash(
    return hashlib.sha1(s.encode()).hexdigest()[:8]


+def _sci(v: Any) -> str:
+    """Float → compact sci notation without a period (e.g. 0.005 → 5E-3,
+    0.01 → 1E-2). Keeps Prefect's UI happy — it doesn't like periods in
+    run names."""
+    try:
+        f = float(v)
+    except (TypeError, ValueError):
+        return str(v)
+    m, e = f"{f:.3e}".split("e")
+    m = m.rstrip("0").rstrip(".")
+    return f"{m}E{int(e)}"
+
+
 def _flow_run_name() -> str:
    """Name each Prefect run after the stem of its output fig, so runs are
    searchable / hoverable instead of wearing Prefect's auto-generated
@ -52,7 +65,7 @@ def _flow_run_name() -> str:
    emb = (p.get("embedder") or "").rsplit(".", 1)[-1] or "?"
    N = p.get("num_points", "?")
    T = p.get("num_timesteps", "?")
-    J = p.get("jitter_scale", "?")
+    J = _sci(p.get("jitter_scale", "?"))
    s = p.get("seed", "?")
    tag = _run_args_hash(p.get("embed_args"), p.get("generator_kwargs"))
    return f"{gen}_{emb}_N{N}_T{T}_J{J}_s{s}_{tag}"
@ -315,12 +328,13 @@ def embedding_flow(

    Path(output_dir).mkdir(parents=True, exist_ok=True)
    _generator = generator_path.split(".")[-1]
+    _j = _sci(jitter_scale)
    output_ref: str = (
-        f"{output_dir.strip('/')}/{_generator}_Reference_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}.html"
+        f"{output_dir.strip('/')}/{_generator}_Reference_N{num_points}_T{num_timesteps}_J{_j}_s{seed}.html"
    )
    _args_tag = _run_args_hash(embed_args, user_generator_kwargs)
    output_embed: str = (
-        f"{output_dir.strip('/')}/{_generator}_{embedder.split('.')[-1]}_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}_{_args_tag}.html"
+        f"{output_dir.strip('/')}/{_generator}_{embedder.split('.')[-1]}_N{num_points}_T{num_timesteps}_J{_j}_s{seed}_{_args_tag}.html"
    )
    output_metrics: str = output_embed[:-5] + ".metrics.json"
    output_frames: str = output_embed[:-5] + ".frames.json"
--- a/scripts/backfill_hashes.py
+++ b/scripts/backfill_hashes.py
@ -30,7 +30,7 @@ from typing import Any, Dict, List, Optional

 _ROOT = Path(__file__).resolve().parent.parent
 sys.path.insert(0, str(_ROOT))
-from app.web.main import PREFECT, run_args_hash  # noqa: E402
+from app.web.main import PREFECT, run_args_hash, sci_notation  # noqa: E402


 def _legacy_hash(ea: Optional[Dict[str, Any]]) -> str:
@ -38,30 +38,45 @@ def _legacy_hash(ea: Optional[Dict[str, Any]]) -> str:
    return hashlib.sha1(s.encode()).hexdigest()[:8]


-def _base_stem(params: Dict[str, Any]) -> Optional[str]:
+def _base_stems(params: Dict[str, Any]) -> List[str]:
+    """Return the stem prefix(es) for this run's params: both the current
+    sci-J form and the legacy decimal-J form, so we can find pre-transition
+    files on disk too."""
    try:
        gen = (params.get("generator_path") or "").rsplit(".", 1)[-1]
        emb = (params.get("embedder") or "").rsplit(".", 1)[-1]
        N = int(params["num_points"])
        T = int(params.get("num_timesteps", params.get("num_snapshots")))
-        J = float(params["jitter_scale"])
+        Jf = float(params["jitter_scale"])
        s = int(params["seed"])
    except (KeyError, TypeError, ValueError):
-        return None
+        return []
    if not gen or not emb:
-        return None
-    return f"{gen}_{emb}_N{N}_T{T}_J{J}_s{s}"
+        return []
+    out = [f"{gen}_{emb}_N{N}_T{T}_J{sci_notation(Jf)}_s{s}"]
+    legacy = f"{gen}_{emb}_N{N}_T{T}_J{Jf}_s{s}"
+    if legacy not in out:
+        out.append(legacy)
+    return out


-def _candidate_names(base: str, ea: Dict[str, Any], gk: Dict[str, Any]) -> List[str]:
-    target = f"{base}_{run_args_hash(ea, gk)}.html"
-    legacy = f"{base}_{_legacy_hash(ea)}.html"
-    no_hash = f"{base}.html"
-    # Preserve order: target first so we short-circuit on already-backfilled.
+def _candidate_names(bases: List[str], ea: Dict[str, Any], gk: Dict[str, Any]) -> List[str]:
+    # Target = current sci-J base + new-scheme hash.
+    if not bases:
+        return []
+    target_base = bases[0]
+    target = f"{target_base}_{run_args_hash(ea, gk)}.html"
    out = [target]
-    for x in (legacy, no_hash):
-        if x not in out:
-            out.append(x)
+    # Fall back to every (base, hash) combination we might find on disk.
+    hashes = [run_args_hash(ea, gk), _legacy_hash(ea)]
+    for b in bases:
+        for h in hashes:
+            x = f"{b}_{h}.html"
+            if x not in out:
+                out.append(x)
+        no_hash = f"{b}.html"
+        if no_hash not in out:
+            out.append(no_hash)
    return out


@ -125,13 +140,13 @@ def main() -> int:
        params = r.get("parameters") or {}
        ea = params.get("embed_args") or {}
        gk = params.get("generator_kwargs") or {}
-        base = _base_stem(params)
-        if not base:
+        bases = _base_stems(params)
+        if not bases:
            continue
-        target = f"{base}_{run_args_hash(ea, gk)}.html"
+        target = f"{bases[0]}_{run_args_hash(ea, gk)}.html"
        if target in seen_targets:
            continue  # later duplicate — the stale-marking logic will handle it
-        for candidate in _candidate_names(base, ea, gk):
+        for candidate in _candidate_names(bases, ea, gk):
            if (figs_dir / candidate).exists():
                if candidate == target:
                    # Already at target; just ensure metrics.json carries gk.