From e94d28b8fcde565b8adfb73ea2d10b6c2fbb5e5a Mon Sep 17 00:00:00 2001
From: Michael Pilosov <consistentbayes@gmail.com>
Date: Wed, 22 Apr 2026 17:54:46 -0600
Subject: [PATCH] filenames + run names: J in sci notation (5E-3 not 0.005)

Periods in filenames are avoidable and the Prefect UI dislikes them in
run names. Uses a shared sci_notation helper in main.py mirrored in the
flow. Stem regex (main + parser) now matches J<digits.Ee+-> to accept
both old decimal-J and new sci-J filenames so the two transition
together. J tag in Prefect tag list also uses the sci form, so chip
filters stay consistent.

Backfill script extended to find pre-transition (decimal-J) files on
disk via a second base-stem variant, then rename them to the sci form.
backfill_tags re-patches existing runs so their J tag matches the new
canonical form.

All 13 existing figs + runs renamed / retagged in-place.
---
 app/web/main.py            | 21 +++++++++++++---
 app/web/plotly_parse.py    |  2 +-
 flows/embedding_flow.py    | 20 ++++++++++++---
 scripts/backfill_hashes.py | 51 ++++++++++++++++++++++++--------------
 4 files changed, 68 insertions(+), 26 deletions(-)
diff --git a/app/web/main.py b/app/web/main.py
index 88cbf83..bdb62c9 100644
--- a/app/web/main.py
+++ b/app/web/main.py
@@ -475,6 +475,18 @@ def run_args_hash(
 embed_args_hash = run_args_hash
 
 
+def sci_notation(v: Any) -> str:
+    """Float → compact sci notation without a period (0.005 → '5E-3').
+    Used in stems and Prefect run names so filenames + UI avoid periods."""
+    try:
+        f = float(v)
+    except (TypeError, ValueError):
+        return str(v)
+    m, e = f"{f:.3e}".split("e")
+    m = m.rstrip("0").rstrip(".")
+    return f"{m}E{int(e)}"
+
+
 def synthesize_output_paths(
     generator_path: str,
     embedder: str,
@@ -487,8 +499,9 @@ def synthesize_output_paths(
 ) -> Tuple[str, str]:
     gen = generator_path.split(".")[-1]
     emb = embedder.split(".")[-1]
-    ref = f"{gen}_Reference_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}.html"
-    base = f"{gen}_{emb}_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}"
+    j = sci_notation(jitter_scale)
+    ref = f"{gen}_Reference_N{num_points}_T{num_timesteps}_J{j}_s{seed}.html"
+    base = f"{gen}_{emb}_N{num_points}_T{num_timesteps}_J{j}_s{seed}"
     if embed_args is None:
         embf = f"{base}.html"
     else:
@@ -977,7 +990,7 @@ async def metrics_json() -> JSONResponse:
 
 
 _STEM_RE = re.compile(
-    r"^make_[A-Za-z_]+?_[A-Za-z]+_N\d+_T\d+_J[\d.]+_s\d+(?:_[0-9a-f]{8})?$"
+    r"^make_[A-Za-z_]+?_[A-Za-z]+_N\d+_T\d+_J[\d.Ee+\-]+_s\d+(?:_[0-9a-f]{8})?$"
 )
 
 # Map short generator name ("make_blobs") to its DATASET_META entry.
@@ -1020,7 +1033,7 @@ def build_run_tags(
         f"algorithm:{(embedder or '').rsplit('.', 1)[-1]}",
         f"N:{int(num_points)}",
         f"T:{int(num_timesteps)}",
-        f"J:{jitter_scale}",
+        f"J:{sci_notation(jitter_scale)}",
     ]
 
 
diff --git a/app/web/plotly_parse.py b/app/web/plotly_parse.py
index 5e0006c..376eb2d 100644
--- a/app/web/plotly_parse.py
+++ b/app/web/plotly_parse.py
@@ -20,7 +20,7 @@ from pathlib import Path
 
 _STEM_RE = re.compile(
     r"^(?P<gen>make_.+?)_(?P<emb>[A-Za-z]+)_N(?P<n>\d+)_T(?P<t>\d+)"
-    r"_J(?P<j>[\d.]+)_s(?P<s>\d+)(?:_(?P<h>[0-9a-f]{8}))?$"
+    r"_J(?P<j>[\d.Ee+\-]+)_s(?P<s>\d+)(?:_(?P<h>[0-9a-f]{8}))?$"
 )
 
 # plotly's typed-array dtype -> (struct format char, item size bytes)
diff --git a/flows/embedding_flow.py b/flows/embedding_flow.py
index 3d638ec..e480e6c 100644
--- a/flows/embedding_flow.py
+++ b/flows/embedding_flow.py
@@ -43,6 +43,19 @@ def _run_args_hash(
     return hashlib.sha1(s.encode()).hexdigest()[:8]
 
 
+def _sci(v: Any) -> str:
+    """Float → compact sci notation without a period (e.g. 0.005 → 5E-3,
+    0.01 → 1E-2). Keeps Prefect's UI happy — it doesn't like periods in
+    run names."""
+    try:
+        f = float(v)
+    except (TypeError, ValueError):
+        return str(v)
+    m, e = f"{f:.3e}".split("e")
+    m = m.rstrip("0").rstrip(".")
+    return f"{m}E{int(e)}"
+
+
 def _flow_run_name() -> str:
     """Name each Prefect run after the stem of its output fig, so runs are
     searchable / hoverable instead of wearing Prefect's auto-generated
@@ -52,7 +65,7 @@ def _flow_run_name() -> str:
     emb = (p.get("embedder") or "").rsplit(".", 1)[-1] or "?"
     N = p.get("num_points", "?")
     T = p.get("num_timesteps", "?")
-    J = p.get("jitter_scale", "?")
+    J = _sci(p.get("jitter_scale", "?"))
     s = p.get("seed", "?")
     tag = _run_args_hash(p.get("embed_args"), p.get("generator_kwargs"))
     return f"{gen}_{emb}_N{N}_T{T}_J{J}_s{s}_{tag}"
@@ -315,12 +328,13 @@ def embedding_flow(
 
     Path(output_dir).mkdir(parents=True, exist_ok=True)
     _generator = generator_path.split(".")[-1]
+    _j = _sci(jitter_scale)
     output_ref: str = (
-        f"{output_dir.strip('/')}/{_generator}_Reference_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}.html"
+        f"{output_dir.strip('/')}/{_generator}_Reference_N{num_points}_T{num_timesteps}_J{_j}_s{seed}.html"
     )
     _args_tag = _run_args_hash(embed_args, user_generator_kwargs)
     output_embed: str = (
-        f"{output_dir.strip('/')}/{_generator}_{embedder.split('.')[-1]}_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}_{_args_tag}.html"
+        f"{output_dir.strip('/')}/{_generator}_{embedder.split('.')[-1]}_N{num_points}_T{num_timesteps}_J{_j}_s{seed}_{_args_tag}.html"
     )
     output_metrics: str = output_embed[:-5] + ".metrics.json"
     output_frames: str = output_embed[:-5] + ".frames.json"
diff --git a/scripts/backfill_hashes.py b/scripts/backfill_hashes.py
index 0df2eeb..56e526d 100644
--- a/scripts/backfill_hashes.py
+++ b/scripts/backfill_hashes.py
@@ -30,7 +30,7 @@ from typing import Any, Dict, List, Optional
 
 _ROOT = Path(__file__).resolve().parent.parent
 sys.path.insert(0, str(_ROOT))
-from app.web.main import PREFECT, run_args_hash  # noqa: E402
+from app.web.main import PREFECT, run_args_hash, sci_notation  # noqa: E402
 
 
 def _legacy_hash(ea: Optional[Dict[str, Any]]) -> str:
@@ -38,30 +38,45 @@ def _legacy_hash(ea: Optional[Dict[str, Any]]) -> str:
     return hashlib.sha1(s.encode()).hexdigest()[:8]
 
 
-def _base_stem(params: Dict[str, Any]) -> Optional[str]:
+def _base_stems(params: Dict[str, Any]) -> List[str]:
+    """Return the stem prefix(es) for this run's params: both the current
+    sci-J form and the legacy decimal-J form, so we can find pre-transition
+    files on disk too."""
     try:
         gen = (params.get("generator_path") or "").rsplit(".", 1)[-1]
         emb = (params.get("embedder") or "").rsplit(".", 1)[-1]
         N = int(params["num_points"])
         T = int(params.get("num_timesteps", params.get("num_snapshots")))
-        J = float(params["jitter_scale"])
+        Jf = float(params["jitter_scale"])
         s = int(params["seed"])
     except (KeyError, TypeError, ValueError):
-        return None
+        return []
     if not gen or not emb:
-        return None
-    return f"{gen}_{emb}_N{N}_T{T}_J{J}_s{s}"
+        return []
+    out = [f"{gen}_{emb}_N{N}_T{T}_J{sci_notation(Jf)}_s{s}"]
+    legacy = f"{gen}_{emb}_N{N}_T{T}_J{Jf}_s{s}"
+    if legacy not in out:
+        out.append(legacy)
+    return out
 
 
-def _candidate_names(base: str, ea: Dict[str, Any], gk: Dict[str, Any]) -> List[str]:
-    target = f"{base}_{run_args_hash(ea, gk)}.html"
-    legacy = f"{base}_{_legacy_hash(ea)}.html"
-    no_hash = f"{base}.html"
-    # Preserve order: target first so we short-circuit on already-backfilled.
+def _candidate_names(bases: List[str], ea: Dict[str, Any], gk: Dict[str, Any]) -> List[str]:
+    # Target = current sci-J base + new-scheme hash.
+    if not bases:
+        return []
+    target_base = bases[0]
+    target = f"{target_base}_{run_args_hash(ea, gk)}.html"
     out = [target]
-    for x in (legacy, no_hash):
-        if x not in out:
-            out.append(x)
+    # Fall back to every (base, hash) combination we might find on disk.
+    hashes = [run_args_hash(ea, gk), _legacy_hash(ea)]
+    for b in bases:
+        for h in hashes:
+            x = f"{b}_{h}.html"
+            if x not in out:
+                out.append(x)
+        no_hash = f"{b}.html"
+        if no_hash not in out:
+            out.append(no_hash)
     return out
 
 
@@ -125,13 +140,13 @@ def main() -> int:
         params = r.get("parameters") or {}
         ea = params.get("embed_args") or {}
         gk = params.get("generator_kwargs") or {}
-        base = _base_stem(params)
-        if not base:
+        bases = _base_stems(params)
+        if not bases:
             continue
-        target = f"{base}_{run_args_hash(ea, gk)}.html"
+        target = f"{bases[0]}_{run_args_hash(ea, gk)}.html"
         if target in seen_targets:
             continue  # later duplicate — the stale-marking logic will handle it
-        for candidate in _candidate_names(base, ea, gk):
+        for candidate in _candidate_names(bases, ea, gk):
             if (figs_dir / candidate).exists():
                 if candidate == target:
                     # Already at target; just ensure metrics.json carries gk.