filenames + run names: J in sci notation (5E-3 not 0.005)

Periods in filenames are avoidable and the Prefect UI dislikes them in
run names. Uses a shared sci_notation helper in main.py mirrored in the
flow. Stem regex (main + parser) now matches J<digits.Ee+-> to accept
both old decimal-J and new sci-J filenames so the two transition
together. J tag in Prefect tag list also uses the sci form, so chip
filters stay consistent.

Backfill script extended to find pre-transition (decimal-J) files on
disk via a second base-stem variant, then rename them to the sci form.
backfill_tags re-patches existing runs so their J tag matches the new
canonical form.

All 13 existing figs + runs renamed / retagged in-place.
This commit is contained in:
Michael Pilosov 2026-04-22 17:54:46 -06:00
parent 56279dbb1b
commit e94d28b8fc
4 changed files with 68 additions and 26 deletions

View File

@ -475,6 +475,18 @@ def run_args_hash(
embed_args_hash = run_args_hash
def sci_notation(v: Any) -> str:
"""Float → compact sci notation without a period (0.005 → '5E-3').
Used in stems and Prefect run names so filenames + UI avoid periods."""
try:
f = float(v)
except (TypeError, ValueError):
return str(v)
m, e = f"{f:.3e}".split("e")
m = m.rstrip("0").rstrip(".")
return f"{m}E{int(e)}"
def synthesize_output_paths(
generator_path: str,
embedder: str,
@ -487,8 +499,9 @@ def synthesize_output_paths(
) -> Tuple[str, str]:
gen = generator_path.split(".")[-1]
emb = embedder.split(".")[-1]
ref = f"{gen}_Reference_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}.html"
base = f"{gen}_{emb}_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}"
j = sci_notation(jitter_scale)
ref = f"{gen}_Reference_N{num_points}_T{num_timesteps}_J{j}_s{seed}.html"
base = f"{gen}_{emb}_N{num_points}_T{num_timesteps}_J{j}_s{seed}"
if embed_args is None:
embf = f"{base}.html"
else:
@ -977,7 +990,7 @@ async def metrics_json() -> JSONResponse:
_STEM_RE = re.compile(
r"^make_[A-Za-z_]+?_[A-Za-z]+_N\d+_T\d+_J[\d.]+_s\d+(?:_[0-9a-f]{8})?$"
r"^make_[A-Za-z_]+?_[A-Za-z]+_N\d+_T\d+_J[\d.Ee+\-]+_s\d+(?:_[0-9a-f]{8})?$"
)
# Map short generator name ("make_blobs") to its DATASET_META entry.
@ -1020,7 +1033,7 @@ def build_run_tags(
f"algorithm:{(embedder or '').rsplit('.', 1)[-1]}",
f"N:{int(num_points)}",
f"T:{int(num_timesteps)}",
f"J:{jitter_scale}",
f"J:{sci_notation(jitter_scale)}",
]

View File

@ -20,7 +20,7 @@ from pathlib import Path
_STEM_RE = re.compile(
r"^(?P<gen>make_.+?)_(?P<emb>[A-Za-z]+)_N(?P<n>\d+)_T(?P<t>\d+)"
r"_J(?P<j>[\d.]+)_s(?P<s>\d+)(?:_(?P<h>[0-9a-f]{8}))?$"
r"_J(?P<j>[\d.Ee+\-]+)_s(?P<s>\d+)(?:_(?P<h>[0-9a-f]{8}))?$"
)
# plotly's typed-array dtype -> (struct format char, item size bytes)

View File

@ -43,6 +43,19 @@ def _run_args_hash(
return hashlib.sha1(s.encode()).hexdigest()[:8]
def _sci(v: Any) -> str:
"""Float → compact sci notation without a period (e.g. 0.005 → 5E-3,
0.01 1E-2). Keeps Prefect's UI happy — it doesn't like periods in
run names."""
try:
f = float(v)
except (TypeError, ValueError):
return str(v)
m, e = f"{f:.3e}".split("e")
m = m.rstrip("0").rstrip(".")
return f"{m}E{int(e)}"
def _flow_run_name() -> str:
"""Name each Prefect run after the stem of its output fig, so runs are
searchable / hoverable instead of wearing Prefect's auto-generated
@ -52,7 +65,7 @@ def _flow_run_name() -> str:
emb = (p.get("embedder") or "").rsplit(".", 1)[-1] or "?"
N = p.get("num_points", "?")
T = p.get("num_timesteps", "?")
J = p.get("jitter_scale", "?")
J = _sci(p.get("jitter_scale", "?"))
s = p.get("seed", "?")
tag = _run_args_hash(p.get("embed_args"), p.get("generator_kwargs"))
return f"{gen}_{emb}_N{N}_T{T}_J{J}_s{s}_{tag}"
@ -315,12 +328,13 @@ def embedding_flow(
Path(output_dir).mkdir(parents=True, exist_ok=True)
_generator = generator_path.split(".")[-1]
_j = _sci(jitter_scale)
output_ref: str = (
f"{output_dir.strip('/')}/{_generator}_Reference_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}.html"
f"{output_dir.strip('/')}/{_generator}_Reference_N{num_points}_T{num_timesteps}_J{_j}_s{seed}.html"
)
_args_tag = _run_args_hash(embed_args, user_generator_kwargs)
output_embed: str = (
f"{output_dir.strip('/')}/{_generator}_{embedder.split('.')[-1]}_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}_{_args_tag}.html"
f"{output_dir.strip('/')}/{_generator}_{embedder.split('.')[-1]}_N{num_points}_T{num_timesteps}_J{_j}_s{seed}_{_args_tag}.html"
)
output_metrics: str = output_embed[:-5] + ".metrics.json"
output_frames: str = output_embed[:-5] + ".frames.json"

View File

@ -30,7 +30,7 @@ from typing import Any, Dict, List, Optional
_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(_ROOT))
from app.web.main import PREFECT, run_args_hash # noqa: E402
from app.web.main import PREFECT, run_args_hash, sci_notation # noqa: E402
def _legacy_hash(ea: Optional[Dict[str, Any]]) -> str:
@ -38,30 +38,45 @@ def _legacy_hash(ea: Optional[Dict[str, Any]]) -> str:
return hashlib.sha1(s.encode()).hexdigest()[:8]
def _base_stem(params: Dict[str, Any]) -> Optional[str]:
def _base_stems(params: Dict[str, Any]) -> List[str]:
"""Return the stem prefix(es) for this run's params: both the current
sci-J form and the legacy decimal-J form, so we can find pre-transition
files on disk too."""
try:
gen = (params.get("generator_path") or "").rsplit(".", 1)[-1]
emb = (params.get("embedder") or "").rsplit(".", 1)[-1]
N = int(params["num_points"])
T = int(params.get("num_timesteps", params.get("num_snapshots")))
J = float(params["jitter_scale"])
Jf = float(params["jitter_scale"])
s = int(params["seed"])
except (KeyError, TypeError, ValueError):
return None
return []
if not gen or not emb:
return None
return f"{gen}_{emb}_N{N}_T{T}_J{J}_s{s}"
return []
out = [f"{gen}_{emb}_N{N}_T{T}_J{sci_notation(Jf)}_s{s}"]
legacy = f"{gen}_{emb}_N{N}_T{T}_J{Jf}_s{s}"
if legacy not in out:
out.append(legacy)
return out
def _candidate_names(base: str, ea: Dict[str, Any], gk: Dict[str, Any]) -> List[str]:
target = f"{base}_{run_args_hash(ea, gk)}.html"
legacy = f"{base}_{_legacy_hash(ea)}.html"
no_hash = f"{base}.html"
# Preserve order: target first so we short-circuit on already-backfilled.
def _candidate_names(bases: List[str], ea: Dict[str, Any], gk: Dict[str, Any]) -> List[str]:
# Target = current sci-J base + new-scheme hash.
if not bases:
return []
target_base = bases[0]
target = f"{target_base}_{run_args_hash(ea, gk)}.html"
out = [target]
for x in (legacy, no_hash):
if x not in out:
out.append(x)
# Fall back to every (base, hash) combination we might find on disk.
hashes = [run_args_hash(ea, gk), _legacy_hash(ea)]
for b in bases:
for h in hashes:
x = f"{b}_{h}.html"
if x not in out:
out.append(x)
no_hash = f"{b}.html"
if no_hash not in out:
out.append(no_hash)
return out
@ -125,13 +140,13 @@ def main() -> int:
params = r.get("parameters") or {}
ea = params.get("embed_args") or {}
gk = params.get("generator_kwargs") or {}
base = _base_stem(params)
if not base:
bases = _base_stems(params)
if not bases:
continue
target = f"{base}_{run_args_hash(ea, gk)}.html"
target = f"{bases[0]}_{run_args_hash(ea, gk)}.html"
if target in seen_targets:
continue # later duplicate — the stale-marking logic will handle it
for candidate in _candidate_names(base, ea, gk):
for candidate in _candidate_names(bases, ea, gk):
if (figs_dir / candidate).exists():
if candidate == target:
# Already at target; just ensure metrics.json carries gk.