"""
web1 — "Scientific instrument / research notebook"
A FastAPI UI for kicking off the embedding-flow Prefect deployment and
viewing the resulting HTML animations.
Design: restrained, typography-driven, two-column notebook layout. No CSS
framework; hand-written styles.
"""
from __future__ import annotations
import hashlib
import importlib.util
import json
import os
import re
from functools import lru_cache
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
from app.web.plotly_parse import parse_plotly_run
import httpx
from fastapi import FastAPI, Form, HTTPException, Request
from fastapi.responses import HTMLResponse, JSONResponse, Response
from fastapi.staticfiles import StaticFiles
from fastapi.templating import Jinja2Templates
from sklearn.datasets import (
make_blobs,
make_classification,
make_gaussian_quantiles,
make_s_curve,
make_swiss_roll,
)
# ---------------------------------------------------------------------------
# Paths / constants
# ---------------------------------------------------------------------------
BASE_DIR = Path(__file__).resolve().parent
PROJECT_ROOT = BASE_DIR.parent.parent # /home/mm/work/dr-sandbox
FIGS_DIR = PROJECT_ROOT / "figs"
FIGS_DIR.mkdir(parents=True, exist_ok=True)
PREFECT_API = os.environ.get("PREFECT_API_URL", "http://localhost:4200/api")
DEPLOYMENT_NAME = "embedding-flow/embedding-flow"
# ---------------------------------------------------------------------------
# Dataset catalogue
# ---------------------------------------------------------------------------
# Metadata for the /data.json endpoint consumed by the dataset picker, and
# for server-side lookup when the picker posts its selection back. kwargs
# must carry n_features=3 for generators that aren't already 3-D, since
# they'll be forwarded verbatim to the Prefect flow's generator_kwargs.
DATASET_PREVIEW_N = 5000
DATASET_PREVIEW_SEED = 0
DATASET_META: Dict[str, Dict[str, Any]] = {
"s_curve": {
"name": "S-Curve",
"path": "sklearn.datasets.make_s_curve",
"kwargs": {},
"description": (
"A 2-D manifold warped into R³. Continuous label encodes position "
"along the curve — a good test of whether a reducer unrolls the "
"sheet without tearing."
),
"kind": "continuous",
},
"swiss_roll": {
"name": "Swiss Roll",
"path": "sklearn.datasets.make_swiss_roll",
"kwargs": {},
"description": (
"A rolled-up plane. The canonical hard case for linear methods: "
"PCA collapses the spiral, non-linear methods should recover the "
"unroll."
),
"kind": "continuous",
},
"swiss_roll_hole": {
"name": "Swiss Roll (hole)",
"path": "sklearn.datasets.make_swiss_roll",
"kwargs": {"hole": True},
"description": (
"Swiss roll with a rectangular hole punched through. Same manifold, "
"non-trivial topology — a faithful unroll should preserve the hole "
"rather than smearing it closed."
),
"kind": "continuous",
},
"blobs": {
"name": "Gaussian Blobs",
"path": "sklearn.datasets.make_blobs",
"kwargs": {"n_features": 3, "centers": 5, "cluster_std": 1.0},
"description": (
"Five isotropic Gaussian clusters in R³. Discrete class labels. "
"Tests whether a reducer preserves cluster separation when "
"projected to 2-D."
),
"kind": "categorical",
},
"gaussian_quantiles": {
"name": "Gaussian Quantiles",
"path": "sklearn.datasets.make_gaussian_quantiles",
"kwargs": {"n_features": 3, "n_classes": 4},
"description": (
"Concentric Gaussian shells in R³; class = which shell. Classes "
"are linearly inseparable by construction — PCA collapses them, "
"kernel and manifold methods have a chance."
),
"kind": "categorical",
},
"classification": {
"name": "Hypercube Clusters",
"path": "sklearn.datasets.make_classification",
"kwargs": {
"n_features": 3,
"n_informative": 3,
"n_redundant": 0,
"n_repeated": 0,
"n_classes": 4,
"n_clusters_per_class": 2,
"class_sep": 1.5,
},
"description": (
"Four classes, two sub-clusters each, placed at hypercube vertices "
"with informative noise. A denser discrete test than blobs — "
"within-class bimodality stresses cluster-preserving reducers."
),
"kind": "categorical",
},
}
@lru_cache(maxsize=1)
def _dataset_previews() -> Dict[str, Dict[str, Any]]:
"""Attach freshly-generated points+labels to the catalogue for the picker."""
N, SEED = DATASET_PREVIEW_N, DATASET_PREVIEW_SEED
s, sl = make_s_curve(n_samples=N, noise=0.03, random_state=SEED)
sr, srl = make_swiss_roll(n_samples=N, noise=0.15, random_state=SEED)
srh, srhl = make_swiss_roll(n_samples=N, noise=0.15, hole=True, random_state=SEED)
b, bl = make_blobs(
n_samples=N, n_features=3, centers=5, cluster_std=1.0, random_state=SEED
)
gq, gql = make_gaussian_quantiles(
n_samples=N, n_features=3, n_classes=4, random_state=SEED
)
cls, clsl = make_classification(
n_samples=N,
n_features=3,
n_informative=3,
n_redundant=0,
n_repeated=0,
n_classes=4,
n_clusters_per_class=2,
class_sep=1.5,
random_state=SEED,
)
samples = {
"s_curve": (s, sl),
"swiss_roll": (sr, srl),
"swiss_roll_hole": (srh, srhl),
"blobs": (b, bl),
"gaussian_quantiles": (gq, gql),
"classification": (cls, clsl),
}
out: Dict[str, Dict[str, Any]] = {}
for key, meta in DATASET_META.items():
pts, labels = samples[key]
out[key] = {
**meta,
"points": pts.tolist(),
"labels": labels.tolist(),
}
return out
# ---------------------------------------------------------------------------
# Reducer catalogue
# ---------------------------------------------------------------------------
# Each field tuple: (name, kind, default, choices_or_none, help_or_none)
# kinds: "int", "float", "str", "bool", "str_or_float", "int_or_null"
REDUCERS: Dict[str, Dict[str, Any]] = {
"sklearn.decomposition.PCA": {
"pkg": "sklearn",
"label": "PCA",
"blurb": "Principal component analysis. Linear, fast, deterministic.",
"key": [
("n_components", "int", 2, None, "Locked."),
],
"advanced": [
("svd_solver", "str", "auto", ["auto", "full", "arpack", "randomized"], None),
("random_state", "int", 42, None, None),
("whiten", "bool", False, None, None),
],
},
"sklearn.decomposition.FactorAnalysis": {
"pkg": "sklearn",
"label": "FactorAnalysis",
"blurb": "Gaussian latent-factor model with per-feature noise.",
"key": [
("n_components", "int", 2, None, "Locked."),
("random_state", "int", 42, None, None),
],
"advanced": [
("tol", "float", 0.01, None, None),
("max_iter", "int", 1000, None, None),
("rotation", "str", "", ["", "varimax", "quartimax"], "Empty = None."),
],
},
"sklearn.decomposition.KernelPCA": {
"pkg": "sklearn",
"label": "KernelPCA",
"blurb": "Non-linear PCA via the kernel trick. Deterministic; kernel choice shapes the output.",
"key": [
("n_components", "int", 2, None, "Locked."),
("kernel", "str", "rbf", ["linear", "poly", "rbf", "sigmoid", "cosine"], None),
("random_state", "int", 42, None, None),
],
"advanced": [
("gamma", "str_or_float", "", None, "Empty = 1/n_features."),
("degree", "int", 3, None, None),
("coef0", "float", 1.0, None, None),
("alpha", "float", 1.0, None, None),
],
},
"sklearn.manifold.Isomap": {
"pkg": "sklearn",
"label": "Isomap",
"blurb": "Geodesic-distance manifold learning via shortest paths on a k-NN graph.",
"key": [
("n_components", "int", 2, None, "Locked."),
("n_neighbors", "int", 5, None, None),
],
"advanced": [
("metric", "str", "minkowski", None, None),
("p", "int", 2, None, "Minkowski power (1 = manhattan, 2 = euclidean)."),
("path_method", "str", "auto", ["auto", "FW", "D"], "Floyd-Warshall / Dijkstra / auto."),
("neighbors_algorithm", "str", "auto", ["auto", "ball_tree", "kd_tree", "brute"], None),
],
},
"sklearn.manifold.MDS": {
"pkg": "sklearn",
"label": "MDS",
"blurb": "Multidimensional scaling. Preserves pairwise distances; O(n²) memory.",
"key": [
("n_components", "int", 2, None, "Locked."),
("n_init", "int", 4, None, None),
("random_state", "int", 42, None, None),
],
"advanced": [
("max_iter", "int", 300, None, None),
("metric_mds", "bool", True, None, "Metric (True) vs non-metric MDS."),
("metric", "str", "euclidean", None, None),
("eps", "float", 1e-6, None, "Convergence tolerance."),
],
},
"sklearn.manifold.SpectralEmbedding": {
"pkg": "sklearn",
"label": "SpectralEmbedding",
"blurb": "Laplacian eigenmaps on an affinity graph. What UMAP uses for initialisation.",
"key": [
("n_components", "int", 2, None, "Locked."),
("affinity", "str", "nearest_neighbors", ["nearest_neighbors", "rbf"], None),
("random_state", "int", 42, None, None),
],
"advanced": [
("n_neighbors", "int_or_null", "", None, "For affinity=nearest_neighbors. Empty = n/10."),
("gamma", "str_or_float", "", None, "For affinity=rbf. Empty = 1/n_features."),
],
},
"sklearn.manifold.TSNE": {
"pkg": "sklearn",
"label": "t-SNE",
"blurb": "Stochastic neighbour embedding. Local structure preserved.",
"key": [
("n_components", "int", 2, None, "Locked."),
("perplexity", "float", 30.0, None, None),
("random_state", "int", 42, None, None),
],
"advanced": [
("learning_rate", "str_or_float", "auto", None, "'auto' or a float."),
("n_iter", "int", 1000, None, None),
("metric", "str", "euclidean", None, None),
("early_exaggeration", "float", 12.0, None, None),
("init", "str", "pca", ["pca", "random"], None),
],
},
"umap.UMAP": {
"pkg": "umap",
"label": "UMAP",
"blurb": "Uniform manifold approximation. Preserves local + some global structure.",
"key": [
("n_components", "int", 2, None, "Locked."),
("n_neighbors", "int", 15, None, None),
("min_dist", "float", 0.1, None, None),
("random_state", "int", 42, None, None),
],
"advanced": [
("metric", "str", "euclidean", None, None),
("n_epochs", "int_or_null", "", None, "Empty = None (auto)."),
("spread", "float", 1.0, None, None),
("init", "str", "spectral", ["spectral", "random"], None),
],
},
"pacmap.PaCMAP": {
"pkg": "pacmap",
"label": "PaCMAP",
"blurb": "Pairwise-controlled manifold approximation. Balanced local/global.",
"key": [
("n_components", "int", 2, None, "Locked."),
("n_neighbors", "int", 10, None, None),
("MN_ratio", "float", 0.5, None, None),
("FP_ratio", "float", 2.0, None, None),
("random_state", "int", 42, None, None),
],
"advanced": [
("lr", "float", 1.0, None, None),
("num_iters", "int", 450, None, None),
("apply_pca", "bool", True, None, None),
],
},
"pacmap.LocalMAP": {
"pkg": "pacmap",
"label": "LocalMAP",
"blurb": "PaCMAP variant with a low-distance threshold; sharper local structure.",
"key": [
("n_components", "int", 2, None, "Locked."),
("n_neighbors", "int", 10, None, None),
("MN_ratio", "float", 0.5, None, None),
("FP_ratio", "float", 2.0, None, None),
("random_state", "int", 42, None, None),
],
"advanced": [
("lr", "float", 1.0, None, None),
("num_iters", "int", 450, None, None),
("apply_pca", "bool", True, None, None),
("low_dist_thres", "float", 10.0, None, None),
],
},
"trimap.TRIMAP": {
"pkg": "trimap",
"label": "TriMap",
"blurb": "Triplet-based dimensionality reduction. Emphasises global structure.",
"key": [
("n_dims", "int", 2, None, "Locked."),
("n_inliers", "int", 10, None, None),
("n_outliers", "int", 5, None, None),
("n_random", "int", 5, None, None),
],
"advanced": [
("lr", "float", 0.1, None, None),
("n_iters", "int", 400, None, None),
("weight_adj", "float", 500.0, None, None),
],
},
"sklearn.random_projection.GaussianRandomProjection": {
"pkg": "sklearn",
"label": "GaussianRandomProjection",
"blurb": "Johnson-Lindenstrauss baseline. Cheap, distance-preserving in expectation, structure-agnostic.",
"key": [
("n_components", "int", 2, None, "Locked."),
("random_state", "int", 42, None, None),
],
"advanced": [],
},
}
def available_reducers() -> List[Tuple[str, Dict[str, Any]]]:
out = []
for key, spec in REDUCERS.items():
if importlib.util.find_spec(spec["pkg"]) is not None:
out.append((key, spec))
return out
# ---------------------------------------------------------------------------
# Parameter coercion
# ---------------------------------------------------------------------------
def _coerce(kind: str, raw: Optional[str], default: Any) -> Any:
if raw is None:
return default
s = raw.strip() if isinstance(raw, str) else raw
if kind == "int":
if s == "" or s is None:
return default
return int(s)
if kind == "float":
if s == "" or s is None:
return default
return float(s)
if kind == "bool":
# Checkbox: "on" / absent
return bool(s) and s not in ("0", "false", "False", "")
if kind == "str":
if s == "":
return None if default in (None, "") else default if default else ""
return s
if kind == "str_or_float":
if s == "":
return default
try:
return float(s)
except (ValueError, TypeError):
return s
if kind == "int_or_null":
if s == "":
return None
return int(s)
return s
def build_embed_args(reducer_key: str, form: Dict[str, str]) -> Dict[str, Any]:
spec = REDUCERS[reducer_key]
out: Dict[str, Any] = {}
all_fields = list(spec["key"]) + list(spec["advanced"])
for (name, kind, default, _choices, _help) in all_fields:
raw = form.get(f"embed__{name}")
if kind == "bool":
raw_v = "on" if f"embed__{name}" in form else ""
value = bool(raw_v)
else:
value = _coerce(kind, raw, default)
# Null-stripping: drop empty rotations etc.
if value is None:
continue
if isinstance(value, str) and value == "" and default in (None, ""):
continue
out[name] = value
# Always force n_components / n_dims to 2 (flow assertion)
if "n_components" in out:
out["n_components"] = 2
if "n_dims" in out:
out["n_dims"] = 2
return out
# ---------------------------------------------------------------------------
# Output-path synthesis (mirrors flows/embedding_flow.py lines ~162–168)
# ---------------------------------------------------------------------------
def embed_args_hash(embed_args: Optional[Dict[str, Any]]) -> str:
"""8-hex digest of an embed_args dict (keys sorted). Stems incorporate
this so runs that differ only in embed_args get distinct output files."""
s = json.dumps(embed_args or {}, sort_keys=True, default=str)
return hashlib.sha1(s.encode()).hexdigest()[:8]
def synthesize_output_paths(
generator_path: str,
embedder: str,
num_points: int,
num_timesteps: int,
jitter_scale: float,
seed: int,
embed_args: Optional[Dict[str, Any]] = None,
) -> Tuple[str, str]:
gen = generator_path.split(".")[-1]
emb = embedder.split(".")[-1]
ref = f"{gen}_Reference_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}.html"
base = f"{gen}_{emb}_N{num_points}_T{num_timesteps}_J{jitter_scale}_s{seed}"
if embed_args is None:
embf = f"{base}.html"
else:
embf = f"{base}_{embed_args_hash(embed_args)}.html"
return ref, embf
def _resolve_emb_file(synthesized: str) -> str:
"""Disk-backed fallback: prefer the synthesized (hashed) name; if that
doesn't exist on disk but an older hash-less variant does, return that
so pre-hash runs still render in the UI."""
if (FIGS_DIR / synthesized).exists():
return synthesized
# Strip trailing _<8hex>.html to get the legacy name.
m = re.match(r"^(?P
unknown reducer
", status_code=404) return templates.TemplateResponse( request, "_reducer_form.html", {"reducer_key": name, "spec": spec}, ) @app.get("/runs", response_class=HTMLResponse) async def runs_partial(request: Request) -> HTMLResponse: async with httpx.AsyncClient(timeout=5.0) as client: runs = await PREFECT.recent_runs(client, limit=10) views = [_run_view(r) for r in runs] _mark_stale_views(views) return templates.TemplateResponse( request, "_runs.html", {"runs": views} ) @app.post("/submit", response_class=HTMLResponse) async def submit(request: Request) -> HTMLResponse: form = await request.form() data: Dict[str, str] = {k: str(v) for k, v in form.items()} reducer = data.get("reducer") or "" if reducer not in REDUCERS: return HTMLResponse( f"{candidate}). change a param or delete "
f"the fig first.{run.get('error')[:500]}