some minor upgrades to prefect syntax

2026-04-21 18:02:39 -06:00 · 2026-04-21 18:02:39 -06:00 · 708157c1ef
commit 708157c1ef
9 changed files with 5875 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,4 @@
+.venv/
+__pycache__/
+
+figs/
--- a/README.md
+++ b/README.md
@ -0,0 +1,25 @@
+# Dimension Reduction Lab
+
+A Python project exploring various dimension reduction techniques using Prefect for workflow orchestration.
+
+## Overview
+
+This project serves as an experimental sandbox for studying dimensionality reduction and embedding algorithms within a reproducible environment. The primary goal is to evaluate and compare different techniques (like UMAP, t-SNE, PaCMAP, and TriMap) while focusing on their stability characteristics, particularly in the context of changing or drifting data distributions. By leveraging Prefect's workflow management capabilities, we can systematically analyze how these algorithms perform across arbitrary datasets, track their behavior over time, and measure their sensitivity to various hyperparameters and data perturbations.
+
+## Requirements
+
+The project uses several key dependencies (as seen in requirements.frozen.txt):
+
+## Package Management
+
+This project uses UV (μv) as its package manager, a fast Python package installer and resolver written in Rust. The `requirements.frozen.txt` file was generated using UV to ensure reproducible dependencies.
+
+To update dependencies:
+
+```bash
+uv pip compile pyproject.toml (--all-extras) -o requirements.frozen.txt
+```
+
+Modifying `--all-extras` to include either an individual optional dependency group or all of them. See the [pyproject.toml](pyproject.toml) file for more information.
+
+This project uses Prefect for workflow orchestration, for it's lightweight approach to running experiments from a UI and compatibility with single-node deployments.
--- a/clean.sh
+++ b/clean.sh
@ -0,0 +1,3 @@
+#!/bin/bash
+
+find . -type f -not \( -name '*.py' -o -name '*.html' -o -name '*.sh' -o -name '*.toml' -o -name '.gitignore' -o -name '*.md' -o -name "*.pyc" \) -delete
--- a/flows/embedding_flow.py
+++ b/flows/embedding_flow.py
@ -0,0 +1,249 @@
+# embedding_flow.py
+import os
+import sys
+
+# Default to the local Docker Prefect server. An explicit PREFECT_API_URL
+# in the environment still wins (setdefault is a no-op if the key exists).
+os.environ.setdefault("PREFECT_API_URL", "http://localhost:4200/api")
+os.environ.setdefault("DO_NOT_TRACK", "1")
+
+from datetime import timedelta
+import math
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+from prefect import flow, task
+from prefect.cache_policies import INPUTS, NO_CACHE
+from prefect_ray import RayTaskRunner
+
+import pandas as pd
+
+import embedding_utils as E
+from joblib import cpu_count
+
+
+@task(cache_policy=INPUTS, cache_expiration=timedelta(hours=1))
+def generate_initial_frame_task(
+    generator_path: str, generator_kwargs: Dict[str, Any], id_column: str = "id"
+) -> pd.DataFrame:
+    """
+    Generate the initial data frame using a specified data generator.
+
+    Parameters:
+    - generator_path: str
+        The full module path to the data generator function (e.g., 'sklearn.datasets.make_s_curve').
+    - generator_kwargs: Dict[str, Any]
+        Keyword arguments to pass to the data generator function.
+    - id_column: str
+        Column name to use as a unique identifier.
+
+    Returns:
+    - df: pd.DataFrame
+        DataFrame with generated data and unique IDs.
+    """
+    generator_func = E.dynamic_import(generator_path)
+    data, labels = generator_func(**generator_kwargs)
+
+    df = pd.DataFrame(
+        {
+            "feature_0": data[:, 0],
+            "feature_1": data[:, 1],
+            "feature_2": data[:, 2],
+            id_column: range(data.shape[0]),
+            "time": 0,
+        }
+    )
+    df[id_column] = df[id_column].astype(int)
+    return df
+
+
+@task(cache_policy=INPUTS, cache_expiration=timedelta(hours=12))
+def generate_snapshots_task(
+    initial_df: pd.DataFrame, num_snapshots: int, jitter_scale: float, seed: int = 42
+) -> List[pd.DataFrame]:
+    return E.generate_jittered_snapshots(initial_df, num_snapshots, jitter_scale, seed)
+
+
+@task(
+    cache_policy=INPUTS,
+    cache_expiration=timedelta(days=1),
+    task_run_name="embed-{time_idx}",
+)
+def create_embedding(
+    snapshot: pd.DataFrame,
+    embed_columns: List[str],
+    embedder: str,
+    embed_args: Dict[str, Any],
+    time_idx: str | int,
+    id_column: str = "id",
+) -> pd.DataFrame:
+    return E.create_embedding_dataframe(
+        snapshot=snapshot,
+        embed_columns=embed_columns,
+        embedding_algorithm_str=embedder,
+        embedding_kwargs=embed_args,
+        id_column=id_column,
+        time_idx=time_idx,
+    )
+
+
+@task
+def collect_data_task(
+    embedded_dfs: List[pd.DataFrame], sort_time: bool = True, id_column: str = "id"
+) -> pd.DataFrame:
+    return E.collect_and_prepare_for_plotly(
+        embedded_dfs, sort_time=sort_time, id_column=id_column
+    )
+
+
+@task(
+    task_run_name="plot-{output_path}",
+    retries=3,
+    cache_policy=NO_CACHE,
+)
+def plot_and_save_task(
+    combined_df: pd.DataFrame,
+    title: str,
+    output_path: str,
+    frame_duration: int = 500,
+    transition_duration: int = 500,
+    fixed_axes: bool = True,
+    equal_aspect: bool = True,
+    samples: int = 25_000,
+):
+    fig = E.plot_embedding_over_time(
+        combined_df,
+        title=title,
+        frame_duration=int(frame_duration),
+        transition_duration=int(transition_duration),
+        fixed_axes=fixed_axes,
+        equal_aspect=equal_aspect,
+        samples=samples,
+    )
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    fig.write_html(output_path)
+    return output_path
+
+
+_DEFAULT_GENERATOR_KWARGS: Dict[str, Any] = {"random_state": 0}
+_DEFAULT_EMBED_COLUMNS: List[str] = ["feature_0", "feature_2", "feature_1"]
+_DEFAULT_EMBED_ARGS: Dict[str, Any] = {"n_components": 2, "random_state": 30}
+
+
+@flow(task_runner=RayTaskRunner(init_kwargs={"num_cpus": 4}))
+def embedding_flow(
+    num_points: int = 5000,
+    num_snapshots: int = 48,
+    jitter_scale: float = 0.01,
+    seed: int = 42,
+    generator_path: str = "sklearn.datasets.make_s_curve",
+    generator_kwargs: Optional[Dict[str, Any]] = None,
+    embed_columns: Optional[List[str]] = None,
+    embedder: str = "sklearn.decomposition.FactorAnalysis",
+    embed_args: Optional[Dict[str, Any]] = None,
+    output_dir: str = "figs",
+    id_column: str = "id",
+    frame_duration: int = 1200,
+    transition_duration: int = 2400,
+    reference_speedup: float = 10.0,
+    samples: int = 10_000,
+):
+    generator_kwargs = {
+        **_DEFAULT_GENERATOR_KWARGS,
+        **(generator_kwargs or {}),
+        "n_samples": num_points,
+    }
+    embed_columns = (
+        list(embed_columns) if embed_columns is not None else list(_DEFAULT_EMBED_COLUMNS)
+    )
+    embed_args = dict(embed_args) if embed_args is not None else dict(_DEFAULT_EMBED_ARGS)
+
+    Path(output_dir).mkdir(parents=True, exist_ok=True)
+    _generator = generator_path.split(".")[-1]
+    output_ref: str = (
+        f"{output_dir.strip('/')}/{_generator}_Reference_N{num_points}_S{num_snapshots}_J{jitter_scale}_s{seed}.html"
+    )
+    output_embed: str = (
+        f"{output_dir.strip('/')}/{_generator}_{embedder.split('.')[-1]}_N{num_points}_S{num_snapshots}_J{jitter_scale}_s{seed}.html"
+    )
+    title_ref = f"Reference: {_generator}, N={num_points} with {jitter_scale} noise"
+    title_embed = f"Embedding: {embedder.split('.')[-1]} on {_generator}, N={num_points} with {jitter_scale} noise"
+
+    merged_embed_args = embed_args
+
+    # Generate initial frame using the specified data generator
+    initial_frame = generate_initial_frame_task.submit(
+        generator_path=generator_path,
+        generator_kwargs=generator_kwargs,
+        id_column=id_column,
+    )
+
+    # Generate snapshots
+    snapshots = generate_snapshots_task.submit(
+        initial_df=initial_frame.result(),
+        num_snapshots=num_snapshots,
+        jitter_scale=jitter_scale,
+        seed=seed,
+    )
+    snapshot_list = snapshots.result()
+
+    # Generate corresponding dates (assuming daily snapshots for simplicity)
+    dates = [
+        f"{year}-{month:02d}-01"
+        for year in range(2000, 2001 + math.floor(num_snapshots / 12))
+        for month in range(1, 13)
+    ][:num_snapshots]
+
+    # Apply embeddings in parallel using Prefect's mapping
+    embeddings = create_embedding.map(
+        snapshot=snapshot_list,
+        time_idx=dates,
+        embed_columns=[embed_columns] * num_snapshots,
+        embedder=[embedder] * num_snapshots,
+        embed_args=[merged_embed_args] * num_snapshots,
+        id_column=[id_column] * num_snapshots,
+    )
+
+    # Collect all embeddings
+    combined_df = collect_data_task.submit(
+        embedded_dfs=embeddings.result(), sort_time=False
+    ).result()
+
+    # make the original snapshots look like the embeddings
+    dfr = collect_data_task.submit(
+        embedded_dfs=snapshot_list, sort_time=False
+    ).result()
+    dfr = dfr[embed_columns[:2] + [id_column, "time"]]
+    dfr.columns = ["x", "y", id_column, "time"]
+    dfr["time"] = combined_df["time"].to_numpy()
+
+    # Plot reference animation
+    ref_path = plot_and_save_task.submit(
+        combined_df=dfr,
+        title=title_ref,
+        output_path=output_ref,
+        frame_duration=max(frame_duration / reference_speedup, 175),
+        transition_duration=max(transition_duration / reference_speedup, 350),
+        fixed_axes=True,
+        equal_aspect=False,
+        samples=samples,
+    )
+
+    # Plot embedding animation
+    emb_path = plot_and_save_task.submit(
+        combined_df=combined_df,
+        title=title_embed,
+        output_path=output_embed,
+        frame_duration=frame_duration,
+        transition_duration=transition_duration,
+        fixed_axes=True,
+        equal_aspect=False,
+        samples=samples,
+    )
+
+    return (ref_path.result(), emb_path.result())
+
+
+if __name__ == "__main__":
+    embedding_flow.serve()
+    # embedding_flow()
--- a/flows/embedding_utils.py
+++ b/flows/embedding_utils.py
@ -0,0 +1,491 @@
+# embedding_utils.py
+
+import importlib
+from typing import List, Optional, Type, Union
+
+import numpy as np
+import pandas as pd
+import plotly.express as px
+from plotly.graph_objects import Figure
+
+
+def dynamic_import(class_path: str) -> Type:
+    """
+    Dynamically import a class from a given module path.
+
+    Parameters:
+    - class_path: str
+        The full path to the class (e.g., 'sklearn.decomposition.PCA').
+
+    Returns:
+    - cls: Type
+        The imported class.
+
+    Raises:
+    - ImportError: If the module or class cannot be found.
+    """
+    try:
+        module_path, class_name = class_path.rsplit(".", 1)
+        module = importlib.import_module(module_path)
+        cls = getattr(module, class_name)
+        return cls
+    except (ImportError, AttributeError) as e:
+        raise ImportError(f"Cannot import '{class_path}'. Error: {e}")
+
+
+def create_embedding_dataframe(
+    snapshot: pd.DataFrame,
+    embed_columns: List[str],
+    embedding_algorithm_str: str = "sklearn.decomposition.PCA",
+    embedding_kwargs: Optional[dict] = None,
+    label_columns: Optional[List[str]] = None,
+    id_column: Optional[str] = None,
+    time_idx: Optional[Union[int, str]] = None,
+) -> pd.DataFrame:
+    """
+    Apply an embedding algorithm to a single snapshot and prepare the DataFrame.
+
+    Parameters:
+    - snapshot: pd.DataFrame
+        The input data snapshot to embed.
+    - time_idx: Optional[Union[int, str]]
+        The time identifier for the snapshot (e.g., integer index or 'YYYYMMDD' string).
+    - embedding_algorithm_str: str
+        The full module path to the embedding class (e.g., 'sklearn.decomposition.PCA').
+    - embedding_kwargs: Optional[dict]
+        Additional keyword arguments for the embedding algorithm.
+    - label_columns: Optional[List[str]]
+        List of column names to include in the tooltip labels. If None or empty, labels are empty.
+    - id_column: Optional[str]
+        Column name to use as a unique identifier. If None, a default integer ID is assigned.
+
+    Returns:
+    - embedded_df: pd.DataFrame
+        DataFrame containing 'id', 'x', 'y', 'time', and 'label' columns.
+    """
+    if embedding_kwargs is None:
+        embedding_kwargs = {}
+    if label_columns is None:
+        label_columns = []
+
+    # Assign unique ID
+    embedded_df = pd.DataFrame()
+    if id_column and id_column in snapshot.columns:
+        embedded_df["id"] = snapshot[id_column]  # .astype(int)  # Ensure ID is integer
+    else:
+        embedded_df["id"] = snapshot.index  # .astype(int)  # Default to integer index
+
+    # Dynamically import the embedding class
+    embedding_class = dynamic_import(embedding_algorithm_str)
+
+    # Initialize and fit the embedding model
+    model = embedding_class(**embedding_kwargs)
+    embedded = model.fit_transform(snapshot[embed_columns].values)
+
+    if embedded.shape[1] != 2:
+        raise ValueError("Embedding must result in 2 dimensions.")
+
+    embedded_coords = pd.DataFrame(embedded, columns=["x", "y"])
+    embedded_df = pd.concat([embedded_df, embedded_coords], axis=1)
+    if time_idx is not None:
+        embedded_df["time"] = time_idx
+    else:  # if not supplied, use "time" from snapshot
+        embedded_df["time"] = snapshot["time"]
+
+    # Create tooltip labels
+    if label_columns:
+        # Ensure the label columns exist
+        missing_cols = [col for col in label_columns if col not in snapshot.columns]
+        if missing_cols:
+            raise ValueError(f"Label columns not found in snapshot: {missing_cols}")
+        # Concatenate specified columns into a single string for the tooltip
+        labels = snapshot[label_columns].astype(str).agg(" | ".join, axis=1)
+        embedded_df["label"] = labels
+    else:
+        embedded_df["label"] = ""
+
+    for k in ["id", "x", "y", "time"]:
+        assert k in embedded_df.columns, k
+    return embedded_df
+
+
+def collect_and_prepare_for_plotly(
+    embedded_dfs: List[pd.DataFrame], sort_time: bool = True, id_column: str = "id"
+) -> pd.DataFrame:
+    """
+    Combine multiple embedded DataFrames and prepare them for Plotly visualization.
+
+    Parameters:
+    - embedded_dfs: List[pd.DataFrame]
+        A list of DataFrames, each containing 'id', 'x', 'y', 'time', and 'label' columns.
+    - sort_time: bool
+        Whether to sort the combined DataFrame by the 'time' column and then by 'id'.
+
+    Returns:
+    - combined_df: pd.DataFrame
+        A single DataFrame concatenating all embedded snapshots, sorted by time and id if specified.
+    """
+    if not embedded_dfs:
+        raise ValueError("The list of embedded DataFrames is empty.")
+
+    # Concatenate all embedded DataFrames
+    combined_df = pd.concat(embedded_dfs, ignore_index=True)
+
+    if "id" not in combined_df.columns:
+        if id_column in combined_df.columns:
+            # rename column to 'id'
+            combined_df.rename(columns={id_column: "id"}, inplace=True)
+        else:
+            raise ValueError(
+                "Each embedded DataFrame must contain an 'id' column for sorting."
+            )
+    # Sort by 'time' and 'id' if required
+    if sort_time:
+        # Determine if 'time' is numeric or string for appropriate sorting
+        # if pd.api.types.is_numeric_dtype(combined_df["time"]):
+        #     combined_df = combined_df.sort_values(by=["time", "id"])
+        # else:
+        # Assume string dates are sortable (e.g., 'YYYYMMDD')
+        combined_df = combined_df.sort_values(by=["time", "id"])
+
+    # Reset index after sorting
+    combined_df.reset_index(drop=True, inplace=True)
+    return combined_df
+
+
+def plot_embedding_over_time(
+    combined_df: pd.DataFrame,
+    title: str = "Embedding Over Time",
+    color_column: Optional[str] = None,
+    fixed_axes: bool = True,
+    equal_aspect: bool = True,
+    frame_duration: int = 500,
+    transition_duration: int = 500,
+    samples: int = 0,
+) -> Figure:
+    """
+    Create an interactive Plotly scatter plot with animation over time.
+
+    Parameters:
+    - combined_df: pd.DataFrame
+        DataFrame containing at least 'id', 'time', and numerical feature columns.
+    - title: str
+        Title of the plot.
+    - color_column: Optional[str]
+        Column name for color encoding. If None, no color encoding is applied.
+    - fixed_axes: bool
+        If True, axes ranges are fixed across all frames for consistency.
+    - equal_aspect: bool
+        If True, the plot will have an equal aspect ratio.
+    - frame_duration: int
+        Duration of each animation frame in milliseconds.
+    - transition_duration: int
+        Duration of the transition between frames in milliseconds.
+    - samples: int (optional)
+        Number of samples to use for plotting (for faster rendering).
+    Returns:
+    - fig: plotly.graph_objs._figure.Figure
+        The Plotly figure object.
+    """
+
+    # Step 1: Identify numerical columns excluding 'id' and 'time'
+    numeric_columns = combined_df.select_dtypes(
+        include=["float", "int", "bool"]
+    ).columns.tolist()
+    numeric_columns = [col for col in numeric_columns if col not in ["id", "time"]]
+
+    if len(numeric_columns) < 2:
+        raise ValueError(
+            "DataFrame must have at least two numerical columns for x and y axes."
+        )
+
+    # Step 2: Use the first two numerical columns as default x and y
+    default_x = numeric_columns[0]
+    default_y = numeric_columns[1]
+
+    # Step 3: Sample the data if required
+    hover_data = (
+        [default_x, default_y]
+        if "label" in combined_df.columns
+        else [default_x, default_y]
+    )
+    hover_data = ["id"]
+
+    if samples > 0:
+        unique_ids = combined_df["id"].unique().tolist()
+        samples = min(samples, len(unique_ids))
+        sample_ids = np.random.choice(unique_ids, samples, replace=False)
+        combined_df_sample = combined_df[combined_df["id"].isin(sample_ids)]
+    else:
+        combined_df_sample = combined_df
+
+    # Step 4: Determine opacity based on number of unique IDs
+    opacity = max(0.1, min(5000.0 / combined_df_sample["id"].nunique(), 1))
+
+    # Step 5: Create the initial scatter plot using Plotly Express
+    if color_column and color_column in combined_df.columns:
+        fig = px.scatter(
+            combined_df_sample,
+            x=default_x,
+            y=default_y,
+            animation_frame="time",
+            animation_group="id",
+            color=color_column,
+            hover_data=hover_data,
+            title=title,
+            labels={default_x: "x", default_y: "y", "time": "Time"},
+            category_orders={"time": sorted(combined_df["time"].unique())},
+            opacity=opacity,
+        )
+    else:
+        fig = px.scatter(
+            combined_df_sample,
+            x=default_x,
+            y=default_y,
+            animation_frame="time",
+            animation_group="id",
+            hover_data=hover_data,
+            title=title,
+            labels={default_x: "x", default_y: "y", "time": "Time"},
+            category_orders={"time": sorted(combined_df["time"].unique())},
+            opacity=opacity,
+        )
+
+    # Step 6: Fix axes ranges if required
+    if fixed_axes:
+        x_min, x_max = (
+            combined_df_sample[default_x].min(),
+            combined_df_sample[default_x].max(),
+        )
+        y_min, y_max = (
+            combined_df_sample[default_y].min(),
+            combined_df_sample[default_y].max(),
+        )
+        fig.update_layout(
+            xaxis=dict(range=[x_min, x_max]),
+            yaxis=dict(range=[y_min, y_max]),
+        )
+
+    # Step 7: Enforce equal aspect ratio if required
+    if equal_aspect:
+        fig.update_yaxes(scaleanchor="x", scaleratio=1)
+
+    # Step 8: Prepare dropdowns if there are more than two numerical columns
+    if len(numeric_columns) > 2:
+        # Create dropdown options
+        dropdown_options = [
+            {"label": col.replace("_", " ").title(), "value": col}
+            for col in numeric_columns
+        ]
+
+        # Dropdown for X-axis
+        dropdown_x = dict(
+            active=0,
+            buttons=[
+                dict(
+                    label=option["label"],
+                    method="update",
+                    args=[
+                        {"x": [combined_df_sample[option["value"]]]},
+                        {
+                            "xaxis.title.text": option["label"],
+                            "hover_data": hover_data,
+                            # 'hover_data': [option['value'], default_y] + hover_data
+                        },
+                        # rescale axis
+                        {
+                            "xaxis": {
+                                "range": [
+                                    combined_df_sample[option["value"]].min(),
+                                    combined_df_sample[option["value"]].max(),
+                                ]
+                            }
+                        },
+                    ],
+                )
+                for option in dropdown_options
+            ],
+            direction="down",
+            showactive=True,
+            x=0.4,
+            xanchor="left",
+            y=1.1,
+            yanchor="top",
+            pad={"r": 10, "t": 10},
+            name="X-Axis",
+        )
+
+        # Dropdown for Y-axis
+        dropdown_y = dict(
+            active=1,
+            buttons=[
+                dict(
+                    label=option["label"],
+                    method="update",
+                    args=[
+                        {"y": [combined_df_sample[option["value"]]]},
+                        {
+                            "yaxis.title.text": option["label"],
+                            "hover_data": hover_data,
+                            # 'hover_data': [default_x, option['value']] + hover_data
+                        },
+                        # rescale axis
+                        {
+                            "yaxis": {
+                                "range": [
+                                    combined_df_sample[option["value"]].min(),
+                                    combined_df_sample[option["value"]].max(),
+                                ]
+                            }
+                        },
+                    ],
+                )
+                for option in dropdown_options
+            ],
+            direction="down",
+            showactive=True,
+            x=0.4,
+            xanchor="left",
+            y=1.2,
+            yanchor="top",
+            pad={"r": 10, "t": 10},
+            name="Y-Axis",
+        )
+
+        # Step 9: Consolidate all layout updates in a single call
+        fig.update_layout(
+            # updatemenus=[
+            #     {},
+            #     dropdown_y,
+            #     dropdown_x,
+            # ],
+            xaxis_title=default_x.replace("_", " ").title(),
+            yaxis_title=default_y.replace("_", " ").title(),
+            width=800,
+            height=800,
+            margin=dict(t=100, b=150),
+        )
+    else:
+        # If only two numerical columns, set titles accordingly
+        fig.update_layout(
+            xaxis_title=default_x.replace("_", " ").title(),
+            yaxis_title=default_y.replace("_", " ").title(),
+            width=800,
+            height=800,
+        )
+
+    # # Step 10: Adjust animation durations for smoother transitions
+    if fig.layout.updatemenus:
+        for updatemenu in fig.layout.updatemenus:
+            if "buttons" in updatemenu:
+                for btn in updatemenu.buttons:
+                    if (
+                        "args" in btn
+                        and len(btn.args) > 1
+                        and isinstance(btn.args[1], dict)
+                    ):
+                        frame = btn.args[1].get("frame", {})
+                        transition = btn.args[1].get("transition", {})
+                        frame["duration"] = frame_duration
+                        transition["duration"] = transition_duration
+                        btn.args[1]["frame"] = frame
+                        btn.args[1]["transition"] = transition
+
+    return fig
+
+
+def generate_initial_frame(
+    num_points: int, num_features: int, seed: int = 42, id_prefix: str = "Point"
+) -> pd.DataFrame:
+    """
+    Generate an initial frame with random points and unique IDs.
+
+    Parameters:
+    - num_points: int
+        Number of data points.
+    - num_features: int
+        Number of features per data point.
+    - seed: int
+        Random seed for reproducibility.
+    - id_prefix: str
+        Prefix for generating unique IDs.
+
+    Returns:
+    - df: pd.DataFrame
+        DataFrame with random data and unique IDs.
+    """
+    np.random.seed(seed)
+    data = np.random.randn(num_points, num_features)
+    df = pd.DataFrame(data, columns=[f"feature_{j}" for j in range(num_features)])
+    df["id"] = [i for i in range(num_points)]
+    df["id"] = df["id"].astype(int)
+    df["time"] = 0
+    return df
+
+
+def generate_jittered_snapshots(
+    initial_df: pd.DataFrame,
+    num_snapshots: int,
+    jitter_scale: float = 0.1,
+    seed: int = 42,
+) -> List[pd.DataFrame]:
+    """
+    Generate snapshots by applying random jitter to the initial frame and randomly adding/removing points.
+
+    Parameters:
+    - initial_df: pd.DataFrame
+        The initial DataFrame to apply jitter.
+    - num_snapshots: int
+        Number of snapshots to generate.
+    - jitter_scale: float
+        Standard deviation of the Gaussian noise added for jitter.
+    - seed: int
+        Random seed for reproducibility.
+
+    Returns:
+    - snapshots: List[pd.DataFrame]
+        List of jittered DataFrames with dynamic point introduction/removal.
+    """
+    np.random.seed(seed)
+    snapshots = []
+    current_df = initial_df.copy()
+
+    for i in range(num_snapshots):
+        # Apply jitter (set to 0 for testing)
+        jitter = np.random.normal(
+            loc=0.0,
+            scale=jitter_scale,
+            size=(current_df.shape[0], current_df.shape[1] - 2),
+        )
+        jittered_features = current_df.iloc[:, :-2] + jitter  # Exclude 'id' and 'time'
+        jittered_df = jittered_features.copy()
+        jittered_df["id"] = current_df["id"]
+
+        # Randomly decide to add or remove points
+        action = np.random.choice(["add", "remove", "none"], p=[0.5, 0.5, 0])
+
+        if action == "add":
+            # Add a new point with a unique integer ID
+            new_point = np.random.randn(1, current_df.shape[1] - 2)
+            new_id = current_df["id"].max() + 1
+            new_df = pd.DataFrame(
+                new_point,
+                columns=[f"feature_{j}" for j in range(current_df.shape[1] - 2)],
+            )
+            new_df["id"] = new_id
+            jittered_df = pd.concat([jittered_df, new_df], ignore_index=True)
+
+        elif action == "remove" and len(jittered_df) > 1:
+            # Remove a random point
+            remove_idx = np.random.choice(jittered_df.index)
+            jittered_df = jittered_df.drop(index=remove_idx).reset_index(drop=True)
+
+        # Assign time index
+        jittered_df["time"] = i + 1  # Start from 1
+
+        snapshots.append(jittered_df)
+
+        # Update current_df for next iteration
+        current_df = jittered_df.copy()
+
+    return snapshots
--- a/18
+++ b/18
@ -0,0 +1,18 @@
+run:
+	.venv/bin/python flows/embedding_flow.py
+
+web1:
+	.venv/bin/python -m uvicorn app.web1.main:app --host 0.0.0.0 --port 8001 --reload
+
+web2:
+	.venv/bin/python -m uvicorn app.web2.main:app --host 0.0.0.0 --port 8002 --reload
+
+web3:
+	.venv/bin/python -m uvicorn app.web3.main:app --host 0.0.0.0 --port 8003 --reload
+
+demo:
+	.venv/bin/python -m uvicorn app.demo.main:app --host 0.0.0.0 --port 8010 --reload
+
+compile:
+	uv pip compile pyproject.toml -o requirements-frozen.txt
+
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,37 @@
+[project]
+name = "dimension-reduction-sandbox"
+version = "0.0.1"
+description = "Dimension Reduction Stability Experiments with Prefect"
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+    "annoy-mm>=1.17.3", # prebuilt wheels for annoy; satisfies pacmap/trimap
+    "pacmap>=0.7.5",
+    "pandas>=2.2.3",
+    "plotly>=5.24.1",
+    "prefect-ray>=0.4.2",
+    "prefect>=3.1.1",
+    "scikit-learn>=1.5.2",
+    "setuptools>=75.4.0", # pacmap + trimap need this (unecessarily)
+    "trimap>=1.1.4",
+    "umap-learn>=0.5.7",
+    "fastapi>=0.115.0",
+    "jinja2>=3.1.4",
+    "uvicorn[standard]>=0.32.0",
+    "httpx>=0.27.0",
+    "python-multipart>=0.0.12",
+    "sse-starlette>=2.1.3",
+]
+
+[tool.uv]
+override-dependencies = [
+    "annoy ; sys_platform == 'never'", # block source build of annoy; annoy-mm provides the module
+]
+
+[dependency-groups]
+dev = [
+    "black>=24.10.0",
+    "flake8>=7.1.1",
+    "ipython>=8.29.0",
+    "isort>=5.13.2",
+]
--- a/requirements-frozen.txt
+++ b/requirements-frozen.txt
@ -0,0 +1,490 @@
+# This file was autogenerated by uv via the following command:
+#    uv pip compile pyproject.toml -o requirements-frozen.txt
+aiohappyeyeballs==2.6.1
+    # via aiohttp
+aiohttp==3.13.5
+    # via
+    #   aiohttp-cors
+    #   ray
+aiohttp-cors==0.8.1
+    # via ray
+aiosignal==1.4.0
+    # via aiohttp
+aiosqlite==0.22.1
+    # via prefect
+alembic==1.18.4
+    # via prefect
+amplitude-analytics==1.2.3
+    # via prefect
+annotated-doc==0.0.4
+    # via
+    #   fastapi
+    #   typer
+annotated-types==0.7.0
+    # via pydantic
+annoy-mm==1.17.3
+    # via dimension-reduction-sandbox (pyproject.toml)
+anyio==4.13.0
+    # via
+    #   httpx
+    #   prefect
+    #   starlette
+apprise==1.9.9
+    # via prefect
+asgi-lifespan==2.1.0
+    # via prefect
+asyncpg==0.31.0
+    # via prefect
+attrs==26.1.0
+    # via
+    #   aiohttp
+    #   cyclopts
+    #   jsonschema
+    #   referencing
+beartype==0.22.9
+    # via py-key-value-aio
+cachetools==7.0.6
+    # via
+    #   prefect
+    #   py-key-value-aio
+certifi==2026.2.25
+    # via
+    #   apprise
+    #   httpcore
+    #   httpx
+    #   requests
+cffi==2.0.0
+    # via cryptography
+charset-normalizer==3.4.7
+    # via requests
+click==8.3.2
+    # via
+    #   apprise
+    #   prefect
+    #   ray
+    #   typer
+    #   uvicorn
+cloudpickle==3.1.2
+    # via
+    #   prefect
+    #   pydocket
+colorama==0.4.6
+    # via griffecli
+colorful==0.5.8
+    # via ray
+coolname==4.2.0
+    # via prefect
+cronsim==2.7
+    # via pydocket
+cryptography==46.0.7
+    # via
+    #   google-auth
+    #   prefect
+cyclopts==4.10.2
+    # via prefect
+dateparser==1.4.0
+    # via prefect
+distlib==0.4.0
+    # via virtualenv
+docker==7.1.0
+    # via prefect
+docstring-parser==0.18.0
+    # via cyclopts
+docutils==0.22.4
+    # via rich-rst
+exceptiongroup==1.3.1
+    # via prefect
+faiss-cpu==1.13.2
+    # via pacmap
+fakeredis==2.35.1
+    # via pydocket
+fastapi==0.136.0
+    # via prefect
+filelock==3.29.0
+    # via
+    #   python-discovery
+    #   ray
+    #   virtualenv
+frozenlist==1.8.0
+    # via
+    #   aiohttp
+    #   aiosignal
+fsspec==2026.3.0
+    # via prefect
+google-api-core==2.30.3
+    # via opencensus
+google-auth==2.49.2
+    # via google-api-core
+googleapis-common-protos==1.74.0
+    # via google-api-core
+graphviz==0.21
+    # via prefect
+greenlet==3.4.0
+    # via sqlalchemy
+griffe==2.0.2
+    # via prefect
+griffecli==2.0.2
+    # via griffe
+griffelib==2.0.2
+    # via
+    #   griffe
+    #   griffecli
+grpcio==1.80.0
+    # via ray
+h11==0.16.0
+    # via
+    #   httpcore
+    #   uvicorn
+h2==4.3.0
+    # via httpx
+hpack==4.1.0
+    # via h2
+httpcore==1.0.9
+    # via
+    #   httpx
+    #   prefect
+httpx==0.28.1
+    # via prefect
+humanize==4.15.0
+    # via
+    #   jinja2-humanize-extension
+    #   prefect
+hyperframe==6.1.0
+    # via h2
+idna==3.12
+    # via
+    #   anyio
+    #   httpx
+    #   requests
+    #   yarl
+importlib-metadata==8.7.1
+    # via opentelemetry-api
+jinja2==3.1.6
+    # via
+    #   jinja2-humanize-extension
+    #   prefect
+jinja2-humanize-extension==0.4.0
+    # via prefect
+joblib==1.5.3
+    # via
+    #   pynndescent
+    #   scikit-learn
+jsonpatch==1.33
+    # via prefect
+jsonpointer==3.1.1
+    # via jsonpatch
+jsonschema==4.26.0
+    # via
+    #   prefect
+    #   ray
+jsonschema-specifications==2025.9.1
+    # via jsonschema
+llvmlite==0.47.0
+    # via
+    #   numba
+    #   pynndescent
+lupa==2.8
+    # via fakeredis
+mako==1.3.11
+    # via alembic
+markdown==3.10.2
+    # via apprise
+markdown-it-py==4.0.0
+    # via rich
+markupsafe==3.0.3
+    # via
+    #   jinja2
+    #   mako
+mdurl==0.1.2
+    # via markdown-it-py
+msgpack==1.1.2
+    # via ray
+multidict==6.7.1
+    # via
+    #   aiohttp
+    #   yarl
+narwhals==2.20.0
+    # via plotly
+numba==0.65.0
+    # via
+    #   pacmap
+    #   pynndescent
+    #   trimap
+    #   umap-learn
+numpy==2.4.4
+    # via
+    #   faiss-cpu
+    #   numba
+    #   pacmap
+    #   pandas
+    #   scikit-learn
+    #   scipy
+    #   umap-learn
+oauthlib==3.3.1
+    # via requests-oauthlib
+opencensus==0.11.4
+    # via ray
+opencensus-context==0.1.3
+    # via opencensus
+opentelemetry-api==1.41.0
+    # via
+    #   opentelemetry-exporter-prometheus
+    #   opentelemetry-sdk
+    #   opentelemetry-semantic-conventions
+    #   prefect
+    #   pydocket
+opentelemetry-exporter-prometheus==0.62b0
+    # via ray
+opentelemetry-proto==1.41.0
+    # via ray
+opentelemetry-sdk==1.41.0
+    # via
+    #   opentelemetry-exporter-prometheus
+    #   ray
+opentelemetry-semantic-conventions==0.62b0
+    # via opentelemetry-sdk
+orjson==3.11.8
+    # via prefect
+packaging==26.0
+    # via
+    #   faiss-cpu
+    #   plotly
+    #   prefect
+    #   ray
+pacmap==0.9.1
+    # via dimension-reduction-sandbox (pyproject.toml)
+pandas==3.0.2
+    # via dimension-reduction-sandbox (pyproject.toml)
+pathspec==1.0.4
+    # via prefect
+pendulum==3.2.0
+    # via prefect
+platformdirs==4.9.6
+    # via
+    #   python-discovery
+    #   virtualenv
+plotly==6.7.0
+    # via dimension-reduction-sandbox (pyproject.toml)
+pluggy==1.6.0
+    # via prefect
+prefect==3.6.27
+    # via
+    #   dimension-reduction-sandbox (pyproject.toml)
+    #   prefect-ray
+prefect-ray==0.4.5
+    # via dimension-reduction-sandbox (pyproject.toml)
+prometheus-client==0.25.0
+    # via
+    #   opentelemetry-exporter-prometheus
+    #   prefect
+    #   pydocket
+    #   ray
+propcache==0.4.1
+    # via
+    #   aiohttp
+    #   yarl
+proto-plus==1.27.2
+    # via google-api-core
+protobuf==6.33.6
+    # via
+    #   google-api-core
+    #   googleapis-common-protos
+    #   opentelemetry-proto
+    #   proto-plus
+    #   ray
+py-key-value-aio==0.4.4
+    # via pydocket
+py-spy==0.4.1
+    # via ray
+pyasn1==0.6.3
+    # via pyasn1-modules
+pyasn1-modules==0.4.2
+    # via google-auth
+pycparser==3.0
+    # via cffi
+pydantic==2.13.3
+    # via
+    #   fastapi
+    #   prefect
+    #   pydantic-extra-types
+    #   pydantic-settings
+    #   ray
+pydantic-core==2.46.3
+    # via
+    #   prefect
+    #   pydantic
+pydantic-extra-types==2.11.1
+    # via prefect
+pydantic-settings==2.14.0
+    # via prefect
+pydocket==0.19.2
+    # via prefect
+pygments==2.20.0
+    # via rich
+pynndescent==0.6.0
+    # via umap-learn
+python-dateutil==2.9.0.post0
+    # via
+    #   dateparser
+    #   pandas
+    #   pendulum
+    #   prefect
+python-discovery==1.2.2
+    # via virtualenv
+python-dotenv==1.2.2
+    # via pydantic-settings
+python-json-logger==4.1.0
+    # via pydocket
+python-slugify==8.0.4
+    # via prefect
+pytz==2026.1.post1
+    # via
+    #   dateparser
+    #   prefect
+pyyaml==6.0.3
+    # via
+    #   apprise
+    #   prefect
+    #   ray
+ray==2.55.0
+    # via prefect-ray
+readchar==4.2.2
+    # via prefect
+redis==7.4.0
+    # via
+    #   fakeredis
+    #   py-key-value-aio
+    #   pydocket
+referencing==0.37.0
+    # via
+    #   jsonschema
+    #   jsonschema-specifications
+regex==2026.4.4
+    # via dateparser
+requests==2.33.1
+    # via
+    #   apprise
+    #   docker
+    #   google-api-core
+    #   ray
+    #   requests-oauthlib
+requests-oauthlib==2.0.0
+    # via apprise
+rfc3339-validator==0.1.4
+    # via prefect
+rich==14.3.4
+    # via
+    #   cyclopts
+    #   prefect
+    #   pydocket
+    #   rich-rst
+    #   typer
+rich-rst==1.3.2
+    # via cyclopts
+rpds-py==0.30.0
+    # via
+    #   jsonschema
+    #   referencing
+ruamel-yaml==0.19.1
+    # via prefect
+ruamel-yaml-clib==0.2.15
+    # via prefect
+scikit-learn==1.8.0
+    # via
+    #   dimension-reduction-sandbox (pyproject.toml)
+    #   pacmap
+    #   pynndescent
+    #   trimap
+    #   umap-learn
+scipy==1.17.1
+    # via
+    #   pynndescent
+    #   scikit-learn
+    #   umap-learn
+semver==3.0.4
+    # via prefect
+setuptools==82.0.1
+    # via dimension-reduction-sandbox (pyproject.toml)
+shellingham==1.5.4
+    # via typer
+six==1.17.0
+    # via
+    #   opencensus
+    #   python-dateutil
+    #   rfc3339-validator
+smart-open==7.6.0
+    # via ray
+sniffio==1.3.1
+    # via
+    #   asgi-lifespan
+    #   prefect
+sortedcontainers==2.4.0
+    # via fakeredis
+sqlalchemy==2.0.49
+    # via
+    #   alembic
+    #   prefect
+starlette==1.0.0
+    # via fastapi
+text-unidecode==1.3
+    # via python-slugify
+threadpoolctl==3.6.0
+    # via scikit-learn
+toml==0.10.2
+    # via prefect
+tqdm==4.67.3
+    # via umap-learn
+trimap==1.1.5
+    # via dimension-reduction-sandbox (pyproject.toml)
+typer==0.24.1
+    # via pydocket
+typing-extensions==4.15.0
+    # via
+    #   aiosignal
+    #   alembic
+    #   anyio
+    #   exceptiongroup
+    #   fastapi
+    #   grpcio
+    #   opentelemetry-api
+    #   opentelemetry-sdk
+    #   opentelemetry-semantic-conventions
+    #   prefect
+    #   py-key-value-aio
+    #   pydantic
+    #   pydantic-core
+    #   pydantic-extra-types
+    #   pydocket
+    #   referencing
+    #   sqlalchemy
+    #   starlette
+    #   typing-inspection
+typing-inspection==0.4.2
+    # via
+    #   fastapi
+    #   pydantic
+    #   pydantic-settings
+tzdata==2026.1
+    # via pendulum
+tzlocal==5.3.1
+    # via dateparser
+umap-learn==0.5.12
+    # via dimension-reduction-sandbox (pyproject.toml)
+uncalled-for==0.3.1
+    # via pydocket
+urllib3==2.6.3
+    # via
+    #   docker
+    #   requests
+uvicorn==0.45.0
+    # via prefect
+virtualenv==21.2.4
+    # via ray
+websockets==16.0
+    # via prefect
+wrapt==2.1.2
+    # via smart-open
+yarl==1.23.0
+    # via aiohttp
+zipp==3.23.1
+    # via importlib-metadata
--- a/uv.lock
+++ b/uv.lock