some minor upgrades to prefect syntax

2026-04-21 18:02:39 -06:00 · 2026-04-21 18:02:39 -06:00 · 708157c1ef
commit 708157c1ef
9 changed files with 5875 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,4 @@
 .venv/
 __pycache__/
 figs/
--- a/README.md
+++ b/README.md
@ -0,0 +1,25 @@
 # Dimension Reduction Lab
 A Python project exploring various dimension reduction techniques using Prefect for workflow orchestration.
 ## Overview
 This project serves as an experimental sandbox for studying dimensionality reduction and embedding algorithms within a reproducible environment. The primary goal is to evaluate and compare different techniques (like UMAP, t-SNE, PaCMAP, and TriMap) while focusing on their stability characteristics, particularly in the context of changing or drifting data distributions. By leveraging Prefect's workflow management capabilities, we can systematically analyze how these algorithms perform across arbitrary datasets, track their behavior over time, and measure their sensitivity to various hyperparameters and data perturbations.
 ## Requirements
 The project uses several key dependencies (as seen in requirements.frozen.txt):
 ## Package Management
 This project uses UV (μv) as its package manager, a fast Python package installer and resolver written in Rust. The `requirements.frozen.txt` file was generated using UV to ensure reproducible dependencies.
 To update dependencies:
 ```bash
 uv pip compile pyproject.toml (--all-extras) -o requirements.frozen.txt
 ```
 Modifying `--all-extras` to include either an individual optional dependency group or all of them. See the [pyproject.toml](pyproject.toml) file for more information.
 This project uses Prefect for workflow orchestration, for it's lightweight approach to running experiments from a UI and compatibility with single-node deployments.
--- a/clean.sh
+++ b/clean.sh
@ -0,0 +1,3 @@
 #!/bin/bash
 find . -type f -not \( -name '*.py' -o -name '*.html' -o -name '*.sh' -o -name '*.toml' -o -name '.gitignore' -o -name '*.md' -o -name "*.pyc" \) -delete
--- a/flows/embedding_flow.py
+++ b/flows/embedding_flow.py
@ -0,0 +1,249 @@
 # embedding_flow.py
 import os
 import sys
 # Default to the local Docker Prefect server. An explicit PREFECT_API_URL
 # in the environment still wins (setdefault is a no-op if the key exists).
 os.environ.setdefault("PREFECT_API_URL", "http://localhost:4200/api")
 os.environ.setdefault("DO_NOT_TRACK", "1")
 from datetime import timedelta
 import math
 from pathlib import Path
 from typing import Any, Dict, List, Optional
 from prefect import flow, task
 from prefect.cache_policies import INPUTS, NO_CACHE
 from prefect_ray import RayTaskRunner
 import pandas as pd
 import embedding_utils as E
 from joblib import cpu_count
@task(cache_policy=INPUTS, cache_expiration=timedelta(hours=1))
 def generate_initial_frame_task(
    generator_path: str, generator_kwargs: Dict[str, Any], id_column: str = "id"
 ) -> pd.DataFrame:
    """
    Generate the initial data frame using a specified data generator.
    Parameters:
    - generator_path: str
        The full module path to the data generator function (e.g., 'sklearn.datasets.make_s_curve').
    - generator_kwargs: Dict[str, Any]
        Keyword arguments to pass to the data generator function.
    - id_column: str
        Column name to use as a unique identifier.
    Returns:
    - df: pd.DataFrame
        DataFrame with generated data and unique IDs.
    """
    generator_func = E.dynamic_import(generator_path)
    data, labels = generator_func(**generator_kwargs)
    df = pd.DataFrame(
        {
            "feature_0": data[:, 0],
            "feature_1": data[:, 1],
            "feature_2": data[:, 2],
            id_column: range(data.shape[0]),
            "time": 0,
        }
    )
    df[id_column] = df[id_column].astype(int)
    return df
@task(cache_policy=INPUTS, cache_expiration=timedelta(hours=12))
 def generate_snapshots_task(
    initial_df: pd.DataFrame, num_snapshots: int, jitter_scale: float, seed: int = 42
 ) -> List[pd.DataFrame]:
    return E.generate_jittered_snapshots(initial_df, num_snapshots, jitter_scale, seed)
@task(
    cache_policy=INPUTS,
    cache_expiration=timedelta(days=1),
    task_run_name="embed-{time_idx}",
 )
 def create_embedding(
    snapshot: pd.DataFrame,
    embed_columns: List[str],
    embedder: str,
    embed_args: Dict[str, Any],
    time_idx: str | int,
    id_column: str = "id",
 ) -> pd.DataFrame:
    return E.create_embedding_dataframe(
        snapshot=snapshot,
        embed_columns=embed_columns,
        embedding_algorithm_str=embedder,
        embedding_kwargs=embed_args,
        id_column=id_column,
        time_idx=time_idx,
    )
@task
 def collect_data_task(
    embedded_dfs: List[pd.DataFrame], sort_time: bool = True, id_column: str = "id"
 ) -> pd.DataFrame:
    return E.collect_and_prepare_for_plotly(
        embedded_dfs, sort_time=sort_time, id_column=id_column
    )
@task(
    task_run_name="plot-{output_path}",
    retries=3,
    cache_policy=NO_CACHE,
 )
 def plot_and_save_task(
    combined_df: pd.DataFrame,
    title: str,
    output_path: str,
    frame_duration: int = 500,
    transition_duration: int = 500,
    fixed_axes: bool = True,
    equal_aspect: bool = True,
    samples: int = 25_000,
 ):
    fig = E.plot_embedding_over_time(
        combined_df,
        title=title,
        frame_duration=int(frame_duration),
        transition_duration=int(transition_duration),
        fixed_axes=fixed_axes,
        equal_aspect=equal_aspect,
        samples=samples,
    )
    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
    fig.write_html(output_path)
    return output_path
 _DEFAULT_GENERATOR_KWARGS: Dict[str, Any] = {"random_state": 0}
 _DEFAULT_EMBED_COLUMNS: List[str] = ["feature_0", "feature_2", "feature_1"]
 _DEFAULT_EMBED_ARGS: Dict[str, Any] = {"n_components": 2, "random_state": 30}
@flow(task_runner=RayTaskRunner(init_kwargs={"num_cpus": 4}))
 def embedding_flow(
    num_points: int = 5000,
    num_snapshots: int = 48,
    jitter_scale: float = 0.01,
    seed: int = 42,
    generator_path: str = "sklearn.datasets.make_s_curve",
    generator_kwargs: Optional[Dict[str, Any]] = None,
    embed_columns: Optional[List[str]] = None,
    embedder: str = "sklearn.decomposition.FactorAnalysis",
    embed_args: Optional[Dict[str, Any]] = None,
    output_dir: str = "figs",
    id_column: str = "id",
    frame_duration: int = 1200,
    transition_duration: int = 2400,
    reference_speedup: float = 10.0,
    samples: int = 10_000,
 ):
    generator_kwargs = {
        **_DEFAULT_GENERATOR_KWARGS,
        **(generator_kwargs or {}),
        "n_samples": num_points,
    }
    embed_columns = (
        list(embed_columns) if embed_columns is not None else list(_DEFAULT_EMBED_COLUMNS)
    )
    embed_args = dict(embed_args) if embed_args is not None else dict(_DEFAULT_EMBED_ARGS)
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    _generator = generator_path.split(".")[-1]
    output_ref: str = (
        f"{output_dir.strip('/')}/{_generator}_Reference_N{num_points}_S{num_snapshots}_J{jitter_scale}_s{seed}.html"
    )
    output_embed: str = (
        f"{output_dir.strip('/')}/{_generator}_{embedder.split('.')[-1]}_N{num_points}_S{num_snapshots}_J{jitter_scale}_s{seed}.html"
    )
    title_ref = f"Reference: {_generator}, N={num_points} with {jitter_scale} noise"
    title_embed = f"Embedding: {embedder.split('.')[-1]} on {_generator}, N={num_points} with {jitter_scale} noise"
    merged_embed_args = embed_args
    # Generate initial frame using the specified data generator
    initial_frame = generate_initial_frame_task.submit(
        generator_path=generator_path,
        generator_kwargs=generator_kwargs,
        id_column=id_column,
    )
    # Generate snapshots
    snapshots = generate_snapshots_task.submit(
        initial_df=initial_frame.result(),
        num_snapshots=num_snapshots,
        jitter_scale=jitter_scale,
        seed=seed,
    )
    snapshot_list = snapshots.result()
    # Generate corresponding dates (assuming daily snapshots for simplicity)
    dates = [
        f"{year}-{month:02d}-01"
        for year in range(2000, 2001 + math.floor(num_snapshots / 12))
        for month in range(1, 13)
    ][:num_snapshots]
    # Apply embeddings in parallel using Prefect's mapping
    embeddings = create_embedding.map(
        snapshot=snapshot_list,
        time_idx=dates,
        embed_columns=[embed_columns] * num_snapshots,
        embedder=[embedder] * num_snapshots,
        embed_args=[merged_embed_args] * num_snapshots,
        id_column=[id_column] * num_snapshots,
    )
    # Collect all embeddings
    combined_df = collect_data_task.submit(
        embedded_dfs=embeddings.result(), sort_time=False
    ).result()
    # make the original snapshots look like the embeddings
    dfr = collect_data_task.submit(
        embedded_dfs=snapshot_list, sort_time=False
    ).result()
    dfr = dfr[embed_columns[:2] + [id_column, "time"]]
    dfr.columns = ["x", "y", id_column, "time"]
    dfr["time"] = combined_df["time"].to_numpy()
    # Plot reference animation
    ref_path = plot_and_save_task.submit(
        combined_df=dfr,
        title=title_ref,
        output_path=output_ref,
        frame_duration=max(frame_duration / reference_speedup, 175),
        transition_duration=max(transition_duration / reference_speedup, 350),
        fixed_axes=True,
        equal_aspect=False,
        samples=samples,
    )
    # Plot embedding animation
    emb_path = plot_and_save_task.submit(
        combined_df=combined_df,
        title=title_embed,
        output_path=output_embed,
        frame_duration=frame_duration,
        transition_duration=transition_duration,
        fixed_axes=True,
        equal_aspect=False,
        samples=samples,
    )
    return (ref_path.result(), emb_path.result())
 if __name__ == "__main__":
    embedding_flow.serve()
    # embedding_flow()
--- a/flows/embedding_utils.py
+++ b/flows/embedding_utils.py
@ -0,0 +1,491 @@
 # embedding_utils.py
 import importlib
 from typing import List, Optional, Type, Union
 import numpy as np
 import pandas as pd
 import plotly.express as px
 from plotly.graph_objects import Figure
 def dynamic_import(class_path: str) -> Type:
    """
    Dynamically import a class from a given module path.
    Parameters:
    - class_path: str
        The full path to the class (e.g., 'sklearn.decomposition.PCA').
    Returns:
    - cls: Type
        The imported class.
    Raises:
    - ImportError: If the module or class cannot be found.
    """
    try:
        module_path, class_name = class_path.rsplit(".", 1)
        module = importlib.import_module(module_path)
        cls = getattr(module, class_name)
        return cls
    except (ImportError, AttributeError) as e:
        raise ImportError(f"Cannot import '{class_path}'. Error: {e}")
 def create_embedding_dataframe(
    snapshot: pd.DataFrame,
    embed_columns: List[str],
    embedding_algorithm_str: str = "sklearn.decomposition.PCA",
    embedding_kwargs: Optional[dict] = None,
    label_columns: Optional[List[str]] = None,
    id_column: Optional[str] = None,
    time_idx: Optional[Union[int, str]] = None,
 ) -> pd.DataFrame:
    """
    Apply an embedding algorithm to a single snapshot and prepare the DataFrame.
    Parameters:
    - snapshot: pd.DataFrame
        The input data snapshot to embed.
    - time_idx: Optional[Union[int, str]]
        The time identifier for the snapshot (e.g., integer index or 'YYYYMMDD' string).
    - embedding_algorithm_str: str
        The full module path to the embedding class (e.g., 'sklearn.decomposition.PCA').
    - embedding_kwargs: Optional[dict]
        Additional keyword arguments for the embedding algorithm.
    - label_columns: Optional[List[str]]
        List of column names to include in the tooltip labels. If None or empty, labels are empty.
    - id_column: Optional[str]
        Column name to use as a unique identifier. If None, a default integer ID is assigned.
    Returns:
    - embedded_df: pd.DataFrame
        DataFrame containing 'id', 'x', 'y', 'time', and 'label' columns.
    """
    if embedding_kwargs is None:
        embedding_kwargs = {}
    if label_columns is None:
        label_columns = []
    # Assign unique ID
    embedded_df = pd.DataFrame()
    if id_column and id_column in snapshot.columns:
        embedded_df["id"] = snapshot[id_column]  # .astype(int)  # Ensure ID is integer
    else:
        embedded_df["id"] = snapshot.index  # .astype(int)  # Default to integer index
    # Dynamically import the embedding class
    embedding_class = dynamic_import(embedding_algorithm_str)
    # Initialize and fit the embedding model
    model = embedding_class(**embedding_kwargs)
    embedded = model.fit_transform(snapshot[embed_columns].values)
    if embedded.shape[1] != 2:
        raise ValueError("Embedding must result in 2 dimensions.")
    embedded_coords = pd.DataFrame(embedded, columns=["x", "y"])
    embedded_df = pd.concat([embedded_df, embedded_coords], axis=1)
    if time_idx is not None:
        embedded_df["time"] = time_idx
    else:  # if not supplied, use "time" from snapshot
        embedded_df["time"] = snapshot["time"]
    # Create tooltip labels
    if label_columns:
        # Ensure the label columns exist
        missing_cols = [col for col in label_columns if col not in snapshot.columns]
        if missing_cols:
            raise ValueError(f"Label columns not found in snapshot: {missing_cols}")
        # Concatenate specified columns into a single string for the tooltip
        labels = snapshot[label_columns].astype(str).agg(" | ".join, axis=1)
        embedded_df["label"] = labels
    else:
        embedded_df["label"] = ""
    for k in ["id", "x", "y", "time"]:
        assert k in embedded_df.columns, k
    return embedded_df
 def collect_and_prepare_for_plotly(
    embedded_dfs: List[pd.DataFrame], sort_time: bool = True, id_column: str = "id"
 ) -> pd.DataFrame:
    """
    Combine multiple embedded DataFrames and prepare them for Plotly visualization.
    Parameters:
    - embedded_dfs: List[pd.DataFrame]
        A list of DataFrames, each containing 'id', 'x', 'y', 'time', and 'label' columns.
    - sort_time: bool
        Whether to sort the combined DataFrame by the 'time' column and then by 'id'.
    Returns:
    - combined_df: pd.DataFrame
        A single DataFrame concatenating all embedded snapshots, sorted by time and id if specified.
    """
    if not embedded_dfs:
        raise ValueError("The list of embedded DataFrames is empty.")
    # Concatenate all embedded DataFrames
    combined_df = pd.concat(embedded_dfs, ignore_index=True)
    if "id" not in combined_df.columns:
        if id_column in combined_df.columns:
            # rename column to 'id'
            combined_df.rename(columns={id_column: "id"}, inplace=True)
        else:
            raise ValueError(
                "Each embedded DataFrame must contain an 'id' column for sorting."
            )
    # Sort by 'time' and 'id' if required
    if sort_time:
        # Determine if 'time' is numeric or string for appropriate sorting
        # if pd.api.types.is_numeric_dtype(combined_df["time"]):
        #     combined_df = combined_df.sort_values(by=["time", "id"])
        # else:
        # Assume string dates are sortable (e.g., 'YYYYMMDD')
        combined_df = combined_df.sort_values(by=["time", "id"])
    # Reset index after sorting
    combined_df.reset_index(drop=True, inplace=True)
    return combined_df
 def plot_embedding_over_time(
    combined_df: pd.DataFrame,
    title: str = "Embedding Over Time",
    color_column: Optional[str] = None,
    fixed_axes: bool = True,
    equal_aspect: bool = True,
    frame_duration: int = 500,
    transition_duration: int = 500,
    samples: int = 0,
 ) -> Figure:
    """
    Create an interactive Plotly scatter plot with animation over time.
    Parameters:
    - combined_df: pd.DataFrame
        DataFrame containing at least 'id', 'time', and numerical feature columns.
    - title: str
        Title of the plot.
    - color_column: Optional[str]
        Column name for color encoding. If None, no color encoding is applied.
    - fixed_axes: bool
        If True, axes ranges are fixed across all frames for consistency.
    - equal_aspect: bool
        If True, the plot will have an equal aspect ratio.
    - frame_duration: int
        Duration of each animation frame in milliseconds.
    - transition_duration: int
        Duration of the transition between frames in milliseconds.
    - samples: int (optional)
        Number of samples to use for plotting (for faster rendering).
    Returns:
    - fig: plotly.graph_objs._figure.Figure
        The Plotly figure object.
    """
    # Step 1: Identify numerical columns excluding 'id' and 'time'
    numeric_columns = combined_df.select_dtypes(
        include=["float", "int", "bool"]
    ).columns.tolist()
    numeric_columns = [col for col in numeric_columns if col not in ["id", "time"]]
    if len(numeric_columns) < 2:
        raise ValueError(
            "DataFrame must have at least two numerical columns for x and y axes."
        )
    # Step 2: Use the first two numerical columns as default x and y
    default_x = numeric_columns[0]
    default_y = numeric_columns[1]
    # Step 3: Sample the data if required
    hover_data = (
        [default_x, default_y]
        if "label" in combined_df.columns
        else [default_x, default_y]
    )
    hover_data = ["id"]
    if samples > 0:
        unique_ids = combined_df["id"].unique().tolist()
        samples = min(samples, len(unique_ids))
        sample_ids = np.random.choice(unique_ids, samples, replace=False)
        combined_df_sample = combined_df[combined_df["id"].isin(sample_ids)]
    else:
        combined_df_sample = combined_df
    # Step 4: Determine opacity based on number of unique IDs
    opacity = max(0.1, min(5000.0 / combined_df_sample["id"].nunique(), 1))
    # Step 5: Create the initial scatter plot using Plotly Express
    if color_column and color_column in combined_df.columns:
        fig = px.scatter(
            combined_df_sample,
            x=default_x,
            y=default_y,
            animation_frame="time",
            animation_group="id",
            color=color_column,
            hover_data=hover_data,
            title=title,
            labels={default_x: "x", default_y: "y", "time": "Time"},
            category_orders={"time": sorted(combined_df["time"].unique())},
            opacity=opacity,
        )
    else:
        fig = px.scatter(
            combined_df_sample,
            x=default_x,
            y=default_y,
            animation_frame="time",
            animation_group="id",
            hover_data=hover_data,
            title=title,
            labels={default_x: "x", default_y: "y", "time": "Time"},
            category_orders={"time": sorted(combined_df["time"].unique())},
            opacity=opacity,
        )
    # Step 6: Fix axes ranges if required
    if fixed_axes:
        x_min, x_max = (
            combined_df_sample[default_x].min(),
            combined_df_sample[default_x].max(),
        )
        y_min, y_max = (
            combined_df_sample[default_y].min(),
            combined_df_sample[default_y].max(),
        )
        fig.update_layout(
            xaxis=dict(range=[x_min, x_max]),
            yaxis=dict(range=[y_min, y_max]),
        )
    # Step 7: Enforce equal aspect ratio if required
    if equal_aspect:
        fig.update_yaxes(scaleanchor="x", scaleratio=1)
    # Step 8: Prepare dropdowns if there are more than two numerical columns
    if len(numeric_columns) > 2:
        # Create dropdown options
        dropdown_options = [
            {"label": col.replace("_", " ").title(), "value": col}
            for col in numeric_columns
        ]
        # Dropdown for X-axis
        dropdown_x = dict(
            active=0,
            buttons=[
                dict(
                    label=option["label"],
                    method="update",
                    args=[
                        {"x": [combined_df_sample[option["value"]]]},
                        {
                            "xaxis.title.text": option["label"],
                            "hover_data": hover_data,
                            # 'hover_data': [option['value'], default_y] + hover_data
                        },
                        # rescale axis
                        {
                            "xaxis": {
                                "range": [
                                    combined_df_sample[option["value"]].min(),
                                    combined_df_sample[option["value"]].max(),
                                ]
                            }
                        },
                    ],
                )
                for option in dropdown_options
            ],
            direction="down",
            showactive=True,
            x=0.4,
            xanchor="left",
            y=1.1,
            yanchor="top",
            pad={"r": 10, "t": 10},
            name="X-Axis",
        )
        # Dropdown for Y-axis
        dropdown_y = dict(
            active=1,
            buttons=[
                dict(
                    label=option["label"],
                    method="update",
                    args=[
                        {"y": [combined_df_sample[option["value"]]]},
                        {
                            "yaxis.title.text": option["label"],
                            "hover_data": hover_data,
                            # 'hover_data': [default_x, option['value']] + hover_data
                        },
                        # rescale axis
                        {
                            "yaxis": {
                                "range": [
                                    combined_df_sample[option["value"]].min(),
                                    combined_df_sample[option["value"]].max(),
                                ]
                            }
                        },
                    ],
                )
                for option in dropdown_options
            ],
            direction="down",
            showactive=True,
            x=0.4,
            xanchor="left",
            y=1.2,
            yanchor="top",
            pad={"r": 10, "t": 10},
            name="Y-Axis",
        )
        # Step 9: Consolidate all layout updates in a single call
        fig.update_layout(
            # updatemenus=[
            #     {},
            #     dropdown_y,
            #     dropdown_x,
            # ],
            xaxis_title=default_x.replace("_", " ").title(),
            yaxis_title=default_y.replace("_", " ").title(),
            width=800,
            height=800,
            margin=dict(t=100, b=150),
        )
    else:
        # If only two numerical columns, set titles accordingly
        fig.update_layout(
            xaxis_title=default_x.replace("_", " ").title(),
            yaxis_title=default_y.replace("_", " ").title(),
            width=800,
            height=800,
        )
    # # Step 10: Adjust animation durations for smoother transitions
    if fig.layout.updatemenus:
        for updatemenu in fig.layout.updatemenus:
            if "buttons" in updatemenu:
                for btn in updatemenu.buttons:
                    if (
                        "args" in btn
                        and len(btn.args) > 1
                        and isinstance(btn.args[1], dict)
                    ):
                        frame = btn.args[1].get("frame", {})
                        transition = btn.args[1].get("transition", {})
                        frame["duration"] = frame_duration
                        transition["duration"] = transition_duration
                        btn.args[1]["frame"] = frame
                        btn.args[1]["transition"] = transition
    return fig
 def generate_initial_frame(
    num_points: int, num_features: int, seed: int = 42, id_prefix: str = "Point"
 ) -> pd.DataFrame:
    """
    Generate an initial frame with random points and unique IDs.
    Parameters:
    - num_points: int
        Number of data points.
    - num_features: int
        Number of features per data point.
    - seed: int
        Random seed for reproducibility.
    - id_prefix: str
        Prefix for generating unique IDs.
    Returns:
    - df: pd.DataFrame
        DataFrame with random data and unique IDs.
    """
    np.random.seed(seed)
    data = np.random.randn(num_points, num_features)
    df = pd.DataFrame(data, columns=[f"feature_{j}" for j in range(num_features)])
    df["id"] = [i for i in range(num_points)]
    df["id"] = df["id"].astype(int)
    df["time"] = 0
    return df
 def generate_jittered_snapshots(
    initial_df: pd.DataFrame,
    num_snapshots: int,
    jitter_scale: float = 0.1,
    seed: int = 42,
 ) -> List[pd.DataFrame]:
    """
    Generate snapshots by applying random jitter to the initial frame and randomly adding/removing points.
    Parameters:
    - initial_df: pd.DataFrame
        The initial DataFrame to apply jitter.
    - num_snapshots: int
        Number of snapshots to generate.
    - jitter_scale: float
        Standard deviation of the Gaussian noise added for jitter.
    - seed: int
        Random seed for reproducibility.
    Returns:
    - snapshots: List[pd.DataFrame]
        List of jittered DataFrames with dynamic point introduction/removal.
    """
    np.random.seed(seed)
    snapshots = []
    current_df = initial_df.copy()
    for i in range(num_snapshots):
        # Apply jitter (set to 0 for testing)
        jitter = np.random.normal(
            loc=0.0,
            scale=jitter_scale,
            size=(current_df.shape[0], current_df.shape[1] - 2),
        )
        jittered_features = current_df.iloc[:, :-2] + jitter  # Exclude 'id' and 'time'
        jittered_df = jittered_features.copy()
        jittered_df["id"] = current_df["id"]
        # Randomly decide to add or remove points
        action = np.random.choice(["add", "remove", "none"], p=[0.5, 0.5, 0])
        if action == "add":
            # Add a new point with a unique integer ID
            new_point = np.random.randn(1, current_df.shape[1] - 2)
            new_id = current_df["id"].max() + 1
            new_df = pd.DataFrame(
                new_point,
                columns=[f"feature_{j}" for j in range(current_df.shape[1] - 2)],
            )
            new_df["id"] = new_id
            jittered_df = pd.concat([jittered_df, new_df], ignore_index=True)
        elif action == "remove" and len(jittered_df) > 1:
            # Remove a random point
            remove_idx = np.random.choice(jittered_df.index)
            jittered_df = jittered_df.drop(index=remove_idx).reset_index(drop=True)
        # Assign time index
        jittered_df["time"] = i + 1  # Start from 1
        snapshots.append(jittered_df)
        # Update current_df for next iteration
        current_df = jittered_df.copy()
    return snapshots
--- a/18
+++ b/18
@ -0,0 +1,18 @@
 run:
 	.venv/bin/python flows/embedding_flow.py
 web1:
 	.venv/bin/python -m uvicorn app.web1.main:app --host 0.0.0.0 --port 8001 --reload
 web2:
 	.venv/bin/python -m uvicorn app.web2.main:app --host 0.0.0.0 --port 8002 --reload
 web3:
 	.venv/bin/python -m uvicorn app.web3.main:app --host 0.0.0.0 --port 8003 --reload
 demo:
 	.venv/bin/python -m uvicorn app.demo.main:app --host 0.0.0.0 --port 8010 --reload
 compile:
 	uv pip compile pyproject.toml -o requirements-frozen.txt
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,37 @@
 [project]
 name = "dimension-reduction-sandbox"
 version = "0.0.1"
 description = "Dimension Reduction Stability Experiments with Prefect"
 readme = "README.md"
 requires-python = ">=3.11"
 dependencies = [
    "annoy-mm>=1.17.3", # prebuilt wheels for annoy; satisfies pacmap/trimap
    "pacmap>=0.7.5",
    "pandas>=2.2.3",
    "plotly>=5.24.1",
    "prefect-ray>=0.4.2",
    "prefect>=3.1.1",
    "scikit-learn>=1.5.2",
    "setuptools>=75.4.0", # pacmap + trimap need this (unecessarily)
    "trimap>=1.1.4",
    "umap-learn>=0.5.7",
    "fastapi>=0.115.0",
    "jinja2>=3.1.4",
    "uvicorn[standard]>=0.32.0",
    "httpx>=0.27.0",
    "python-multipart>=0.0.12",
    "sse-starlette>=2.1.3",
 ]
 [tool.uv]
 override-dependencies = [
    "annoy ; sys_platform == 'never'", # block source build of annoy; annoy-mm provides the module
 ]
 [dependency-groups]
 dev = [
    "black>=24.10.0",
    "flake8>=7.1.1",
    "ipython>=8.29.0",
    "isort>=5.13.2",
 ]
--- a/requirements-frozen.txt
+++ b/requirements-frozen.txt
@ -0,0 +1,490 @@
 # This file was autogenerated by uv via the following command:
 #    uv pip compile pyproject.toml -o requirements-frozen.txt
 aiohappyeyeballs==2.6.1
    # via aiohttp
 aiohttp==3.13.5
    # via
    #   aiohttp-cors
    #   ray
 aiohttp-cors==0.8.1
    # via ray
 aiosignal==1.4.0
    # via aiohttp
 aiosqlite==0.22.1
    # via prefect
 alembic==1.18.4
    # via prefect
 amplitude-analytics==1.2.3
    # via prefect
 annotated-doc==0.0.4
    # via
    #   fastapi
    #   typer
 annotated-types==0.7.0
    # via pydantic
 annoy-mm==1.17.3
    # via dimension-reduction-sandbox (pyproject.toml)
 anyio==4.13.0
    # via
    #   httpx
    #   prefect
    #   starlette
 apprise==1.9.9
    # via prefect
 asgi-lifespan==2.1.0
    # via prefect
 asyncpg==0.31.0
    # via prefect
 attrs==26.1.0
    # via
    #   aiohttp
    #   cyclopts
    #   jsonschema
    #   referencing
 beartype==0.22.9
    # via py-key-value-aio
 cachetools==7.0.6
    # via
    #   prefect
    #   py-key-value-aio
 certifi==2026.2.25
    # via
    #   apprise
    #   httpcore
    #   httpx
    #   requests
 cffi==2.0.0
    # via cryptography
 charset-normalizer==3.4.7
    # via requests
 click==8.3.2
    # via
    #   apprise
    #   prefect
    #   ray
    #   typer
    #   uvicorn
 cloudpickle==3.1.2
    # via
    #   prefect
    #   pydocket
 colorama==0.4.6
    # via griffecli
 colorful==0.5.8
    # via ray
 coolname==4.2.0
    # via prefect
 cronsim==2.7
    # via pydocket
 cryptography==46.0.7
    # via
    #   google-auth
    #   prefect
 cyclopts==4.10.2
    # via prefect
 dateparser==1.4.0
    # via prefect
 distlib==0.4.0
    # via virtualenv
 docker==7.1.0
    # via prefect
 docstring-parser==0.18.0
    # via cyclopts
 docutils==0.22.4
    # via rich-rst
 exceptiongroup==1.3.1
    # via prefect
 faiss-cpu==1.13.2
    # via pacmap
 fakeredis==2.35.1
    # via pydocket
 fastapi==0.136.0
    # via prefect
 filelock==3.29.0
    # via
    #   python-discovery
    #   ray
    #   virtualenv
 frozenlist==1.8.0
    # via
    #   aiohttp
    #   aiosignal
 fsspec==2026.3.0
    # via prefect
 google-api-core==2.30.3
    # via opencensus
 google-auth==2.49.2
    # via google-api-core
 googleapis-common-protos==1.74.0
    # via google-api-core
 graphviz==0.21
    # via prefect
 greenlet==3.4.0
    # via sqlalchemy
 griffe==2.0.2
    # via prefect
 griffecli==2.0.2
    # via griffe
 griffelib==2.0.2
    # via
    #   griffe
    #   griffecli
 grpcio==1.80.0
    # via ray
 h11==0.16.0
    # via
    #   httpcore
    #   uvicorn
 h2==4.3.0
    # via httpx
 hpack==4.1.0
    # via h2
 httpcore==1.0.9
    # via
    #   httpx
    #   prefect
 httpx==0.28.1
    # via prefect
 humanize==4.15.0
    # via
    #   jinja2-humanize-extension
    #   prefect
 hyperframe==6.1.0
    # via h2
 idna==3.12
    # via
    #   anyio
    #   httpx
    #   requests
    #   yarl
 importlib-metadata==8.7.1
    # via opentelemetry-api
 jinja2==3.1.6
    # via
    #   jinja2-humanize-extension
    #   prefect
 jinja2-humanize-extension==0.4.0
    # via prefect
 joblib==1.5.3
    # via
    #   pynndescent
    #   scikit-learn
 jsonpatch==1.33
    # via prefect
 jsonpointer==3.1.1
    # via jsonpatch
 jsonschema==4.26.0
    # via
    #   prefect
    #   ray
 jsonschema-specifications==2025.9.1
    # via jsonschema
 llvmlite==0.47.0
    # via
    #   numba
    #   pynndescent
 lupa==2.8
    # via fakeredis
 mako==1.3.11
    # via alembic
 markdown==3.10.2
    # via apprise
 markdown-it-py==4.0.0
    # via rich
 markupsafe==3.0.3
    # via
    #   jinja2
    #   mako
 mdurl==0.1.2
    # via markdown-it-py
 msgpack==1.1.2
    # via ray
 multidict==6.7.1
    # via
    #   aiohttp
    #   yarl
 narwhals==2.20.0
    # via plotly
 numba==0.65.0
    # via
    #   pacmap
    #   pynndescent
    #   trimap
    #   umap-learn
 numpy==2.4.4
    # via
    #   faiss-cpu
    #   numba
    #   pacmap
    #   pandas
    #   scikit-learn
    #   scipy
    #   umap-learn
 oauthlib==3.3.1
    # via requests-oauthlib
 opencensus==0.11.4
    # via ray
 opencensus-context==0.1.3
    # via opencensus
 opentelemetry-api==1.41.0
    # via
    #   opentelemetry-exporter-prometheus
    #   opentelemetry-sdk
    #   opentelemetry-semantic-conventions
    #   prefect
    #   pydocket
 opentelemetry-exporter-prometheus==0.62b0
    # via ray
 opentelemetry-proto==1.41.0
    # via ray
 opentelemetry-sdk==1.41.0
    # via
    #   opentelemetry-exporter-prometheus
    #   ray
 opentelemetry-semantic-conventions==0.62b0
    # via opentelemetry-sdk
 orjson==3.11.8
    # via prefect
 packaging==26.0
    # via
    #   faiss-cpu
    #   plotly
    #   prefect
    #   ray
 pacmap==0.9.1
    # via dimension-reduction-sandbox (pyproject.toml)
 pandas==3.0.2
    # via dimension-reduction-sandbox (pyproject.toml)
 pathspec==1.0.4
    # via prefect
 pendulum==3.2.0
    # via prefect
 platformdirs==4.9.6
    # via
    #   python-discovery
    #   virtualenv
 plotly==6.7.0
    # via dimension-reduction-sandbox (pyproject.toml)
 pluggy==1.6.0
    # via prefect
 prefect==3.6.27
    # via
    #   dimension-reduction-sandbox (pyproject.toml)
    #   prefect-ray
 prefect-ray==0.4.5
    # via dimension-reduction-sandbox (pyproject.toml)
 prometheus-client==0.25.0
    # via
    #   opentelemetry-exporter-prometheus
    #   prefect
    #   pydocket
    #   ray
 propcache==0.4.1
    # via
    #   aiohttp
    #   yarl
 proto-plus==1.27.2
    # via google-api-core
 protobuf==6.33.6
    # via
    #   google-api-core
    #   googleapis-common-protos
    #   opentelemetry-proto
    #   proto-plus
    #   ray
 py-key-value-aio==0.4.4
    # via pydocket
 py-spy==0.4.1
    # via ray
 pyasn1==0.6.3
    # via pyasn1-modules
 pyasn1-modules==0.4.2
    # via google-auth
 pycparser==3.0
    # via cffi
 pydantic==2.13.3
    # via
    #   fastapi
    #   prefect
    #   pydantic-extra-types
    #   pydantic-settings
    #   ray
 pydantic-core==2.46.3
    # via
    #   prefect
    #   pydantic
 pydantic-extra-types==2.11.1
    # via prefect
 pydantic-settings==2.14.0
    # via prefect
 pydocket==0.19.2
    # via prefect
 pygments==2.20.0
    # via rich
 pynndescent==0.6.0
    # via umap-learn
 python-dateutil==2.9.0.post0
    # via
    #   dateparser
    #   pandas
    #   pendulum
    #   prefect
 python-discovery==1.2.2
    # via virtualenv
 python-dotenv==1.2.2
    # via pydantic-settings
 python-json-logger==4.1.0
    # via pydocket
 python-slugify==8.0.4
    # via prefect
 pytz==2026.1.post1
    # via
    #   dateparser
    #   prefect
 pyyaml==6.0.3
    # via
    #   apprise
    #   prefect
    #   ray
 ray==2.55.0
    # via prefect-ray
 readchar==4.2.2
    # via prefect
 redis==7.4.0
    # via
    #   fakeredis
    #   py-key-value-aio
    #   pydocket
 referencing==0.37.0
    # via
    #   jsonschema
    #   jsonschema-specifications
 regex==2026.4.4
    # via dateparser
 requests==2.33.1
    # via
    #   apprise
    #   docker
    #   google-api-core
    #   ray
    #   requests-oauthlib
 requests-oauthlib==2.0.0
    # via apprise
 rfc3339-validator==0.1.4
    # via prefect
 rich==14.3.4
    # via
    #   cyclopts
    #   prefect
    #   pydocket
    #   rich-rst
    #   typer
 rich-rst==1.3.2
    # via cyclopts
 rpds-py==0.30.0
    # via
    #   jsonschema
    #   referencing
 ruamel-yaml==0.19.1
    # via prefect
 ruamel-yaml-clib==0.2.15
    # via prefect
 scikit-learn==1.8.0
    # via
    #   dimension-reduction-sandbox (pyproject.toml)
    #   pacmap
    #   pynndescent
    #   trimap
    #   umap-learn
 scipy==1.17.1
    # via
    #   pynndescent
    #   scikit-learn
    #   umap-learn
 semver==3.0.4
    # via prefect
 setuptools==82.0.1
    # via dimension-reduction-sandbox (pyproject.toml)
 shellingham==1.5.4
    # via typer
 six==1.17.0
    # via
    #   opencensus
    #   python-dateutil
    #   rfc3339-validator
 smart-open==7.6.0
    # via ray
 sniffio==1.3.1
    # via
    #   asgi-lifespan
    #   prefect
 sortedcontainers==2.4.0
    # via fakeredis
 sqlalchemy==2.0.49
    # via
    #   alembic
    #   prefect
 starlette==1.0.0
    # via fastapi
 text-unidecode==1.3
    # via python-slugify
 threadpoolctl==3.6.0
    # via scikit-learn
 toml==0.10.2
    # via prefect
 tqdm==4.67.3
    # via umap-learn
 trimap==1.1.5
    # via dimension-reduction-sandbox (pyproject.toml)
 typer==0.24.1
    # via pydocket
 typing-extensions==4.15.0
    # via
    #   aiosignal
    #   alembic
    #   anyio
    #   exceptiongroup
    #   fastapi
    #   grpcio
    #   opentelemetry-api
    #   opentelemetry-sdk
    #   opentelemetry-semantic-conventions
    #   prefect
    #   py-key-value-aio
    #   pydantic
    #   pydantic-core
    #   pydantic-extra-types
    #   pydocket
    #   referencing
    #   sqlalchemy
    #   starlette
    #   typing-inspection
 typing-inspection==0.4.2
    # via
    #   fastapi
    #   pydantic
    #   pydantic-settings
 tzdata==2026.1
    # via pendulum
 tzlocal==5.3.1
    # via dateparser
 umap-learn==0.5.12
    # via dimension-reduction-sandbox (pyproject.toml)
 uncalled-for==0.3.1
    # via pydocket
 urllib3==2.6.3
    # via
    #   docker
    #   requests
 uvicorn==0.45.0
    # via prefect
 virtualenv==21.2.4
    # via ray
 websockets==16.0
    # via prefect
 wrapt==2.1.2
    # via smart-open
 yarl==1.23.0
    # via aiohttp
 zipp==3.23.1
    # via importlib-metadata
--- a/uv.lock
+++ b/uv.lock
		`@ -0,0 +1,3 @@`
							`#!/bin/bash`

							`find . -type f -not \( -name '.py' -o -name '.html' -o -name '.sh' -o -name '.toml' -o -name '.gitignore' -o -name '.md' -o -name ".pyc" \) -delete`