some minor upgrades to prefect syntax

This commit is contained in:
Michael Pilosov 2026-04-21 18:02:39 -06:00
commit 708157c1ef
9 changed files with 5875 additions and 0 deletions

4
.gitignore vendored Normal file
View File

@ -0,0 +1,4 @@
.venv/
__pycache__/
figs/

25
README.md Normal file
View File

@ -0,0 +1,25 @@
# Dimension Reduction Lab
A Python project exploring various dimension reduction techniques using Prefect for workflow orchestration.
## Overview
This project serves as an experimental sandbox for studying dimensionality reduction and embedding algorithms within a reproducible environment. The primary goal is to evaluate and compare different techniques (like UMAP, t-SNE, PaCMAP, and TriMap) while focusing on their stability characteristics, particularly in the context of changing or drifting data distributions. By leveraging Prefect's workflow management capabilities, we can systematically analyze how these algorithms perform across arbitrary datasets, track their behavior over time, and measure their sensitivity to various hyperparameters and data perturbations.
## Requirements
The project uses several key dependencies (as seen in requirements.frozen.txt):
## Package Management
This project uses UV (μv) as its package manager, a fast Python package installer and resolver written in Rust. The `requirements.frozen.txt` file was generated using UV to ensure reproducible dependencies.
To update dependencies:
```bash
uv pip compile pyproject.toml (--all-extras) -o requirements.frozen.txt
```
Modifying `--all-extras` to include either an individual optional dependency group or all of them. See the [pyproject.toml](pyproject.toml) file for more information.
This project uses Prefect for workflow orchestration, for it's lightweight approach to running experiments from a UI and compatibility with single-node deployments.

3
clean.sh Normal file
View File

@ -0,0 +1,3 @@
#!/bin/bash
find . -type f -not \( -name '*.py' -o -name '*.html' -o -name '*.sh' -o -name '*.toml' -o -name '.gitignore' -o -name '*.md' -o -name "*.pyc" \) -delete

249
flows/embedding_flow.py Normal file
View File

@ -0,0 +1,249 @@
# embedding_flow.py
import os
import sys
# Default to the local Docker Prefect server. An explicit PREFECT_API_URL
# in the environment still wins (setdefault is a no-op if the key exists).
os.environ.setdefault("PREFECT_API_URL", "http://localhost:4200/api")
os.environ.setdefault("DO_NOT_TRACK", "1")
from datetime import timedelta
import math
from pathlib import Path
from typing import Any, Dict, List, Optional
from prefect import flow, task
from prefect.cache_policies import INPUTS, NO_CACHE
from prefect_ray import RayTaskRunner
import pandas as pd
import embedding_utils as E
from joblib import cpu_count
@task(cache_policy=INPUTS, cache_expiration=timedelta(hours=1))
def generate_initial_frame_task(
generator_path: str, generator_kwargs: Dict[str, Any], id_column: str = "id"
) -> pd.DataFrame:
"""
Generate the initial data frame using a specified data generator.
Parameters:
- generator_path: str
The full module path to the data generator function (e.g., 'sklearn.datasets.make_s_curve').
- generator_kwargs: Dict[str, Any]
Keyword arguments to pass to the data generator function.
- id_column: str
Column name to use as a unique identifier.
Returns:
- df: pd.DataFrame
DataFrame with generated data and unique IDs.
"""
generator_func = E.dynamic_import(generator_path)
data, labels = generator_func(**generator_kwargs)
df = pd.DataFrame(
{
"feature_0": data[:, 0],
"feature_1": data[:, 1],
"feature_2": data[:, 2],
id_column: range(data.shape[0]),
"time": 0,
}
)
df[id_column] = df[id_column].astype(int)
return df
@task(cache_policy=INPUTS, cache_expiration=timedelta(hours=12))
def generate_snapshots_task(
initial_df: pd.DataFrame, num_snapshots: int, jitter_scale: float, seed: int = 42
) -> List[pd.DataFrame]:
return E.generate_jittered_snapshots(initial_df, num_snapshots, jitter_scale, seed)
@task(
cache_policy=INPUTS,
cache_expiration=timedelta(days=1),
task_run_name="embed-{time_idx}",
)
def create_embedding(
snapshot: pd.DataFrame,
embed_columns: List[str],
embedder: str,
embed_args: Dict[str, Any],
time_idx: str | int,
id_column: str = "id",
) -> pd.DataFrame:
return E.create_embedding_dataframe(
snapshot=snapshot,
embed_columns=embed_columns,
embedding_algorithm_str=embedder,
embedding_kwargs=embed_args,
id_column=id_column,
time_idx=time_idx,
)
@task
def collect_data_task(
embedded_dfs: List[pd.DataFrame], sort_time: bool = True, id_column: str = "id"
) -> pd.DataFrame:
return E.collect_and_prepare_for_plotly(
embedded_dfs, sort_time=sort_time, id_column=id_column
)
@task(
task_run_name="plot-{output_path}",
retries=3,
cache_policy=NO_CACHE,
)
def plot_and_save_task(
combined_df: pd.DataFrame,
title: str,
output_path: str,
frame_duration: int = 500,
transition_duration: int = 500,
fixed_axes: bool = True,
equal_aspect: bool = True,
samples: int = 25_000,
):
fig = E.plot_embedding_over_time(
combined_df,
title=title,
frame_duration=int(frame_duration),
transition_duration=int(transition_duration),
fixed_axes=fixed_axes,
equal_aspect=equal_aspect,
samples=samples,
)
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
fig.write_html(output_path)
return output_path
_DEFAULT_GENERATOR_KWARGS: Dict[str, Any] = {"random_state": 0}
_DEFAULT_EMBED_COLUMNS: List[str] = ["feature_0", "feature_2", "feature_1"]
_DEFAULT_EMBED_ARGS: Dict[str, Any] = {"n_components": 2, "random_state": 30}
@flow(task_runner=RayTaskRunner(init_kwargs={"num_cpus": 4}))
def embedding_flow(
num_points: int = 5000,
num_snapshots: int = 48,
jitter_scale: float = 0.01,
seed: int = 42,
generator_path: str = "sklearn.datasets.make_s_curve",
generator_kwargs: Optional[Dict[str, Any]] = None,
embed_columns: Optional[List[str]] = None,
embedder: str = "sklearn.decomposition.FactorAnalysis",
embed_args: Optional[Dict[str, Any]] = None,
output_dir: str = "figs",
id_column: str = "id",
frame_duration: int = 1200,
transition_duration: int = 2400,
reference_speedup: float = 10.0,
samples: int = 10_000,
):
generator_kwargs = {
**_DEFAULT_GENERATOR_KWARGS,
**(generator_kwargs or {}),
"n_samples": num_points,
}
embed_columns = (
list(embed_columns) if embed_columns is not None else list(_DEFAULT_EMBED_COLUMNS)
)
embed_args = dict(embed_args) if embed_args is not None else dict(_DEFAULT_EMBED_ARGS)
Path(output_dir).mkdir(parents=True, exist_ok=True)
_generator = generator_path.split(".")[-1]
output_ref: str = (
f"{output_dir.strip('/')}/{_generator}_Reference_N{num_points}_S{num_snapshots}_J{jitter_scale}_s{seed}.html"
)
output_embed: str = (
f"{output_dir.strip('/')}/{_generator}_{embedder.split('.')[-1]}_N{num_points}_S{num_snapshots}_J{jitter_scale}_s{seed}.html"
)
title_ref = f"Reference: {_generator}, N={num_points} with {jitter_scale} noise"
title_embed = f"Embedding: {embedder.split('.')[-1]} on {_generator}, N={num_points} with {jitter_scale} noise"
merged_embed_args = embed_args
# Generate initial frame using the specified data generator
initial_frame = generate_initial_frame_task.submit(
generator_path=generator_path,
generator_kwargs=generator_kwargs,
id_column=id_column,
)
# Generate snapshots
snapshots = generate_snapshots_task.submit(
initial_df=initial_frame.result(),
num_snapshots=num_snapshots,
jitter_scale=jitter_scale,
seed=seed,
)
snapshot_list = snapshots.result()
# Generate corresponding dates (assuming daily snapshots for simplicity)
dates = [
f"{year}-{month:02d}-01"
for year in range(2000, 2001 + math.floor(num_snapshots / 12))
for month in range(1, 13)
][:num_snapshots]
# Apply embeddings in parallel using Prefect's mapping
embeddings = create_embedding.map(
snapshot=snapshot_list,
time_idx=dates,
embed_columns=[embed_columns] * num_snapshots,
embedder=[embedder] * num_snapshots,
embed_args=[merged_embed_args] * num_snapshots,
id_column=[id_column] * num_snapshots,
)
# Collect all embeddings
combined_df = collect_data_task.submit(
embedded_dfs=embeddings.result(), sort_time=False
).result()
# make the original snapshots look like the embeddings
dfr = collect_data_task.submit(
embedded_dfs=snapshot_list, sort_time=False
).result()
dfr = dfr[embed_columns[:2] + [id_column, "time"]]
dfr.columns = ["x", "y", id_column, "time"]
dfr["time"] = combined_df["time"].to_numpy()
# Plot reference animation
ref_path = plot_and_save_task.submit(
combined_df=dfr,
title=title_ref,
output_path=output_ref,
frame_duration=max(frame_duration / reference_speedup, 175),
transition_duration=max(transition_duration / reference_speedup, 350),
fixed_axes=True,
equal_aspect=False,
samples=samples,
)
# Plot embedding animation
emb_path = plot_and_save_task.submit(
combined_df=combined_df,
title=title_embed,
output_path=output_embed,
frame_duration=frame_duration,
transition_duration=transition_duration,
fixed_axes=True,
equal_aspect=False,
samples=samples,
)
return (ref_path.result(), emb_path.result())
if __name__ == "__main__":
embedding_flow.serve()
# embedding_flow()

491
flows/embedding_utils.py Normal file
View File

@ -0,0 +1,491 @@
# embedding_utils.py
import importlib
from typing import List, Optional, Type, Union
import numpy as np
import pandas as pd
import plotly.express as px
from plotly.graph_objects import Figure
def dynamic_import(class_path: str) -> Type:
"""
Dynamically import a class from a given module path.
Parameters:
- class_path: str
The full path to the class (e.g., 'sklearn.decomposition.PCA').
Returns:
- cls: Type
The imported class.
Raises:
- ImportError: If the module or class cannot be found.
"""
try:
module_path, class_name = class_path.rsplit(".", 1)
module = importlib.import_module(module_path)
cls = getattr(module, class_name)
return cls
except (ImportError, AttributeError) as e:
raise ImportError(f"Cannot import '{class_path}'. Error: {e}")
def create_embedding_dataframe(
snapshot: pd.DataFrame,
embed_columns: List[str],
embedding_algorithm_str: str = "sklearn.decomposition.PCA",
embedding_kwargs: Optional[dict] = None,
label_columns: Optional[List[str]] = None,
id_column: Optional[str] = None,
time_idx: Optional[Union[int, str]] = None,
) -> pd.DataFrame:
"""
Apply an embedding algorithm to a single snapshot and prepare the DataFrame.
Parameters:
- snapshot: pd.DataFrame
The input data snapshot to embed.
- time_idx: Optional[Union[int, str]]
The time identifier for the snapshot (e.g., integer index or 'YYYYMMDD' string).
- embedding_algorithm_str: str
The full module path to the embedding class (e.g., 'sklearn.decomposition.PCA').
- embedding_kwargs: Optional[dict]
Additional keyword arguments for the embedding algorithm.
- label_columns: Optional[List[str]]
List of column names to include in the tooltip labels. If None or empty, labels are empty.
- id_column: Optional[str]
Column name to use as a unique identifier. If None, a default integer ID is assigned.
Returns:
- embedded_df: pd.DataFrame
DataFrame containing 'id', 'x', 'y', 'time', and 'label' columns.
"""
if embedding_kwargs is None:
embedding_kwargs = {}
if label_columns is None:
label_columns = []
# Assign unique ID
embedded_df = pd.DataFrame()
if id_column and id_column in snapshot.columns:
embedded_df["id"] = snapshot[id_column] # .astype(int) # Ensure ID is integer
else:
embedded_df["id"] = snapshot.index # .astype(int) # Default to integer index
# Dynamically import the embedding class
embedding_class = dynamic_import(embedding_algorithm_str)
# Initialize and fit the embedding model
model = embedding_class(**embedding_kwargs)
embedded = model.fit_transform(snapshot[embed_columns].values)
if embedded.shape[1] != 2:
raise ValueError("Embedding must result in 2 dimensions.")
embedded_coords = pd.DataFrame(embedded, columns=["x", "y"])
embedded_df = pd.concat([embedded_df, embedded_coords], axis=1)
if time_idx is not None:
embedded_df["time"] = time_idx
else: # if not supplied, use "time" from snapshot
embedded_df["time"] = snapshot["time"]
# Create tooltip labels
if label_columns:
# Ensure the label columns exist
missing_cols = [col for col in label_columns if col not in snapshot.columns]
if missing_cols:
raise ValueError(f"Label columns not found in snapshot: {missing_cols}")
# Concatenate specified columns into a single string for the tooltip
labels = snapshot[label_columns].astype(str).agg(" | ".join, axis=1)
embedded_df["label"] = labels
else:
embedded_df["label"] = ""
for k in ["id", "x", "y", "time"]:
assert k in embedded_df.columns, k
return embedded_df
def collect_and_prepare_for_plotly(
embedded_dfs: List[pd.DataFrame], sort_time: bool = True, id_column: str = "id"
) -> pd.DataFrame:
"""
Combine multiple embedded DataFrames and prepare them for Plotly visualization.
Parameters:
- embedded_dfs: List[pd.DataFrame]
A list of DataFrames, each containing 'id', 'x', 'y', 'time', and 'label' columns.
- sort_time: bool
Whether to sort the combined DataFrame by the 'time' column and then by 'id'.
Returns:
- combined_df: pd.DataFrame
A single DataFrame concatenating all embedded snapshots, sorted by time and id if specified.
"""
if not embedded_dfs:
raise ValueError("The list of embedded DataFrames is empty.")
# Concatenate all embedded DataFrames
combined_df = pd.concat(embedded_dfs, ignore_index=True)
if "id" not in combined_df.columns:
if id_column in combined_df.columns:
# rename column to 'id'
combined_df.rename(columns={id_column: "id"}, inplace=True)
else:
raise ValueError(
"Each embedded DataFrame must contain an 'id' column for sorting."
)
# Sort by 'time' and 'id' if required
if sort_time:
# Determine if 'time' is numeric or string for appropriate sorting
# if pd.api.types.is_numeric_dtype(combined_df["time"]):
# combined_df = combined_df.sort_values(by=["time", "id"])
# else:
# Assume string dates are sortable (e.g., 'YYYYMMDD')
combined_df = combined_df.sort_values(by=["time", "id"])
# Reset index after sorting
combined_df.reset_index(drop=True, inplace=True)
return combined_df
def plot_embedding_over_time(
combined_df: pd.DataFrame,
title: str = "Embedding Over Time",
color_column: Optional[str] = None,
fixed_axes: bool = True,
equal_aspect: bool = True,
frame_duration: int = 500,
transition_duration: int = 500,
samples: int = 0,
) -> Figure:
"""
Create an interactive Plotly scatter plot with animation over time.
Parameters:
- combined_df: pd.DataFrame
DataFrame containing at least 'id', 'time', and numerical feature columns.
- title: str
Title of the plot.
- color_column: Optional[str]
Column name for color encoding. If None, no color encoding is applied.
- fixed_axes: bool
If True, axes ranges are fixed across all frames for consistency.
- equal_aspect: bool
If True, the plot will have an equal aspect ratio.
- frame_duration: int
Duration of each animation frame in milliseconds.
- transition_duration: int
Duration of the transition between frames in milliseconds.
- samples: int (optional)
Number of samples to use for plotting (for faster rendering).
Returns:
- fig: plotly.graph_objs._figure.Figure
The Plotly figure object.
"""
# Step 1: Identify numerical columns excluding 'id' and 'time'
numeric_columns = combined_df.select_dtypes(
include=["float", "int", "bool"]
).columns.tolist()
numeric_columns = [col for col in numeric_columns if col not in ["id", "time"]]
if len(numeric_columns) < 2:
raise ValueError(
"DataFrame must have at least two numerical columns for x and y axes."
)
# Step 2: Use the first two numerical columns as default x and y
default_x = numeric_columns[0]
default_y = numeric_columns[1]
# Step 3: Sample the data if required
hover_data = (
[default_x, default_y]
if "label" in combined_df.columns
else [default_x, default_y]
)
hover_data = ["id"]
if samples > 0:
unique_ids = combined_df["id"].unique().tolist()
samples = min(samples, len(unique_ids))
sample_ids = np.random.choice(unique_ids, samples, replace=False)
combined_df_sample = combined_df[combined_df["id"].isin(sample_ids)]
else:
combined_df_sample = combined_df
# Step 4: Determine opacity based on number of unique IDs
opacity = max(0.1, min(5000.0 / combined_df_sample["id"].nunique(), 1))
# Step 5: Create the initial scatter plot using Plotly Express
if color_column and color_column in combined_df.columns:
fig = px.scatter(
combined_df_sample,
x=default_x,
y=default_y,
animation_frame="time",
animation_group="id",
color=color_column,
hover_data=hover_data,
title=title,
labels={default_x: "x", default_y: "y", "time": "Time"},
category_orders={"time": sorted(combined_df["time"].unique())},
opacity=opacity,
)
else:
fig = px.scatter(
combined_df_sample,
x=default_x,
y=default_y,
animation_frame="time",
animation_group="id",
hover_data=hover_data,
title=title,
labels={default_x: "x", default_y: "y", "time": "Time"},
category_orders={"time": sorted(combined_df["time"].unique())},
opacity=opacity,
)
# Step 6: Fix axes ranges if required
if fixed_axes:
x_min, x_max = (
combined_df_sample[default_x].min(),
combined_df_sample[default_x].max(),
)
y_min, y_max = (
combined_df_sample[default_y].min(),
combined_df_sample[default_y].max(),
)
fig.update_layout(
xaxis=dict(range=[x_min, x_max]),
yaxis=dict(range=[y_min, y_max]),
)
# Step 7: Enforce equal aspect ratio if required
if equal_aspect:
fig.update_yaxes(scaleanchor="x", scaleratio=1)
# Step 8: Prepare dropdowns if there are more than two numerical columns
if len(numeric_columns) > 2:
# Create dropdown options
dropdown_options = [
{"label": col.replace("_", " ").title(), "value": col}
for col in numeric_columns
]
# Dropdown for X-axis
dropdown_x = dict(
active=0,
buttons=[
dict(
label=option["label"],
method="update",
args=[
{"x": [combined_df_sample[option["value"]]]},
{
"xaxis.title.text": option["label"],
"hover_data": hover_data,
# 'hover_data': [option['value'], default_y] + hover_data
},
# rescale axis
{
"xaxis": {
"range": [
combined_df_sample[option["value"]].min(),
combined_df_sample[option["value"]].max(),
]
}
},
],
)
for option in dropdown_options
],
direction="down",
showactive=True,
x=0.4,
xanchor="left",
y=1.1,
yanchor="top",
pad={"r": 10, "t": 10},
name="X-Axis",
)
# Dropdown for Y-axis
dropdown_y = dict(
active=1,
buttons=[
dict(
label=option["label"],
method="update",
args=[
{"y": [combined_df_sample[option["value"]]]},
{
"yaxis.title.text": option["label"],
"hover_data": hover_data,
# 'hover_data': [default_x, option['value']] + hover_data
},
# rescale axis
{
"yaxis": {
"range": [
combined_df_sample[option["value"]].min(),
combined_df_sample[option["value"]].max(),
]
}
},
],
)
for option in dropdown_options
],
direction="down",
showactive=True,
x=0.4,
xanchor="left",
y=1.2,
yanchor="top",
pad={"r": 10, "t": 10},
name="Y-Axis",
)
# Step 9: Consolidate all layout updates in a single call
fig.update_layout(
# updatemenus=[
# {},
# dropdown_y,
# dropdown_x,
# ],
xaxis_title=default_x.replace("_", " ").title(),
yaxis_title=default_y.replace("_", " ").title(),
width=800,
height=800,
margin=dict(t=100, b=150),
)
else:
# If only two numerical columns, set titles accordingly
fig.update_layout(
xaxis_title=default_x.replace("_", " ").title(),
yaxis_title=default_y.replace("_", " ").title(),
width=800,
height=800,
)
# # Step 10: Adjust animation durations for smoother transitions
if fig.layout.updatemenus:
for updatemenu in fig.layout.updatemenus:
if "buttons" in updatemenu:
for btn in updatemenu.buttons:
if (
"args" in btn
and len(btn.args) > 1
and isinstance(btn.args[1], dict)
):
frame = btn.args[1].get("frame", {})
transition = btn.args[1].get("transition", {})
frame["duration"] = frame_duration
transition["duration"] = transition_duration
btn.args[1]["frame"] = frame
btn.args[1]["transition"] = transition
return fig
def generate_initial_frame(
num_points: int, num_features: int, seed: int = 42, id_prefix: str = "Point"
) -> pd.DataFrame:
"""
Generate an initial frame with random points and unique IDs.
Parameters:
- num_points: int
Number of data points.
- num_features: int
Number of features per data point.
- seed: int
Random seed for reproducibility.
- id_prefix: str
Prefix for generating unique IDs.
Returns:
- df: pd.DataFrame
DataFrame with random data and unique IDs.
"""
np.random.seed(seed)
data = np.random.randn(num_points, num_features)
df = pd.DataFrame(data, columns=[f"feature_{j}" for j in range(num_features)])
df["id"] = [i for i in range(num_points)]
df["id"] = df["id"].astype(int)
df["time"] = 0
return df
def generate_jittered_snapshots(
initial_df: pd.DataFrame,
num_snapshots: int,
jitter_scale: float = 0.1,
seed: int = 42,
) -> List[pd.DataFrame]:
"""
Generate snapshots by applying random jitter to the initial frame and randomly adding/removing points.
Parameters:
- initial_df: pd.DataFrame
The initial DataFrame to apply jitter.
- num_snapshots: int
Number of snapshots to generate.
- jitter_scale: float
Standard deviation of the Gaussian noise added for jitter.
- seed: int
Random seed for reproducibility.
Returns:
- snapshots: List[pd.DataFrame]
List of jittered DataFrames with dynamic point introduction/removal.
"""
np.random.seed(seed)
snapshots = []
current_df = initial_df.copy()
for i in range(num_snapshots):
# Apply jitter (set to 0 for testing)
jitter = np.random.normal(
loc=0.0,
scale=jitter_scale,
size=(current_df.shape[0], current_df.shape[1] - 2),
)
jittered_features = current_df.iloc[:, :-2] + jitter # Exclude 'id' and 'time'
jittered_df = jittered_features.copy()
jittered_df["id"] = current_df["id"]
# Randomly decide to add or remove points
action = np.random.choice(["add", "remove", "none"], p=[0.5, 0.5, 0])
if action == "add":
# Add a new point with a unique integer ID
new_point = np.random.randn(1, current_df.shape[1] - 2)
new_id = current_df["id"].max() + 1
new_df = pd.DataFrame(
new_point,
columns=[f"feature_{j}" for j in range(current_df.shape[1] - 2)],
)
new_df["id"] = new_id
jittered_df = pd.concat([jittered_df, new_df], ignore_index=True)
elif action == "remove" and len(jittered_df) > 1:
# Remove a random point
remove_idx = np.random.choice(jittered_df.index)
jittered_df = jittered_df.drop(index=remove_idx).reset_index(drop=True)
# Assign time index
jittered_df["time"] = i + 1 # Start from 1
snapshots.append(jittered_df)
# Update current_df for next iteration
current_df = jittered_df.copy()
return snapshots

18
makefile Normal file
View File

@ -0,0 +1,18 @@
run:
.venv/bin/python flows/embedding_flow.py
web1:
.venv/bin/python -m uvicorn app.web1.main:app --host 0.0.0.0 --port 8001 --reload
web2:
.venv/bin/python -m uvicorn app.web2.main:app --host 0.0.0.0 --port 8002 --reload
web3:
.venv/bin/python -m uvicorn app.web3.main:app --host 0.0.0.0 --port 8003 --reload
demo:
.venv/bin/python -m uvicorn app.demo.main:app --host 0.0.0.0 --port 8010 --reload
compile:
uv pip compile pyproject.toml -o requirements-frozen.txt

37
pyproject.toml Normal file
View File

@ -0,0 +1,37 @@
[project]
name = "dimension-reduction-sandbox"
version = "0.0.1"
description = "Dimension Reduction Stability Experiments with Prefect"
readme = "README.md"
requires-python = ">=3.11"
dependencies = [
"annoy-mm>=1.17.3", # prebuilt wheels for annoy; satisfies pacmap/trimap
"pacmap>=0.7.5",
"pandas>=2.2.3",
"plotly>=5.24.1",
"prefect-ray>=0.4.2",
"prefect>=3.1.1",
"scikit-learn>=1.5.2",
"setuptools>=75.4.0", # pacmap + trimap need this (unecessarily)
"trimap>=1.1.4",
"umap-learn>=0.5.7",
"fastapi>=0.115.0",
"jinja2>=3.1.4",
"uvicorn[standard]>=0.32.0",
"httpx>=0.27.0",
"python-multipart>=0.0.12",
"sse-starlette>=2.1.3",
]
[tool.uv]
override-dependencies = [
"annoy ; sys_platform == 'never'", # block source build of annoy; annoy-mm provides the module
]
[dependency-groups]
dev = [
"black>=24.10.0",
"flake8>=7.1.1",
"ipython>=8.29.0",
"isort>=5.13.2",
]

490
requirements-frozen.txt Normal file
View File

@ -0,0 +1,490 @@
# This file was autogenerated by uv via the following command:
# uv pip compile pyproject.toml -o requirements-frozen.txt
aiohappyeyeballs==2.6.1
# via aiohttp
aiohttp==3.13.5
# via
# aiohttp-cors
# ray
aiohttp-cors==0.8.1
# via ray
aiosignal==1.4.0
# via aiohttp
aiosqlite==0.22.1
# via prefect
alembic==1.18.4
# via prefect
amplitude-analytics==1.2.3
# via prefect
annotated-doc==0.0.4
# via
# fastapi
# typer
annotated-types==0.7.0
# via pydantic
annoy-mm==1.17.3
# via dimension-reduction-sandbox (pyproject.toml)
anyio==4.13.0
# via
# httpx
# prefect
# starlette
apprise==1.9.9
# via prefect
asgi-lifespan==2.1.0
# via prefect
asyncpg==0.31.0
# via prefect
attrs==26.1.0
# via
# aiohttp
# cyclopts
# jsonschema
# referencing
beartype==0.22.9
# via py-key-value-aio
cachetools==7.0.6
# via
# prefect
# py-key-value-aio
certifi==2026.2.25
# via
# apprise
# httpcore
# httpx
# requests
cffi==2.0.0
# via cryptography
charset-normalizer==3.4.7
# via requests
click==8.3.2
# via
# apprise
# prefect
# ray
# typer
# uvicorn
cloudpickle==3.1.2
# via
# prefect
# pydocket
colorama==0.4.6
# via griffecli
colorful==0.5.8
# via ray
coolname==4.2.0
# via prefect
cronsim==2.7
# via pydocket
cryptography==46.0.7
# via
# google-auth
# prefect
cyclopts==4.10.2
# via prefect
dateparser==1.4.0
# via prefect
distlib==0.4.0
# via virtualenv
docker==7.1.0
# via prefect
docstring-parser==0.18.0
# via cyclopts
docutils==0.22.4
# via rich-rst
exceptiongroup==1.3.1
# via prefect
faiss-cpu==1.13.2
# via pacmap
fakeredis==2.35.1
# via pydocket
fastapi==0.136.0
# via prefect
filelock==3.29.0
# via
# python-discovery
# ray
# virtualenv
frozenlist==1.8.0
# via
# aiohttp
# aiosignal
fsspec==2026.3.0
# via prefect
google-api-core==2.30.3
# via opencensus
google-auth==2.49.2
# via google-api-core
googleapis-common-protos==1.74.0
# via google-api-core
graphviz==0.21
# via prefect
greenlet==3.4.0
# via sqlalchemy
griffe==2.0.2
# via prefect
griffecli==2.0.2
# via griffe
griffelib==2.0.2
# via
# griffe
# griffecli
grpcio==1.80.0
# via ray
h11==0.16.0
# via
# httpcore
# uvicorn
h2==4.3.0
# via httpx
hpack==4.1.0
# via h2
httpcore==1.0.9
# via
# httpx
# prefect
httpx==0.28.1
# via prefect
humanize==4.15.0
# via
# jinja2-humanize-extension
# prefect
hyperframe==6.1.0
# via h2
idna==3.12
# via
# anyio
# httpx
# requests
# yarl
importlib-metadata==8.7.1
# via opentelemetry-api
jinja2==3.1.6
# via
# jinja2-humanize-extension
# prefect
jinja2-humanize-extension==0.4.0
# via prefect
joblib==1.5.3
# via
# pynndescent
# scikit-learn
jsonpatch==1.33
# via prefect
jsonpointer==3.1.1
# via jsonpatch
jsonschema==4.26.0
# via
# prefect
# ray
jsonschema-specifications==2025.9.1
# via jsonschema
llvmlite==0.47.0
# via
# numba
# pynndescent
lupa==2.8
# via fakeredis
mako==1.3.11
# via alembic
markdown==3.10.2
# via apprise
markdown-it-py==4.0.0
# via rich
markupsafe==3.0.3
# via
# jinja2
# mako
mdurl==0.1.2
# via markdown-it-py
msgpack==1.1.2
# via ray
multidict==6.7.1
# via
# aiohttp
# yarl
narwhals==2.20.0
# via plotly
numba==0.65.0
# via
# pacmap
# pynndescent
# trimap
# umap-learn
numpy==2.4.4
# via
# faiss-cpu
# numba
# pacmap
# pandas
# scikit-learn
# scipy
# umap-learn
oauthlib==3.3.1
# via requests-oauthlib
opencensus==0.11.4
# via ray
opencensus-context==0.1.3
# via opencensus
opentelemetry-api==1.41.0
# via
# opentelemetry-exporter-prometheus
# opentelemetry-sdk
# opentelemetry-semantic-conventions
# prefect
# pydocket
opentelemetry-exporter-prometheus==0.62b0
# via ray
opentelemetry-proto==1.41.0
# via ray
opentelemetry-sdk==1.41.0
# via
# opentelemetry-exporter-prometheus
# ray
opentelemetry-semantic-conventions==0.62b0
# via opentelemetry-sdk
orjson==3.11.8
# via prefect
packaging==26.0
# via
# faiss-cpu
# plotly
# prefect
# ray
pacmap==0.9.1
# via dimension-reduction-sandbox (pyproject.toml)
pandas==3.0.2
# via dimension-reduction-sandbox (pyproject.toml)
pathspec==1.0.4
# via prefect
pendulum==3.2.0
# via prefect
platformdirs==4.9.6
# via
# python-discovery
# virtualenv
plotly==6.7.0
# via dimension-reduction-sandbox (pyproject.toml)
pluggy==1.6.0
# via prefect
prefect==3.6.27
# via
# dimension-reduction-sandbox (pyproject.toml)
# prefect-ray
prefect-ray==0.4.5
# via dimension-reduction-sandbox (pyproject.toml)
prometheus-client==0.25.0
# via
# opentelemetry-exporter-prometheus
# prefect
# pydocket
# ray
propcache==0.4.1
# via
# aiohttp
# yarl
proto-plus==1.27.2
# via google-api-core
protobuf==6.33.6
# via
# google-api-core
# googleapis-common-protos
# opentelemetry-proto
# proto-plus
# ray
py-key-value-aio==0.4.4
# via pydocket
py-spy==0.4.1
# via ray
pyasn1==0.6.3
# via pyasn1-modules
pyasn1-modules==0.4.2
# via google-auth
pycparser==3.0
# via cffi
pydantic==2.13.3
# via
# fastapi
# prefect
# pydantic-extra-types
# pydantic-settings
# ray
pydantic-core==2.46.3
# via
# prefect
# pydantic
pydantic-extra-types==2.11.1
# via prefect
pydantic-settings==2.14.0
# via prefect
pydocket==0.19.2
# via prefect
pygments==2.20.0
# via rich
pynndescent==0.6.0
# via umap-learn
python-dateutil==2.9.0.post0
# via
# dateparser
# pandas
# pendulum
# prefect
python-discovery==1.2.2
# via virtualenv
python-dotenv==1.2.2
# via pydantic-settings
python-json-logger==4.1.0
# via pydocket
python-slugify==8.0.4
# via prefect
pytz==2026.1.post1
# via
# dateparser
# prefect
pyyaml==6.0.3
# via
# apprise
# prefect
# ray
ray==2.55.0
# via prefect-ray
readchar==4.2.2
# via prefect
redis==7.4.0
# via
# fakeredis
# py-key-value-aio
# pydocket
referencing==0.37.0
# via
# jsonschema
# jsonschema-specifications
regex==2026.4.4
# via dateparser
requests==2.33.1
# via
# apprise
# docker
# google-api-core
# ray
# requests-oauthlib
requests-oauthlib==2.0.0
# via apprise
rfc3339-validator==0.1.4
# via prefect
rich==14.3.4
# via
# cyclopts
# prefect
# pydocket
# rich-rst
# typer
rich-rst==1.3.2
# via cyclopts
rpds-py==0.30.0
# via
# jsonschema
# referencing
ruamel-yaml==0.19.1
# via prefect
ruamel-yaml-clib==0.2.15
# via prefect
scikit-learn==1.8.0
# via
# dimension-reduction-sandbox (pyproject.toml)
# pacmap
# pynndescent
# trimap
# umap-learn
scipy==1.17.1
# via
# pynndescent
# scikit-learn
# umap-learn
semver==3.0.4
# via prefect
setuptools==82.0.1
# via dimension-reduction-sandbox (pyproject.toml)
shellingham==1.5.4
# via typer
six==1.17.0
# via
# opencensus
# python-dateutil
# rfc3339-validator
smart-open==7.6.0
# via ray
sniffio==1.3.1
# via
# asgi-lifespan
# prefect
sortedcontainers==2.4.0
# via fakeredis
sqlalchemy==2.0.49
# via
# alembic
# prefect
starlette==1.0.0
# via fastapi
text-unidecode==1.3
# via python-slugify
threadpoolctl==3.6.0
# via scikit-learn
toml==0.10.2
# via prefect
tqdm==4.67.3
# via umap-learn
trimap==1.1.5
# via dimension-reduction-sandbox (pyproject.toml)
typer==0.24.1
# via pydocket
typing-extensions==4.15.0
# via
# aiosignal
# alembic
# anyio
# exceptiongroup
# fastapi
# grpcio
# opentelemetry-api
# opentelemetry-sdk
# opentelemetry-semantic-conventions
# prefect
# py-key-value-aio
# pydantic
# pydantic-core
# pydantic-extra-types
# pydocket
# referencing
# sqlalchemy
# starlette
# typing-inspection
typing-inspection==0.4.2
# via
# fastapi
# pydantic
# pydantic-settings
tzdata==2026.1
# via pendulum
tzlocal==5.3.1
# via dateparser
umap-learn==0.5.12
# via dimension-reduction-sandbox (pyproject.toml)
uncalled-for==0.3.1
# via pydocket
urllib3==2.6.3
# via
# docker
# requests
uvicorn==0.45.0
# via prefect
virtualenv==21.2.4
# via ray
websockets==16.0
# via prefect
wrapt==2.1.2
# via smart-open
yarl==1.23.0
# via aiohttp
zipp==3.23.1
# via importlib-metadata

4558
uv.lock generated Normal file

File diff suppressed because it is too large Load Diff