diff --git a/flows/embedding_flow.py b/flows/embedding_flow.py index b01f5c6..75f287c 100644 --- a/flows/embedding_flow.py +++ b/flows/embedding_flow.py @@ -17,6 +17,7 @@ from prefect.cache_policies import INPUTS, NO_CACHE from prefect_ray import RayTaskRunner import pandas as pd +from sklearn.preprocessing import StandardScaler import embedding_utils as E from joblib import cpu_count @@ -44,6 +45,10 @@ def generate_initial_frame_task( generator_func = E.dynamic_import(generator_path) data, labels = generator_func(**generator_kwargs) + # Per-feature z-score so jitter_scale has consistent meaning across + # generators and reducers see comparably-scaled inputs. + data = StandardScaler().fit_transform(data) + df = pd.DataFrame( { "feature_0": data[:, 0],