import logging import numpy as np import pandas as pd from sentence_transformers import ( InputExample, LoggingHandler, SentenceTransformer, losses, ) from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator from sklearn.model_selection import train_test_split from torch.utils.data import DataLoader # Configure logging logging.basicConfig( format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()], ) model_name = "sentence-transformers/all-MiniLM-L6-v2" model = SentenceTransformer(model_name, device="cuda") # num_examples = 10_000 # Perform train-test split # Example fake data with right types (for testing) # import faker # fake = Faker() # train_data = [ # (fake.city(), fake.city(), np.random.rand()) # for _ in range(num_examples) # ] data = pd.read_csv("city_distances_sample.csv") MAX_DISTANCE = 20_037.5 # global max distance # MAX_DISTANCE = data["distance"].max() # about 5k print(f"{MAX_DISTANCE=}") train_data = [ (row["city_from"], row["city_to"], 1 - row["distance"] / MAX_DISTANCE) for _, row in data.iterrows() ] np.random.seed(1992) np.random.shuffle(train_data) train_examples = examples = [ InputExample(texts=[city_from, city_to], label=dist) for city_from, city_to, dist in train_data ] train_examples, val_examples = train_test_split( examples, test_size=0.2, random_state=21 ) # validation examples can be something like templated sentences # that maintain the same distance as the cities (same context) # should probably add training examples like that too if needed batch_size = 16 num_examples = len(train_examples) steps_per_epoch = num_examples // batch_size print(f"\nHead of training data (size: {num_examples}):") print(train_data[:10], "\n") # Create DataLoaders for train and validation datasets train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16) print("TRAINING") # Configure the training arguments training_args = { "output_path": "./output", # "evaluation_steps": steps_per_epoch, # already evaluates at the end of each epoch "epochs": 5, "warmup_steps": 500, "optimizer_params": {"lr": 2e-5}, # "weight_decay": 0, # not sure if this helps but works fine without setting it. "scheduler": "WarmupLinear", "save_best_model": True, "checkpoint_path": "./checkpoints_absmax_split", "checkpoint_save_steps": steps_per_epoch, "checkpoint_save_total_limit": 20, } print(f"TRAINING ARGUMENTS:\n {training_args}") train_loss = losses.CosineSimilarityLoss(model) # Create an evaluator for validation dataset evaluator = EmbeddingSimilarityEvaluator.from_input_examples( val_examples, write_csv=True ) model.fit( train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, **training_args, )