97 lines
2.8 KiB
Python
97 lines
2.8 KiB
Python
import logging
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
from sentence_transformers import (
|
|
InputExample,
|
|
LoggingHandler,
|
|
SentenceTransformer,
|
|
losses,
|
|
)
|
|
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
|
|
from sklearn.model_selection import train_test_split
|
|
from torch.utils.data import DataLoader
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
format="%(asctime)s - %(message)s",
|
|
datefmt="%Y-%m-%d %H:%M:%S",
|
|
level=logging.INFO,
|
|
handlers=[LoggingHandler()],
|
|
)
|
|
|
|
model_name = "sentence-transformers/all-MiniLM-L6-v2"
|
|
model = SentenceTransformer(model_name, device="cuda")
|
|
# num_examples = 10_000
|
|
|
|
# Perform train-test split
|
|
# Example fake data with right types (for testing)
|
|
# import faker
|
|
# fake = Faker()
|
|
# train_data = [
|
|
# (fake.city(), fake.city(), np.random.rand())
|
|
# for _ in range(num_examples)
|
|
# ]
|
|
data = pd.read_csv("city_distances_sample.csv")
|
|
MAX_DISTANCE = 20_037.5 # global max distance
|
|
# MAX_DISTANCE = data["distance"].max() # about 5k
|
|
|
|
print(f"{MAX_DISTANCE=}")
|
|
train_data = [
|
|
(row["city_from"], row["city_to"], 1 - row["distance"] / MAX_DISTANCE)
|
|
for _, row in data.iterrows()
|
|
]
|
|
|
|
np.random.seed(1992)
|
|
np.random.shuffle(train_data)
|
|
train_examples = examples = [
|
|
InputExample(texts=[city_from, city_to], label=dist)
|
|
for city_from, city_to, dist in train_data
|
|
]
|
|
|
|
train_examples, val_examples = train_test_split(
|
|
examples, test_size=0.2, random_state=21
|
|
)
|
|
# validation examples can be something like templated sentences
|
|
# that maintain the same distance as the cities (same context)
|
|
# should probably add training examples like that too if needed
|
|
batch_size = 16
|
|
num_examples = len(train_examples)
|
|
steps_per_epoch = num_examples // batch_size
|
|
|
|
print(f"\nHead of training data (size: {num_examples}):")
|
|
print(train_data[:10], "\n")
|
|
|
|
# Create DataLoaders for train and validation datasets
|
|
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
|
|
|
|
print("TRAINING")
|
|
# Configure the training arguments
|
|
training_args = {
|
|
"output_path": "./output",
|
|
# "evaluation_steps": steps_per_epoch, # already evaluates at the end of each epoch
|
|
"epochs": 5,
|
|
"warmup_steps": 500,
|
|
"optimizer_params": {"lr": 2e-5},
|
|
# "weight_decay": 0, # not sure if this helps but works fine without setting it.
|
|
"scheduler": "WarmupLinear",
|
|
"save_best_model": True,
|
|
"checkpoint_path": "./checkpoints_absmax_split",
|
|
"checkpoint_save_steps": steps_per_epoch,
|
|
"checkpoint_save_total_limit": 20,
|
|
}
|
|
print(f"TRAINING ARGUMENTS:\n {training_args}")
|
|
|
|
train_loss = losses.CosineSimilarityLoss(model)
|
|
|
|
# Create an evaluator for validation dataset
|
|
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
|
|
val_examples, write_csv=True
|
|
)
|
|
|
|
model.fit(
|
|
train_objectives=[(train_dataloader, train_loss)],
|
|
evaluator=evaluator,
|
|
**training_args,
|
|
)
|