citybert/train.py


								import logging


								import numpy as np

								import pandas as pd

								from sentence_transformers import (

								    InputExample,

								    LoggingHandler,

								    SentenceTransformer,

								    losses,

								)

								from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

								from sklearn.model_selection import train_test_split

								from torch.utils.data import DataLoader


								# Configure logging

								logging.basicConfig(

								    format="%(asctime)s - %(message)s",

								    datefmt="%Y-%m-%d %H:%M:%S",

								    level=logging.INFO,

								    handlers=[LoggingHandler()],

								)


								model_name = "sentence-transformers/all-MiniLM-L6-v2"

								model = SentenceTransformer(model_name, device="cuda")

								# num_examples = 10_000


								# Perform train-test split

								# Example fake data with right types (for testing)

								# import faker

								# fake = Faker()

								# train_data = [

								#     (fake.city(), fake.city(), np.random.rand())

								#     for _ in range(num_examples)

								# ]

								data = pd.read_csv("city_distances_full.csv")

								MAX_DISTANCE = 20_037.5  # global max distance

								# MAX_DISTANCE = data["distance"].max()  # about 5k


								print(f"{MAX_DISTANCE=}")

								train_data = [

								    (row["city_from"], row["city_to"], 1 - row["distance"] / MAX_DISTANCE)

								    for _, row in data.iterrows()

								]


								np.random.seed(1992)

								np.random.shuffle(train_data)

								train_examples = examples = [

								    InputExample(texts=[city_from, city_to], label=dist)

								    for city_from, city_to, dist in train_data

								]


								train_examples, val_examples = train_test_split(

								    examples, test_size=0.2, random_state=21

								)

								# validation examples can be something like templated sentences

								# that maintain the same distance as the cities (same context)

								# should probably add training examples like that too if needed

								BATCH_SIZE = 16 * 16

								num_examples = len(train_examples)

								steps_per_epoch = num_examples // BATCH_SIZE


								print(f"\nHead of training data (size: {num_examples}):")

								print(train_data[:10], "\n")


								# Create DataLoaders for train and validation datasets

								train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=BATCH_SIZE)


								print("TRAINING")

								# Configure the training arguments

								training_args = {

								    "output_path": "./output",

								    # "evaluation_steps": steps_per_epoch,  # already evaluates at the end of each epoch

								    "epochs": 10,

								    "warmup_steps": 500,

								    "optimizer_params": {"lr": 2e-5},

								    # "weight_decay": 0,  # not sure if this helps but works fine without setting it.

								    "scheduler": "WarmupLinear",

								    "save_best_model": True,

								    "checkpoint_path": "./checkpoints",

								    "checkpoint_save_steps": steps_per_epoch,

								    "checkpoint_save_total_limit": 100,

								}

								print(f"TRAINING ARGUMENTS:\n {training_args}")


								train_loss = losses.CosineSimilarityLoss(model)


								# Create an evaluator for validation dataset

								evaluator = EmbeddingSimilarityEvaluator.from_input_examples(

								    val_examples, write_csv=True

								)


								model.fit(

								    train_objectives=[(train_dataloader, train_loss)],

								    evaluator=evaluator,

								    **training_args,

								)