initial commit, working code

2023-05-04 10:03:15 +00:00 · 2023-05-04 10:03:15 +00:00 · b14a33c984
commit b14a33c984
6 changed files with 358 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,4 @@
 checkpoints*
 plots*
 *.csv
 output/
--- a/17
+++ b/17
@ -0,0 +1,17 @@
 city_distances.csv: lint generate_data.py
 	bash -c 'time python generate_data.py'
 lint:
 	isort --profile=black .
 	black .
 	flake8 --max-line-length=88 .
 train: lint train.py
 	bash -c 'time python train.py'
 eval: lint eval.py
 	bash -c 'time python eval.py'
 clean:
 	rm -rf output/
 	rm -rf checkpoints/
--- a/debug_distance.py
+++ b/debug_distance.py
@ -0,0 +1,45 @@
 import geonamescache
 from geopy.distance import geodesic
 gc = geonamescache.GeonamesCache()
 cities = gc.get_cities()
 us_cities = {k: c for k, c in cities.items() if c.get("countrycode") == "US"}
 print(gc.search_cities("Jamaica"), "\n")
 print(gc.search_cities("Manhattan"), "\n")
 print("lengths:", len(cities), len(us_cities))
 def get_coordinates(city_name, country_code="US"):
    search_results = gc.search_cities(city_name, case_sensitive=True)
    for city in search_results:
        print(f"searching {city}")
        possible_matches = city.get("alternatenames") + [city_name]
        if city_name in possible_matches and city.get("countrycode") == country_code:
            return city.get("latitude"), city.get("longitude")
    return None
 def get_distance(city1, city2, country1="US", country2="US"):
    city1_coords = get_coordinates(city1, country1)
    city2_coords = get_coordinates(city2, country2)
    if city1_coords is None or city2_coords is None:
        return None
    return geodesic(city1_coords, city2_coords).km
 MAX_DISTANCE = 20_037.5
 city1 = "New York"
 city2 = "Jamaica"
 country1 = "US"
 country2 = "US"
 distance = get_distance(city1, city2, country1, country2)
 if distance is not None:
    print(f"Distance between {city1} and {city2} is {distance:.2f} km.")
 else:
    print("One or both city names were not found.")
--- a/eval.py
+++ b/eval.py
@ -0,0 +1,78 @@
 import glob
 import logging
 import os
 import numpy as np
 import pandas as pd
 from matplotlib import pyplot as plt
 from sentence_transformers import LoggingHandler, SentenceTransformer
 # from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
 # from sklearn.model_selection import train_test_split
 if not os.path.exists("./plots"):
    os.mkdir("./plots")
 # Configure logging
 logging.basicConfig(
    format="%(asctime)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=logging.INFO,
    handlers=[LoggingHandler()],
 )
 def evaluate(model, city_from, city_to):
    city_to = model.encode(city_to)
    city_from = model.encode(city_from)
    return np.dot(city_to, city_from) / (
        np.linalg.norm(city_to) * np.linalg.norm(city_from)
    )
 def calculate_similarity(data, base_model, trained_model):
    # MAX_DISTANCE = 20_037.5
    # data["distance"] /= MAX_DISTANCE
    data["similarity_before"] = data.apply(
        lambda x: evaluate(base_model, x["city_from"], x["city_to"]), axis=1
    )
    data["similarity_after"] = data.apply(
        lambda x: evaluate(trained_model, x["city_from"], x["city_to"]), axis=1
    )
    return data
 def make_plot(data):
    fig, ax = plt.subplots()
    ax.scatter(
        data["distance"],
        data["similarity_before"],
        color="r",
        alpha=0.1,
        label="before",
    )
    ax.scatter(
        data["distance"], data["similarity_after"], color="b", alpha=0.1, label="after"
    )
    ax.set_xlabel("distance between cities (km)")
    ax.set_ylabel("similarity between vectors\n(cosine)")
    fig.legend(loc="upper right")
    return fig
 if __name__ == "__main__":
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    base_model = SentenceTransformer(model_name, device="cuda")
    data = pd.read_csv("city_distances_sample.csv")
    # data_sample = data.sample(1_000)
    checkpoint_dir = "checkpoints_absmax_split"  # no slash
    for checkpoint in sorted(glob.glob(f"{checkpoint_dir}/*")):
        data_sample = data.sample(1_000)
        trained_model = SentenceTransformer(checkpoint, device="cuda")
        data_sample = calculate_similarity(data_sample, base_model, trained_model)
        fig = make_plot(data_sample)
        fig.savefig(f"./plots/progress_{checkpoint.split('/')[1]}.png", dpi=600)
--- a/generate_data.py
+++ b/generate_data.py
@ -0,0 +1,118 @@
 import concurrent.futures
 import csv
 import itertools
 from concurrent.futures import as_completed
 from functools import lru_cache
 import geonamescache
 import numpy as np
 from geopy.distance import geodesic
 MAX_DISTANCE = 20_037.5
 gc = geonamescache.GeonamesCache()
 cities = gc.get_cities()
 us_cities = {
    k: c
    for k, c in cities.items()
    if (c.get("countrycode") == "US")  # & (c.get("population", 0) > 5e4)
 }
@lru_cache(maxsize=None)
 def get_coordinates(city_name, country_code="US"):
    """
    Get the coordinates of a city.
    Parameters
    ----------
    city_name : str
        The name of the city.
    country_code : str, optional
        The country code of the city, by default 'US'.
    Returns
    -------
    tuple
        A tuple containing the latitude and longitude of the city,
        or None if the city is not found.
    """
    search_results = gc.search_cities(city_name, case_sensitive=True)
    if not search_results:
        return None
    populations = [city.get("population") for city in search_results]
    city = search_results[np.argmax(populations)]
    return city.get("latitude"), city.get("longitude")
 def get_distance(city1, city2, country1="US", country2="US"):
    """
    Get the distance between two cities in kilometers.
    Parameters
    ----------
    city1 : str
        The name of the first city.
    city2 : str
        The name of the second city.
    country1 : str, optional
        The country code of the first city, by default 'US'.
    country2 : str, optional
        The country code of the second city, by default 'US'.
    Returns
    -------
    float
        The distance between the two cities in kilometers,
        or None if one or both city names were not found.
    """
    city1_coords = get_coordinates(city1, country1)
    city2_coords = get_coordinates(city2, country2)
    if city1_coords is None or city2_coords is None:
        return None
    return geodesic(city1_coords, city2_coords).km
 def calculate_distance(pair):
    city1, city2 = pair
    distance = get_distance(city1["name"], city2["name"])
    return city1["name"], city2["name"], distance
 def main():
    cities = list(us_cities.values())
    print(f"Num cities: {len(cities)}")
    city_combinations = list(itertools.combinations(cities, 2))
    with open("city_distances_full.csv", "w", newline="") as csvfile:
        fieldnames = ["city_from", "city_to", "distance"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        try:
            executor = concurrent.futures.ProcessPoolExecutor(max_workers=8)
            # results = executor.map(calculate_distance, city_combinations)
            futures = {
                executor.submit(calculate_distance, pair): pair
                for pair in city_combinations
            }
            for future in as_completed(futures):
                city_from, city_to, distance = future.result()
                if distance is not None:
                    writer.writerow(
                        {
                            "city_from": city_from,
                            "city_to": city_to,
                            "distance": distance,
                        }
                    )
        except KeyboardInterrupt:
            print("Interrupted. Terminating processes...")
            executor.shutdown(wait=False)
            raise SystemExit("Execution terminated by user.")
 if __name__ == "__main__":
    main()
--- a/train.py
+++ b/train.py
@ -0,0 +1,96 @@
 import logging
 import numpy as np
 import pandas as pd
 from sentence_transformers import (
    InputExample,
    LoggingHandler,
    SentenceTransformer,
    losses,
 )
 from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
 from sklearn.model_selection import train_test_split
 from torch.utils.data import DataLoader
 # Configure logging
 logging.basicConfig(
    format="%(asctime)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=logging.INFO,
    handlers=[LoggingHandler()],
 )
 model_name = "sentence-transformers/all-MiniLM-L6-v2"
 model = SentenceTransformer(model_name, device="cuda")
 # num_examples = 10_000
 # Perform train-test split
 # Example fake data with right types (for testing)
 # import faker
 # fake = Faker()
 # train_data = [
 #     (fake.city(), fake.city(), np.random.rand())
 #     for _ in range(num_examples)
 # ]
 data = pd.read_csv("city_distances_sample.csv")
 MAX_DISTANCE = 20_037.5  # global max distance
 # MAX_DISTANCE = data["distance"].max()  # about 5k
 print(f"{MAX_DISTANCE=}")
 train_data = [
    (row["city_from"], row["city_to"], 1 - row["distance"] / MAX_DISTANCE)
    for _, row in data.iterrows()
 ]
 np.random.seed(1992)
 np.random.shuffle(train_data)
 train_examples = examples = [
    InputExample(texts=[city_from, city_to], label=dist)
    for city_from, city_to, dist in train_data
 ]
 train_examples, val_examples = train_test_split(
    examples, test_size=0.2, random_state=21
 )
 # validation examples can be something like templated sentences
 # that maintain the same distance as the cities (same context)
 # should probably add training examples like that too if needed
 batch_size = 16
 num_examples = len(train_examples)
 steps_per_epoch = num_examples // batch_size
 print(f"\nHead of training data (size: {num_examples}):")
 print(train_data[:10], "\n")
 # Create DataLoaders for train and validation datasets
 train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
 print("TRAINING")
 # Configure the training arguments
 training_args = {
    "output_path": "./output",
    # "evaluation_steps": steps_per_epoch,  # already evaluates at the end of each epoch
    "epochs": 5,
    "warmup_steps": 500,
    "optimizer_params": {"lr": 2e-5},
    # "weight_decay": 0,  # not sure if this helps but works fine without setting it.
    "scheduler": "WarmupLinear",
    "save_best_model": True,
    "checkpoint_path": "./checkpoints_absmax_split",
    "checkpoint_save_steps": steps_per_epoch,
    "checkpoint_save_total_limit": 20,
 }
 print(f"TRAINING ARGUMENTS:\n {training_args}")
 train_loss = losses.CosineSimilarityLoss(model)
 # Create an evaluator for validation dataset
 evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    val_examples, write_csv=True
 )
 model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=evaluator,
    **training_args,
 )