From b14a33c984270093f4ccfdc0c8d6383609860e00 Mon Sep 17 00:00:00 2001 From: mm Date: Thu, 4 May 2023 10:03:15 +0000 Subject: [PATCH] initial commit, working code --- .gitignore | 4 ++ Makefile | 17 +++++++ debug_distance.py | 45 ++++++++++++++++++ eval.py | 78 ++++++++++++++++++++++++++++++ generate_data.py | 118 ++++++++++++++++++++++++++++++++++++++++++++++ train.py | 96 +++++++++++++++++++++++++++++++++++++ 6 files changed, 358 insertions(+) create mode 100644 .gitignore create mode 100644 Makefile create mode 100644 debug_distance.py create mode 100644 eval.py create mode 100644 generate_data.py create mode 100644 train.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fb1d41c --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +checkpoints* +plots* +*.csv +output/ \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..8f5e786 --- /dev/null +++ b/Makefile @@ -0,0 +1,17 @@ +city_distances.csv: lint generate_data.py + bash -c 'time python generate_data.py' + +lint: + isort --profile=black . + black . + flake8 --max-line-length=88 . + +train: lint train.py + bash -c 'time python train.py' + +eval: lint eval.py + bash -c 'time python eval.py' + +clean: + rm -rf output/ + rm -rf checkpoints/ \ No newline at end of file diff --git a/debug_distance.py b/debug_distance.py new file mode 100644 index 0000000..d0123c6 --- /dev/null +++ b/debug_distance.py @@ -0,0 +1,45 @@ +import geonamescache +from geopy.distance import geodesic + +gc = geonamescache.GeonamesCache() +cities = gc.get_cities() +us_cities = {k: c for k, c in cities.items() if c.get("countrycode") == "US"} + +print(gc.search_cities("Jamaica"), "\n") +print(gc.search_cities("Manhattan"), "\n") +print("lengths:", len(cities), len(us_cities)) + + +def get_coordinates(city_name, country_code="US"): + search_results = gc.search_cities(city_name, case_sensitive=True) + for city in search_results: + print(f"searching {city}") + possible_matches = city.get("alternatenames") + [city_name] + if city_name in possible_matches and city.get("countrycode") == country_code: + return city.get("latitude"), city.get("longitude") + return None + + +def get_distance(city1, city2, country1="US", country2="US"): + city1_coords = get_coordinates(city1, country1) + city2_coords = get_coordinates(city2, country2) + + if city1_coords is None or city2_coords is None: + return None + + return geodesic(city1_coords, city2_coords).km + + +MAX_DISTANCE = 20_037.5 + +city1 = "New York" +city2 = "Jamaica" +country1 = "US" +country2 = "US" + +distance = get_distance(city1, city2, country1, country2) + +if distance is not None: + print(f"Distance between {city1} and {city2} is {distance:.2f} km.") +else: + print("One or both city names were not found.") diff --git a/eval.py b/eval.py new file mode 100644 index 0000000..7874dae --- /dev/null +++ b/eval.py @@ -0,0 +1,78 @@ +import glob +import logging +import os + +import numpy as np +import pandas as pd +from matplotlib import pyplot as plt +from sentence_transformers import LoggingHandler, SentenceTransformer + +# from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator +# from sklearn.model_selection import train_test_split + +if not os.path.exists("./plots"): + os.mkdir("./plots") + +# Configure logging +logging.basicConfig( + format="%(asctime)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=logging.INFO, + handlers=[LoggingHandler()], +) + + +def evaluate(model, city_from, city_to): + city_to = model.encode(city_to) + city_from = model.encode(city_from) + return np.dot(city_to, city_from) / ( + np.linalg.norm(city_to) * np.linalg.norm(city_from) + ) + + +def calculate_similarity(data, base_model, trained_model): + # MAX_DISTANCE = 20_037.5 + # data["distance"] /= MAX_DISTANCE + data["similarity_before"] = data.apply( + lambda x: evaluate(base_model, x["city_from"], x["city_to"]), axis=1 + ) + + data["similarity_after"] = data.apply( + lambda x: evaluate(trained_model, x["city_from"], x["city_to"]), axis=1 + ) + return data + + +def make_plot(data): + fig, ax = plt.subplots() + + ax.scatter( + data["distance"], + data["similarity_before"], + color="r", + alpha=0.1, + label="before", + ) + ax.scatter( + data["distance"], data["similarity_after"], color="b", alpha=0.1, label="after" + ) + ax.set_xlabel("distance between cities (km)") + ax.set_ylabel("similarity between vectors\n(cosine)") + fig.legend(loc="upper right") + return fig + + +if __name__ == "__main__": + model_name = "sentence-transformers/all-MiniLM-L6-v2" + base_model = SentenceTransformer(model_name, device="cuda") + + data = pd.read_csv("city_distances_sample.csv") + # data_sample = data.sample(1_000) + checkpoint_dir = "checkpoints_absmax_split" # no slash + for checkpoint in sorted(glob.glob(f"{checkpoint_dir}/*")): + data_sample = data.sample(1_000) + trained_model = SentenceTransformer(checkpoint, device="cuda") + + data_sample = calculate_similarity(data_sample, base_model, trained_model) + fig = make_plot(data_sample) + fig.savefig(f"./plots/progress_{checkpoint.split('/')[1]}.png", dpi=600) diff --git a/generate_data.py b/generate_data.py new file mode 100644 index 0000000..d0656f2 --- /dev/null +++ b/generate_data.py @@ -0,0 +1,118 @@ +import concurrent.futures +import csv +import itertools +from concurrent.futures import as_completed +from functools import lru_cache + +import geonamescache +import numpy as np +from geopy.distance import geodesic + +MAX_DISTANCE = 20_037.5 + +gc = geonamescache.GeonamesCache() +cities = gc.get_cities() +us_cities = { + k: c + for k, c in cities.items() + if (c.get("countrycode") == "US") # & (c.get("population", 0) > 5e4) +} + + +@lru_cache(maxsize=None) +def get_coordinates(city_name, country_code="US"): + """ + Get the coordinates of a city. + + Parameters + ---------- + city_name : str + The name of the city. + country_code : str, optional + The country code of the city, by default 'US'. + + Returns + ------- + tuple + A tuple containing the latitude and longitude of the city, + or None if the city is not found. + """ + search_results = gc.search_cities(city_name, case_sensitive=True) + if not search_results: + return None + populations = [city.get("population") for city in search_results] + city = search_results[np.argmax(populations)] + return city.get("latitude"), city.get("longitude") + + +def get_distance(city1, city2, country1="US", country2="US"): + """ + Get the distance between two cities in kilometers. + + Parameters + ---------- + city1 : str + The name of the first city. + city2 : str + The name of the second city. + country1 : str, optional + The country code of the first city, by default 'US'. + country2 : str, optional + The country code of the second city, by default 'US'. + + Returns + ------- + float + The distance between the two cities in kilometers, + or None if one or both city names were not found. + """ + city1_coords = get_coordinates(city1, country1) + city2_coords = get_coordinates(city2, country2) + + if city1_coords is None or city2_coords is None: + return None + + return geodesic(city1_coords, city2_coords).km + + +def calculate_distance(pair): + city1, city2 = pair + distance = get_distance(city1["name"], city2["name"]) + return city1["name"], city2["name"], distance + + +def main(): + cities = list(us_cities.values()) + print(f"Num cities: {len(cities)}") + city_combinations = list(itertools.combinations(cities, 2)) + + with open("city_distances_full.csv", "w", newline="") as csvfile: + fieldnames = ["city_from", "city_to", "distance"] + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + writer.writeheader() + + try: + executor = concurrent.futures.ProcessPoolExecutor(max_workers=8) + # results = executor.map(calculate_distance, city_combinations) + futures = { + executor.submit(calculate_distance, pair): pair + for pair in city_combinations + } + for future in as_completed(futures): + city_from, city_to, distance = future.result() + if distance is not None: + writer.writerow( + { + "city_from": city_from, + "city_to": city_to, + "distance": distance, + } + ) + except KeyboardInterrupt: + print("Interrupted. Terminating processes...") + executor.shutdown(wait=False) + raise SystemExit("Execution terminated by user.") + + +if __name__ == "__main__": + main() diff --git a/train.py b/train.py new file mode 100644 index 0000000..94b3eac --- /dev/null +++ b/train.py @@ -0,0 +1,96 @@ +import logging + +import numpy as np +import pandas as pd +from sentence_transformers import ( + InputExample, + LoggingHandler, + SentenceTransformer, + losses, +) +from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator +from sklearn.model_selection import train_test_split +from torch.utils.data import DataLoader + +# Configure logging +logging.basicConfig( + format="%(asctime)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=logging.INFO, + handlers=[LoggingHandler()], +) + +model_name = "sentence-transformers/all-MiniLM-L6-v2" +model = SentenceTransformer(model_name, device="cuda") +# num_examples = 10_000 + +# Perform train-test split +# Example fake data with right types (for testing) +# import faker +# fake = Faker() +# train_data = [ +# (fake.city(), fake.city(), np.random.rand()) +# for _ in range(num_examples) +# ] +data = pd.read_csv("city_distances_sample.csv") +MAX_DISTANCE = 20_037.5 # global max distance +# MAX_DISTANCE = data["distance"].max() # about 5k + +print(f"{MAX_DISTANCE=}") +train_data = [ + (row["city_from"], row["city_to"], 1 - row["distance"] / MAX_DISTANCE) + for _, row in data.iterrows() +] + +np.random.seed(1992) +np.random.shuffle(train_data) +train_examples = examples = [ + InputExample(texts=[city_from, city_to], label=dist) + for city_from, city_to, dist in train_data +] + +train_examples, val_examples = train_test_split( + examples, test_size=0.2, random_state=21 +) +# validation examples can be something like templated sentences +# that maintain the same distance as the cities (same context) +# should probably add training examples like that too if needed +batch_size = 16 +num_examples = len(train_examples) +steps_per_epoch = num_examples // batch_size + +print(f"\nHead of training data (size: {num_examples}):") +print(train_data[:10], "\n") + +# Create DataLoaders for train and validation datasets +train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16) + +print("TRAINING") +# Configure the training arguments +training_args = { + "output_path": "./output", + # "evaluation_steps": steps_per_epoch, # already evaluates at the end of each epoch + "epochs": 5, + "warmup_steps": 500, + "optimizer_params": {"lr": 2e-5}, + # "weight_decay": 0, # not sure if this helps but works fine without setting it. + "scheduler": "WarmupLinear", + "save_best_model": True, + "checkpoint_path": "./checkpoints_absmax_split", + "checkpoint_save_steps": steps_per_epoch, + "checkpoint_save_total_limit": 20, +} +print(f"TRAINING ARGUMENTS:\n {training_args}") + +train_loss = losses.CosineSimilarityLoss(model) + +# Create an evaluator for validation dataset +evaluator = EmbeddingSimilarityEvaluator.from_input_examples( + val_examples, write_csv=True +) + +model.fit( + train_objectives=[(train_dataloader, train_loss)], + evaluator=evaluator, + **training_args, +)