initial commit, working code

This commit is contained in:
mm 2023-05-04 10:03:15 +00:00
commit b14a33c984
6 changed files with 358 additions and 0 deletions

4
.gitignore vendored Normal file
View File

@ -0,0 +1,4 @@
checkpoints*
plots*
*.csv
output/

17
Makefile Normal file
View File

@ -0,0 +1,17 @@
city_distances.csv: lint generate_data.py
bash -c 'time python generate_data.py'
lint:
isort --profile=black .
black .
flake8 --max-line-length=88 .
train: lint train.py
bash -c 'time python train.py'
eval: lint eval.py
bash -c 'time python eval.py'
clean:
rm -rf output/
rm -rf checkpoints/

45
debug_distance.py Normal file
View File

@ -0,0 +1,45 @@
import geonamescache
from geopy.distance import geodesic
gc = geonamescache.GeonamesCache()
cities = gc.get_cities()
us_cities = {k: c for k, c in cities.items() if c.get("countrycode") == "US"}
print(gc.search_cities("Jamaica"), "\n")
print(gc.search_cities("Manhattan"), "\n")
print("lengths:", len(cities), len(us_cities))
def get_coordinates(city_name, country_code="US"):
search_results = gc.search_cities(city_name, case_sensitive=True)
for city in search_results:
print(f"searching {city}")
possible_matches = city.get("alternatenames") + [city_name]
if city_name in possible_matches and city.get("countrycode") == country_code:
return city.get("latitude"), city.get("longitude")
return None
def get_distance(city1, city2, country1="US", country2="US"):
city1_coords = get_coordinates(city1, country1)
city2_coords = get_coordinates(city2, country2)
if city1_coords is None or city2_coords is None:
return None
return geodesic(city1_coords, city2_coords).km
MAX_DISTANCE = 20_037.5
city1 = "New York"
city2 = "Jamaica"
country1 = "US"
country2 = "US"
distance = get_distance(city1, city2, country1, country2)
if distance is not None:
print(f"Distance between {city1} and {city2} is {distance:.2f} km.")
else:
print("One or both city names were not found.")

78
eval.py Normal file
View File

@ -0,0 +1,78 @@
import glob
import logging
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sentence_transformers import LoggingHandler, SentenceTransformer
# from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
# from sklearn.model_selection import train_test_split
if not os.path.exists("./plots"):
os.mkdir("./plots")
# Configure logging
logging.basicConfig(
format="%(asctime)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
level=logging.INFO,
handlers=[LoggingHandler()],
)
def evaluate(model, city_from, city_to):
city_to = model.encode(city_to)
city_from = model.encode(city_from)
return np.dot(city_to, city_from) / (
np.linalg.norm(city_to) * np.linalg.norm(city_from)
)
def calculate_similarity(data, base_model, trained_model):
# MAX_DISTANCE = 20_037.5
# data["distance"] /= MAX_DISTANCE
data["similarity_before"] = data.apply(
lambda x: evaluate(base_model, x["city_from"], x["city_to"]), axis=1
)
data["similarity_after"] = data.apply(
lambda x: evaluate(trained_model, x["city_from"], x["city_to"]), axis=1
)
return data
def make_plot(data):
fig, ax = plt.subplots()
ax.scatter(
data["distance"],
data["similarity_before"],
color="r",
alpha=0.1,
label="before",
)
ax.scatter(
data["distance"], data["similarity_after"], color="b", alpha=0.1, label="after"
)
ax.set_xlabel("distance between cities (km)")
ax.set_ylabel("similarity between vectors\n(cosine)")
fig.legend(loc="upper right")
return fig
if __name__ == "__main__":
model_name = "sentence-transformers/all-MiniLM-L6-v2"
base_model = SentenceTransformer(model_name, device="cuda")
data = pd.read_csv("city_distances_sample.csv")
# data_sample = data.sample(1_000)
checkpoint_dir = "checkpoints_absmax_split" # no slash
for checkpoint in sorted(glob.glob(f"{checkpoint_dir}/*")):
data_sample = data.sample(1_000)
trained_model = SentenceTransformer(checkpoint, device="cuda")
data_sample = calculate_similarity(data_sample, base_model, trained_model)
fig = make_plot(data_sample)
fig.savefig(f"./plots/progress_{checkpoint.split('/')[1]}.png", dpi=600)

118
generate_data.py Normal file
View File

@ -0,0 +1,118 @@
import concurrent.futures
import csv
import itertools
from concurrent.futures import as_completed
from functools import lru_cache
import geonamescache
import numpy as np
from geopy.distance import geodesic
MAX_DISTANCE = 20_037.5
gc = geonamescache.GeonamesCache()
cities = gc.get_cities()
us_cities = {
k: c
for k, c in cities.items()
if (c.get("countrycode") == "US") # & (c.get("population", 0) > 5e4)
}
@lru_cache(maxsize=None)
def get_coordinates(city_name, country_code="US"):
"""
Get the coordinates of a city.
Parameters
----------
city_name : str
The name of the city.
country_code : str, optional
The country code of the city, by default 'US'.
Returns
-------
tuple
A tuple containing the latitude and longitude of the city,
or None if the city is not found.
"""
search_results = gc.search_cities(city_name, case_sensitive=True)
if not search_results:
return None
populations = [city.get("population") for city in search_results]
city = search_results[np.argmax(populations)]
return city.get("latitude"), city.get("longitude")
def get_distance(city1, city2, country1="US", country2="US"):
"""
Get the distance between two cities in kilometers.
Parameters
----------
city1 : str
The name of the first city.
city2 : str
The name of the second city.
country1 : str, optional
The country code of the first city, by default 'US'.
country2 : str, optional
The country code of the second city, by default 'US'.
Returns
-------
float
The distance between the two cities in kilometers,
or None if one or both city names were not found.
"""
city1_coords = get_coordinates(city1, country1)
city2_coords = get_coordinates(city2, country2)
if city1_coords is None or city2_coords is None:
return None
return geodesic(city1_coords, city2_coords).km
def calculate_distance(pair):
city1, city2 = pair
distance = get_distance(city1["name"], city2["name"])
return city1["name"], city2["name"], distance
def main():
cities = list(us_cities.values())
print(f"Num cities: {len(cities)}")
city_combinations = list(itertools.combinations(cities, 2))
with open("city_distances_full.csv", "w", newline="") as csvfile:
fieldnames = ["city_from", "city_to", "distance"]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
try:
executor = concurrent.futures.ProcessPoolExecutor(max_workers=8)
# results = executor.map(calculate_distance, city_combinations)
futures = {
executor.submit(calculate_distance, pair): pair
for pair in city_combinations
}
for future in as_completed(futures):
city_from, city_to, distance = future.result()
if distance is not None:
writer.writerow(
{
"city_from": city_from,
"city_to": city_to,
"distance": distance,
}
)
except KeyboardInterrupt:
print("Interrupted. Terminating processes...")
executor.shutdown(wait=False)
raise SystemExit("Execution terminated by user.")
if __name__ == "__main__":
main()

96
train.py Normal file
View File

@ -0,0 +1,96 @@
import logging
import numpy as np
import pandas as pd
from sentence_transformers import (
InputExample,
LoggingHandler,
SentenceTransformer,
losses,
)
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
# Configure logging
logging.basicConfig(
format="%(asctime)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
level=logging.INFO,
handlers=[LoggingHandler()],
)
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(model_name, device="cuda")
# num_examples = 10_000
# Perform train-test split
# Example fake data with right types (for testing)
# import faker
# fake = Faker()
# train_data = [
# (fake.city(), fake.city(), np.random.rand())
# for _ in range(num_examples)
# ]
data = pd.read_csv("city_distances_sample.csv")
MAX_DISTANCE = 20_037.5 # global max distance
# MAX_DISTANCE = data["distance"].max() # about 5k
print(f"{MAX_DISTANCE=}")
train_data = [
(row["city_from"], row["city_to"], 1 - row["distance"] / MAX_DISTANCE)
for _, row in data.iterrows()
]
np.random.seed(1992)
np.random.shuffle(train_data)
train_examples = examples = [
InputExample(texts=[city_from, city_to], label=dist)
for city_from, city_to, dist in train_data
]
train_examples, val_examples = train_test_split(
examples, test_size=0.2, random_state=21
)
# validation examples can be something like templated sentences
# that maintain the same distance as the cities (same context)
# should probably add training examples like that too if needed
batch_size = 16
num_examples = len(train_examples)
steps_per_epoch = num_examples // batch_size
print(f"\nHead of training data (size: {num_examples}):")
print(train_data[:10], "\n")
# Create DataLoaders for train and validation datasets
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
print("TRAINING")
# Configure the training arguments
training_args = {
"output_path": "./output",
# "evaluation_steps": steps_per_epoch, # already evaluates at the end of each epoch
"epochs": 5,
"warmup_steps": 500,
"optimizer_params": {"lr": 2e-5},
# "weight_decay": 0, # not sure if this helps but works fine without setting it.
"scheduler": "WarmupLinear",
"save_best_model": True,
"checkpoint_path": "./checkpoints_absmax_split",
"checkpoint_save_steps": steps_per_epoch,
"checkpoint_save_total_limit": 20,
}
print(f"TRAINING ARGUMENTS:\n {training_args}")
train_loss = losses.CosineSimilarityLoss(model)
# Create an evaluator for validation dataset
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
val_examples, write_csv=True
)
model.fit(
train_objectives=[(train_dataloader, train_loss)],
evaluator=evaluator,
**training_args,
)