Compare commits
2 Commits
b14a33c984
...
fab8952d59
Author | SHA1 | Date | |
---|---|---|---|
fab8952d59 | |||
c7b294c557 |
16
README.md
Normal file
16
README.md
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
# city-transformers
|
||||||
|
|
||||||
|
Generates dataset of cities (US only for now) and their geodesic distances.
|
||||||
|
Uses that dataset to fine-tune a neural-net to understand that cities closer to one another are more similar.
|
||||||
|
Distances become `labels` through the formula `1 - distance/MAX_DISTANCE`, where `MAX_DISTANCE=20_037.5 # km` represents half of the Earth's circumfrence.
|
||||||
|
|
||||||
|
There are other factors that can make cities that are "close together" on the globe "far apart" in reality, due to political borders.
|
||||||
|
Factors like this are not considered in this model, it is only considering geography.
|
||||||
|
|
||||||
|
However, for use-cases that involve different measures of distances (perhaps just time-zones, or something that considers the reality of travel), the general principals proven here should be applicable (pick a metric, generate data, train).
|
||||||
|
|
||||||
|
A particularly useful addition to the dataset here:
|
||||||
|
- airports: they (more/less) have unique codes, and this semantic understanding would be helpful for search engines.
|
||||||
|
- aliases for cities: the dataset used for city data (lat/lon) contains a pretty exhaustive list of aliases for the cities. It would be good to generate examples of these with a distance of 0 and train the model on this knowledge.
|
||||||
|
|
||||||
|
see `Makefile` for instructions.
|
6
train.py
6
train.py
@ -32,7 +32,7 @@ model = SentenceTransformer(model_name, device="cuda")
|
|||||||
# (fake.city(), fake.city(), np.random.rand())
|
# (fake.city(), fake.city(), np.random.rand())
|
||||||
# for _ in range(num_examples)
|
# for _ in range(num_examples)
|
||||||
# ]
|
# ]
|
||||||
data = pd.read_csv("city_distances_sample.csv")
|
data = pd.read_csv("city_distances_full.csv")
|
||||||
MAX_DISTANCE = 20_037.5 # global max distance
|
MAX_DISTANCE = 20_037.5 # global max distance
|
||||||
# MAX_DISTANCE = data["distance"].max() # about 5k
|
# MAX_DISTANCE = data["distance"].max() # about 5k
|
||||||
|
|
||||||
@ -70,7 +70,7 @@ print("TRAINING")
|
|||||||
training_args = {
|
training_args = {
|
||||||
"output_path": "./output",
|
"output_path": "./output",
|
||||||
# "evaluation_steps": steps_per_epoch, # already evaluates at the end of each epoch
|
# "evaluation_steps": steps_per_epoch, # already evaluates at the end of each epoch
|
||||||
"epochs": 5,
|
"epochs": 20,
|
||||||
"warmup_steps": 500,
|
"warmup_steps": 500,
|
||||||
"optimizer_params": {"lr": 2e-5},
|
"optimizer_params": {"lr": 2e-5},
|
||||||
# "weight_decay": 0, # not sure if this helps but works fine without setting it.
|
# "weight_decay": 0, # not sure if this helps but works fine without setting it.
|
||||||
@ -78,7 +78,7 @@ training_args = {
|
|||||||
"save_best_model": True,
|
"save_best_model": True,
|
||||||
"checkpoint_path": "./checkpoints_absmax_split",
|
"checkpoint_path": "./checkpoints_absmax_split",
|
||||||
"checkpoint_save_steps": steps_per_epoch,
|
"checkpoint_save_steps": steps_per_epoch,
|
||||||
"checkpoint_save_total_limit": 20,
|
"checkpoint_save_total_limit": 100,
|
||||||
}
|
}
|
||||||
print(f"TRAINING ARGUMENTS:\n {training_args}")
|
print(f"TRAINING ARGUMENTS:\n {training_args}")
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user