diff --git a/Makefile b/Makefile index cb5d8a0..e1d361f 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ all: install data train eval city_distances_full.csv: check generate_data.py @echo "Generating distance data..." - @bash -c 'time python generate_data.py -w 8 -c US -s 10000' + @bash -c 'time python generate_data.py --country US --workers 8 --chunk-size 8000' data: city_distances_full.csv diff --git a/generate_data.py b/generate_data.py index f580a6e..2fdb3da 100644 --- a/generate_data.py +++ b/generate_data.py @@ -8,6 +8,7 @@ from functools import lru_cache import geonamescache import numpy as np from geopy.distance import geodesic +from tqdm import tqdm MAX_DISTANCE = 20_037.5 @@ -115,6 +116,7 @@ def main(): cities = list(us_cities.values()) print(f"Num cities: {len(cities)}") city_combinations = list(itertools.combinations(cities, 2)) + # np.random.shuffle(city_combinations) # will this help or hurt caching? 1.03it/s chunk_size = args.chunk_size num_chunks = len(city_combinations) // chunk_size + 1 output_file = args.output_file @@ -126,8 +128,13 @@ def main(): try: executor = concurrent.futures.ProcessPoolExecutor(max_workers=args.workers) - for i in range(num_chunks): - print(f"Processing chunk {i}...") + for i in tqdm( + range(num_chunks), + total=num_chunks, + desc="Processing chunks", + ncols=100, + bar_format="{l_bar}{bar}{r_bar}", + ): chunk = city_combinations[(i * chunk_size) : (i + 1) * chunk_size] futures = { executor.submit(calculate_distance, pair): pair for pair in chunk