Michael Pilosov, PhD
10 months ago
6 changed files with 178 additions and 40 deletions
@ -0,0 +1,82 @@ |
|||||
|
import subprocess |
||||
|
import sys |
||||
|
from random import sample |
||||
|
|
||||
|
import numpy as np # noqa: F401 |
||||
|
from lightning_sdk import Machine, Studio # noqa: F401 |
||||
|
|
||||
|
NUM_JOBS = 100 |
||||
|
|
||||
|
# reference to the current studio |
||||
|
# if you run outside of Lightning, you can pass the Studio name |
||||
|
# studio = Studio() |
||||
|
|
||||
|
# use the jobs plugin |
||||
|
# studio.install_plugin("jobs") |
||||
|
# job_plugin = studio.installed_plugins["jobs"] |
||||
|
|
||||
|
# do a sweep over learning rates |
||||
|
|
||||
|
# Define the ranges or sets of values for each hyperparameter |
||||
|
# alpha_values = list(np.round(np.linspace(2, 4, 21), 4)) |
||||
|
# learning_rate_values = list(np.round(np.logspace(-5, -3, 21), 5)) |
||||
|
learning_rate_values = [1e-2] |
||||
|
alpha_values = [0, 1, 2] |
||||
|
widths = [2**k for k in range(4, 15)] |
||||
|
# learning_rate_values = [5e-4] |
||||
|
batch_size_values = [256] |
||||
|
max_epochs_values = [100] |
||||
|
seeds = list(range(21, 1992)) |
||||
|
|
||||
|
# Generate all possible combinations of hyperparameters |
||||
|
all_params = [ |
||||
|
(alpha, lr, bs, me, s, w) |
||||
|
for alpha in alpha_values |
||||
|
for lr in learning_rate_values |
||||
|
for bs in batch_size_values |
||||
|
for me in max_epochs_values |
||||
|
for s in seeds |
||||
|
for w in widths |
||||
|
] |
||||
|
|
||||
|
|
||||
|
# perform random search with a limit |
||||
|
search_params = sample(all_params, min(NUM_JOBS, len(all_params))) |
||||
|
|
||||
|
for idx, params in enumerate(search_params): |
||||
|
a, lr, bs, me, s, w = params |
||||
|
cmd = f"cd ~/colors && python main.py --alpha {a} --lr {lr} --bs {bs} --max_epochs {me} --seed {s} --width {w}" |
||||
|
cmd = f""" |
||||
|
python newmain.py fit \ |
||||
|
--seed_everything {s} \ |
||||
|
--data.batch_size {bs} \ |
||||
|
--data.train_size 0 \ |
||||
|
--data.val_size 100000 \ |
||||
|
--model.alpha {a} \ |
||||
|
--model.width {w} \ |
||||
|
--trainer.fast_dev_run 1 \ |
||||
|
--trainer.min_epochs 10 \ |
||||
|
--trainer.max_epochs {me} \ |
||||
|
--trainer.check_val_every_n_epoch 1 \ |
||||
|
--trainer.callbacks callbacks.SaveImageCallback \ |
||||
|
--trainer.callbacks.init_args.final_dir out \ |
||||
|
--trainer.callbacks.init_args.save_interval 0 \ |
||||
|
--optimizer torch.optim.Adam \ |
||||
|
--optimizer.init_args.lr {lr} \ |
||||
|
--lr_scheduler lightning.pytorch.cli.ReduceLROnPlateau \ |
||||
|
--lr_scheduler.init_args.patience 5 \ |
||||
|
--lr_scheduler.init_args.cooldown 10 \ |
||||
|
--lr_scheduler.init_args.factor 0.05 \ |
||||
|
--lr_scheduler.init_args.verbose true \ |
||||
|
--print_config |
||||
|
""" |
||||
|
|
||||
|
# job_name = f"color2_{bs}_{a}_{lr:2.2e}" |
||||
|
# job_plugin.run(cmd, machine=Machine.T4, name=job_name) |
||||
|
print(f"Running {params}: {cmd}") |
||||
|
try: |
||||
|
# Run the command and wait for it to complete |
||||
|
subprocess.run(cmd, shell=True, check=True) |
||||
|
except KeyboardInterrupt: |
||||
|
print("Interrupted by user") |
||||
|
sys.exit(1) |
Loading…
Reference in new issue