lfads-torch error with ray

error with ray

Open rosskempner opened this issue 5 months ago • 2 comments

Hi,

I am running the second part of the tutorial, the 2_run_pbt.py script.

Here is my script:


import os
import shutil
from datetime import datetime
from pathlib import Path

from ray import tune
from ray.tune import CLIReporter
from ray.tune.search.basic_variant import BasicVariantGenerator

from lfads_torch.extensions.tune import (
    BinaryTournamentPBT,
    HyperParam,
    ImprovementRatioStopper,
)
from lfads_torch.run_model import run_model

# ---------- OPTIONS ----------
PROJECT_STR = "pbt"
DATASET_STR = "rouse_multisession_PCR"
RUN_TAG = datetime.now().strftime("%y%m%d")
RUN_DIR = os.getcwd()
import pdb
pdb.set_trace()
HYPERPARAM_SPACE = {
    "model.lr_init": HyperParam(
        1e-4, 1e-3, explore_wt=0.3, enforce_limits=True, init=1e-3
    ),
    "model.dropout_rate": HyperParam(
        0.0, 0.6, explore_wt=0.3, enforce_limits=True, sample_fn="uniform"
    ),
    "model.train_aug_stack.transforms.0.cd_rate": HyperParam(
        0.01, 0.99, explore_wt=0.3, enforce_limits=True, init=0.5, sample_fn="uniform"
    ),
    "model.kl_co_scale": HyperParam(1e-5, 1e-3, explore_wt=0.8),
    "model.kl_ic_scale": HyperParam(1e-5, 1e-3, explore_wt=0.8),
    "model.l2_gen_scale": HyperParam(1e-5, 1e-0, explore_wt=0.8),
    "model.l2_con_scale": HyperParam(1e-4, 1e-0, explore_wt=0.8),
}
# ------------------------------


# Function to keep dropout and CD rates in-bounds
def clip_config_rates(config):
    return {k: min(v, 0.99) if "_rate" in k else v for k, v in config.items()}


init_space = {name: tune.sample_from(hp.init) for name, hp in HYPERPARAM_SPACE.items()}
# Set the mandatory config overrides to select datamodule and model
mandatory_overrides = {
    "datamodule": DATASET_STR,
    "model": DATASET_STR,
    "logger.wandb_logger.project": PROJECT_STR,
    "logger.wandb_logger.tags.1": DATASET_STR,
    "logger.wandb_logger.tags.2": RUN_TAG,
}

# Copy this script into the run directory

# Run the hyperparameter search
metric = "valid/recon_smth"
num_trials = 20
perturbation_interval = 15
burn_in_period = 50 + 15
analysis = tune.run(
    tune.with_parameters(
        run_model,
        config_path="../../../configs/pbt.yaml",
        do_posterior_sample=False,
    ),
    metric=metric,
    mode="min",
    name=RUN_DIR,
    stop=ImprovementRatioStopper(
        num_trials=num_trials,
        perturbation_interval=perturbation_interval,
        burn_in_period=burn_in_period,
        metric=metric,
        patience=4,
        min_improvement_ratio=5e-4,
    ),
    config={**mandatory_overrides, **init_space},
    resources_per_trial=dict(cpu=3, gpu=0.5),
    num_samples=num_trials,
    local_dir=RUN_DIR,
    search_alg=BasicVariantGenerator(random_state=0),
    scheduler=BinaryTournamentPBT(
        perturbation_interval=perturbation_interval,
        burn_in_period=burn_in_period,
        hyperparam_mutations=HYPERPARAM_SPACE,
    ),
    keep_checkpoints_num=1,
    verbose=1,
    progress_reporter=CLIReporter(
        metric_columns=[metric, "cur_epoch"],
        sort_by_metric=True,
    ),
    trial_dirname_creator=lambda trial: str(trial),
)
# Copy the best model to a new folder so it is easy to identify
best_model_dir = RUN_DIR / "best_model"
shutil.copytree(analysis.best_logdir, best_model_dir)
# Switch working directory to this folder (usually handled by tune)
os.chdir(best_model_dir)
# Load the best model and run posterior sampling (skip training)
best_ckpt_dir = best_model_dir / Path(analysis.best_checkpoint._local_path).name
run_model(
    overrides=mandatory_overrides,
    checkpoint_dir=best_ckpt_dir,
    config_path="../../../configs/pbt.yaml",
    do_train=False,
)

And here is my error message:


== Status ==
Current time: 2024-09-05 16:47:51 (running for 00:00:22.52)
Memory usage on this node: 52.9/1007.7 GiB 
PopulationBasedTraining: 0 checkpoints, 0 perturbs
Resources requested: 0/128 CPUs, 0/2 GPUs, 0.0/767.03 GiB heap, 0.0/186.26 GiB objects (0.0/1.0 accelerator_type:RTX)
Result logdir: /home/xx/Documents/xx_lab_projects/lfads_xx/tutorials/multisession/runs
Number of trials: 20/20 (19 ERROR, 1 PENDING)
+-----------------------+----------+-----------------------+----------------------+---------------------+---------------------+----------------------+----------------------+-----------------+------------------------+
| Trial name            | status   | loc                   |   model.dropout_rate |   model.kl_co_scale |   model.kl_ic_scale |   model.l2_con_scale |   model.l2_gen_scale |   model.lr_init |   ....train_aug_stack. |
|                       |          |                       |                      |                     |                     |                      |                      |                 |   transforms.0.cd_rate |
|-----------------------+----------+-----------------------+----------------------+---------------------+---------------------+----------------------+----------------------+-----------------+------------------------|
| run_model_0f720_00019 | PENDING  |                       |            0.183342  |         1.96506e-05 |         2.02864e-05 |          0.000313861 |          0.125441    |           0.001 |                    0.5 |
| run_model_0f720_00000 | ERROR    | 10.81.105.145:2546365 |            0.0998097 |         0.000145566 |         0.000562019 |          0.362005    |          0.260521    |           0.001 |                    0.5 |
| run_model_0f720_00001 | ERROR    | 10.81.105.145:2546872 |            0.428983  |         0.000678712 |         1.55435e-05 |          0.0184235   |          0.00489324  |           0.001 |                    0.5 |
| run_model_0f720_00002 | ERROR    | 10.81.105.145:2546875 |            0.103402  |         0.000216868 |         1.79223e-05 |          0.857996    |          0.0610244   |           0.001 |                    0.5 |
| run_model_0f720_00003 | ERROR    | 10.81.105.145:2546879 |            0.444794  |         0.000201164 |         0.000190564 |          0.177565    |          0.0100834   |           0.001 |                    0.5 |
| run_model_0f720_00004 | ERROR    | 10.81.105.145:2547180 |            0.373833  |         0.000137225 |         0.000604685 |          0.00033449  |          0.149631    |           0.001 |                    0.5 |
| run_model_0f720_00005 | ERROR    | 10.81.105.145:2547927 |            0.0448103 |         7.36163e-05 |         0.000395537 |          0.170101    |          0.00522518  |           0.001 |                    0.5 |
| run_model_0f720_00006 | ERROR    | 10.81.105.145:2547933 |            0.241651  |         1.58007e-05 |         2.34636e-05 |          0.581212    |          1.47931e-05 |           0.001 |                    0.5 |
| run_model_0f720_00007 | ERROR    | 10.81.105.145:2548169 |            0.412236  |         6.3678e-05  |         5.93763e-05 |          0.699554    |          0.398774    |           0.001 |                    0.5 |
| run_model_0f720_00008 | ERROR    | 10.81.105.145:2548173 |            0.117632  |         0.000613384 |         8.97283e-05 |          0.0157385   |          0.000186606 |           0.001 |                    0.5 |
| run_model_0f720_00009 | ERROR    | 10.81.105.145:2548890 |            0.122786  |         0.000559937 |         3.01349e-05 |          0.000128701 |          0.0237087   |           0.001 |                    0.5 |
| run_model_0f720_00010 | ERROR    | 10.81.105.145:2548893 |            0.395912  |         0.000156902 |         2.0803e-05  |          0.255673    |          0.269453    |           0.001 |                    0.5 |
| run_model_0f720_00011 | ERROR    | 10.81.105.145:2549739 |            0.324184  |         3.9222e-05  |         8.08373e-05 |          0.239034    |          0.000984247 |           0.001 |                    0.5 |
| run_model_0f720_00012 | ERROR    | 10.81.105.145:2549778 |            0.310792  |         3.46667e-05 |         0.00051653  |          0.005516    |          0.000158445 |           0.001 |                    0.5 |
| run_model_0f720_00013 | ERROR    | 10.81.105.145:2549785 |            0.19944   |         9.65847e-05 |         2.86928e-05 |          0.000929231 |          0.0169142   |           0.001 |                    0.5 |
| run_model_0f720_00014 | ERROR    | 10.81.105.145:2549795 |            0.400014  |         0.000282342 |         2.32923e-05 |          0.0177753   |          0.000776878 |           0.001 |                    0.5 |
| run_model_0f720_00015 | ERROR    | 10.81.105.145:2550830 |            0.412812  |         0.00033784  |         0.000113766 |          0.000269373 |          9.77298e-05 |           0.001 |                    0.5 |
| run_model_0f720_00016 | ERROR    | 10.81.105.145:2550831 |            0.339418  |         0.000107645 |         0.000920339 |          0.213078    |          0.00158166  |           0.001 |                    0.5 |
| run_model_0f720_00017 | ERROR    | 10.81.105.145:2550839 |            0.341545  |         7.79686e-05 |         2.2213e-05  |          0.000881869 |          0.00873363  |           0.001 |                    0.5 |
| run_model_0f720_00018 | ERROR    | 10.81.105.145:2550840 |            0.324184  |         0.000384731 |         3.67654e-05 |          0.00945664  |          0.00138483  |           0.001 |                    0.5 |
+-----------------------+----------+-----------------------+----------------------+---------------------+---------------------+----------------------+----------------------+-----------------+------------------------+
Number of errored trials: 19
+-----------------------+--------------+-------------------------------------------------------------------------------------------------------------------+
| Trial name            |   # failures | error file                                                                                                        |
|-----------------------+--------------+-------------------------------------------------------------------------------------------------------------------|
| run_model_0f720_00000 |            1 | /home/xx/Documents/xx_lab_projects/lfads_xx/tutorials/multisession/runs/run_model_0f720_00000/error.txt |
| run_model_0f720_00001 |            1 | /home/xx/Documents/xx_lab_projects/lfads_xx/tutorials/multisession/runs/run_model_0f720_00001/error.txt |
| run_model_0f720_00002 |            1 | /home/xx/Documents/xx_lab_projects/lfads_xx/tutorials/multisession/runs/run_model_0f720_00002/error.txt |
| run_model_0f720_00003 |            1 | /home/xx/Documents/xx_lab_projects/lfads_xx/tutorials/multisession/runs/run_model_0f720_00003/error.txt |
| run_model_0f720_00004 |            1 | /home/xx/Documents/xx_lab_projects/lfads_xx/tutorials/multisession/runs/run_model_0f720_00004/error.txt |
| run_model_0f720_00005 |            1 | /home/xx/Documents/xx_lab_projects/lfads_xx/tutorials/multisession/runs/run_model_0f720_00005/error.txt |
| run_model_0f720_00006 |            1 | /home/xx/Documents/xx_lab_projects/lfads_xx/tutorials/multisession/runs/run_model_0f720_00006/error.txt |
| run_model_0f720_00007 |            1 | /home/xx/Documents/xx_lab_projects/lfads_xx/tutorials/multisession/runs/run_model_0f720_00007/error.txt |
| run_model_0f720_00008 |            1 | /home/xx/Documents/xx_lab_projects/lfads_xx/tutorials/multisession/runs/run_model_0f720_00008/error.txt |
| run_model_0f720_00009 |            1 | /home/xx/Documents/xx_lab_projects/lfads_xx/tutorials/multisession/runs/run_model_0f720_00009/error.txt |
| run_model_0f720_00010 |            1 | /home/xx/Documents/xx_lab_projects/lfads_xx/tutorials/multisession/runs/run_model_0f720_00010/error.txt |
| run_model_0f720_00011 |            1 | /home/xx/Documents/xx_lab_projects/lfads_xx/tutorials/multisession/runs/run_model_0f720_00011/error.txt |
| run_model_0f720_00012 |            1 | /home/xx/Documents/xx_lab_projects/lfads_xx/tutorials/multisession/runs/run_model_0f720_00012/error.txt |
| run_model_0f720_00013 |            1 | /home/xx/Documents/xx_lab_projects/lfads_xx/tutorials/multisession/runs/run_model_0f720_00013/error.txt |
| run_model_0f720_00014 |            1 | /home/xx/Documents/xx_lab_projects/lfads_xx/tutorials/multisession/runs/run_model_0f720_00014/error.txt |
| run_model_0f720_00015 |            1 | /home/xx/Documents/xx_lab_projects/lfads_xx/tutorials/multisession/runs/run_model_0f720_00015/error.txt |
| run_model_0f720_00016 |            1 | /home/xx/Documents/xx_lab_projects/lfads_xx/tutorials/multisession/runs/run_model_0f720_00016/error.txt |
| run_model_0f720_00017 |            1 | /home/xx/Documents/xx_lab_projects/lfads_xx/tutorials/multisession/runs/run_model_0f720_00017/error.txt |
| run_model_0f720_00018 |            1 | /home/xx/Documents/xx_lab_projects/lfads_xx/tutorials/multisession/runs/run_model_0f720_00018/error.txt |
+-----------------------+--------------+-------------------------------------------------------------------------------------------------------------------+

(pid=2551608) WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
(pid=2551608) I0000 00:00:1725569274.455947 2551608 fork_posix.cc:77] Other threads are currently calling into gRPC, skipping fork() handlers
(pid=2551608) I0000 00:00:1725569274.490916 2551608 fork_posix.cc:77] Other threads are currently calling into gRPC, skipping fork() handlers
(pid=2551608) I0000 00:00:1725569274.671377 2551608 fork_posix.cc:77] Other threads are currently calling into gRPC, skipping fork() handlers
2024-09-05 16:47:55,277 ERROR serialization.py:371 -- Failed to unpickle serialized exception
Traceback (most recent call last):
  File "/home/xx/anaconda3/envs/lfads-torch/lib/python3.9/site-packages/ray/exceptions.py", line 46, in from_ray_exception
    return pickle.loads(ray_exception.serialized_exception)
TypeError: __init__() missing 1 required positional argument: 'missing_cfg_file'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/xx/anaconda3/envs/lfads-torch/lib/python3.9/site-packages/ray/_private/serialization.py", line 369, in deserialize_objects
    obj = self._deserialize_object(data, metadata, object_ref)
  File "/home/xx/anaconda3/envs/lfads-torch/lib/python3.9/site-packages/ray/_private/serialization.py", line 275, in _deserialize_object
    return RayError.from_bytes(obj)
  File "/home/xx/anaconda3/envs/lfads-torch/lib/python3.9/site-packages/ray/exceptions.py", line 40, in from_bytes
    return RayError.from_ray_exception(ray_exception)
  File "/home/xx/anaconda3/envs/lfads-torch/lib/python3.9/site-packages/ray/exceptions.py", line 49, in from_ray_exception
    raise RuntimeError(msg) from e
RuntimeError: Failed to unpickle serialized exception
2024-09-05 16:47:55,277 ERROR trial_runner.py:993 -- Trial run_model_0f720_00019: Error processing event.
ray.tune.error._TuneNoNextExecutorEventError: Traceback (most recent call last):
  File "/home/xx/anaconda3/envs/lfads-torch/lib/python3.9/site-packages/ray/tune/execution/ray_trial_executor.py", line 1050, in get_next_executor_event
    future_result = ray.get(ready_future)
  File "/home/xx/anaconda3/envs/lfads-torch/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
    return func(*args, **kwargs)
  File "/home/xx/anaconda3/envs/lfads-torch/lib/python3.9/site-packages/ray/_private/worker.py", line 2291, in get
    raise value
ray.exceptions.RaySystemError: System error: Failed to unpickle serialized exception
traceback: Traceback (most recent call last):
  File "/home/xx/anaconda3/envs/lfads-torch/lib/python3.9/site-packages/ray/exceptions.py", line 46, in from_ray_exception
    return pickle.loads(ray_exception.serialized_exception)
TypeError: __init__() missing 1 required positional argument: 'missing_cfg_file'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/xx/anaconda3/envs/lfads-torch/lib/python3.9/site-packages/ray/_private/serialization.py", line 369, in deserialize_objects
    obj = self._deserialize_object(data, metadata, object_ref)
  File "/home/xx/anaconda3/envs/lfads-torch/lib/python3.9/site-packages/ray/_private/serialization.py", line 275, in _deserialize_object
    return RayError.from_bytes(obj)
  File "/home/xx/anaconda3/envs/lfads-torch/lib/python3.9/site-packages/ray/exceptions.py", line 40, in from_bytes
    return RayError.from_ray_exception(ray_exception)
  File "/home/xx/anaconda3/envs/lfads-torch/lib/python3.9/site-packages/ray/exceptions.py", line 49, in from_ray_exception
    raise RuntimeError(msg) from e
RuntimeError: Failed to unpickle serialized exception

Sep 05 '24 21:09 rosskempner

lfads-torch lfads-torch copied to clipboard

error with ray

lfads-torch
lfads-torch copied to clipboard