seals icon indicating copy to clipboard operation
seals copied to clipboard

Variable Horizon in seals/CartPole

Open lcotetur opened this issue 2 years ago • 0 comments

from imitation.algorithms.adversarial.airl import AIRL
from imitation.rewards.reward_nets import BasicShapedRewardNet
from imitation.util.networks import RunningNorm
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv

import gym
import seals

learners_rewards_after_training = []
learners_rewards_before_training = []
venv = DummyVecEnv([lambda: gym.make("seals/CartPole-v0")] * 8)
learner = PPO(
        env=venv,
        policy=MlpPolicy,
        batch_size=64,
        ent_coef=0.0,
        learning_rate=0.0003,
        n_epochs=10,
    )
reward_net = BasicShapedRewardNet(
        venv.observation_space, venv.action_space, normalize_input_layer=RunningNorm
    )
airl_trainer = AIRL(
        demonstrations=rollouts,
        demo_batch_size=1024,
        gen_replay_buffer_capacity=2048,
        n_disc_updates_per_round=4,
        venv=venv,
        gen_algo=learner,
        reward_net=reward_net
    )

for i in range(10):
     
    learner_rewards_before_training, _ = evaluate_policy(
        learner, venv, 100, return_episode_rewards=True
    )
    learners_rewards_before_training.append(learner_rewards_before_training)


    airl_trainer.train(20000)  # Note: set to 300000 for better results
    learner_rewards_after_training, _ = evaluate_policy(
        learner, venv, 100, return_episode_rewards=True
        ) 
    learners_rewards_after_training.append(learner_rewards_after_training)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_16872\944136942.py in <module>
     41 
     42 
---> 43     airl_trainer.train(20000)  # Note: set to 300000 for better results
     44     learner_rewards_after_training, _ = evaluate_policy(
     45         learner, venv, 100, return_episode_rewards=True

c:\users\stephane\documents\imitation\src\imitation\algorithms\adversarial\common.py in train(self, total_timesteps, callback)
    416         )
    417         for r in tqdm.tqdm(range(0, n_rounds), desc="round"):
--> 418             self.train_gen(self.gen_train_timesteps)
    419             for _ in range(self.n_disc_updates_per_round):
    420                 with networks.training(self.reward_train):

c:\users\stephane\documents\imitation\src\imitation\algorithms\adversarial\common.py in train_gen(self, total_timesteps, learn_kwargs)
    385 
    386         gen_trajs, ep_lens = self.venv_buffering.pop_trajectories()
--> 387         self._check_fixed_horizon(ep_lens)
    388         gen_samples = rollout.flatten_trajectories_with_rew(gen_trajs)
    389         self._gen_replay_buffer.store(gen_samples)

c:\users\stephane\documents\imitation\src\imitation\algorithms\base.py in _check_fixed_horizon(self, horizons)
     89         if len(horizons) > 1:
     90             raise ValueError(
---> 91                 f"Episodes of different length detected: {horizons}. "
     92                 "Variable horizon environments are discouraged -- "
     93                 "termination conditions leak information about reward. See"

ValueError: Episodes of different length detected: {548, 500}. Variable horizon environments are discouraged -- termination conditions leak information about reward. Seehttps://imitation.readthedocs.io/en/latest/guide/variable_horizon.html for more information. If you are SURE you want to run imitation on a variable horizon task, then please pass in the flag: `allow_variable_horizon=True`.

When trying to run demo from https://github.com/HumanCompatibleAI/imitation/blob/master/examples/4_train_airl.ipynb with a for loop for the training steps it creates episodes of different horizons

lcotetur avatar Jul 28 '22 22:07 lcotetur