seals
seals copied to clipboard
Variable Horizon in seals/CartPole
from imitation.algorithms.adversarial.airl import AIRL
from imitation.rewards.reward_nets import BasicShapedRewardNet
from imitation.util.networks import RunningNorm
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
import gym
import seals
learners_rewards_after_training = []
learners_rewards_before_training = []
venv = DummyVecEnv([lambda: gym.make("seals/CartPole-v0")] * 8)
learner = PPO(
env=venv,
policy=MlpPolicy,
batch_size=64,
ent_coef=0.0,
learning_rate=0.0003,
n_epochs=10,
)
reward_net = BasicShapedRewardNet(
venv.observation_space, venv.action_space, normalize_input_layer=RunningNorm
)
airl_trainer = AIRL(
demonstrations=rollouts,
demo_batch_size=1024,
gen_replay_buffer_capacity=2048,
n_disc_updates_per_round=4,
venv=venv,
gen_algo=learner,
reward_net=reward_net
)
for i in range(10):
learner_rewards_before_training, _ = evaluate_policy(
learner, venv, 100, return_episode_rewards=True
)
learners_rewards_before_training.append(learner_rewards_before_training)
airl_trainer.train(20000) # Note: set to 300000 for better results
learner_rewards_after_training, _ = evaluate_policy(
learner, venv, 100, return_episode_rewards=True
)
learners_rewards_after_training.append(learner_rewards_after_training)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_16872\944136942.py in <module>
41
42
---> 43 airl_trainer.train(20000) # Note: set to 300000 for better results
44 learner_rewards_after_training, _ = evaluate_policy(
45 learner, venv, 100, return_episode_rewards=True
c:\users\stephane\documents\imitation\src\imitation\algorithms\adversarial\common.py in train(self, total_timesteps, callback)
416 )
417 for r in tqdm.tqdm(range(0, n_rounds), desc="round"):
--> 418 self.train_gen(self.gen_train_timesteps)
419 for _ in range(self.n_disc_updates_per_round):
420 with networks.training(self.reward_train):
c:\users\stephane\documents\imitation\src\imitation\algorithms\adversarial\common.py in train_gen(self, total_timesteps, learn_kwargs)
385
386 gen_trajs, ep_lens = self.venv_buffering.pop_trajectories()
--> 387 self._check_fixed_horizon(ep_lens)
388 gen_samples = rollout.flatten_trajectories_with_rew(gen_trajs)
389 self._gen_replay_buffer.store(gen_samples)
c:\users\stephane\documents\imitation\src\imitation\algorithms\base.py in _check_fixed_horizon(self, horizons)
89 if len(horizons) > 1:
90 raise ValueError(
---> 91 f"Episodes of different length detected: {horizons}. "
92 "Variable horizon environments are discouraged -- "
93 "termination conditions leak information about reward. See"
ValueError: Episodes of different length detected: {548, 500}. Variable horizon environments are discouraged -- termination conditions leak information about reward. Seehttps://imitation.readthedocs.io/en/latest/guide/variable_horizon.html for more information. If you are SURE you want to run imitation on a variable horizon task, then please pass in the flag: `allow_variable_horizon=True`.
When trying to run demo from https://github.com/HumanCompatibleAI/imitation/blob/master/examples/4_train_airl.ipynb with a for loop for the training steps it creates episodes of different horizons