TQC: ep_len_mean and ep_rew_mean does not match real values

🐛 Bug


I am currently using TQC (sb3 contrib version 2.3.0/ sb3 version: 2.3.2) with a custom environment on gymnasium (version 0.28.1) and Isaac Sim as a simulator. I have figured out that the data that sb3 tqc displays (ep_len_mean and ep_rew_mean) does not match what my environment calculates. I have checked my environment several times,s and I can't find where the issue might come from.

You can find attached a screenshot showing the discrepancy. Screenshot from 2024-05-07 10-51-16 Screenshot from 2024-05-07 10-51-25

To Reproduce

import gymnasium as gym
from gymnasium import spaces
import numpy as np
import os
from tasks.basetask import Basetask
from tasks.single_debris_rigid import SingleDebrisRigid
from tasks.single_debris_rigid_admitance import SingleDebrisRigidAdmitance

def task_selector(config):
    if config["task"]["name"] == "single_debris_rigid":
        return SingleDebrisRigid(config)
    elif config["task"]["name"] == "single_debris_rigid_admitance":
        return SingleDebrisRigidAdmitance(config)
        return SingleDebrisRigid(config)

class Glovebox(gym.Env):
    metadata = {"render_modes": "human"}

    def __init__(self, render_mode=None, cfg_env=None, cfg_task=None,  world=None) -> None:
        self.render_mode = render_mode
        self.step_after_reset = int(1)
        self.current_step = 0
        self.episode_reward = 0 = world
        self.cfg = cfg_env
        self.prev_time = 0
        self.task = task_selector(cfg_task)
        self._num_actions = self.cfg["env"]["num_actions"]
        # number of observations depends on the number of debris on this version
        self.reward_range = (-float("inf"), float("inf"))

        self.action_space = spaces.Box(
                                    low = -1.0,
                                    high = 1.0,
                                    shape = (self._num_actions,),
                                    dtype= np.float32,
        observation_map = {
            "ik_control" : 15,
            "Small" : 19,
            "Medium" : 27,
            "Large": 44,
            "Xlarge": 41,
        self.task.num_observations = self.cfg["env"]["num_observations"]
        self.observation_space = spaces.Box(
                                        low = -float("inf"),
                                        high = float("inf"),
                                        shape = (int(observation_map[self.cfg["env"]["num_observations"]]),),
                                        dtype= np.float32,
        self._max_episode_length = self.cfg["env"]["max_episode_length"]

    # set up the main component of the environment such as the glovebox and the robot
    def set_up_env(self):
        print("--- Glovebox Environment added --")

    def include_glovebox(self, world):
        # include all the glovebox environment with the robot included
        # Only one robot is managed
        import omni.isaac.core.utils.stage as stage_utils
        glovebox_usd_path = self.cfg["env"]["glovebox_usd_path"]
        stage_utils.add_reference_to_stage(usd_path=glovebox_usd_path, prim_path = "/World")

    def get_dt(self):
        return self._dt

    def get_observations(self):
        obs =  self.task.get_observations()
        return obs
    def get_info(self):
        return 0
    def seed(self):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]
    def close(self, seed=None, options=None):
    def compute_reward(self):
        rewards = self.task.calculate_metrics()
        return rewards
    def reset(self, seed=None, options=None):
        self.current_step = 0   
        self.episode_reward = 0
        obs = self.get_observations()
        info = {}
        return obs, info
    def step(self, action):
        self.current_step = self.current_step + 1
        for i in range(self.cfg["env"]["skipframe"]):
        observations = self.get_observations()
        info = {}
        terminated = False
        truncated = False

        if self.current_step >= self._max_episode_length:
            terminated = True
            truncated = True
            print("Number of Timestep : " + str(self.current_step))
            print("max episode length: " + str(self._max_episode_length))

        terminated = terminated or self.task.is_done(terminated)
        rewards = self.compute_reward()
        self.episode_reward = self.episode_reward + rewards
        if terminated == True: 
            print("Total episode reward: " + str(self.episode_reward))
            self.episode_reward = 0
        return observations, rewards, terminated, truncated, info

System Info

GPU: RTX 4090 Cuda 12.2

Package Version Location

absl-py 2.1.0 aiobotocore 1.2.0 aiodns 2.0.0 aiofiles 0.4.0 aiohttp 3.8.3 aioitertools 0.7.1 aiosignal 1.3.1 antlr4-python3-runtime 4.9.3 anyio 3.7.1 appdirs 1.4.4 asteval 0.9.21 astunparse 1.6.3 async-timeout 4.0.2 attrs 20.1.0 azure-core 1.28.0 azure-identity 1.13.0 azure-storage-blob 12.17.0 boto3 1.26.63 botocore 1.20.49 cchardet 2.1.6 certifi 2023.5.7 cffi 1.15.1 charset-normalizer 2.1.1 click 8.1.3 cloudpickle 3.0.0 cmake construct 2.10.68 contourpy 1.2.1 coverage 6.1.2 cryptography 41.0.6 cycler 0.11.0 docker-pycreds 0.4.0 exceptiongroup 1.1.2 Farama-Notifications 0.0.4 fastapi 0.92.0 filelock 3.12.2 flatbuffers 24.3.25 fonttools 4.51.0 frozenlist 1.3.3 fsspec 2024.3.1 gast 0.5.4 gitdb 4.0.11 GitPython 3.1.43 Glovebox 0.0.1 /home/btabia/git/residual_soft_push/envs google-pasta 0.2.0 grpcio 1.62.1 gunicorn 20.1.0 gymnasium 0.28.1 h11 0.14.0 h5py 3.10.0 httptools 0.4.0 hydra-core 1.3.2 idna 3.4 idna-ssl 1.1.0 imageio 2.22.2 isodate 0.6.1 jax-jumpy 1.0.0 Jinja2 3.1.2 jmespath 0.10.0 jsonschema 3.2.0 keras 3.2.0 kiwisolver 1.4.4 libclang 18.1.1 lit 18.1.2 llvmlite 0.40.0 lxml 4.9.3 Markdown 3.6 markdown-it-py 3.0.0 MarkupSafe 2.1.3 matplotlib 3.7.1 mdurl 0.1.2 ml-dtypes 0.3.2 mpmath 1.3.0 msal 1.23.0 msal-extensions 1.0.0 multidict 6.0.4 namex 0.0.7 nest-asyncio 1.5.6 networkx 3.1 numba 0.57.0 numpy 1.23.5 numpy-quaternion 2022.4.3 nvidia-cublas-cu11 nvidia-cublas-cu12 nvidia-cuda-cupti-cu11 11.7.101 nvidia-cuda-cupti-cu12 12.1.105 nvidia-cuda-nvrtc-cu11 11.7.99 nvidia-cuda-nvrtc-cu12 12.1.105 nvidia-cuda-runtime-cu11 11.7.99 nvidia-cuda-runtime-cu12 12.1.105 nvidia-cudnn-cu11 nvidia-cudnn-cu12 nvidia-cufft-cu11 nvidia-cufft-cu12 nvidia-curand-cu11 nvidia-curand-cu12 nvidia-cusolver-cu11 nvidia-cusolver-cu12 nvidia-cusparse-cu11 nvidia-cusparse-cu12 nvidia-lula-no-cuda 0.9.1 nvidia-nccl-cu11 2.14.3 nvidia-nccl-cu12 2.20.5 nvidia-nvjitlink-cu12 12.4.127 nvidia-nvtx-cu11 11.7.91 nvidia-nvtx-cu12 12.1.105 nvidia-srl-base 0.9.0 nvidia-srl-math 0.8.0 nvidia-srl-usd 0.13.0 nvidia-srl-usd-to-urdf 0.5.0 nvsmi 0.4.2 oauthlib 3.2.2 omegaconf 2.3.0 opt-einsum 3.3.0 optree 0.11.0 osqp 0.6.2.post8 packaging 23.0 pandas 2.2.1 pathtools 0.1.2 Pillow 9.2.0 Pint 0.20.1 pip 21.2.1+nv1 plotly 5.3.1 portalocker 2.7.0 protobuf 4.25.3 psutil 5.7.2 pycares 3.1.1 pycparser 2.21 pydantic 1.9.2 Pygments 2.17.2 pyparsing 3.0.9 pyperclip 1.8.0 pypng 0.20220715.0 pyrsistent 0.19.3 python-dateutil 2.8.2 python-multipart 0.0.6 pytz 2022.7.1 PyYAML 6.0.1 qdldl 0.1.5.post3 qrcode 7.4.2 requests 2.31.0 requests-oauthlib 1.3.1 rich 13.7.1 s3transfer 0.6.1 sb3_contrib 2.3.0 scipy 1.10.1 selenium 4.14.0 sentry-sdk 1.14.0 setproctitle 1.3.3 setuptools 68.0.0 setuptools-scm 8.0.4 six 1.16.0 smmap 5.0.1 sniffio 1.3.0 stable_baselines3 2.3.2 starlette 0.25.0 sympy 1.12 tensorboard 2.16.2 tensorboard-data-server 0.7.2 tensorflow-io-gcs-filesystem 0.36.0 termcolor 2.4.0 toml 0.10.1 tomli 2.0.1 torch 2.0.1 torchvision 0.15.2+cu118 tornado 6.2 triton 2.0.0 typing_extensions 4.11.0 tzdata 2024.1 urllib3 1.26.16 uvicorn 0.21.1 wandb 0.16.6 watchdog 0.10.4 webbot 0.34 websockets 10.3 Werkzeug 3.0.2 wheel 0.43.0 wrapt 1.10.10 yarl 1.8.2


