stable-baselines3 Stuck when training with DDPG

❓ Question

Hello, When I use DDPG to train the gym third party environment torcs (a racing environment), after a period of time when the program starts to run, it stalls for a while and then the car keeps turning to the left, no problem when using stable_baselines, this happens when using stable_baselines3

TIMESTEPS = int(1e7) + 1

if __name__ == "__main__":
    env = gym.make('Torcs-v0')
    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions),sigma=0.1 * np.ones(n_actions))

    model = DDPG(policy="MlpPolicy",
                 env=env,
                 action_noise=action_noise,
                 verbose=1,
                 tensorboard_log="./ddpg_tensorboard/"
                 )
    model.learn(
        total_timesteps=TIMESTEPS,
    )

Checklist

[X] I have checked that there is no similar issue in the repo
[X] I have read the documentation
[X] If code there is, it is minimal and working
[X] If code there is, it is formatted using the markdown code blocks for both code and stack traces.

Dec 18 '22 13:12 betacatZ

Hello, is the simulation asynchronous?

Please fill the custom gym env template completely. If you want a full video serie of SB3 and car racing (with open source code), you can take a look at: https://www.youtube.com/watch?v=ngK33h00iBE&list=PL42jkf1t1F7dFXE7f0VTeFLhW0ZEQ4XJV

Edited version: https://www.youtube.com/watch?v=jXVUig0muFI

Dec 18 '22 13:12 araffin

It is not asynchronous. It is from gym_ torcs , I did not modify it.

class TorcsEnv(gym.Env):
    terminal_judge_start = 500  # Speed limit is applied after this step
    # [km/h], episode terminates if car is running slower than this limit
    termination_limit_progress = 5
    default_speed = 50

    initial_reset = True

    def __init__(self):
        self.vision = False
        self.throttle = False
        self.gear_change = False

        self.initial_run = True

        # launch torcs
        os.system('pkill torcs')
        time.sleep(0.5)
        if self.vision is True:
            os.system('torcs -nofuel -nodamage -nolaptime  -vision &')
        else:
            os.system('torcs  -nofuel -nodamage -nolaptime &')
        time.sleep(0.5)
        os.system('sh autostart.sh')
        time.sleep(0.5)

        """
        # Modify here if you use multiple tracks in the environment
        self.client = snakeoil3.Client(p=3101, vision=self.vision)  # Open new UDP in vtorcs
        self.client.MAX_STEPS = np.inf

        client = self.client
        client.get_servers_input()  # Get the initial input from torcs

        obs = client.S.d  # Get the current full-observation from torcs
        """
        if self.throttle is False:
            self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(1,))
        else:
            self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(2,))

        if self.vision is False:
            high_list = [1. for _ in range(5)]  # focus
            high_list += [np.inf for _ in range(3)]  # speedX, Y, Z
            high_list += [1. for _ in range(36)]  # opponents
            high_list += [np.inf]  # rpm
            high_list += [1. for _ in range(19)]
            high_list += [np.inf for _ in range(4)]
            high = np.array(high_list)

            low_list = [0. for _ in range(5)]
            low_list += [-np.inf for _ in range(3)]
            low_list += [0. for _ in range(36)]
            low_list += [-np.inf]
            low_list += [0. for _ in range(19)]
            low_list += [-np.inf for _ in range(4)]
            low = np.array(low_list)
            self.observation_space = spaces.Box(low=low, high=high)
        else:
            # TODO: vision obs space
            high = np.array(
                [1., np.inf, np.inf, np.inf, 1., np.inf, 1., np.inf, 255])
            low = np.array(
                [0., -np.inf, -np.inf, -np.inf, 0., -np.inf, 0., -np.inf, 0])
            self.observation_space = spaces.Box(low=low, high=high)

    def step(self, u):
        # convert thisAction to the actual torcs actionstr
        client = self.client

        this_action = self.agent_to_torcs(u)

        # Apply Action
        action_torcs = client.R.d

        # Steering
        action_torcs['steer'] = this_action['steer']  # in [-1, 1]

        # Simple Autnmatic Throttle Control by Snakeoil
        if self.throttle is False:
            target_speed = self.default_speed
            if client.S.d['speedX'] < target_speed - (client.R.d['steer']*50):
                client.R.d['accel'] += .01
            else:
                client.R.d['accel'] -= .01

            if client.R.d['accel'] > 0.2:
                client.R.d['accel'] = 0.2

            if client.S.d['speedX'] < 10:
                client.R.d['accel'] += 1/(client.S.d['speedX']+.1)

            # Traction Control System
            if ((client.S.d['wheelSpinVel'][2]+client.S.d['wheelSpinVel'][3]) -
                    (client.S.d['wheelSpinVel'][0]+client.S.d['wheelSpinVel'][1]) > 5):
                action_torcs['accel'] -= .2
        else:
            action_torcs['accel'] = this_action['accel']

        # Automatic Gear Change by Snakeoil
        if self.gear_change is True:
            action_torcs['gear'] = this_action['gear']
        else:
            # Automatic Gear Change by Snakeoil is possible
            action_torcs['gear'] = 1
            """
            if client.S.d['speedX'] > 50:
                action_torcs['gear'] = 2
            if client.S.d['speedX'] > 80:
                action_torcs['gear'] = 3
            if client.S.d['speedX'] > 110:
                action_torcs['gear'] = 4
            if client.S.d['speedX'] > 140:
                action_torcs['gear'] = 5
            if client.S.d['speedX'] > 170:
                action_torcs['gear'] = 6
            """

        # Save the privious full-obs from torcs for the reward calculation
        obs_pre = copy.deepcopy(client.S.d)

        # One-Step Dynamics Update #################################
        # Apply the Agent's action into torcs
        client.respond_to_server()
        # Get the response of TORCS
        client.get_servers_input()

        # Get the current full-observation from torcs
        obs = client.S.d

        # Make an observation from a raw observation vector from TORCS
        self.observation = self.make_observaton(obs)

        # Reward setting Here #######################################
        # direction-dependent positive reward
        track = np.array(obs['track'])
        sp = np.array(obs['speedX'])
        progress = sp*np.cos(obs['angle'])
        reward = progress

        # collision detection
        if obs['damage'] - obs_pre['damage'] > 0:
            reward = -1

        # Termination judgement #########################
        episode_terminate = False
        if track.min() < 0:  # Episode is terminated if the car is out of track
            reward = - 1
            episode_terminate = True
            client.R.d['meta'] = True

        if self.terminal_judge_start < self.time_step:  # Episode terminates if the progress of agent is small
            if progress < self.termination_limit_progress:
                episode_terminate = True
                client.R.d['meta'] = True

        if np.cos(obs['angle']) < 0:  # Episode is terminated if the agent runs backward
            episode_terminate = True
            client.R.d['meta'] = True

        if client.R.d['meta'] is True:  # Send a reset signal
            self.initial_run = False
            client.respond_to_server()

        self.time_step += 1

        return self.get_obs(), reward, client.R.d['meta'], {}

    def reset(self, relaunch=False):
        self.time_step = 0

        if self.initial_reset is not True:
            self.client.R.d['meta'] = True
            self.client.respond_to_server()

            ## TENTATIVE. Restarting TORCS every episode suffers the memory leak bug!
            if relaunch is True:
                self.reset_torcs()
                print("### TORCS is RELAUNCHED ###")

        # Modify here if you use multiple tracks in the environment
        # Open new UDP in vtorcs
        self.client = Client(p=3101, vision=self.vision)
        self.client.MAX_STEPS = np.inf

        client = self.client
        client.get_servers_input()  # Get the initial input from torcs

        obs = client.S.d  # Get the current full-observation from torcs
        self.observation = self.make_observaton(obs)

        self.last_u = None

        self.initial_reset = False
        return self.get_obs()

    def end(self):
        os.system('pkill torcs')

    def get_obs(self):
        return self.observation

    def reset_torcs(self):
        os.system('pkill torcs')
        time.sleep(0.5)
        if self.vision is True:
            os.system('torcs -nofuel -nodamage -nolaptime -vision &')
        else:
            os.system('torcs -nofuel -nodamage -nolaptime &')
        time.sleep(0.5)
        os.system('sh autostart.sh')
        time.sleep(0.5)

    def agent_to_torcs(self, u):
        torcs_action = {'steer': u[0]}

        if self.throttle is True:  # throttle action is enabled
            torcs_action.update({'accel': u[1]})

        if self.gear_change is True:  # gear change action is enabled
            torcs_action.update({'gear': u[2]})

        return torcs_action

    def obs_vision_to_image_rgb(self, obs_image_vec):
        image_vec = obs_image_vec
        rgb = []
        temp = []
        # convert size 64x64x3 = 12288 to 64x64=4096 2-D list
        # with rgb values grouped together.
        # Format similar to the observation in openai gym
        for i in range(0, 12286, 3):
            temp.append(image_vec[i])
            temp.append(image_vec[i+1])
            temp.append(image_vec[i+2])
            rgb.append(temp)
            temp = []
        return np.array(rgb, dtype=np.uint8)

    def make_observaton(self, raw_obs):
        if self.vision is False:
            names = ['focus',
                     'speedX', 'speedY', 'speedZ',
                     'opponents',
                     'rpm',
                     'track',
                     'wheelSpinVel']

            focus = np.array(raw_obs['focus'], dtype=np.float32)/200.  # (5, )
            speedX = np.array(raw_obs['speedX'],
                              dtype=np.float32)/self.default_speed
            speedY = np.array(raw_obs['speedY'],
                              dtype=np.float32)/self.default_speed
            speedZ = np.array(raw_obs['speedZ'],
                              dtype=np.float32)/self.default_speed
            opponents = np.array(
                raw_obs['opponents'], dtype=np.float32)/200.  # (36, )
            rpm = np.array(raw_obs['rpm'], dtype=np.float32)
            track = np.array(raw_obs['track'], dtype=np.float32)/200.  # (19, )
            wheelSpinVel = np.array(
                raw_obs['wheelSpinVel'], dtype=np.float32)  # (4, )

            observation = np.hstack(
                (focus, speedX, speedY, speedZ, opponents, rpm, track, wheelSpinVel))
            return observation
        else:
            # TODO: vision obs space
            names = ['focus',
                     'speedX', 'speedY', 'speedZ',
                     'opponents',
                     'rpm',
                     'track',
                     'wheelSpinVel',
                     'img']
            Observation = col.namedtuple('Observaion', names)

            # Get RGB from observation
            image_rgb = self.obs_vision_to_image_rgb(raw_obs[names[8]])

            return Observation(focus=np.array(raw_obs['focus'], dtype=np.float32)/200.,
                               speedX=np.array(
                                   raw_obs['speedX'], dtype=np.float32)/self.default_speed,
                               speedY=np.array(
                                   raw_obs['speedY'], dtype=np.float32)/self.default_speed,
                               speedZ=np.array(
                                   raw_obs['speedZ'], dtype=np.float32)/self.default_speed,
                               opponents=np.array(
                                   raw_obs['opponents'], dtype=np.float32)/200.,
                               rpm=np.array(raw_obs['rpm'], dtype=np.float32),
                               track=np.array(
                                   raw_obs['track'], dtype=np.float32)/200.,
                               wheelSpinVel=np.array(
                                   raw_obs['wheelSpinVel'], dtype=np.float32),
                               img=image_rgb)

Dec 18 '22 14:12 betacatZ

stable-baselines3 stable-baselines3 copied to clipboard

Stuck when training with DDPG

❓ Question

Checklist

stable-baselines3
stable-baselines3 copied to clipboard