stable-baselines3
stable-baselines3 copied to clipboard
Stuck when training with DDPG
❓ Question
Hello, When I use DDPG to train the gym third party environment torcs (a racing environment), after a period of time when the program starts to run, it stalls for a while and then the car keeps turning to the left, no problem when using stable_baselines, this happens when using stable_baselines3
TIMESTEPS = int(1e7) + 1
if __name__ == "__main__":
env = gym.make('Torcs-v0')
n_actions = env.action_space.shape[-1]
action_noise = NormalActionNoise(mean=np.zeros(n_actions),sigma=0.1 * np.ones(n_actions))
model = DDPG(policy="MlpPolicy",
env=env,
action_noise=action_noise,
verbose=1,
tensorboard_log="./ddpg_tensorboard/"
)
model.learn(
total_timesteps=TIMESTEPS,
)
Checklist
- [X] I have checked that there is no similar issue in the repo
- [X] I have read the documentation
- [X] If code there is, it is minimal and working
- [X] If code there is, it is formatted using the markdown code blocks for both code and stack traces.
Hello, is the simulation asynchronous?
Please fill the custom gym env template completely. If you want a full video serie of SB3 and car racing (with open source code), you can take a look at: https://www.youtube.com/watch?v=ngK33h00iBE&list=PL42jkf1t1F7dFXE7f0VTeFLhW0ZEQ4XJV
Edited version: https://www.youtube.com/watch?v=jXVUig0muFI
It is not asynchronous. It is from gym_ torcs , I did not modify it.
class TorcsEnv(gym.Env):
terminal_judge_start = 500 # Speed limit is applied after this step
# [km/h], episode terminates if car is running slower than this limit
termination_limit_progress = 5
default_speed = 50
initial_reset = True
def __init__(self):
self.vision = False
self.throttle = False
self.gear_change = False
self.initial_run = True
# launch torcs
os.system('pkill torcs')
time.sleep(0.5)
if self.vision is True:
os.system('torcs -nofuel -nodamage -nolaptime -vision &')
else:
os.system('torcs -nofuel -nodamage -nolaptime &')
time.sleep(0.5)
os.system('sh autostart.sh')
time.sleep(0.5)
"""
# Modify here if you use multiple tracks in the environment
self.client = snakeoil3.Client(p=3101, vision=self.vision) # Open new UDP in vtorcs
self.client.MAX_STEPS = np.inf
client = self.client
client.get_servers_input() # Get the initial input from torcs
obs = client.S.d # Get the current full-observation from torcs
"""
if self.throttle is False:
self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(1,))
else:
self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(2,))
if self.vision is False:
high_list = [1. for _ in range(5)] # focus
high_list += [np.inf for _ in range(3)] # speedX, Y, Z
high_list += [1. for _ in range(36)] # opponents
high_list += [np.inf] # rpm
high_list += [1. for _ in range(19)]
high_list += [np.inf for _ in range(4)]
high = np.array(high_list)
low_list = [0. for _ in range(5)]
low_list += [-np.inf for _ in range(3)]
low_list += [0. for _ in range(36)]
low_list += [-np.inf]
low_list += [0. for _ in range(19)]
low_list += [-np.inf for _ in range(4)]
low = np.array(low_list)
self.observation_space = spaces.Box(low=low, high=high)
else:
# TODO: vision obs space
high = np.array(
[1., np.inf, np.inf, np.inf, 1., np.inf, 1., np.inf, 255])
low = np.array(
[0., -np.inf, -np.inf, -np.inf, 0., -np.inf, 0., -np.inf, 0])
self.observation_space = spaces.Box(low=low, high=high)
def step(self, u):
# convert thisAction to the actual torcs actionstr
client = self.client
this_action = self.agent_to_torcs(u)
# Apply Action
action_torcs = client.R.d
# Steering
action_torcs['steer'] = this_action['steer'] # in [-1, 1]
# Simple Autnmatic Throttle Control by Snakeoil
if self.throttle is False:
target_speed = self.default_speed
if client.S.d['speedX'] < target_speed - (client.R.d['steer']*50):
client.R.d['accel'] += .01
else:
client.R.d['accel'] -= .01
if client.R.d['accel'] > 0.2:
client.R.d['accel'] = 0.2
if client.S.d['speedX'] < 10:
client.R.d['accel'] += 1/(client.S.d['speedX']+.1)
# Traction Control System
if ((client.S.d['wheelSpinVel'][2]+client.S.d['wheelSpinVel'][3]) -
(client.S.d['wheelSpinVel'][0]+client.S.d['wheelSpinVel'][1]) > 5):
action_torcs['accel'] -= .2
else:
action_torcs['accel'] = this_action['accel']
# Automatic Gear Change by Snakeoil
if self.gear_change is True:
action_torcs['gear'] = this_action['gear']
else:
# Automatic Gear Change by Snakeoil is possible
action_torcs['gear'] = 1
"""
if client.S.d['speedX'] > 50:
action_torcs['gear'] = 2
if client.S.d['speedX'] > 80:
action_torcs['gear'] = 3
if client.S.d['speedX'] > 110:
action_torcs['gear'] = 4
if client.S.d['speedX'] > 140:
action_torcs['gear'] = 5
if client.S.d['speedX'] > 170:
action_torcs['gear'] = 6
"""
# Save the privious full-obs from torcs for the reward calculation
obs_pre = copy.deepcopy(client.S.d)
# One-Step Dynamics Update #################################
# Apply the Agent's action into torcs
client.respond_to_server()
# Get the response of TORCS
client.get_servers_input()
# Get the current full-observation from torcs
obs = client.S.d
# Make an observation from a raw observation vector from TORCS
self.observation = self.make_observaton(obs)
# Reward setting Here #######################################
# direction-dependent positive reward
track = np.array(obs['track'])
sp = np.array(obs['speedX'])
progress = sp*np.cos(obs['angle'])
reward = progress
# collision detection
if obs['damage'] - obs_pre['damage'] > 0:
reward = -1
# Termination judgement #########################
episode_terminate = False
if track.min() < 0: # Episode is terminated if the car is out of track
reward = - 1
episode_terminate = True
client.R.d['meta'] = True
if self.terminal_judge_start < self.time_step: # Episode terminates if the progress of agent is small
if progress < self.termination_limit_progress:
episode_terminate = True
client.R.d['meta'] = True
if np.cos(obs['angle']) < 0: # Episode is terminated if the agent runs backward
episode_terminate = True
client.R.d['meta'] = True
if client.R.d['meta'] is True: # Send a reset signal
self.initial_run = False
client.respond_to_server()
self.time_step += 1
return self.get_obs(), reward, client.R.d['meta'], {}
def reset(self, relaunch=False):
self.time_step = 0
if self.initial_reset is not True:
self.client.R.d['meta'] = True
self.client.respond_to_server()
## TENTATIVE. Restarting TORCS every episode suffers the memory leak bug!
if relaunch is True:
self.reset_torcs()
print("### TORCS is RELAUNCHED ###")
# Modify here if you use multiple tracks in the environment
# Open new UDP in vtorcs
self.client = Client(p=3101, vision=self.vision)
self.client.MAX_STEPS = np.inf
client = self.client
client.get_servers_input() # Get the initial input from torcs
obs = client.S.d # Get the current full-observation from torcs
self.observation = self.make_observaton(obs)
self.last_u = None
self.initial_reset = False
return self.get_obs()
def end(self):
os.system('pkill torcs')
def get_obs(self):
return self.observation
def reset_torcs(self):
os.system('pkill torcs')
time.sleep(0.5)
if self.vision is True:
os.system('torcs -nofuel -nodamage -nolaptime -vision &')
else:
os.system('torcs -nofuel -nodamage -nolaptime &')
time.sleep(0.5)
os.system('sh autostart.sh')
time.sleep(0.5)
def agent_to_torcs(self, u):
torcs_action = {'steer': u[0]}
if self.throttle is True: # throttle action is enabled
torcs_action.update({'accel': u[1]})
if self.gear_change is True: # gear change action is enabled
torcs_action.update({'gear': u[2]})
return torcs_action
def obs_vision_to_image_rgb(self, obs_image_vec):
image_vec = obs_image_vec
rgb = []
temp = []
# convert size 64x64x3 = 12288 to 64x64=4096 2-D list
# with rgb values grouped together.
# Format similar to the observation in openai gym
for i in range(0, 12286, 3):
temp.append(image_vec[i])
temp.append(image_vec[i+1])
temp.append(image_vec[i+2])
rgb.append(temp)
temp = []
return np.array(rgb, dtype=np.uint8)
def make_observaton(self, raw_obs):
if self.vision is False:
names = ['focus',
'speedX', 'speedY', 'speedZ',
'opponents',
'rpm',
'track',
'wheelSpinVel']
focus = np.array(raw_obs['focus'], dtype=np.float32)/200. # (5, )
speedX = np.array(raw_obs['speedX'],
dtype=np.float32)/self.default_speed
speedY = np.array(raw_obs['speedY'],
dtype=np.float32)/self.default_speed
speedZ = np.array(raw_obs['speedZ'],
dtype=np.float32)/self.default_speed
opponents = np.array(
raw_obs['opponents'], dtype=np.float32)/200. # (36, )
rpm = np.array(raw_obs['rpm'], dtype=np.float32)
track = np.array(raw_obs['track'], dtype=np.float32)/200. # (19, )
wheelSpinVel = np.array(
raw_obs['wheelSpinVel'], dtype=np.float32) # (4, )
observation = np.hstack(
(focus, speedX, speedY, speedZ, opponents, rpm, track, wheelSpinVel))
return observation
else:
# TODO: vision obs space
names = ['focus',
'speedX', 'speedY', 'speedZ',
'opponents',
'rpm',
'track',
'wheelSpinVel',
'img']
Observation = col.namedtuple('Observaion', names)
# Get RGB from observation
image_rgb = self.obs_vision_to_image_rgb(raw_obs[names[8]])
return Observation(focus=np.array(raw_obs['focus'], dtype=np.float32)/200.,
speedX=np.array(
raw_obs['speedX'], dtype=np.float32)/self.default_speed,
speedY=np.array(
raw_obs['speedY'], dtype=np.float32)/self.default_speed,
speedZ=np.array(
raw_obs['speedZ'], dtype=np.float32)/self.default_speed,
opponents=np.array(
raw_obs['opponents'], dtype=np.float32)/200.,
rpm=np.array(raw_obs['rpm'], dtype=np.float32),
track=np.array(
raw_obs['track'], dtype=np.float32)/200.,
wheelSpinVel=np.array(
raw_obs['wheelSpinVel'], dtype=np.float32),
img=image_rgb)