stable-baselines
stable-baselines copied to clipboard
"Getting Mean Reward in CustomCallBack" Unsupported operand type(s) for /: 'str' and 'int'
Describe the bug In the CustomCallback, getting the mean reward causes a numpy Error:
TypeError: unsupported operand type(s) for /: 'str' and 'int'
The values are:
x, y = ts2xy(load_results(self.log_dir), 'timesteps')
x: [1467] y: ['56.9170050.0'] self.log_dir level_0 level_1 r l t 0 40.0 1774 56.9170050.0 1467 102.009927 x: [1467] y: ['56.9170050.0']
Error Reproduction:
-
Install Anaconda
-
create environment for stable baseline
-
Install TensorFlow 1.14.0
-
Install Stable Baselines
-
Install Gym Retro
-
Run this code:
Code
# -*- coding: utf-8 -*-
"""
Created on Sun Mar 8 16:13:37 2020
@author: MasterTrader
"""
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 5 08:59:03 2020
@author: MasterTrader
"""
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
import gym
import retro
from stable_baselines import PPO2, A2C,ACKTR
# from ppo2 import PPO2
import numpy as np
from stable_baselines.common.policies import MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy, CnnLnLstmPolicy, CnnPolicy, CnnLstmPolicy
from stable_baselines.common.vec_env import SubprocVecEnv, DummyVecEnv
from stable_baselines.common import make_vec_env
import cv2
from stable_baselines.common.callbacks import CheckpointCallback
# from stable_baselines.common.callbacks import CheckpointCallback
from stable_baselines.common.callbacks import BaseCallback
from stable_baselines.common.base_class import BaseRLModel, Union,VecEnv
from stable_baselines.results_plotter import load_results, ts2xy
from stable_baselines.bench import Monitor
from stable_baselines.common.noise import AdaptiveParamNoiseSpec
import os
class Discretizer(gym.ActionWrapper):
"""
Wrap a gym-retro environment and make it use discrete
actions for the Sonic game.
"""
def __init__(self, env):
super(Discretizer, self).__init__(env)
buttons = ["B", "A", "MODE", "START", "UP", "DOWN", "LEFT", "RIGHT", "C", "Y", "X", "Z"]
# actions = [['LEFT'], ['RIGHT'], ['LEFT', 'DOWN'], ['RIGHT', 'DOWN'], ['DOWN'],
# ['DOWN', 'B'], ['B']]
actions = [['LEFT'], ['RIGHT'], ['B']]
self._actions = []
"""
What we do in this loop:
For each action in actions
- Create an array of 12 False (12 = nb of buttons)
For each button in action: (for instance ['LEFT']) we need to make that left button index = True
- Then the button index = LEFT = True
In fact at the end we will have an array where each array is an action and each elements True of this array
are the buttons clicked.
"""
for action in actions:
arr = np.array([False] * 12)
for button in action:
arr[buttons.index(button)] = True
self._actions.append(arr)
self.action_space = gym.spaces.Discrete(len(self._actions))
def action(self, a): # pylint: disable=W0221
return self._actions[a].copy()
class PreprocessFrame(gym.ObservationWrapper):
"""
Here we do the preprocessing part:
- Set frame to gray
- Resize the frame to 96x96x1
"""
def __init__(self, env):
gym.ObservationWrapper.__init__(self, env)
self.width = 96
self.height = 96
self.observation_space = gym.spaces.Box(low=0, high=255,
shape=(self.height, self.width, 1), dtype=np.uint8)
def observation(self, frame):
# Set frame to gray
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
# Resize the frame to 96x96x1
frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA)
frame = frame[:, :, None]
return frame
class CustomCallback(BaseCallback):
"""
A custom callback that derives from ``BaseCallback``.
:param verbose: (int) Verbosity level 0: not output 1: info 2: debug
"""
def __init__(self,model,check_freq: int,log_dir: str, verbose=1):
super(CustomCallback, self).__init__(verbose)
# Those variables will be accessible in the callback
# (they are defined in the base class)
# The RL model
self.log_dir = log_dir
self.check_freq = check_freq
self.model = None # type: BaseRLModel
self.best_mean_reward = -np.inf
# An alias for self.model.get_env(), the environment used for training
# self.training_env = None # type: Union[gym.Env, VecEnv, None]
# Number of time the callback was called
# self.n_calls = 0 # type: int
# self.num_timesteps = 0 # type: int
# local and global variables
# self.locals = None # type: Dfromict[str, Any]
# self.globals = None # type: Dict[str, Any]
# The logger object, used to report things in the terminal
# self.logger = None # type: logger.Logger
# # Sometimes, for event callback, it is useful
# # to have access to the parent object
# self.parent = None # type: Optional[BaseCallback]
def _on_training_start(self) -> None:
"""
This method is called before the first rollout starts.
"""
# print("on training start")
self.training_env.reset()
pass
def _on_rollout_start(self) -> None:
"""
A rollout is the collection of environment interaction
using the current policy.
This event is triggered before collecting new samples.
"""
# print("on rollout start")
pass
def _on_step(self) -> bool:
"""
This method will be called by the model after each call to `env.step()`.
For child callback (of an `EventCallback`), this will be called
when the event is triggered.
:return: (bool) If the callback returns False, training is aborted early.
"""
mean_reward =0
victor = False
if self.n_calls % self.check_freq == 0:
# if self.num_timesteps == self.check_freq-(.80*(self.check_freq)):
# Retrieve training reward
x, y = ts2xy(load_results(self.log_dir), 'timesteps')
print("x: ", x)
print("y: ", y)
print("self.log_dir",load_results(self.log_dir))
if len(x) > 0:
print("x: ", x)
print("y: ", y)
mean_reward = np.mean(y[-100:])
# mean_reward = y
# # print("on step ")
# # self.training_env.render(mode='human')
# print("Num timesteps: {}".format(self.num_timesteps))
# print('\r' + "Numtimesteps: {}".format(self.num_timesteps),' Mean reward: ',mean_reward, end='')
return True
def _on_rollout_end(self) -> None:
"""
This event is triggered before updating the policy.
"""
# print("on rollout end ")
# self.training_env.close()
pass
def _on_training_end(self) -> None:
"""
This event is triggered before exiting the `learn()` method.
"""
# print("on training end ")
pass
# @profile
def main():
# Suppress warnings!
try:
from tensorflow.python.util import module_wrapper as deprecation
except ImportError:
from tensorflow.python.util import deprecation_wrapper as deprecation
deprecation._PER_MODULE_WARNING_LIMIT = 0
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
# Create log dir
log_dir = "tmp/"
os.makedirs(log_dir, exist_ok=True)
print("Setting Environment")
n_cpu = 16
env = SubprocVecEnv([lambda:Monitor(PreprocessFrame(Discretizer(retro.make(game='Airstriker-Genesis'))),log_dir) for i in range(n_cpu)])
# env = DummyVecEnv([lambda:Monitor(PreprocessFrame(Discretizer(retro.make(game='Airstriker-Genesis'))),log_dir)])
print("Setting up Model")
# Add some param noise for exploration
# param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.1, desired_action_stddev=0.1)
# Because we use parameter noise, we should use a MlpPolicy with layer normalization
# MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy, CnnLnLstmPolicy, CnnPolicy, CnnLstmPolicy
# model = ACKTR(MlpPolicy,env,verbose=1)
# model = ACKTR(MlpLstmPolicy,env,verbose=1)
model = PPO2(CnnPolicy,env,verbose=1)
# model = ACKTR(CnnLnLstmPolicy,env,verbose=1) #387
# model = ACKTR(CnnPolicy,env,verbose=1)
# model = ACKTR(CnnLstmPolicy,env,verbose=1) #387
print("Setting up Model done:")
# from stable_baselines.common.callbacks import CheckpointCallback
# Save a checkpoint every 1000 steps
# checkpoint_callback = CheckpointCallback(save_freq=100, save_path='./logs/',
# name_prefix='ppo_savepoint')
print("Learning time!")
print("model",model)
print("Setting up callbacks")
total_timesteps = 1000000
# model.learn(total_timesteps=5000, callback=checkpoint_callback)
# callback = CustomCallback(model = model,training_env = env, check_freq=1000,log_dir=log_dir)
callback = CustomCallback(model = model,check_freq=100,log_dir=log_dir)
model.learn(total_timesteps=total_timesteps, callback=callback)
print("done learning.")
print("Saving model.")
model.save("PPO2")
print("done")
# del model # remove to demonstrate saving and loading
# print("loading model")
# model = ACKTR.load("ACKTR")
# model = PPO2.load("PPO2")
# print("loading done")
print("Rendering Time!")
obs = env.reset()
while True:
action,_states = model.predict(obs)
# print(action)
obs, reward,dones,info = env.step(action)
env.render(mode='human')
if __name__ == "__main__":
main()
Logs
-------------------------------------
| approxkl | 0.0021870576 |
| clipfrac | 0.03112793 |
| ep_len_mean | 1.53e+03 |
| ep_reward_mean | 25 |
| explained_variance | 0.559 |
| fps | 496 |
| n_updates | 24 |
| policy_entropy | 1.0432243 |
| policy_loss | -0.006628548 |
| serial_timesteps | 3072 |
| time_elapsed | 96.9 |
| total_timesteps | 49152 |
| value_loss | 0.0033252328 |
-------------------------------------
x: [1467]
y: ['56.9170050.0']
self.log_dir level_0 level_1 r l t
0 40.0 1774 56.9170050.0 1467 102.009927
x: [1467]
y: ['56.9170050.0']
Traceback (most recent call last):
File "ppo2retro.py", line 302, in <module>
main()
File "ppo2retro.py", line 272, in main
model.learn(total_timesteps=total_timesteps, callback=callback)
File "C:\anaconda\envs\StableBaselines\lib\site-packages\stable_baselines\ppo2\ppo2.py", line 336, in learn
rollout = self.runner.run(callback)
File "C:\anaconda\envs\StableBaselines\lib\site-packages\stable_baselines\common\runners.py", line 48, in run
return self._run()
File "C:\anaconda\envs\StableBaselines\lib\site-packages\stable_baselines\ppo2\ppo2.py", line 488, in _run
if self.callback.on_step() is False:
File "C:\anaconda\envs\StableBaselines\lib\site-packages\stable_baselines\common\callbacks.py", line 89, in on_step
return self._on_step()
File "ppo2retro.py", line 175, in _on_step
mean_reward = np.mean(y[-100:])
File "<__array_function__ internals>", line 6, in mean
File "C:\anaconda\envs\StableBaselines\lib\site-packages\numpy\core\fromnumeric.py", line 3335, in mean
out=out, **kwargs)
File "C:\anaconda\envs\StableBaselines\lib\site-packages\numpy\core\_methods.py", line 163, in _mean
ret = ret / rcount
TypeError: unsupported operand type(s) for /: 'str' and 'int'
(StableBaselines) E:\ML\reinforcementlearning\BASELINES>
System Info Describe the characteristic of your environment:
- Describe how the library was installed (pip, docker, source, ...) PIP. TesnforFlow installed using Anaconda Navigator.
- GPU models and configuration CPU ryzen 2700x
- Python version Python 3.7.6 from Anaconda Navigator (1.9.12)
- Tensorflow version 1.14.0
- Versions of any other relevant libraries Conda list codnalist.txt
Hey have you found a solution for this? I have a similar issue.Thanks :)
Sadly no.. I changed it back to the default code.