stable-baselines icon indicating copy to clipboard operation
stable-baselines copied to clipboard

"Getting Mean Reward in CustomCallBack" Unsupported operand type(s) for /: 'str' and 'int'

Open toksis opened this issue 4 years ago • 2 comments

Describe the bug In the CustomCallback, getting the mean reward causes a numpy Error:

TypeError: unsupported operand type(s) for /: 'str' and 'int'

The values are:

 x, y = ts2xy(load_results(self.log_dir), 'timesteps')

x: [1467] y: ['56.9170050.0'] self.log_dir level_0 level_1 r l t 0 40.0 1774 56.9170050.0 1467 102.009927 x: [1467] y: ['56.9170050.0']

Error Reproduction:

  • Install Anaconda
    
  • create environment for stable baseline
    
  • Install TensorFlow 1.14.0
    
  • Install Stable Baselines
    
  • Install Gym Retro
    
  • Run this code:
    

Code

# -*- coding: utf-8 -*-
"""
Created on Sun Mar  8 16:13:37 2020

@author: MasterTrader
"""

# -*- coding: utf-8 -*-
"""
Created on Thu Mar  5 08:59:03 2020

@author: MasterTrader
"""
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

      
import gym
import retro
from stable_baselines import PPO2, A2C,ACKTR
# from ppo2 import PPO2
import numpy as np
from stable_baselines.common.policies import MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy, CnnLnLstmPolicy, CnnPolicy, CnnLstmPolicy
from stable_baselines.common.vec_env import SubprocVecEnv, DummyVecEnv
from stable_baselines.common import make_vec_env
import cv2

from stable_baselines.common.callbacks import CheckpointCallback
# from stable_baselines.common.callbacks import CheckpointCallback
from stable_baselines.common.callbacks import BaseCallback
from stable_baselines.common.base_class import BaseRLModel, Union,VecEnv
from stable_baselines.results_plotter import load_results, ts2xy
from stable_baselines.bench import Monitor
from stable_baselines.common.noise import AdaptiveParamNoiseSpec
import os

class Discretizer(gym.ActionWrapper):
    """
    Wrap a gym-retro environment and make it use discrete
    actions for the Sonic game.
    """
    def __init__(self, env):
        super(Discretizer, self).__init__(env)
        buttons = ["B", "A", "MODE", "START", "UP", "DOWN", "LEFT", "RIGHT", "C", "Y", "X", "Z"]
        # actions = [['LEFT'], ['RIGHT'], ['LEFT', 'DOWN'], ['RIGHT', 'DOWN'], ['DOWN'],
        #            ['DOWN', 'B'], ['B']]
        
        actions = [['LEFT'], ['RIGHT'], ['B']]
                   
        
        self._actions = []

        """
        What we do in this loop:
        For each action in actions
            - Create an array of 12 False (12 = nb of buttons)
            For each button in action: (for instance ['LEFT']) we need to make that left button index = True
                - Then the button index = LEFT = True
            In fact at the end we will have an array where each array is an action and each elements True of this array
            are the buttons clicked.
        """
        for action in actions:
            arr = np.array([False] * 12)
            for button in action:
                arr[buttons.index(button)] = True
            self._actions.append(arr)
        self.action_space = gym.spaces.Discrete(len(self._actions))

    def action(self, a): # pylint: disable=W0221
        return self._actions[a].copy()




class PreprocessFrame(gym.ObservationWrapper):
    """
    Here we do the preprocessing part:
    - Set frame to gray
    - Resize the frame to 96x96x1
    """
    def __init__(self, env):
        gym.ObservationWrapper.__init__(self, env)
        self.width = 96
        self.height = 96
        self.observation_space = gym.spaces.Box(low=0, high=255,
            shape=(self.height, self.width, 1), dtype=np.uint8)

    def observation(self, frame):
        # Set frame to gray
        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)

        # Resize the frame to 96x96x1
        frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA)
        frame = frame[:, :, None]

        return frame


class CustomCallback(BaseCallback):
    """
    A custom callback that derives from ``BaseCallback``.

    :param verbose: (int) Verbosity level 0: not output 1: info 2: debug
    """
    def __init__(self,model,check_freq: int,log_dir: str, verbose=1):
        super(CustomCallback, self).__init__(verbose)
        # Those variables will be accessible in the callback
        # (they are defined in the base class)
        # The RL model
        self.log_dir = log_dir
        self.check_freq = check_freq
        self.model = None  # type: BaseRLModel
        self.best_mean_reward = -np.inf
        # An alias for self.model.get_env(), the environment used for training
        # self.training_env = None  # type: Union[gym.Env, VecEnv, None]
        # Number of time the callback was called
        # self.n_calls = 0  # type: int
        # self.num_timesteps = 0  # type: int
        # local and global variables
        # self.locals = None  # type: Dfromict[str, Any]
        # self.globals = None  # type: Dict[str, Any]
        # The logger object, used to report things in the terminal
        # self.logger = None  # type: logger.Logger
        # # Sometimes, for event callback, it is useful
        # # to have access to the parent object
        # self.parent = None  # type: Optional[BaseCallback]
        

    def _on_training_start(self) -> None:
        """
        This method is called before the first rollout starts.
        """
        # print("on training start")
        self.training_env.reset()
        pass

    def _on_rollout_start(self) -> None:
        """
        A rollout is the collection of environment interaction
        using the current policy.
        This event is triggered before collecting new samples.
        """
        # print("on rollout start")
       
        pass

    def _on_step(self) -> bool:
        """
        This method will be called by the model after each call to `env.step()`.

        For child callback (of an `EventCallback`), this will be called
        when the event is triggered.

        :return: (bool) If the callback returns False, training is aborted early.
        """
        mean_reward =0
        victor = False
        if self.n_calls % self.check_freq == 0:
        # if self.num_timesteps == self.check_freq-(.80*(self.check_freq)):
              # Retrieve training reward
            x, y = ts2xy(load_results(self.log_dir), 'timesteps')
            print("x: ", x)
            print("y: ", y)
            print("self.log_dir",load_results(self.log_dir))    
            
            if len(x) > 0:
                print("x: ", x)
                print("y: ", y)
                
                mean_reward = np.mean(y[-100:])
                
                  # mean_reward = y
        
        
                
        
        # # print("on step ")
        # # self.training_env.render(mode='human')
        # print("Num timesteps: {}".format(self.num_timesteps))
        
        # print('\r' + "Numtimesteps: {}".format(self.num_timesteps),' Mean reward: ',mean_reward, end='')
        return True

    def _on_rollout_end(self) -> None:
        """
        This event is triggered before updating the policy.
        """
        # print("on rollout end ")
        # self.training_env.close()
        pass

    def _on_training_end(self) -> None:
        """
        This event is triggered before exiting the `learn()` method.
        """
        # print("on training end ")
        
        pass
# @profile
def main():
        # Suppress warnings!
    try:
        from tensorflow.python.util import module_wrapper as deprecation
    except ImportError:
        from tensorflow.python.util import deprecation_wrapper as deprecation
    deprecation._PER_MODULE_WARNING_LIMIT = 0
    
    import tensorflow as tf
    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
    

    
    
    # Create log dir
    log_dir = "tmp/"
    os.makedirs(log_dir, exist_ok=True)
    
        
    
    
    print("Setting Environment")
    
    
    
    
    
    n_cpu = 16
    env = SubprocVecEnv([lambda:Monitor(PreprocessFrame(Discretizer(retro.make(game='Airstriker-Genesis'))),log_dir) for i in range(n_cpu)]) 
    # env = DummyVecEnv([lambda:Monitor(PreprocessFrame(Discretizer(retro.make(game='Airstriker-Genesis'))),log_dir)])
    
    print("Setting up Model")


    # Add some param noise for exploration
    # param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.1, desired_action_stddev=0.1)
    # Because we use parameter noise, we should use a MlpPolicy with layer normalization



    # MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy, CnnLnLstmPolicy, CnnPolicy, CnnLstmPolicy
    # model = ACKTR(MlpPolicy,env,verbose=1)
    # model = ACKTR(MlpLstmPolicy,env,verbose=1)
    model = PPO2(CnnPolicy,env,verbose=1)
    # model = ACKTR(CnnLnLstmPolicy,env,verbose=1) #387
    # model = ACKTR(CnnPolicy,env,verbose=1)
    # model = ACKTR(CnnLstmPolicy,env,verbose=1) #387
    
    print("Setting up Model done:")
    
    
    
    # from stable_baselines.common.callbacks import CheckpointCallback
    # Save a checkpoint every 1000 steps
    # checkpoint_callback = CheckpointCallback(save_freq=100, save_path='./logs/',
    #                                          name_prefix='ppo_savepoint')
    print("Learning time!")
    print("model",model)
    print("Setting up callbacks")
    
    
    
    
    total_timesteps = 1000000
    # model.learn(total_timesteps=5000, callback=checkpoint_callback)
    # callback = CustomCallback(model = model,training_env = env, check_freq=1000,log_dir=log_dir)
    callback = CustomCallback(model = model,check_freq=100,log_dir=log_dir)
    model.learn(total_timesteps=total_timesteps, callback=callback)
    
    print("done learning.")
    
    
    print("Saving model.")
    model.save("PPO2")
    print("done")
    # del model # remove to demonstrate saving and loading
    
    
    # print("loading model")
    # model = ACKTR.load("ACKTR")
    # model = PPO2.load("PPO2")
    # print("loading done")
    
    
    
    print("Rendering Time!")
    obs = env.reset()
    
    while True:
        action,_states = model.predict(obs)
        # print(action)
        obs, reward,dones,info = env.step(action)
       
        env.render(mode='human')


if __name__ == "__main__":
    main()    

Logs

-------------------------------------
| approxkl           | 0.0021870576 |
| clipfrac           | 0.03112793   |
| ep_len_mean        | 1.53e+03     |
| ep_reward_mean     | 25           |
| explained_variance | 0.559        |
| fps                | 496          |
| n_updates          | 24           |
| policy_entropy     | 1.0432243    |
| policy_loss        | -0.006628548 |
| serial_timesteps   | 3072         |
| time_elapsed       | 96.9         |
| total_timesteps    | 49152        |
| value_loss         | 0.0033252328 |
-------------------------------------
x:  [1467]
y:  ['56.9170050.0']
self.log_dir    level_0  level_1             r     l           t
0     40.0     1774  56.9170050.0  1467  102.009927
x:  [1467]
y:  ['56.9170050.0']
Traceback (most recent call last):
  File "ppo2retro.py", line 302, in <module>
    main()
  File "ppo2retro.py", line 272, in main
    model.learn(total_timesteps=total_timesteps, callback=callback)
  File "C:\anaconda\envs\StableBaselines\lib\site-packages\stable_baselines\ppo2\ppo2.py", line 336, in learn
    rollout = self.runner.run(callback)
  File "C:\anaconda\envs\StableBaselines\lib\site-packages\stable_baselines\common\runners.py", line 48, in run
    return self._run()
  File "C:\anaconda\envs\StableBaselines\lib\site-packages\stable_baselines\ppo2\ppo2.py", line 488, in _run
    if self.callback.on_step() is False:
  File "C:\anaconda\envs\StableBaselines\lib\site-packages\stable_baselines\common\callbacks.py", line 89, in on_step
    return self._on_step()
  File "ppo2retro.py", line 175, in _on_step
    mean_reward = np.mean(y[-100:])
  File "<__array_function__ internals>", line 6, in mean
  File "C:\anaconda\envs\StableBaselines\lib\site-packages\numpy\core\fromnumeric.py", line 3335, in mean
    out=out, **kwargs)
  File "C:\anaconda\envs\StableBaselines\lib\site-packages\numpy\core\_methods.py", line 163, in _mean
    ret = ret / rcount
TypeError: unsupported operand type(s) for /: 'str' and 'int'

(StableBaselines) E:\ML\reinforcementlearning\BASELINES>

System Info Describe the characteristic of your environment:

  • Describe how the library was installed (pip, docker, source, ...) PIP. TesnforFlow installed using Anaconda Navigator.
  • GPU models and configuration CPU ryzen 2700x
  • Python version Python 3.7.6 from Anaconda Navigator (1.9.12)
  • Tensorflow version 1.14.0
  • Versions of any other relevant libraries Conda list codnalist.txt

toksis avatar Mar 11 '20 09:03 toksis

Hey have you found a solution for this? I have a similar issue.Thanks :)

mohammad200h avatar Sep 07 '20 01:09 mohammad200h

Sadly no.. I changed it back to the default code.

toksis avatar Sep 07 '20 07:09 toksis