Trace doesn't work with python 3.13
Trace seems to have some error with the bundle decorator when running on python 3.13. Downgrading to 3.12 does not have this error.
Traceback (most recent call last):
File "/Users/ryanrong/Documents/GitHub/cs224n_llm_agent/pong_LLM_agent.py", line 269, in
@Ryan-Rong-24 can you paste an example code snippet, so I can reproduce the error? Thanks.
Can you also let me know what you get as _ldict when that error happened?
It seems the main cause is that starting from Python 3.13 (see here) f_locals is no longer a dict.
The f_locals attribute on a frame object is an instance of a “frame-locals proxy”. The proxy object exposes a write-through view of the underlying locals dictionary for the frame. This ensures that the variables exposed by f_locals are always up to date with the live local variables in the frame itself.
Our previous assumes it to be a dict. (f_locals is used to resolve the nonlocal namespace for bundle).
Currently the quickest solution is to use Python <3.13. We will look into that more for a patch.
This is our code for using Trace to play Pong. Also I'm not sure if I can show what _ldict is without diving into the code for Trace... but hopefully this helps! Meanwhile I'll just develop on python 3.12
def optimize_policy(
env_name="ALE/Pong-v5",
horizon=2000,
memory_size=5,
n_optimization_steps=10,
verbose=False,
model="gpt-4o-mini"
):
@trace.bundle(trainable=True)
def policy(obs):
'''
A policy that moves the paddle towards the ball to deflect the ball.
If the paddle is below the ball, move up; otherwise, move down.
Make prediction on the ball's moving direction and velocity to adjust the paddle action.
Args:
obs (dict): A dictionary with keys "ball_pos" and "paddle_pos" and values the corresponding [x, y, w, h], coordinates, width and height of the ball and agent paddle in the game screen of (210, 160).
Output:
action (int): The action to take among 0 (NOOP), 1 (FIRE), 2 (DOWN), 3 (UP).
'''
ball_pos = obs["ball_pos"]
paddle_pos = obs["paddle_pos"]
action = 0 # NOOP
if ball_pos and paddle_pos:
ball_y = ball_pos[1]
paddle_y = paddle_pos[1]
if paddle_y + 10 < ball_y: # Paddle is below the ball, move up
action = 3
elif paddle_y > ball_y + 10: # Paddle is above the ball, move down
action = 2
return action
# Get the config file path from environment variable
config_path = os.getenv("OAI_CONFIG_LIST")
config_list = config_list_from_json(config_path)
config_list = [config for config in config_list if config["model"] == model]
optimizer = OptoPrime(policy.parameters(), config_list=config_list, memory_size=memory_size)
env = PongTracedEnv(env_name=env_name)
try:
rewards = []
logger.info("Optimization Starts")
for i in range(n_optimization_steps):
env.init()
traj, error = rollout(env, horizon, policy)
if error is None:
feedback = f"Episode ends after {traj['steps']} steps with total score: {sum(traj['rewards']):.1f}"
if sum(traj['rewards']) > 0:
feedback += "\nGood job! You're scoring points against the opponent."
elif sum(traj['rewards']) <= 0:
feedback += "\nTry to improve paddle positioning to prevent opponent scoring."
target = traj['observations'][-1]
rewards.append(sum(traj['rewards']))
else:
feedback = error.exception_node.create_feedback()
target = error.exception_node
logger.info(f"Iteration: {i}, Feedback: {feedback}, target: {target}, Parameter: {policy.parameters()}")
instruction = "In Pong, you control the right paddle and compete against the computer on the left. "
instruction += "The goal is to keep deflecting the ball away from your goal and into your opponent's goal to maximize your score and win the game. "
instruction += "You score one point when the opponent misses the ball or hits it out of bounds. "
instruction += "The policy should move the right paddle up or down or NOOP to hit the ball. "
optimizer.objective = instruction + optimizer.default_objective
optimizer.zero_feedback()
optimizer.backward(target, feedback, visualize=True)
logger.info(optimizer.problem_instance(optimizer.summarize()))
stdout_buffer = io.StringIO()
with contextlib.redirect_stdout(stdout_buffer):
optimizer.step(verbose=verbose)
llm_output = stdout_buffer.getvalue()
if llm_output:
logger.info(f"LLM response:\n {llm_output}")
logger.info(f"Iteration: {i}, Feedback: {feedback}, Parameter: {policy.parameters()}")
finally:
if env is not None:
env.close()
logger.info(f"Final Average Reward: {sum(rewards) / len(rewards)}")
return rewards
if __name__ == "__main__":
# Set up logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler()
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logger.addHandler(console_handler)
# Set up file logging
log_dir = Path("logs")
log_dir.mkdir(exist_ok=True)
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = log_dir / f"pong_ai_{timestamp}.log"
file_handler = logging.FileHandler(log_file)
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logger.addHandler(file_handler)
logger.info("Starting Pong AI training...")
rewards = optimize_policy(
env_name="ALE/Pong-v5",
horizon=800,
n_optimization_steps=5,
memory_size=5,
verbose='output',
model="gpt-4o-mini"
)
logger.info("Training completed.")