Directly get success rate from `run_maniskill2_eval_single_episode`
Currently simplerenv/tools/calc_metrics_evaluation_videos.py is requires keep all scenes to get standing sim variant avg success.
Maybe we could directly get success rate from run_maniskill2_eval_single_episode to log into files and count.
simplerenv/simpler_env/evaluation/maniskill2_evaluator.py here,
if author think it is doable, I may open a PR for it.
The difficult is sort it by name , to make it generalize takes time
simplerenv/simpler_env/main_inference.py has print(" " * 10, "Average success", np.mean(success_arr)), we can make full use of it .
I believe it can obviously speed up the evaluation 10%. Now it takes me 10-16 hours in google robot.
Use the same environment, can save 20-30% .
Maybe grep the log file
import sys
from collections import deque
def calculate_success_rates(log_content):
stats = {}
lines = log_content.strip().split('\n')
i = 0
while i < len(lines) - 1:
line1 = lines[i].strip()
line2 = lines[i+1].strip()
# Check if lines match the expected pattern
if ':' in line1 and 'Average success' in line2:
try:
# Extract operation name (after first ':')
op_name = line1.split(':', 1)[1].strip()
# Extract success rate (between 'Average success' and '<<<')
rate_str = line2.split('Average success')[1].split('<<<')[0].strip()
rate = float(rate_str)
# Store the rate
if op_name not in stats:
stats[op_name] = []
stats[op_name].append(rate)
# Skip the next line since we've processed it as the success rate
i += 1
except (ValueError, IndexError):
# Ignore pairs that don't parse correctly (e.g., non-float rate)
pass
i += 1 # Move to the next potential operation line
# Calculate averages
averages = {}
for op, rates in stats.items():
averages[op] = sum(rates) / len(rates) if rates else 0.0 # Avoid division by zero
return averages
def grep():
# Check command line arguments
if len(sys.argv) < 2:
print("Usage: python script.py <file_path>")
return 1
file_path = sys.argv[1]
pattern = " Average success" # Two spaces followed by "Average success"
log_file = "grep.log" # Output log file
try:
# Use a deque to keep track of recent lines
buffer = deque(maxlen=5) # Keep last 5 lines to handle both cases
# Open the output file
with open(log_file, 'w') as out_file:
# Open the input file
with open(file_path, 'r') as in_file:
line_number = 0
for line in in_file:
line_number += 1
buffer.append((line_number, line.rstrip()))
# Check if the current line matches the pattern
if pattern in line:
# Check if the buffer contains "Saving video to"
has_saving_video = any("Saving video to" in item[1] for item in buffer)
if has_saving_video:
# Case 1: Has "Saving video to"
# Print first line (buffer[0])
if len(buffer) >= 5:
first_line = buffer[0]
out_file.write(f"{first_line[0]}: {first_line[1]}\n")
else:
# Case 2: No "Saving video to"
# Print first line (buffer[0])
if len(buffer) >= 4:
first_line = buffer[1]
out_file.write(f"{first_line[0]}: {first_line[1]}\n")
# Print current matching line in both cases
out_file.write(f"{line_number}: {line.rstrip()} <<< MATCH\n")
out_file.write("\n") # Add an empty line for better readability
print(f"Results written to {log_file}")
return 0
except FileNotFoundError:
print(f"Error: File '{file_path}' not found.")
return 1
except Exception as e:
print(f"Error processing file: {e}")
return 1
if __name__ == "__main__":
grep()
with open('grep.log', 'r') as f:
content = f.read()
# Calculate results
results = calculate_success_rates(content)
# Print results
if results:
print("Average Success Rates by Drawer Operation:")
for operation, avg_rate in results.items():
print(f"- {operation}: {avg_rate:.2f}") # Format to 2 decimal places
else:
print("No valid drawer operation data found in grep.log.")
But it cannot distinguish visual matching or agg. It could be possible because simplerenv/simpler_env/main_inference.py has enough information for each scene, env
Anyway save videos shouldn't take long compared to evaluating models, so you can write your own script to handle it.
write a python script.
use multigpu to run multi ckpt.
import os
import subprocess
import time
import argparse
from pathlib import Path
from multiprocessing import Process
from datetime import datetime
action_ensemble_temp = -0.8
tasks = [
"pick_coke_can_visual_matching.sh",
"pick_coke_can_variant_agg.sh",
"move_near_variant_agg.sh",
"move_near_visual_matching.sh",
"drawer_visual_matching.sh",
"drawer_variant_agg.sh",
# "bridge.sh",
# "put_in_drawer_variant_agg.sh",
# "put_in_drawer_visual_matching.sh",
]
def run_evaluation(ckpt_path, device):
print(f"current time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
logging_dir = f"results/{Path(ckpt_path).name}{action_ensemble_temp}"
os.makedirs(logging_dir, exist_ok=True)
for task in tasks:
print(f"🚀 running {task} on GPU {device} for {Path(ckpt_path).name} ...")
task_log_file = os.path.join(logging_dir, f"{Path(ckpt_path).name}--{task}.log")
with open(task_log_file, "w") as fout, open(task_log_file + ".err", "w") as ferr:
cmd = [
"bash",
f"scripts/{task}",
ckpt_path,
model_name,
str(action_ensemble_temp),
logging_dir,
str(device)
]
subprocess.run(cmd, stdout=fout, stderr=ferr)
print(f"current time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"🚀 all tasks DONE for {Path(ckpt_path).name} on GPU {device}! Calculating metrics...")
with open(f"{logging_dir}/total.metrics", "a") as f:
subprocess.run(
["python", "tools/calc_metrics_evaluation_videos.py", "--log-dir-root", logging_dir],
stdout=f
)
print(f"🚀 Calculate metrics... DONE")
print(f"current time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
def launch_eval_in_process(ckpt_path, device):
p = Process(target=run_evaluation, args=(ckpt_path, device))
p.start()
return p
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run evaluation on multiple checkpoints.")
parser.add_argument("ckpts", nargs="+", help="Paths to checkpoint files")
parser.add_argument("--devices", type=int, nargs="+", help="GPU device IDs to use (default: 0, 1, 2, ...)")
parser.add_argument("--action-ensemble-temp", type=float, default=-0.8, help="Action ensemble temperature (default: -0.8)")
args = parser.parse_args()
# Update global variables based on args
model_name = args.model_name
action_ensemble_temp = args.action_ensemble_temp
# Assign default devices if not specified
if args.devices is None:
args.devices = list(range(len(args.ckpts)))
# Ensure we have enough devices for the checkpoints
if len(args.devices) < len(args.ckpts):
print(f"Warning: {len(args.ckpts)} checkpoints but only {len(args.devices)} devices specified.")
print(f"Will only evaluate the first {len(args.devices)} checkpoints.")
args.ckpts = args.ckpts[:len(args.devices)]
# Launch processes
processes = []
for i, (ckpt, device) in enumerate(zip(args.ckpts, args.devices)):
print(f"Starting evaluation for checkpoint {i+1} on GPU {device}")
p = launch_eval_in_process(ckpt, device)
processes.append(p)
write a script to collect all results.
import os
import subprocess
from pathlib import Path
from multiprocessing import Process
from datetime import datetime
def run_evaluation_on_dir(log_dir):
print(f"current time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"🚀 Running metric calculation for: {log_dir}")
with open(f"{log_dir}/total.metrics", "a") as f:
subprocess.run(
["python", "tools/calc_metrics_evaluation_videos.py", "--log-dir-root", str(log_dir)],
stdout=f
)
print(f"✅ DONE: {log_dir}")
print(f"current time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
def launch_eval_in_process(log_dir):
p = Process(target=run_evaluation_on_dir, args=(log_dir,))
p.start()
return p
if __name__ == "__main__":
results_root = Path("results")
all_subdirs = [d for d in results_root.iterdir() if d.is_dir()]
print(f"Found {len(all_subdirs)} result directories.")
processes = []
for log_dir in all_subdirs:
print(f"Starting evaluation for {log_dir}")
p = launch_eval_in_process(log_dir)
processes.append(p)
Write a script to grep bridge results.
import sys
from collections import deque
import os
import pandas as pd
import argparse
import glob
def calculate_success_rates(log_content):
stats = {}
lines = log_content.strip().split('\n')
i = 0
while i < len(lines) - 1:
line1 = lines[i].strip()
line2 = lines[i+1].strip()
# Check if lines match the expected pattern
if ':' in line1 and 'Average success' in line2:
try:
# Extract operation name (after first ':')
op_name = line1.split(':', 1)[1].strip()
# Extract success rate (between 'Average success' and '<<<')
rate_str = line2.split('Average success')[1].split('<<<')[0].strip()
rate = float(rate_str)
# Store the rate
if op_name not in stats:
stats[op_name] = []
stats[op_name].append(rate)
# Skip the next line since we've processed it as the success rate
i += 1
except (ValueError, IndexError):
# Ignore pairs that don't parse correctly (e.g., non-float rate)
pass
i += 1 # Move to the next potential operation line
# Calculate averages
averages = {}
for op, rates in stats.items():
averages[op] = sum(rates) / len(rates) if rates else 0.0 # Avoid division by zero
return averages
def grep():
# Check command line arguments
if len(sys.argv) < 2:
print("Usage: python script.py <file_path>")
return 1
file_path = sys.argv[1]
pattern = " Average success" # Two spaces followed by "Average success"
log_file = "grep.log" # Output log file
try:
# Use a deque to keep track of recent lines
buffer = deque(maxlen=5) # Keep last 5 lines to handle both cases
# Open the output file
with open(log_file, 'w') as out_file:
# Open the input file
with open(file_path, 'r') as in_file:
line_number = 0
for line in in_file:
line_number += 1
buffer.append((line_number, line.rstrip()))
# Check if the current line matches the pattern
if pattern in line:
# Check if the buffer contains "Saving video to"
has_saving_video = any("Saving video to" in item[1] for item in buffer)
if has_saving_video:
# Case 1: Has "Saving video to"
# Print first line (buffer[0])
if len(buffer) >= 5:
first_line = buffer[0]
out_file.write(f"{first_line[0]}: {first_line[1]}\n")
else:
# Case 2: No "Saving video to"
# Print first line (buffer[0])
if len(buffer) >= 4:
first_line = buffer[1]
out_file.write(f"{first_line[0]}: {first_line[1]}\n")
# Print current matching line in both cases
out_file.write(f"{line_number}: {line.rstrip()} <<< MATCH\n")
out_file.write("\n") # Add an empty line for better readability
print(f"Results written to {log_file}")
return 0
except FileNotFoundError:
print(f"Error: File '{file_path}' not found.")
return 1
except Exception as e:
print(f"Error processing file: {e}")
return 1
def process_csv_file(input_file, bridge_file_name="bridge.csv", frac_file_name="frac.csv"):
"""
Reads a CSV file from the specified path and processes it to create two outputs:
1. Column 16 onwards excluding last 3 columns (bridge.csv)
2. Selected columns from first 16 columns in specific order (frac.csv):
- First three columns: df_frac[0, 9, 10]
- Last three columns: df_frac[1, 8, 11]
Args:
input_file (str): Path to the input CSV file
bridge_file_name (str): Name of the output CSV for columns 16+ (minus last 3)
frac_file_name (str): Name of the output CSV for selected columns in specific order
Returns:
tuple: (df_frac_combined, df_bridge) The processed DataFrames written to output files
"""
try:
# Check if input file exists
if not os.path.exists(input_file):
print(f"Error: Input file not found at {input_file}")
return None, None
# Get the directory path from the input file
input_dir = os.path.dirname(input_file)
# Create the output file paths
bridge_file = os.path.join(input_dir, bridge_file_name)
frac_file = os.path.join(input_dir, frac_file_name)
# Read the CSV file
df = pd.read_csv(input_file)
print(f"Successfully read CSV file from {input_file}")
print(f"Original shape: {df.shape} (rows, columns)")
# Make sure we have enough columns
if df.shape[1] <= 16:
print("Warning: The DataFrame has 16 or fewer columns. Cannot split as requested.")
return None, None
# Get the first 16 columns
df_frac = df.iloc[:, :16]
# Select columns in the specified order
# First three columns: [0, 9, 10], Last three columns: [1, 8, 11]
df_frac_combined = df_frac.iloc[:, [0, 9, 10, 1, 8, 11]]
print(f"Combined frac data shape: {df_frac_combined.shape}")
# Get bridge data (columns 16 onwards, excluding last 3)
df_bridge = df.iloc[:, 16:]
if df_bridge.shape[1] > 3:
df_bridge = df_bridge.iloc[:, :-3] # Exclude the last 3 columns
print(f"Bridge data shape: {df_bridge.shape}")
# Write the processed DataFrames to the new CSV files
df_frac_combined.to_csv(frac_file, index=False)
print(f"Successfully created combined frac CSV file at {frac_file}")
df_bridge.to_csv(bridge_file, index=False)
print(f"Successfully created bridge CSV file at {bridge_file}")
# Display the processed data previews
print("\nCombined frac data preview:")
print(df_frac_combined.head())
print("\nBridge data preview:")
print(df_bridge.head())
return df_frac_combined, df_bridge
except Exception as e:
print(f"Error processing CSV file: {str(e)}")
return None, None
if __name__ == "__main__":
# grep()
# with open('grep.log', 'r') as f:
# content = f.read()
# results = calculate_success_rates(content)
# # Print results
# if results:
# print("Average Success Rates")
# for operation, avg_rate in results.items():
# print(f"- {operation}: {avg_rate:.2f}") # Format to 2 decimal places
# else:
# print("No valid data found in grep.log.")
parser = argparse.ArgumentParser(description='Process CSV files by removing specific columns.')
parser.add_argument('--prefix', type=str, default='./results/',
help='Path prefix containing directories to process')
args = parser.parse_args()
# Find all directories under the prefix
directories = [d for d in glob.glob(os.path.join(args.prefix, '*')) if os.path.isdir(d)]
for directory in directories:
# Construct the path to the results.csv file in each directory
input_file = os.path.join(directory, 'results.csv')
# Check if the file exists before processing
if os.path.exists(input_file):
print(f"Processing: {input_file}")
process_csv_file(input_file=input_file)
else:
print(f"Warning: No results.csv found in {directory}")