SimplerEnv Directly get success rate from `run_maniskill2_eval_single

Currently simplerenv/tools/calc_metrics_evaluation_videos.py is requires keep all scenes to get standing sim variant avg success.

Maybe we could directly get success rate from run_maniskill2_eval_single_episode to log into files and count.

simplerenv/simpler_env/evaluation/maniskill2_evaluator.py here, if author think it is doable, I may open a PR for it. The difficult is sort it by name , to make it generalize takes time

simplerenv/simpler_env/main_inference.py has print(" " * 10, "Average success", np.mean(success_arr)), we can make full use of it .

I believe it can obviously speed up the evaluation 10%. Now it takes me 10-16 hours in google robot.

Use the same environment, can save 20-30% .

Apr 18 '25 01:04 LukeLIN-web

Maybe grep the log file

import sys
from collections import deque


def calculate_success_rates(log_content):
    stats = {}
    lines = log_content.strip().split('\n')
    i = 0
    while i < len(lines) - 1:
        line1 = lines[i].strip()
        line2 = lines[i+1].strip()

        # Check if lines match the expected pattern
        if ':' in line1 and 'Average success' in line2:
            try:
                # Extract operation name (after first ':')  
                op_name = line1.split(':', 1)[1].strip()
                # Extract success rate (between 'Average success' and '<<<')
                rate_str = line2.split('Average success')[1].split('<<<')[0].strip()
                rate = float(rate_str)

                # Store the rate
                if op_name not in stats:
                    stats[op_name] = []
                stats[op_name].append(rate)

                # Skip the next line since we've processed it as the success rate
                i += 1
            except (ValueError, IndexError):
                # Ignore pairs that don't parse correctly (e.g., non-float rate)
                pass

        i += 1 # Move to the next potential operation line

    # Calculate averages
    averages = {}
    for op, rates in stats.items():
        averages[op] = sum(rates) / len(rates) if rates else 0.0 # Avoid division by zero
    return averages

def grep():
    # Check command line arguments
    if len(sys.argv) < 2:
        print("Usage: python script.py <file_path>")
        return 1
    
    file_path = sys.argv[1]
    pattern = "  Average success"  # Two spaces followed by "Average success"
    log_file = "grep.log"  # Output log file
    
    try:
        # Use a deque to keep track of recent lines
        buffer = deque(maxlen=5)  # Keep last 5 lines to handle both cases
        
        # Open the output file
        with open(log_file, 'w') as out_file:
            # Open the input file
            with open(file_path, 'r') as in_file:
                line_number = 0
                
                for line in in_file:
                    line_number += 1
                    buffer.append((line_number, line.rstrip()))
                    
                    # Check if the current line matches the pattern
                    if pattern in line:
                        # Check if the buffer contains "Saving video to"
                        has_saving_video = any("Saving video to" in item[1] for item in buffer)
                        
                        if has_saving_video:
                            # Case 1: Has "Saving video to"
                            # Print first line (buffer[0])
                            if len(buffer) >= 5:
                                first_line = buffer[0]
                                out_file.write(f"{first_line[0]}: {first_line[1]}\n")
                        else:
                            # Case 2: No "Saving video to"
                            # Print first line (buffer[0])
                            if len(buffer) >= 4:
                                first_line = buffer[1]
                                out_file.write(f"{first_line[0]}: {first_line[1]}\n")
                        
                        # Print current matching line in both cases
                        out_file.write(f"{line_number}: {line.rstrip()} <<< MATCH\n")
                        out_file.write("\n")  # Add an empty line for better readability
        
        print(f"Results written to {log_file}")
        return 0
                    
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        return 1
    except Exception as e:
        print(f"Error processing file: {e}")
        return 1

if __name__ == "__main__":
    grep()
    with open('grep.log', 'r') as f:
        content = f.read()

    # Calculate results
    results = calculate_success_rates(content)

    # Print results
    if results:
        print("Average Success Rates by Drawer Operation:")
        for operation, avg_rate in results.items():
            print(f"- {operation}: {avg_rate:.2f}") # Format to 2 decimal places
    else:
        print("No valid drawer operation data found in grep.log.")

But it cannot distinguish visual matching or agg. It could be possible because simplerenv/simpler_env/main_inference.py has enough information for each scene, env

Apr 18 '25 02:04 LukeLIN-web

Anyway save videos shouldn't take long compared to evaluating models, so you can write your own script to handle it.

Apr 18 '25 03:04 xuanlinli17

write a python script.

use multigpu to run multi ckpt.

import os
import subprocess
import time
import argparse
from pathlib import Path
from multiprocessing import Process
from datetime import datetime

action_ensemble_temp = -0.8

tasks = [
    "pick_coke_can_visual_matching.sh",
    "pick_coke_can_variant_agg.sh",
    "move_near_variant_agg.sh",
    "move_near_visual_matching.sh",
    "drawer_visual_matching.sh",
    "drawer_variant_agg.sh",
    # "bridge.sh",
    # "put_in_drawer_variant_agg.sh",
    # "put_in_drawer_visual_matching.sh",
]

def run_evaluation(ckpt_path, device):
    print(f"current time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    logging_dir = f"results/{Path(ckpt_path).name}{action_ensemble_temp}"
    os.makedirs(logging_dir, exist_ok=True)
    
    for task in tasks:
        print(f"🚀 running {task} on GPU {device} for {Path(ckpt_path).name} ...")
        task_log_file = os.path.join(logging_dir, f"{Path(ckpt_path).name}--{task}.log")
        
        with open(task_log_file, "w") as fout, open(task_log_file + ".err", "w") as ferr:
            cmd = [
                "bash",
                f"scripts/{task}",
                ckpt_path,
                model_name,
                str(action_ensemble_temp),
                logging_dir,
                str(device)
            ]
            subprocess.run(cmd, stdout=fout, stderr=ferr)
        print(f"current time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    
    print(f"🚀 all tasks DONE for {Path(ckpt_path).name} on GPU {device}! Calculating metrics...")
    with open(f"{logging_dir}/total.metrics", "a") as f:
        subprocess.run(
            ["python", "tools/calc_metrics_evaluation_videos.py", "--log-dir-root", logging_dir],
            stdout=f
        )
    print(f"🚀 Calculate metrics... DONE")
    print(f"current time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

def launch_eval_in_process(ckpt_path, device):
    p = Process(target=run_evaluation, args=(ckpt_path, device))
    p.start()
    return p

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run evaluation on multiple checkpoints.")
    parser.add_argument("ckpts", nargs="+", help="Paths to checkpoint files")
    parser.add_argument("--devices", type=int, nargs="+", help="GPU device IDs to use (default: 0, 1, 2, ...)")
    parser.add_argument("--action-ensemble-temp", type=float, default=-0.8, help="Action ensemble temperature (default: -0.8)")
    
    args = parser.parse_args()
    
    # Update global variables based on args
    model_name = args.model_name
    action_ensemble_temp = args.action_ensemble_temp
    
    # Assign default devices if not specified
    if args.devices is None:
        args.devices = list(range(len(args.ckpts)))
    
    # Ensure we have enough devices for the checkpoints
    if len(args.devices) < len(args.ckpts):
        print(f"Warning: {len(args.ckpts)} checkpoints but only {len(args.devices)} devices specified.")
        print(f"Will only evaluate the first {len(args.devices)} checkpoints.")
        args.ckpts = args.ckpts[:len(args.devices)]
    
    # Launch processes
    processes = []
    for i, (ckpt, device) in enumerate(zip(args.ckpts, args.devices)):
        print(f"Starting evaluation for checkpoint {i+1} on GPU {device}")
        p = launch_eval_in_process(ckpt, device)
        processes.append(p)

Apr 23 '25 13:04 LukeLIN-web

write a script to collect all results.

import os
import subprocess
from pathlib import Path
from multiprocessing import Process
from datetime import datetime

def run_evaluation_on_dir(log_dir):
    print(f"current time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"🚀 Running metric calculation for: {log_dir}")

    with open(f"{log_dir}/total.metrics", "a") as f:
        subprocess.run(
            ["python", "tools/calc_metrics_evaluation_videos.py", "--log-dir-root", str(log_dir)],
            stdout=f
        )

    print(f"✅ DONE: {log_dir}")
    print(f"current time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


def launch_eval_in_process(log_dir):
    p = Process(target=run_evaluation_on_dir, args=(log_dir,))
    p.start()
    return p


if __name__ == "__main__":
    results_root = Path("results")
    all_subdirs = [d for d in results_root.iterdir() if d.is_dir()]
    
    print(f"Found {len(all_subdirs)} result directories.")

    processes = []
    for log_dir in all_subdirs:
        print(f"Starting evaluation for {log_dir}")
        p = launch_eval_in_process(log_dir)
        processes.append(p)

May 14 '25 17:05 LukeLIN-web

Write a script to grep bridge results.

import sys
from collections import deque
import os
import pandas as pd
import argparse
import glob

def calculate_success_rates(log_content):
    stats = {}
    lines = log_content.strip().split('\n')
    i = 0
    while i < len(lines) - 1:
        line1 = lines[i].strip()
        line2 = lines[i+1].strip()

        # Check if lines match the expected pattern
        if ':' in line1 and 'Average success' in line2:
            try:
                # Extract operation name (after first ':')  
                op_name = line1.split(':', 1)[1].strip()
                # Extract success rate (between 'Average success' and '<<<')
                rate_str = line2.split('Average success')[1].split('<<<')[0].strip()
                rate = float(rate_str)

                # Store the rate
                if op_name not in stats:
                    stats[op_name] = []
                stats[op_name].append(rate)

                # Skip the next line since we've processed it as the success rate
                i += 1
            except (ValueError, IndexError):
                # Ignore pairs that don't parse correctly (e.g., non-float rate)
                pass

        i += 1 # Move to the next potential operation line

    # Calculate averages
    averages = {}
    for op, rates in stats.items():
        averages[op] = sum(rates) / len(rates) if rates else 0.0 # Avoid division by zero
    return averages

def grep():
    # Check command line arguments
    if len(sys.argv) < 2:
        print("Usage: python script.py <file_path>")
        return 1
    
    file_path = sys.argv[1]
    pattern = "  Average success"  # Two spaces followed by "Average success"
    log_file = "grep.log"  # Output log file
    
    try:
        # Use a deque to keep track of recent lines
        buffer = deque(maxlen=5)  # Keep last 5 lines to handle both cases
        
        # Open the output file
        with open(log_file, 'w') as out_file:
            # Open the input file
            with open(file_path, 'r') as in_file:
                line_number = 0
                
                for line in in_file:
                    line_number += 1
                    buffer.append((line_number, line.rstrip()))
                    
                    # Check if the current line matches the pattern
                    if pattern in line:
                        # Check if the buffer contains "Saving video to"
                        has_saving_video = any("Saving video to" in item[1] for item in buffer)
                        
                        if has_saving_video:
                            # Case 1: Has "Saving video to"
                            # Print first line (buffer[0])
                            if len(buffer) >= 5:
                                first_line = buffer[0]
                                out_file.write(f"{first_line[0]}: {first_line[1]}\n")
                        else:
                            # Case 2: No "Saving video to"
                            # Print first line (buffer[0])
                            if len(buffer) >= 4:
                                first_line = buffer[1]
                                out_file.write(f"{first_line[0]}: {first_line[1]}\n")
                        
                        # Print current matching line in both cases
                        out_file.write(f"{line_number}: {line.rstrip()} <<< MATCH\n")
                        out_file.write("\n")  # Add an empty line for better readability
        
        print(f"Results written to {log_file}")
        return 0
                    
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        return 1
    except Exception as e:
        print(f"Error processing file: {e}")
        return 1


def process_csv_file(input_file, bridge_file_name="bridge.csv", frac_file_name="frac.csv"):
    """
    Reads a CSV file from the specified path and processes it to create two outputs:
    1. Column 16 onwards excluding last 3 columns (bridge.csv)
    2. Selected columns from first 16 columns in specific order (frac.csv):
       - First three columns: df_frac[0, 9, 10]
       - Last three columns: df_frac[1, 8, 11]
    
    Args:
        input_file (str): Path to the input CSV file
        bridge_file_name (str): Name of the output CSV for columns 16+ (minus last 3)
        frac_file_name (str): Name of the output CSV for selected columns in specific order
        
    Returns:
        tuple: (df_frac_combined, df_bridge) The processed DataFrames written to output files
    """
    try:
        # Check if input file exists
        if not os.path.exists(input_file):
            print(f"Error: Input file not found at {input_file}")
            return None, None
            
        # Get the directory path from the input file
        input_dir = os.path.dirname(input_file)
            
        # Create the output file paths
        bridge_file = os.path.join(input_dir, bridge_file_name)
        frac_file = os.path.join(input_dir, frac_file_name)
            
        # Read the CSV file
        df = pd.read_csv(input_file)
        print(f"Successfully read CSV file from {input_file}")
        print(f"Original shape: {df.shape} (rows, columns)")
            
        # Make sure we have enough columns
        if df.shape[1] <= 16:
            print("Warning: The DataFrame has 16 or fewer columns. Cannot split as requested.")
            return None, None
                
        # Get the first 16 columns
        df_frac = df.iloc[:, :16]
        
        # Select columns in the specified order
        # First three columns: [0, 9, 10], Last three columns: [1, 8, 11]
        df_frac_combined = df_frac.iloc[:, [0, 9, 10, 1, 8, 11]]
        print(f"Combined frac data shape: {df_frac_combined.shape}")
        
        # Get bridge data (columns 16 onwards, excluding last 3)
        df_bridge = df.iloc[:, 16:]
        if df_bridge.shape[1] > 3:
            df_bridge = df_bridge.iloc[:, :-3]  # Exclude the last 3 columns
        print(f"Bridge data shape: {df_bridge.shape}")
            
        # Write the processed DataFrames to the new CSV files
        df_frac_combined.to_csv(frac_file, index=False)
        print(f"Successfully created combined frac CSV file at {frac_file}")
        
        df_bridge.to_csv(bridge_file, index=False)
        print(f"Successfully created bridge CSV file at {bridge_file}")
            
        # Display the processed data previews
        print("\nCombined frac data preview:")
        print(df_frac_combined.head())
        
        print("\nBridge data preview:")
        print(df_bridge.head())
            
        return df_frac_combined, df_bridge
        
    except Exception as e:
        print(f"Error processing CSV file: {str(e)}")
        return None, None


if __name__ == "__main__":
    # grep()
    # with open('grep.log', 'r') as f:
    #     content = f.read()

    # results = calculate_success_rates(content)

    # # Print results
    # if results:
    #     print("Average Success Rates")
    #     for operation, avg_rate in results.items():
    #         print(f"- {operation}: {avg_rate:.2f}") # Format to 2 decimal places
    # else:
    #     print("No valid data found in grep.log.")
    parser = argparse.ArgumentParser(description='Process CSV files by removing specific columns.')
    parser.add_argument('--prefix', type=str, default='./results/', 
                        help='Path prefix containing directories to process')
    
    args = parser.parse_args()
    
    # Find all directories under the prefix
    directories = [d for d in glob.glob(os.path.join(args.prefix, '*')) if os.path.isdir(d)]
    
    for directory in directories:
        # Construct the path to the results.csv file in each directory
        input_file = os.path.join(directory, 'results.csv')
        
        # Check if the file exists before processing
        if os.path.exists(input_file):
            print(f"Processing: {input_file}")
            process_csv_file(input_file=input_file)
        else:
            print(f"Warning: No results.csv found in {directory}")

May 14 '25 17:05 LukeLIN-web

Directly get success rate from `run_maniskill2_eval_single_episode`