codebench/lboard.py

import json
import os
import argparse
import glob
import matplotlib.pyplot as plt

def get_latest_json_file(directory):
    json_files = glob.glob(os.path.join(directory, '*.json'))
    print(f"Found JSON files: {json_files}")
    latest_file = max(json_files, key=os.path.getmtime) if json_files else None
    return latest_file
def calculate_model_stats(model_result):
    """Calculate average stats for a model from its test results."""
    test_results = model_result['test_results']

    # Calculate overall success rate (average of all test success rates)
    success_rates = [test['success_rate'] for test in test_results.values()]
    overall_success_rate = sum(success_rates) / len(success_rates)

    # Handle the case where some test results might not have avg_duration or avg_tokens_sec
    # This is for backward compatibility with older benchmark results
    min_avg_duration = max_avg_duration = None
    min_tokens_per_second = max_tokens_per_second = None

    # First try to get these values from the model_result directly (new format)
    if 'min_avg_duration' in model_result and 'max_avg_duration' in model_result:
        min_avg_duration = model_result['min_avg_duration']
        max_avg_duration = model_result['max_avg_duration']

    if 'min_tokens_per_second' in model_result and 'max_tokens_per_second' in model_result:
        min_tokens_per_second = model_result['min_tokens_per_second']
        max_tokens_per_second = model_result['max_tokens_per_second']

    # If not available in the model_result, try to calculate from test_results (old format)
    if min_avg_duration is None or max_avg_duration is None:
        try:
            min_avg_duration = min(test.get('avg_duration', float('inf')) for test in test_results.values() if 'avg_duration' in test)
            max_avg_duration = max(test.get('avg_duration', 0) for test in test_results.values() if 'avg_duration' in test)
            # If no test has avg_duration, use total_duration as fallback
            if min_avg_duration == float('inf') or max_avg_duration == 0:
                min_avg_duration = max_avg_duration = model_result['total_duration']
        except (ValueError, KeyError):
            # If calculation fails, use total_duration as fallback
            min_avg_duration = max_avg_duration = model_result['total_duration']

    if min_tokens_per_second is None or max_tokens_per_second is None:
        try:
            min_tokens_per_second = min(test.get('avg_tokens_sec', float('inf')) for test in test_results.values() if 'avg_tokens_sec' in test)
            max_tokens_per_second = max(test.get('avg_tokens_sec', 0) for test in test_results.values() if 'avg_tokens_sec' in test)
            # If no test has avg_tokens_sec, use tokens_per_second as fallback
            if min_tokens_per_second == float('inf') or max_tokens_per_second == 0:
                min_tokens_per_second = max_tokens_per_second = model_result['tokens_per_second']
        except (ValueError, KeyError):
            # If calculation fails, use tokens_per_second as fallback
            min_tokens_per_second = max_tokens_per_second = model_result['tokens_per_second']

    return {
        'model': model_result['model'],
        'overall_success_rate': overall_success_rate,
        'tokens_per_second': model_result['tokens_per_second'],
        'total_duration': model_result['total_duration'],
        'min_avg_duration': min_avg_duration,
        'max_avg_duration': max_avg_duration,
        'min_tokens_per_second': min_tokens_per_second,
        'max_tokens_per_second': max_tokens_per_second,
        'test_results': test_results
    }

def plot_model_comparison(model_stats):
    """Plot model comparison with dual y-axes for tokens/sec and success rate."""
    models = [stat['model'] for stat in model_stats]
    token_speeds = [stat['tokens_per_second'] for stat in model_stats]
    success_rates = [stat['overall_success_rate'] for stat in model_stats]
    durations = [stat['total_duration'] for stat in model_stats]

    # Create figure and primary axis
    fig, ax1 = plt.subplots(figsize=(15, 8))

    # Plot tokens/sec bars using min and max values
    for i, stat in enumerate(model_stats):
        min_tokens = stat['min_tokens_per_second']
        max_tokens = stat['max_tokens_per_second']

        # Plot lower part (0 to min) with slightly darker blue
        ax1.bar(i, min_tokens, color='royalblue', alpha=0.4)
        # Plot upper part (min to max) with lighter blue
        bar_height = max_tokens - min_tokens
        ax1.bar(i, bar_height, bottom=min_tokens, color='royalblue', alpha=0.3)

    ax1.set_ylabel('Tokens per Second', color='blue')
    ax1.tick_params(axis='y', labelcolor='blue')
    # Set y-axis range for tokens per second
    max_token_speed = max(stat['max_tokens_per_second'] for stat in model_stats)
    ax1.set_ylim(0, max(100, max_token_speed * 1.1))  # Add 10% padding above max value

    # Set x-axis labels
    ax1.set_xticks(range(len(models)))
    ax1.set_xticklabels(models, rotation=45, ha='right', rotation_mode='anchor')

    # Create secondary y-axis for success rate
    ax2 = ax1.twinx()
    ax2.plot(models, success_rates, 'r+', markersize=15, label='Success Rate', linestyle='None')
    ax2.set_ylabel('Success Rate (%)', color='red')
    ax2.tick_params(axis='y', labelcolor='red')
    ax2.set_ylim(0, 100)

    # Create third y-axis for duration
    ax3 = ax1.twinx()
    ax3.spines['right'].set_position(('outward', 60))  # Move third axis outward
    #ax3.plot(models, durations, 'g_', markersize=15, label='Duration', linestyle='None')
    # Add min and max duration markers
    min_durations = [stat['min_avg_duration'] for stat in model_stats]
    max_durations = [stat['max_avg_duration'] for stat in model_stats]
    # Plot duration ranges with vertical lines and markers
    for i, (min_d, max_d) in enumerate(zip(min_durations, max_durations)):
        ax3.plot([i, i], [min_d, max_d], 'g-', linewidth=1)  # Vertical line
        ax3.plot(i, min_d, 'g-', markersize=10)  # Min marker
        ax3.plot(i, max_d, 'g-', markersize=10)  # Max marker

    ax3.set_ylabel('Duration (s)', color='green')
    ax3.tick_params(axis='y', labelcolor='green')

    # Customize x-axis labels with proper rotation
    ax1.set_xticks(range(len(models)))
    ax1.set_xticklabels(models, rotation=45, ha='right', rotation_mode='anchor')
    for i, model in enumerate(models):
        # Shorten model names by removing common suffixes
        short_name = model.replace(':latest', '').replace('-uncensored', '')
        ax1.get_xticklabels()[i].set_text(short_name)
        # Updated conditions: success rate > 95% AND success rate / duration >= 5
        if success_rates[i] > 95 and (success_rates[i] / durations[i] >= 5):
            ax1.get_xticklabels()[i].set_color('green')

    # Adjust layout to prevent label cutoff
    plt.subplots_adjust(bottom=0.25, left=0.1, right=0.85)

    '''
    # Add value labels
    for i, bar in enumerate(bars):
        ax1.text(i, token_speeds[i], f'{token_speeds[i]:.1f}',
                ha='center', va='bottom', color='black')
        ax2.text(i, success_rates[i], f'{success_rates[i]:.1f}%',
                ha='center', va='bottom', color='black')
        ax3.text(i, durations[i], f'{durations[i]:.1f}s',
                ha='center', va='top', color='black')
    '''
    plt.title('Model Performance Comparison')
    plt.tight_layout()

    plt.show()
    plt.savefig('benchmark_results/model_comparison.png')
    print("\nPlot saved as 'benchmark_results/model_comparison.png'")

def print_leaderboard(benchmark_data):
    """Print leaderboard from benchmark results."""
    if not benchmark_data.get('benchmarks'):
        print("No benchmark data to display")
        return

    # Get all benchmark results and combine them
    all_model_results = []
    model_names = set()

    # Process all benchmarks, keeping only the latest result for each model
    for benchmark in benchmark_data['benchmarks']:
        for model_result in benchmark.get('results', []):
            model_name = model_result.get('model')
            if model_name and model_name not in model_names:
                all_model_results.append(model_result)
                model_names.add(model_name)
            elif model_name in model_names:
                # Replace existing model with newer version
                for i, existing_model in enumerate(all_model_results):
                    if existing_model.get('model') == model_name:
                        all_model_results[i] = model_result
                        break

    # Calculate stats and sort models
    model_stats = [calculate_model_stats(model) for model in all_model_results]
    sorted_stats = sorted(model_stats,
                         key=lambda x: (x['overall_success_rate'], x['tokens_per_second']),
                         reverse=True)

    print(f"\n🏆 Final Model Leaderboard:")
    for stats in sorted_stats:
        print(f"\n{stats['model']}")
        print(f"   Overall Success Rate: {stats['overall_success_rate']:.1f}%")
        print(f"   Average Tokens/sec: {stats['tokens_per_second']:.2f} ({stats['min_tokens_per_second']:.2f} - {stats['max_tokens_per_second']:.2f})")
        print(f"   Average Duration: {stats['total_duration']:.2f}s")
        print(f"   Min/Max Avg Duration: {stats['min_avg_duration']:.2f}s / {stats['max_avg_duration']:.2f}s")
        print(f"   Test Results:")

        for test_name, test_result in stats['test_results'].items():
            status = '✅' if test_result['success_rate'] == 100 else '❌'
            print(f"   - {test_name}: {status} {test_result['passed_cases']}/{test_result['total_cases']} cases ({test_result['success_rate']:.1f}%)")

    # Generate visualization
    plot_model_comparison(sorted_stats)

def main():
    parser = argparse.ArgumentParser(description='Display benchmark leaderboard')
    parser.add_argument('filepath', nargs='?', help='Path to benchmark results JSON file')
    parser.add_argument('--file', type=str, help='Path to benchmark results JSON file (alternative way)')
    args = parser.parse_args()

    try:
        # Use filepath if provided, then --file, otherwise find latest
        if args.filepath:
            json_file = args.filepath
        elif args.file:
            json_file = args.file
        else:
            json_file = get_latest_json_file('benchmark_results')
            if not json_file:
                print("No benchmark results found")
                return

        with open(json_file, 'r') as f:
            benchmark_data = json.load(f)
        print(f"Using benchmark file: {json_file}")
        print_leaderboard(benchmark_data)
    except Exception as e:
        print(f"Error loading benchmark data: {e}")

if __name__ == "__main__":
    main()