codebench/lboard.py

import json
import os
import argparse
import glob
import matplotlib.pyplot as plt

def get_latest_json_file(directory):
    json_files = glob.glob(os.path.join(directory, '*.json'))
    print(f"Found JSON files: {json_files}")
    latest_file = max(json_files, key=os.path.getmtime) if json_files else None
    return latest_file
def calculate_model_stats(model_result):
    """Calculate average stats for a model from its test results."""
    test_results = model_result['test_results']

    # Calculate overall success rate (average of all test success rates)
    success_rates = [test['success_rate'] for test in test_results.values()]
    overall_success_rate = sum(success_rates) / len(success_rates)

    return {
        'model': model_result['model'],
        'overall_success_rate': overall_success_rate,
        'tokens_per_second': model_result['tokens_per_second'],
        'total_duration': model_result['total_duration'],
        'test_results': test_results
    }

def plot_model_comparison(model_stats):
    """Plot model comparison with dual y-axes for tokens/sec and success rate."""
    models = [stat['model'] for stat in model_stats]
    token_speeds = [stat['tokens_per_second'] for stat in model_stats]
    success_rates = [stat['overall_success_rate'] for stat in model_stats]
    durations = [stat['total_duration'] for stat in model_stats]

    # Create figure and primary axis
    fig, ax1 = plt.subplots(figsize=(15, 8))

    # Plot tokens/sec bars on primary y-axis with lighter blue and more transparency
    bars = ax1.bar(models, token_speeds, color='royalblue', alpha=0.3)
    ax1.set_ylabel('Tokens per Second', color='blue')
    ax1.tick_params(axis='y', labelcolor='blue')

    # Create secondary y-axis for success rate
    ax2 = ax1.twinx()
    ax2.plot(models, success_rates, 'r+', markersize=15, label='Success Rate', linestyle='None')
    ax2.set_ylabel('Success Rate (%)', color='red')
    ax2.tick_params(axis='y', labelcolor='red')
    ax2.set_ylim(0, 100)

    # Create third y-axis for duration
    ax3 = ax1.twinx()
    ax3.spines['right'].set_position(('outward', 60))  # Move third axis outward
    ax3.plot(models, durations, 'g_', markersize=15, label='Duration', linestyle='None')
    ax3.set_ylabel('Duration (s)', color='green')
    ax3.tick_params(axis='y', labelcolor='green')

    # Customize x-axis labels with proper rotation
    ax1.set_xticks(range(len(models)))
    ax1.set_xticklabels(models, rotation=45, ha='right', rotation_mode='anchor')
    for i, model in enumerate(models):
        # Shorten model names by removing common suffixes
        short_name = model.replace(':latest', '').replace('-uncensored', '')
        ax1.get_xticklabels()[i].set_text(short_name)
        if success_rates[i] > 90:
            ax1.get_xticklabels()[i].set_color('green')

    # Adjust layout to prevent label cutoff
    plt.subplots_adjust(bottom=0.25, left=0.1, right=0.85)

    '''
    # Add value labels
    for i, bar in enumerate(bars):
        ax1.text(i, token_speeds[i], f'{token_speeds[i]:.1f}',
                ha='center', va='bottom', color='black')
        ax2.text(i, success_rates[i], f'{success_rates[i]:.1f}%',
                ha='center', va='bottom', color='black')
        ax3.text(i, durations[i], f'{durations[i]:.1f}s',
                ha='center', va='top', color='black')
    '''
    plt.title('Model Performance Comparison')
    plt.tight_layout()

    plt.show()
    plt.savefig('benchmark_results/model_comparison.png')
    print("\nPlot saved as 'benchmark_results/model_comparison.png'")

def print_leaderboard(benchmark_data):
    """Print leaderboard from benchmark results."""
    if not benchmark_data.get('benchmarks'):
        print("No benchmark data to display")
        return

    # Get the latest benchmark results
    latest_benchmark = benchmark_data['benchmarks'][-1]
    model_results = latest_benchmark['results']

    # Calculate stats and sort models
    model_stats = [calculate_model_stats(model) for model in model_results]
    sorted_stats = sorted(model_stats,
                         key=lambda x: (x['overall_success_rate'], x['tokens_per_second']),
                         reverse=True)

    print(f"\n🏆 Final Model Leaderboard:")
    for stats in sorted_stats:
        print(f"\n{stats['model']}")
        print(f"   Overall Success Rate: {stats['overall_success_rate']:.1f}%")
        print(f"   Average Tokens/sec: {stats['tokens_per_second']:.2f}")
        print(f"   Average Duration: {stats['total_duration']:.2f}s")
        print(f"   Test Results:")

        for test_name, test_result in stats['test_results'].items():
            status = '✅' if test_result['success_rate'] == 100 else '❌'
            print(f"   - {test_name}: {status} {test_result['success_rate']:.1f}%")

    # Generate visualization
    plot_model_comparison(sorted_stats)

def main():
    parser = argparse.ArgumentParser(description='Display benchmark leaderboard')
    parser.add_argument('filepath', nargs='?', help='Path to benchmark results JSON file')
    parser.add_argument('--file', type=str, help='Path to benchmark results JSON file (alternative way)')
    args = parser.parse_args()

    try:
        # Use filepath if provided, then --file, otherwise find latest
        if args.filepath:
            json_file = args.filepath
        elif args.file:
            json_file = args.file
        else:
            json_file = get_latest_json_file('benchmark_results')
            if not json_file:
                print("No benchmark results found")
                return

        with open(json_file, 'r') as f:
            benchmark_data = json.load(f)
        print(f"Using benchmark file: {json_file}")
        print_leaderboard(benchmark_data)
    except Exception as e:
        print(f"Error loading benchmark data: {e}")

if __name__ == "__main__":
    main()