codebench/lboard.py
2025-03-02 22:40:55 +01:00

144 lines
5.8 KiB
Python

import json
import os
import argparse
import glob
import matplotlib.pyplot as plt
def get_latest_json_file(directory):
json_files = glob.glob(os.path.join(directory, '*.json'))
print(f"Found JSON files: {json_files}")
latest_file = max(json_files, key=os.path.getmtime) if json_files else None
return latest_file
def calculate_model_stats(model_result):
"""Calculate average stats for a model from its test results."""
test_results = model_result['test_results']
# Calculate overall success rate (average of all test success rates)
success_rates = [test['success_rate'] for test in test_results.values()]
overall_success_rate = sum(success_rates) / len(success_rates)
return {
'model': model_result['model'],
'overall_success_rate': overall_success_rate,
'tokens_per_second': model_result['tokens_per_second'],
'total_duration': model_result['total_duration'],
'test_results': test_results
}
def plot_model_comparison(model_stats):
"""Plot model comparison with dual y-axes for tokens/sec and success rate."""
models = [stat['model'] for stat in model_stats]
token_speeds = [stat['tokens_per_second'] for stat in model_stats]
success_rates = [stat['overall_success_rate'] for stat in model_stats]
durations = [stat['total_duration'] for stat in model_stats]
# Create figure and primary axis
fig, ax1 = plt.subplots(figsize=(15, 8))
# Plot tokens/sec bars on primary y-axis with lighter blue and more transparency
bars = ax1.bar(models, token_speeds, color='royalblue', alpha=0.3)
ax1.set_ylabel('Tokens per Second', color='blue')
ax1.tick_params(axis='y', labelcolor='blue')
# Create secondary y-axis for success rate
ax2 = ax1.twinx()
ax2.plot(models, success_rates, 'r+', markersize=15, label='Success Rate', linestyle='None')
ax2.set_ylabel('Success Rate (%)', color='red')
ax2.tick_params(axis='y', labelcolor='red')
ax2.set_ylim(0, 100)
# Create third y-axis for duration
ax3 = ax1.twinx()
ax3.spines['right'].set_position(('outward', 60)) # Move third axis outward
ax3.plot(models, durations, 'g_', markersize=15, label='Duration', linestyle='None')
ax3.set_ylabel('Duration (s)', color='green')
ax3.tick_params(axis='y', labelcolor='green')
# Customize x-axis labels with proper rotation
ax1.set_xticks(range(len(models)))
ax1.set_xticklabels(models, rotation=45, ha='right', rotation_mode='anchor')
for i, model in enumerate(models):
# Shorten model names by removing common suffixes
short_name = model.replace(':latest', '').replace('-uncensored', '')
ax1.get_xticklabels()[i].set_text(short_name)
if success_rates[i] > 90:
ax1.get_xticklabels()[i].set_color('green')
# Adjust layout to prevent label cutoff
plt.subplots_adjust(bottom=0.25, left=0.1, right=0.85)
'''
# Add value labels
for i, bar in enumerate(bars):
ax1.text(i, token_speeds[i], f'{token_speeds[i]:.1f}',
ha='center', va='bottom', color='black')
ax2.text(i, success_rates[i], f'{success_rates[i]:.1f}%',
ha='center', va='bottom', color='black')
ax3.text(i, durations[i], f'{durations[i]:.1f}s',
ha='center', va='top', color='black')
'''
plt.title('Model Performance Comparison')
plt.tight_layout()
plt.show()
plt.savefig('benchmark_results/model_comparison.png')
print("\nPlot saved as 'benchmark_results/model_comparison.png'")
def print_leaderboard(benchmark_data):
"""Print leaderboard from benchmark results."""
if not benchmark_data.get('benchmarks'):
print("No benchmark data to display")
return
# Get the latest benchmark results
latest_benchmark = benchmark_data['benchmarks'][-1]
model_results = latest_benchmark['results']
# Calculate stats and sort models
model_stats = [calculate_model_stats(model) for model in model_results]
sorted_stats = sorted(model_stats,
key=lambda x: (x['overall_success_rate'], x['tokens_per_second']),
reverse=True)
print(f"\n🏆 Final Model Leaderboard:")
for stats in sorted_stats:
print(f"\n{stats['model']}")
print(f" Overall Success Rate: {stats['overall_success_rate']:.1f}%")
print(f" Average Tokens/sec: {stats['tokens_per_second']:.2f}")
print(f" Average Duration: {stats['total_duration']:.2f}s")
print(f" Test Results:")
for test_name, test_result in stats['test_results'].items():
status = '' if test_result['success_rate'] == 100 else ''
print(f" - {test_name}: {status} {test_result['success_rate']:.1f}%")
# Generate visualization
plot_model_comparison(sorted_stats)
def main():
parser = argparse.ArgumentParser(description='Display benchmark leaderboard')
parser.add_argument('filepath', nargs='?', help='Path to benchmark results JSON file')
parser.add_argument('--file', type=str, help='Path to benchmark results JSON file (alternative way)')
args = parser.parse_args()
try:
# Use filepath if provided, then --file, otherwise find latest
if args.filepath:
json_file = args.filepath
elif args.file:
json_file = args.file
else:
json_file = get_latest_json_file('benchmark_results')
if not json_file:
print("No benchmark results found")
return
with open(json_file, 'r') as f:
benchmark_data = json.load(f)
print(f"Using benchmark file: {json_file}")
print_leaderboard(benchmark_data)
except Exception as e:
print(f"Error loading benchmark data: {e}")
if __name__ == "__main__":
main()