144 lines
5.8 KiB
Python
144 lines
5.8 KiB
Python
import json
|
|
import os
|
|
import argparse
|
|
import glob
|
|
import matplotlib.pyplot as plt
|
|
|
|
def get_latest_json_file(directory):
|
|
json_files = glob.glob(os.path.join(directory, '*.json'))
|
|
print(f"Found JSON files: {json_files}")
|
|
latest_file = max(json_files, key=os.path.getmtime) if json_files else None
|
|
return latest_file
|
|
def calculate_model_stats(model_result):
|
|
"""Calculate average stats for a model from its test results."""
|
|
test_results = model_result['test_results']
|
|
|
|
# Calculate overall success rate (average of all test success rates)
|
|
success_rates = [test['success_rate'] for test in test_results.values()]
|
|
overall_success_rate = sum(success_rates) / len(success_rates)
|
|
|
|
return {
|
|
'model': model_result['model'],
|
|
'overall_success_rate': overall_success_rate,
|
|
'tokens_per_second': model_result['tokens_per_second'],
|
|
'total_duration': model_result['total_duration'],
|
|
'test_results': test_results
|
|
}
|
|
|
|
def plot_model_comparison(model_stats):
|
|
"""Plot model comparison with dual y-axes for tokens/sec and success rate."""
|
|
models = [stat['model'] for stat in model_stats]
|
|
token_speeds = [stat['tokens_per_second'] for stat in model_stats]
|
|
success_rates = [stat['overall_success_rate'] for stat in model_stats]
|
|
durations = [stat['total_duration'] for stat in model_stats]
|
|
|
|
# Create figure and primary axis
|
|
fig, ax1 = plt.subplots(figsize=(15, 8))
|
|
|
|
# Plot tokens/sec bars on primary y-axis with lighter blue and more transparency
|
|
bars = ax1.bar(models, token_speeds, color='royalblue', alpha=0.3)
|
|
ax1.set_ylabel('Tokens per Second', color='blue')
|
|
ax1.tick_params(axis='y', labelcolor='blue')
|
|
|
|
# Create secondary y-axis for success rate
|
|
ax2 = ax1.twinx()
|
|
ax2.plot(models, success_rates, 'r+', markersize=15, label='Success Rate', linestyle='None')
|
|
ax2.set_ylabel('Success Rate (%)', color='red')
|
|
ax2.tick_params(axis='y', labelcolor='red')
|
|
ax2.set_ylim(0, 100)
|
|
|
|
# Create third y-axis for duration
|
|
ax3 = ax1.twinx()
|
|
ax3.spines['right'].set_position(('outward', 60)) # Move third axis outward
|
|
ax3.plot(models, durations, 'g_', markersize=15, label='Duration', linestyle='None')
|
|
ax3.set_ylabel('Duration (s)', color='green')
|
|
ax3.tick_params(axis='y', labelcolor='green')
|
|
|
|
# Customize x-axis labels with proper rotation
|
|
ax1.set_xticks(range(len(models)))
|
|
ax1.set_xticklabels(models, rotation=45, ha='right', rotation_mode='anchor')
|
|
for i, model in enumerate(models):
|
|
# Shorten model names by removing common suffixes
|
|
short_name = model.replace(':latest', '').replace('-uncensored', '')
|
|
ax1.get_xticklabels()[i].set_text(short_name)
|
|
if success_rates[i] > 90:
|
|
ax1.get_xticklabels()[i].set_color('green')
|
|
|
|
# Adjust layout to prevent label cutoff
|
|
plt.subplots_adjust(bottom=0.25, left=0.1, right=0.85)
|
|
|
|
'''
|
|
# Add value labels
|
|
for i, bar in enumerate(bars):
|
|
ax1.text(i, token_speeds[i], f'{token_speeds[i]:.1f}',
|
|
ha='center', va='bottom', color='black')
|
|
ax2.text(i, success_rates[i], f'{success_rates[i]:.1f}%',
|
|
ha='center', va='bottom', color='black')
|
|
ax3.text(i, durations[i], f'{durations[i]:.1f}s',
|
|
ha='center', va='top', color='black')
|
|
'''
|
|
plt.title('Model Performance Comparison')
|
|
plt.tight_layout()
|
|
|
|
plt.show()
|
|
plt.savefig('benchmark_results/model_comparison.png')
|
|
print("\nPlot saved as 'benchmark_results/model_comparison.png'")
|
|
|
|
def print_leaderboard(benchmark_data):
|
|
"""Print leaderboard from benchmark results."""
|
|
if not benchmark_data.get('benchmarks'):
|
|
print("No benchmark data to display")
|
|
return
|
|
|
|
# Get the latest benchmark results
|
|
latest_benchmark = benchmark_data['benchmarks'][-1]
|
|
model_results = latest_benchmark['results']
|
|
|
|
# Calculate stats and sort models
|
|
model_stats = [calculate_model_stats(model) for model in model_results]
|
|
sorted_stats = sorted(model_stats,
|
|
key=lambda x: (x['overall_success_rate'], x['tokens_per_second']),
|
|
reverse=True)
|
|
|
|
print(f"\n🏆 Final Model Leaderboard:")
|
|
for stats in sorted_stats:
|
|
print(f"\n{stats['model']}")
|
|
print(f" Overall Success Rate: {stats['overall_success_rate']:.1f}%")
|
|
print(f" Average Tokens/sec: {stats['tokens_per_second']:.2f}")
|
|
print(f" Average Duration: {stats['total_duration']:.2f}s")
|
|
print(f" Test Results:")
|
|
|
|
for test_name, test_result in stats['test_results'].items():
|
|
status = '✅' if test_result['success_rate'] == 100 else '❌'
|
|
print(f" - {test_name}: {status} {test_result['success_rate']:.1f}%")
|
|
|
|
# Generate visualization
|
|
plot_model_comparison(sorted_stats)
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Display benchmark leaderboard')
|
|
parser.add_argument('filepath', nargs='?', help='Path to benchmark results JSON file')
|
|
parser.add_argument('--file', type=str, help='Path to benchmark results JSON file (alternative way)')
|
|
args = parser.parse_args()
|
|
|
|
try:
|
|
# Use filepath if provided, then --file, otherwise find latest
|
|
if args.filepath:
|
|
json_file = args.filepath
|
|
elif args.file:
|
|
json_file = args.file
|
|
else:
|
|
json_file = get_latest_json_file('benchmark_results')
|
|
if not json_file:
|
|
print("No benchmark results found")
|
|
return
|
|
|
|
with open(json_file, 'r') as f:
|
|
benchmark_data = json.load(f)
|
|
print(f"Using benchmark file: {json_file}")
|
|
print_leaderboard(benchmark_data)
|
|
except Exception as e:
|
|
print(f"Error loading benchmark data: {e}")
|
|
|
|
if __name__ == "__main__":
|
|
main() |