226 lines
11 KiB
Python
Executable File
226 lines
11 KiB
Python
Executable File
import json
|
|
import os
|
|
import argparse
|
|
import glob
|
|
import matplotlib.pyplot as plt
|
|
|
|
def get_latest_json_file(directory):
|
|
json_files = glob.glob(os.path.join(directory, '*.json'))
|
|
print(f"Found JSON files: {json_files}")
|
|
latest_file = max(json_files, key=os.path.getmtime) if json_files else None
|
|
return latest_file
|
|
def calculate_model_stats(model_result):
|
|
"""Calculate average stats for a model from its test results."""
|
|
test_results = model_result['test_results']
|
|
|
|
# Calculate overall success rate (average of all test success rates)
|
|
success_rates = [test['success_rate'] for test in test_results.values()]
|
|
overall_success_rate = sum(success_rates) / len(success_rates)
|
|
|
|
# Handle the case where some test results might not have avg_duration or avg_tokens_sec
|
|
# This is for backward compatibility with older benchmark results
|
|
min_avg_duration = max_avg_duration = None
|
|
min_tokens_per_second = max_tokens_per_second = None
|
|
|
|
# First try to get these values from the model_result directly (new format)
|
|
if 'min_avg_duration' in model_result and 'max_avg_duration' in model_result:
|
|
min_avg_duration = model_result['min_avg_duration']
|
|
max_avg_duration = model_result['max_avg_duration']
|
|
|
|
if 'min_tokens_per_second' in model_result and 'max_tokens_per_second' in model_result:
|
|
min_tokens_per_second = model_result['min_tokens_per_second']
|
|
max_tokens_per_second = model_result['max_tokens_per_second']
|
|
|
|
# If not available in the model_result, try to calculate from test_results (old format)
|
|
if min_avg_duration is None or max_avg_duration is None:
|
|
try:
|
|
min_avg_duration = min(test.get('avg_duration', float('inf')) for test in test_results.values() if 'avg_duration' in test)
|
|
max_avg_duration = max(test.get('avg_duration', 0) for test in test_results.values() if 'avg_duration' in test)
|
|
# If no test has avg_duration, use total_duration as fallback
|
|
if min_avg_duration == float('inf') or max_avg_duration == 0:
|
|
min_avg_duration = max_avg_duration = model_result['total_duration']
|
|
except (ValueError, KeyError):
|
|
# If calculation fails, use total_duration as fallback
|
|
min_avg_duration = max_avg_duration = model_result['total_duration']
|
|
|
|
if min_tokens_per_second is None or max_tokens_per_second is None:
|
|
try:
|
|
min_tokens_per_second = min(test.get('avg_tokens_sec', float('inf')) for test in test_results.values() if 'avg_tokens_sec' in test)
|
|
max_tokens_per_second = max(test.get('avg_tokens_sec', 0) for test in test_results.values() if 'avg_tokens_sec' in test)
|
|
# If no test has avg_tokens_sec, use tokens_per_second as fallback
|
|
if min_tokens_per_second == float('inf') or max_tokens_per_second == 0:
|
|
min_tokens_per_second = max_tokens_per_second = model_result['tokens_per_second']
|
|
except (ValueError, KeyError):
|
|
# If calculation fails, use tokens_per_second as fallback
|
|
min_tokens_per_second = max_tokens_per_second = model_result['tokens_per_second']
|
|
|
|
return {
|
|
'model': model_result['model'],
|
|
'overall_success_rate': overall_success_rate,
|
|
'tokens_per_second': model_result['tokens_per_second'],
|
|
'total_duration': model_result['total_duration'],
|
|
'min_avg_duration': min_avg_duration,
|
|
'max_avg_duration': max_avg_duration,
|
|
'min_tokens_per_second': min_tokens_per_second,
|
|
'max_tokens_per_second': max_tokens_per_second,
|
|
'test_results': test_results
|
|
}
|
|
|
|
def plot_model_comparison(model_stats):
|
|
"""Plot model comparison with dual y-axes for tokens/sec and success rate."""
|
|
models = [stat['model'] for stat in model_stats]
|
|
token_speeds = [stat['tokens_per_second'] for stat in model_stats]
|
|
success_rates = [stat['overall_success_rate'] for stat in model_stats]
|
|
durations = [stat['total_duration'] for stat in model_stats]
|
|
|
|
# Create figure and primary axis
|
|
fig, ax1 = plt.subplots(figsize=(15, 8))
|
|
|
|
# Plot tokens/sec bars using min and max values
|
|
for i, stat in enumerate(model_stats):
|
|
min_tokens = stat['min_tokens_per_second']
|
|
max_tokens = stat['max_tokens_per_second']
|
|
|
|
# Plot lower part (0 to min) with slightly darker blue
|
|
ax1.bar(i, min_tokens, color='royalblue', alpha=0.4)
|
|
# Plot upper part (min to max) with lighter blue
|
|
bar_height = max_tokens - min_tokens
|
|
ax1.bar(i, bar_height, bottom=min_tokens, color='royalblue', alpha=0.3)
|
|
|
|
ax1.set_ylabel('Tokens per Second', color='blue')
|
|
ax1.tick_params(axis='y', labelcolor='blue')
|
|
# Set y-axis range for tokens per second
|
|
max_token_speed = max(stat['max_tokens_per_second'] for stat in model_stats)
|
|
ax1.set_ylim(0, max(100, max_token_speed * 1.1)) # Add 10% padding above max value
|
|
|
|
# Set x-axis labels
|
|
ax1.set_xticks(range(len(models)))
|
|
ax1.set_xticklabels(models, rotation=45, ha='right', rotation_mode='anchor')
|
|
|
|
# Create secondary y-axis for success rate
|
|
ax2 = ax1.twinx()
|
|
ax2.plot(models, success_rates, 'r+', markersize=15, label='Success Rate', linestyle='None')
|
|
ax2.set_ylabel('Success Rate (%)', color='red')
|
|
ax2.tick_params(axis='y', labelcolor='red')
|
|
ax2.set_ylim(0, 100)
|
|
|
|
# Create third y-axis for duration
|
|
ax3 = ax1.twinx()
|
|
ax3.spines['right'].set_position(('outward', 60)) # Move third axis outward
|
|
#ax3.plot(models, durations, 'g_', markersize=15, label='Duration', linestyle='None')
|
|
# Add min and max duration markers
|
|
min_durations = [stat['min_avg_duration'] for stat in model_stats]
|
|
max_durations = [stat['max_avg_duration'] for stat in model_stats]
|
|
# Plot duration ranges with vertical lines and markers
|
|
for i, (min_d, max_d) in enumerate(zip(min_durations, max_durations)):
|
|
ax3.plot([i, i], [min_d, max_d], 'g-', linewidth=1) # Vertical line
|
|
ax3.plot(i, min_d, 'g-', markersize=10) # Min marker
|
|
ax3.plot(i, max_d, 'g-', markersize=10) # Max marker
|
|
|
|
ax3.set_ylabel('Duration (s)', color='green')
|
|
ax3.tick_params(axis='y', labelcolor='green')
|
|
|
|
# Customize x-axis labels with proper rotation
|
|
ax1.set_xticks(range(len(models)))
|
|
ax1.set_xticklabels(models, rotation=45, ha='right', rotation_mode='anchor')
|
|
for i, model in enumerate(models):
|
|
# Shorten model names by removing common suffixes
|
|
short_name = model.replace(':latest', '').replace('-uncensored', '')
|
|
ax1.get_xticklabels()[i].set_text(short_name)
|
|
# Updated conditions: success rate > 95% AND success rate / duration >= 5
|
|
if success_rates[i] > 95 and (success_rates[i] / durations[i] >= 5):
|
|
ax1.get_xticklabels()[i].set_color('green')
|
|
|
|
# Adjust layout to prevent label cutoff
|
|
plt.subplots_adjust(bottom=0.25, left=0.1, right=0.85)
|
|
|
|
'''
|
|
# Add value labels
|
|
for i, bar in enumerate(bars):
|
|
ax1.text(i, token_speeds[i], f'{token_speeds[i]:.1f}',
|
|
ha='center', va='bottom', color='black')
|
|
ax2.text(i, success_rates[i], f'{success_rates[i]:.1f}%',
|
|
ha='center', va='bottom', color='black')
|
|
ax3.text(i, durations[i], f'{durations[i]:.1f}s',
|
|
ha='center', va='top', color='black')
|
|
'''
|
|
plt.title('Model Performance Comparison')
|
|
plt.tight_layout()
|
|
|
|
plt.show()
|
|
plt.savefig('benchmark_results/model_comparison.png')
|
|
print("\nPlot saved as 'benchmark_results/model_comparison.png'")
|
|
|
|
def print_leaderboard(benchmark_data):
|
|
"""Print leaderboard from benchmark results."""
|
|
if not benchmark_data.get('benchmarks'):
|
|
print("No benchmark data to display")
|
|
return
|
|
|
|
# Get all benchmark results and combine them
|
|
all_model_results = []
|
|
model_names = set()
|
|
|
|
# Process all benchmarks, keeping only the latest result for each model
|
|
for benchmark in benchmark_data['benchmarks']:
|
|
for model_result in benchmark.get('results', []):
|
|
model_name = model_result.get('model')
|
|
if model_name and model_name not in model_names:
|
|
all_model_results.append(model_result)
|
|
model_names.add(model_name)
|
|
elif model_name in model_names:
|
|
# Replace existing model with newer version
|
|
for i, existing_model in enumerate(all_model_results):
|
|
if existing_model.get('model') == model_name:
|
|
all_model_results[i] = model_result
|
|
break
|
|
|
|
# Calculate stats and sort models
|
|
model_stats = [calculate_model_stats(model) for model in all_model_results]
|
|
sorted_stats = sorted(model_stats,
|
|
key=lambda x: (x['overall_success_rate'], x['tokens_per_second']),
|
|
reverse=True)
|
|
|
|
print(f"\n🏆 Final Model Leaderboard:")
|
|
for stats in sorted_stats:
|
|
print(f"\n{stats['model']}")
|
|
print(f" Overall Success Rate: {stats['overall_success_rate']:.1f}%")
|
|
print(f" Average Tokens/sec: {stats['tokens_per_second']:.2f} ({stats['min_tokens_per_second']:.2f} - {stats['max_tokens_per_second']:.2f})")
|
|
print(f" Average Duration: {stats['total_duration']:.2f}s")
|
|
print(f" Min/Max Avg Duration: {stats['min_avg_duration']:.2f}s / {stats['max_avg_duration']:.2f}s")
|
|
print(f" Test Results:")
|
|
|
|
for test_name, test_result in stats['test_results'].items():
|
|
status = '✅' if test_result['success_rate'] == 100 else '❌'
|
|
print(f" - {test_name}: {status} {test_result['passed_cases']}/{test_result['total_cases']} cases ({test_result['success_rate']:.1f}%)")
|
|
|
|
# Generate visualization
|
|
plot_model_comparison(sorted_stats)
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Display benchmark leaderboard')
|
|
parser.add_argument('filepath', nargs='?', help='Path to benchmark results JSON file')
|
|
parser.add_argument('--file', type=str, help='Path to benchmark results JSON file (alternative way)')
|
|
args = parser.parse_args()
|
|
|
|
try:
|
|
# Use filepath if provided, then --file, otherwise find latest
|
|
if args.filepath:
|
|
json_file = args.filepath
|
|
elif args.file:
|
|
json_file = args.file
|
|
else:
|
|
json_file = get_latest_json_file('benchmark_results')
|
|
if not json_file:
|
|
print("No benchmark results found")
|
|
return
|
|
|
|
with open(json_file, 'r') as f:
|
|
benchmark_data = json.load(f)
|
|
print(f"Using benchmark file: {json_file}")
|
|
print_leaderboard(benchmark_data)
|
|
except Exception as e:
|
|
print(f"Error loading benchmark data: {e}")
|
|
|
|
if __name__ == "__main__":
|
|
main() |