import json import os import argparse import glob import matplotlib.pyplot as plt def get_latest_json_file(directory): json_files = glob.glob(os.path.join(directory, '*.json')) print(f"Found JSON files: {json_files}") latest_file = max(json_files, key=os.path.getmtime) if json_files else None return latest_file def calculate_model_stats(model_result): """Calculate average stats for a model from its test results.""" test_results = model_result['test_results'] # Calculate overall success rate (average of all test success rates) success_rates = [test['success_rate'] for test in test_results.values()] overall_success_rate = sum(success_rates) / len(success_rates) return { 'model': model_result['model'], 'overall_success_rate': overall_success_rate, 'tokens_per_second': model_result['tokens_per_second'], 'total_duration': model_result['total_duration'], 'test_results': test_results } def plot_model_comparison(model_stats): """Plot model comparison with dual y-axes for tokens/sec and success rate.""" models = [stat['model'] for stat in model_stats] token_speeds = [stat['tokens_per_second'] for stat in model_stats] success_rates = [stat['overall_success_rate'] for stat in model_stats] durations = [stat['total_duration'] for stat in model_stats] # Create figure and primary axis fig, ax1 = plt.subplots(figsize=(15, 8)) # Plot tokens/sec bars on primary y-axis with lighter blue and more transparency bars = ax1.bar(models, token_speeds, color='royalblue', alpha=0.3) ax1.set_ylabel('Tokens per Second', color='blue') ax1.tick_params(axis='y', labelcolor='blue') # Create secondary y-axis for success rate ax2 = ax1.twinx() ax2.plot(models, success_rates, 'r+', markersize=15, label='Success Rate', linestyle='None') ax2.set_ylabel('Success Rate (%)', color='red') ax2.tick_params(axis='y', labelcolor='red') ax2.set_ylim(0, 100) # Create third y-axis for duration ax3 = ax1.twinx() ax3.spines['right'].set_position(('outward', 60)) # Move third axis outward ax3.plot(models, durations, 'g_', markersize=15, label='Duration', linestyle='None') ax3.set_ylabel('Duration (s)', color='green') ax3.tick_params(axis='y', labelcolor='green') # Customize x-axis labels with proper rotation ax1.set_xticks(range(len(models))) ax1.set_xticklabels(models, rotation=45, ha='right', rotation_mode='anchor') for i, model in enumerate(models): # Shorten model names by removing common suffixes short_name = model.replace(':latest', '').replace('-uncensored', '') ax1.get_xticklabels()[i].set_text(short_name) if success_rates[i] > 90: ax1.get_xticklabels()[i].set_color('green') # Adjust layout to prevent label cutoff plt.subplots_adjust(bottom=0.25, left=0.1, right=0.85) ''' # Add value labels for i, bar in enumerate(bars): ax1.text(i, token_speeds[i], f'{token_speeds[i]:.1f}', ha='center', va='bottom', color='black') ax2.text(i, success_rates[i], f'{success_rates[i]:.1f}%', ha='center', va='bottom', color='black') ax3.text(i, durations[i], f'{durations[i]:.1f}s', ha='center', va='top', color='black') ''' plt.title('Model Performance Comparison') plt.tight_layout() plt.show() plt.savefig('benchmark_results/model_comparison.png') print("\nPlot saved as 'benchmark_results/model_comparison.png'") def print_leaderboard(benchmark_data): """Print leaderboard from benchmark results.""" if not benchmark_data.get('benchmarks'): print("No benchmark data to display") return # Get the latest benchmark results latest_benchmark = benchmark_data['benchmarks'][-1] model_results = latest_benchmark['results'] # Calculate stats and sort models model_stats = [calculate_model_stats(model) for model in model_results] sorted_stats = sorted(model_stats, key=lambda x: (x['overall_success_rate'], x['tokens_per_second']), reverse=True) print(f"\nšŸ† Final Model Leaderboard:") for stats in sorted_stats: print(f"\n{stats['model']}") print(f" Overall Success Rate: {stats['overall_success_rate']:.1f}%") print(f" Average Tokens/sec: {stats['tokens_per_second']:.2f}") print(f" Average Duration: {stats['total_duration']:.2f}s") print(f" Test Results:") for test_name, test_result in stats['test_results'].items(): status = 'āœ…' if test_result['success_rate'] == 100 else 'āŒ' print(f" - {test_name}: {status} {test_result['success_rate']:.1f}%") # Generate visualization plot_model_comparison(sorted_stats) def main(): parser = argparse.ArgumentParser(description='Display benchmark leaderboard') parser.add_argument('filepath', nargs='?', help='Path to benchmark results JSON file') parser.add_argument('--file', type=str, help='Path to benchmark results JSON file (alternative way)') args = parser.parse_args() try: # Use filepath if provided, then --file, otherwise find latest if args.filepath: json_file = args.filepath elif args.file: json_file = args.file else: json_file = get_latest_json_file('benchmark_results') if not json_file: print("No benchmark results found") return with open(json_file, 'r') as f: benchmark_data = json.load(f) print(f"Using benchmark file: {json_file}") print_leaderboard(benchmark_data) except Exception as e: print(f"Error loading benchmark data: {e}") if __name__ == "__main__": main()