bug fixs
This commit is contained in:
parent
a3b06718a2
commit
dedbeceb8e
1005
benchmark_results/Apple_M1_Pro_localhost_11434.json
Normal file
1005
benchmark_results/Apple_M1_Pro_localhost_11434.json
Normal file
File diff suppressed because it is too large
Load Diff
236
benchmark_results/Apple_M1_Pro_localhost_11434.log
Normal file
236
benchmark_results/Apple_M1_Pro_localhost_11434.log
Normal file
@ -0,0 +1,236 @@
|
||||
Benchmark Run: 20250303_174821
|
||||
Server: http://localhost:11434
|
||||
|
||||
CPU Information:
|
||||
python_version: 3.10.16.final.0 (64 bit)
|
||||
cpuinfo_version: [9, 0, 0]
|
||||
cpuinfo_version_string: 9.0.0
|
||||
arch: ARM_8
|
||||
bits: 64
|
||||
count: 10
|
||||
arch_string_raw: arm64
|
||||
brand_raw: Apple M1 Pro
|
||||
|
||||
Benchmark Results:
|
||||
|
||||
[38;5;147m[1m🏆 Final Model Leaderboard:[0m
|
||||
|
||||
[1m[38;5;221mqwen2.5-coder:7b-instruct-q4_K_M[0m
|
||||
[1mOverall Success Rate:[0m 100.0% (72/72 cases)
|
||||
[1mAverage Tokens/sec:[0m 19.33 (18.75 - 19.58)
|
||||
[1mAverage Duration:[0m 17.32s
|
||||
[1mMin/Max Avg Duration:[0m 8.67s / 17.99s
|
||||
[1mTest Results:[0m
|
||||
- Fibonacci: ✅ 18/18 cases (100.0%)
|
||||
- Binary Search: ✅ 18/18 cases (100.0%)
|
||||
- Palindrome: ✅ 18/18 cases (100.0%)
|
||||
- Anagram Check: ✅ 18/18 cases (100.0%)
|
||||
|
||||
[1m[38;5;221mfalcon3:10b[0m
|
||||
[1mOverall Success Rate:[0m 100.0% (72/72 cases)
|
||||
[1mAverage Tokens/sec:[0m 13.21 (12.53 - 13.31)
|
||||
[1mAverage Duration:[0m 13.46s
|
||||
[1mMin/Max Avg Duration:[0m 6.76s / 13.46s
|
||||
[1mTest Results:[0m
|
||||
- Fibonacci: ✅ 18/18 cases (100.0%)
|
||||
- Binary Search: ✅ 18/18 cases (100.0%)
|
||||
- Palindrome: ✅ 18/18 cases (100.0%)
|
||||
- Anagram Check: ✅ 18/18 cases (100.0%)
|
||||
|
||||
[1m[38;5;221mqwen2.5:14b[0m
|
||||
[1mOverall Success Rate:[0m 100.0% (72/72 cases)
|
||||
[1mAverage Tokens/sec:[0m 9.78 (9.78 - 9.88)
|
||||
[1mAverage Duration:[0m 35.25s
|
||||
[1mMin/Max Avg Duration:[0m 30.09s / 35.25s
|
||||
[1mTest Results:[0m
|
||||
- Fibonacci: ✅ 18/18 cases (100.0%)
|
||||
- Binary Search: ✅ 18/18 cases (100.0%)
|
||||
- Palindrome: ✅ 18/18 cases (100.0%)
|
||||
- Anagram Check: ✅ 18/18 cases (100.0%)
|
||||
|
||||
[1m[38;5;221mqwen2.5-coder:14b-instruct-q4_K_M[0m
|
||||
[1mOverall Success Rate:[0m 100.0% (72/72 cases)
|
||||
[1mAverage Tokens/sec:[0m 9.68 (9.65 - 9.88)
|
||||
[1mAverage Duration:[0m 37.18s
|
||||
[1mMin/Max Avg Duration:[0m 23.06s / 37.18s
|
||||
[1mTest Results:[0m
|
||||
- Fibonacci: ✅ 18/18 cases (100.0%)
|
||||
- Binary Search: ✅ 18/18 cases (100.0%)
|
||||
- Palindrome: ✅ 18/18 cases (100.0%)
|
||||
- Anagram Check: ✅ 18/18 cases (100.0%)
|
||||
|
||||
[1m[38;5;221mphi4:latest[0m
|
||||
[1mOverall Success Rate:[0m 100.0% (72/72 cases)
|
||||
[1mAverage Tokens/sec:[0m 9.01 (8.96 - 9.32)
|
||||
[1mAverage Duration:[0m 23.44s
|
||||
[1mMin/Max Avg Duration:[0m 23.44s / 38.82s
|
||||
[1mTest Results:[0m
|
||||
- Fibonacci: ✅ 18/18 cases (100.0%)
|
||||
- Binary Search: ✅ 18/18 cases (100.0%)
|
||||
- Palindrome: ✅ 18/18 cases (100.0%)
|
||||
- Anagram Check: ✅ 18/18 cases (100.0%)
|
||||
|
||||
[1m[38;5;221mdeepseek-r1:14b[0m
|
||||
[1mOverall Success Rate:[0m 97.2% (70/72 cases)
|
||||
[1mAverage Tokens/sec:[0m 9.05 (8.90 - 9.38)
|
||||
[1mAverage Duration:[0m 278.32s
|
||||
[1mMin/Max Avg Duration:[0m 174.30s / 482.10s
|
||||
[1mTest Results:[0m
|
||||
- Fibonacci: ✅ 18/18 cases (100.0%)
|
||||
- Binary Search: ✅ 18/18 cases (100.0%)
|
||||
- Palindrome: ❌ 16/18 cases (88.9%)
|
||||
- Anagram Check: ✅ 18/18 cases (100.0%)
|
||||
|
||||
[1m[38;5;221mllama3.2-vision:11b-instruct-q4_K_M[0m
|
||||
[1mOverall Success Rate:[0m 95.8% (69/72 cases)
|
||||
[1mAverage Tokens/sec:[0m 15.68 (14.92 - 15.92)
|
||||
[1mAverage Duration:[0m 22.33s
|
||||
[1mMin/Max Avg Duration:[0m 16.31s / 28.85s
|
||||
[1mTest Results:[0m
|
||||
- Fibonacci: ❌ 16/18 cases (88.9%)
|
||||
- Binary Search: ❌ 17/18 cases (94.4%)
|
||||
- Palindrome: ✅ 18/18 cases (100.0%)
|
||||
- Anagram Check: ✅ 18/18 cases (100.0%)
|
||||
|
||||
[1m[38;5;221mllama3.2:3b[0m
|
||||
[1mOverall Success Rate:[0m 94.4% (68/72 cases)
|
||||
[1mAverage Tokens/sec:[0m 36.09 (30.85 - 37.53)
|
||||
[1mAverage Duration:[0m 2.67s
|
||||
[1mMin/Max Avg Duration:[0m 1.04s / 2.76s
|
||||
[1mTest Results:[0m
|
||||
- Fibonacci: ❌ 14/18 cases (77.8%)
|
||||
- Binary Search: ✅ 18/18 cases (100.0%)
|
||||
- Palindrome: ✅ 18/18 cases (100.0%)
|
||||
- Anagram Check: ✅ 18/18 cases (100.0%)
|
||||
|
||||
[1m[38;5;221mllama3.1:8b[0m
|
||||
[1mOverall Success Rate:[0m 94.4% (68/72 cases)
|
||||
[1mAverage Tokens/sec:[0m 17.92 (17.92 - 18.45)
|
||||
[1mAverage Duration:[0m 18.04s
|
||||
[1mMin/Max Avg Duration:[0m 14.68s / 19.56s
|
||||
[1mTest Results:[0m
|
||||
- Fibonacci: ❌ 14/18 cases (77.8%)
|
||||
- Binary Search: ✅ 18/18 cases (100.0%)
|
||||
- Palindrome: ✅ 18/18 cases (100.0%)
|
||||
- Anagram Check: ✅ 18/18 cases (100.0%)
|
||||
|
||||
[1m[38;5;221mhhao/qwen2.5-coder-tools:7b[0m
|
||||
[1mOverall Success Rate:[0m 91.7% (66/72 cases)
|
||||
[1mAverage Tokens/sec:[0m 17.75 (16.05 - 17.75)
|
||||
[1mAverage Duration:[0m 9.35s
|
||||
[1mMin/Max Avg Duration:[0m 4.17s / 9.35s
|
||||
[1mTest Results:[0m
|
||||
- Fibonacci: ❌ 12/18 cases (66.7%)
|
||||
- Binary Search: ✅ 18/18 cases (100.0%)
|
||||
- Palindrome: ✅ 18/18 cases (100.0%)
|
||||
- Anagram Check: ✅ 18/18 cases (100.0%)
|
||||
|
||||
[1m[38;5;221mQwen2.5-Coder-7B-Instruct-s1k:latest[0m
|
||||
[1mOverall Success Rate:[0m 88.9% (64/72 cases)
|
||||
[1mAverage Tokens/sec:[0m 18.38 (18.38 - 18.94)
|
||||
[1mAverage Duration:[0m 9.95s
|
||||
[1mMin/Max Avg Duration:[0m 9.06s / 12.91s
|
||||
[1mTest Results:[0m
|
||||
- Fibonacci: ❌ 16/18 cases (88.9%)
|
||||
- Binary Search: ❌ 12/18 cases (66.7%)
|
||||
- Palindrome: ✅ 18/18 cases (100.0%)
|
||||
- Anagram Check: ✅ 18/18 cases (100.0%)
|
||||
|
||||
[1m[38;5;221mdeepseek-r1:8b[0m
|
||||
[1mOverall Success Rate:[0m 86.1% (62/72 cases)
|
||||
[1mAverage Tokens/sec:[0m 17.43 (17.29 - 18.01)
|
||||
[1mAverage Duration:[0m 168.97s
|
||||
[1mMin/Max Avg Duration:[0m 107.91s / 168.97s
|
||||
[1mTest Results:[0m
|
||||
- Fibonacci: ✅ 18/18 cases (100.0%)
|
||||
- Binary Search: ✅ 18/18 cases (100.0%)
|
||||
- Palindrome: ❌ 16/18 cases (88.9%)
|
||||
- Anagram Check: ❌ 10/18 cases (55.6%)
|
||||
|
||||
[1m[38;5;221mllama3.2:1b-instruct-q4_K_M[0m
|
||||
[1mOverall Success Rate:[0m 81.9% (59/72 cases)
|
||||
[1mAverage Tokens/sec:[0m 88.24 (88.24 - 88.93)
|
||||
[1mAverage Duration:[0m 3.64s
|
||||
[1mMin/Max Avg Duration:[0m 1.87s / 4.93s
|
||||
[1mTest Results:[0m
|
||||
- Fibonacci: ❌ 5/18 cases (27.8%)
|
||||
- Binary Search: ✅ 18/18 cases (100.0%)
|
||||
- Palindrome: ✅ 18/18 cases (100.0%)
|
||||
- Anagram Check: ✅ 18/18 cases (100.0%)
|
||||
|
||||
[1m[38;5;221msamantha-mistral:latest[0m
|
||||
[1mOverall Success Rate:[0m 80.6% (58/72 cases)
|
||||
[1mAverage Tokens/sec:[0m 23.92 (23.91 - 24.79)
|
||||
[1mAverage Duration:[0m 12.21s
|
||||
[1mMin/Max Avg Duration:[0m 7.59s / 12.21s
|
||||
[1mTest Results:[0m
|
||||
- Fibonacci: ❌ 8/18 cases (44.4%)
|
||||
- Binary Search: ✅ 18/18 cases (100.0%)
|
||||
- Palindrome: ❌ 16/18 cases (88.9%)
|
||||
- Anagram Check: ❌ 16/18 cases (88.9%)
|
||||
|
||||
[1m[38;5;221mmarco-o1:latest[0m
|
||||
[1mOverall Success Rate:[0m 80.6% (58/72 cases)
|
||||
[1mAverage Tokens/sec:[0m 19.19 (19.19 - 19.39)
|
||||
[1mAverage Duration:[0m 41.14s
|
||||
[1mMin/Max Avg Duration:[0m 33.28s / 51.50s
|
||||
[1mTest Results:[0m
|
||||
- Fibonacci: ✅ 18/18 cases (100.0%)
|
||||
- Binary Search: ❌ 6/18 cases (33.3%)
|
||||
- Palindrome: ✅ 18/18 cases (100.0%)
|
||||
- Anagram Check: ❌ 16/18 cases (88.9%)
|
||||
|
||||
[1m[38;5;221mdeepseek-r1:7b[0m
|
||||
[1mOverall Success Rate:[0m 80.6% (58/72 cases)
|
||||
[1mAverage Tokens/sec:[0m 18.01 (18.01 - 19.07)
|
||||
[1mAverage Duration:[0m 336.87s
|
||||
[1mMin/Max Avg Duration:[0m 78.71s / 336.87s
|
||||
[1mTest Results:[0m
|
||||
- Fibonacci: ❌ 10/18 cases (55.6%)
|
||||
- Binary Search: ✅ 18/18 cases (100.0%)
|
||||
- Palindrome: ❌ 12/18 cases (66.7%)
|
||||
- Anagram Check: ✅ 18/18 cases (100.0%)
|
||||
|
||||
[1m[38;5;221mdeepseek-r1:1.5b-qwen-distill-q8_0[0m
|
||||
[1mOverall Success Rate:[0m 52.8% (38/72 cases)
|
||||
[1mAverage Tokens/sec:[0m 57.37 (53.88 - 59.60)
|
||||
[1mAverage Duration:[0m 137.59s
|
||||
[1mMin/Max Avg Duration:[0m 41.38s / 371.13s
|
||||
[1mTest Results:[0m
|
||||
- Fibonacci: ❌ 11/18 cases (61.1%)
|
||||
- Binary Search: ❌ 12/18 cases (66.7%)
|
||||
- Palindrome: ❌ 6/18 cases (33.3%)
|
||||
- Anagram Check: ❌ 9/18 cases (50.0%)
|
||||
|
||||
[1m[38;5;221mopenthinker:7b[0m
|
||||
[1mOverall Success Rate:[0m 47.2% (34/72 cases)
|
||||
[1mAverage Tokens/sec:[0m 18.16 (17.98 - 18.29)
|
||||
[1mAverage Duration:[0m 263.00s
|
||||
[1mMin/Max Avg Duration:[0m 168.91s / 302.79s
|
||||
[1mTest Results:[0m
|
||||
- Fibonacci: ❌ 0/18 cases (0.0%)
|
||||
- Binary Search: ✅ 18/18 cases (100.0%)
|
||||
- Palindrome: ❌ 12/18 cases (66.7%)
|
||||
- Anagram Check: ❌ 4/18 cases (22.2%)
|
||||
|
||||
[1m[38;5;221mwizard-vicuna-uncensored:latest[0m
|
||||
[1mOverall Success Rate:[0m 9.7% (7/72 cases)
|
||||
[1mAverage Tokens/sec:[0m 22.01 (22.01 - 24.42)
|
||||
[1mAverage Duration:[0m 9.06s
|
||||
[1mMin/Max Avg Duration:[0m 5.60s / 11.45s
|
||||
[1mTest Results:[0m
|
||||
- Fibonacci: ❌ 0/18 cases (0.0%)
|
||||
- Binary Search: ❌ 0/18 cases (0.0%)
|
||||
- Palindrome: ❌ 6/18 cases (33.3%)
|
||||
- Anagram Check: ❌ 1/18 cases (5.6%)
|
||||
|
||||
[1m[38;5;221mmxbai-embed-large:latest[0m
|
||||
[1mOverall Success Rate:[0m 0.0% (0/72 cases)
|
||||
[1mAverage Tokens/sec:[0m 0.00 (0.00 - 0.00)
|
||||
[1mAverage Duration:[0m 0.00s
|
||||
[1mMin/Max Avg Duration:[0m 0.00s / 0.00s
|
||||
[1mTest Results:[0m
|
||||
- Fibonacci: ❌ 0/18 cases (0.0%)
|
||||
- Binary Search: ❌ 0/18 cases (0.0%)
|
||||
- Palindrome: ❌ 0/18 cases (0.0%)
|
||||
- Anagram Check: ❌ 0/18 cases (0.0%)
|
BIN
benchmark_results/model_comparison.png
Normal file
BIN
benchmark_results/model_comparison.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 2.3 KiB |
0
computest.py
Normal file
0
computest.py
Normal file
13
devbook.md
13
devbook.md
@ -198,7 +198,18 @@ python lboard.py path/to/results.json
|
||||
- Dynamic axis scaling
|
||||
- Combined legend for all metrics
|
||||
|
||||
### Output Format
|
||||
## Output Format
|
||||
|
||||
### Benchmark Run Output
|
||||
For each model being tested, the output shows:
|
||||
|
||||
1. Individual test runs (1-4) with:
|
||||
- Test case results
|
||||
- Performance metrics
|
||||
- Pass/fail status
|
||||
|
||||
2. Cumulative Results Summary:
|
||||
After all runs are completed, a summary is displayed:
|
||||
- Detailed test results per model
|
||||
- Individual test case counts
|
||||
- Validation status indicators
|
||||
|
43
lboard.py
Normal file → Executable file
43
lboard.py
Normal file → Executable file
@ -22,6 +22,10 @@ def calculate_model_stats(model_result):
|
||||
'overall_success_rate': overall_success_rate,
|
||||
'tokens_per_second': model_result['tokens_per_second'],
|
||||
'total_duration': model_result['total_duration'],
|
||||
'min_avg_duration': model_result.get('min_avg_duration', min(test['avg_duration'] for test in test_results.values())),
|
||||
'max_avg_duration': model_result.get('max_avg_duration', max(test['avg_duration'] for test in test_results.values())),
|
||||
'min_tokens_per_second': model_result.get('min_tokens_per_second', min(test['avg_tokens_sec'] for test in test_results.values())),
|
||||
'max_tokens_per_second': model_result.get('max_tokens_per_second', max(test['avg_tokens_sec'] for test in test_results.values())),
|
||||
'test_results': test_results
|
||||
}
|
||||
|
||||
@ -35,10 +39,26 @@ def plot_model_comparison(model_stats):
|
||||
# Create figure and primary axis
|
||||
fig, ax1 = plt.subplots(figsize=(15, 8))
|
||||
|
||||
# Plot tokens/sec bars on primary y-axis with lighter blue and more transparency
|
||||
bars = ax1.bar(models, token_speeds, color='royalblue', alpha=0.3)
|
||||
# Plot tokens/sec bars using min and max values
|
||||
for i, stat in enumerate(model_stats):
|
||||
min_tokens = stat['min_tokens_per_second']
|
||||
max_tokens = stat['max_tokens_per_second']
|
||||
|
||||
# Plot lower part (0 to min) with slightly darker blue
|
||||
ax1.bar(i, min_tokens, color='royalblue', alpha=0.4)
|
||||
# Plot upper part (min to max) with lighter blue
|
||||
bar_height = max_tokens - min_tokens
|
||||
ax1.bar(i, bar_height, bottom=min_tokens, color='royalblue', alpha=0.3)
|
||||
|
||||
ax1.set_ylabel('Tokens per Second', color='blue')
|
||||
ax1.tick_params(axis='y', labelcolor='blue')
|
||||
# Set y-axis range for tokens per second
|
||||
max_token_speed = max(stat['max_tokens_per_second'] for stat in model_stats)
|
||||
ax1.set_ylim(0, max(100, max_token_speed * 1.1)) # Add 10% padding above max value
|
||||
|
||||
# Set x-axis labels
|
||||
ax1.set_xticks(range(len(models)))
|
||||
ax1.set_xticklabels(models, rotation=45, ha='right', rotation_mode='anchor')
|
||||
|
||||
# Create secondary y-axis for success rate
|
||||
ax2 = ax1.twinx()
|
||||
@ -50,7 +70,16 @@ def plot_model_comparison(model_stats):
|
||||
# Create third y-axis for duration
|
||||
ax3 = ax1.twinx()
|
||||
ax3.spines['right'].set_position(('outward', 60)) # Move third axis outward
|
||||
ax3.plot(models, durations, 'g_', markersize=15, label='Duration', linestyle='None')
|
||||
#ax3.plot(models, durations, 'g_', markersize=15, label='Duration', linestyle='None')
|
||||
# Add min and max duration markers
|
||||
min_durations = [stat['min_avg_duration'] for stat in model_stats]
|
||||
max_durations = [stat['max_avg_duration'] for stat in model_stats]
|
||||
# Plot duration ranges with vertical lines and markers
|
||||
for i, (min_d, max_d) in enumerate(zip(min_durations, max_durations)):
|
||||
ax3.plot([i, i], [min_d, max_d], 'g-', linewidth=1) # Vertical line
|
||||
ax3.plot(i, min_d, 'g-', markersize=10) # Min marker
|
||||
ax3.plot(i, max_d, 'g-', markersize=10) # Max marker
|
||||
|
||||
ax3.set_ylabel('Duration (s)', color='green')
|
||||
ax3.tick_params(axis='y', labelcolor='green')
|
||||
|
||||
@ -61,7 +90,8 @@ def plot_model_comparison(model_stats):
|
||||
# Shorten model names by removing common suffixes
|
||||
short_name = model.replace(':latest', '').replace('-uncensored', '')
|
||||
ax1.get_xticklabels()[i].set_text(short_name)
|
||||
if success_rates[i] > 90:
|
||||
# Updated conditions: success rate > 95% AND success rate / duration >= 5
|
||||
if success_rates[i] > 95 and (success_rates[i] / durations[i] >= 5):
|
||||
ax1.get_xticklabels()[i].set_color('green')
|
||||
|
||||
# Adjust layout to prevent label cutoff
|
||||
@ -104,13 +134,14 @@ def print_leaderboard(benchmark_data):
|
||||
for stats in sorted_stats:
|
||||
print(f"\n{stats['model']}")
|
||||
print(f" Overall Success Rate: {stats['overall_success_rate']:.1f}%")
|
||||
print(f" Average Tokens/sec: {stats['tokens_per_second']:.2f}")
|
||||
print(f" Average Tokens/sec: {stats['tokens_per_second']:.2f} ({stats['min_tokens_per_second']:.2f} - {stats['max_tokens_per_second']:.2f})")
|
||||
print(f" Average Duration: {stats['total_duration']:.2f}s")
|
||||
print(f" Min/Max Avg Duration: {stats['min_avg_duration']:.2f}s / {stats['max_avg_duration']:.2f}s")
|
||||
print(f" Test Results:")
|
||||
|
||||
for test_name, test_result in stats['test_results'].items():
|
||||
status = '✅' if test_result['success_rate'] == 100 else '❌'
|
||||
print(f" - {test_name}: {status} {test_result['success_rate']:.1f}%")
|
||||
print(f" - {test_name}: {status} {test_result['passed_cases']}/{test_result['total_cases']} cases ({test_result['success_rate']:.1f}%)")
|
||||
|
||||
# Generate visualization
|
||||
plot_model_comparison(sorted_stats)
|
||||
|
756
main copie.py
Normal file
756
main copie.py
Normal file
@ -0,0 +1,756 @@
|
||||
from tabnanny import verbose
|
||||
import ollama
|
||||
import time
|
||||
from typing import List, Dict, Any
|
||||
import json
|
||||
from statistics import mean
|
||||
import re
|
||||
import ast
|
||||
import argparse
|
||||
import requests
|
||||
import os
|
||||
from together import Together
|
||||
from cpuinfo import get_cpu_info
|
||||
import subprocess
|
||||
|
||||
|
||||
# ANSI color codes
|
||||
SUCCESS = '\033[38;5;78m' # Soft mint green for success
|
||||
ERROR = '\033[38;5;203m' # Soft coral red for errors
|
||||
INFO = '\033[38;5;75m' # Sky blue for info
|
||||
HEADER = '\033[38;5;147m' # Soft purple for headers
|
||||
WARNING = '\033[38;5;221m' # Warm gold for warnings
|
||||
EMPHASIS = '\033[38;5;159m' # Cyan for emphasis
|
||||
MUTED = '\033[38;5;246m' # Subtle gray for less important text
|
||||
ENDC = '\033[0m'
|
||||
BOLD = '\033[1m'
|
||||
|
||||
# Replace existing color usages
|
||||
GREEN = SUCCESS
|
||||
RED = ERROR
|
||||
BLUE = INFO
|
||||
YELLOW = WARNING
|
||||
WHITE = MUTED
|
||||
|
||||
# Server configurations
|
||||
SERVERS = {
|
||||
'local': 'http://localhost:11434',
|
||||
'z60': 'http://192.168.196.60:11434'
|
||||
}
|
||||
|
||||
class Timer:
|
||||
def __init__(self):
|
||||
self.start_time = None
|
||||
self.end_time = None
|
||||
|
||||
def start(self):
|
||||
self.start_time = time.time()
|
||||
|
||||
def stop(self):
|
||||
self.end_time = time.time()
|
||||
|
||||
def elapsed_time(self):
|
||||
if self.start_time is None:
|
||||
return 0
|
||||
if self.end_time is None:
|
||||
return time.time() - self.start_time
|
||||
return self.end_time - self.start_time
|
||||
|
||||
def extract_code_from_response(response: str) -> str:
|
||||
"""Extract Python code from a markdown-formatted string."""
|
||||
code_blocks = re.findall(r'```python\n(.*?)```', response, re.DOTALL)
|
||||
if code_blocks:
|
||||
return code_blocks[0].strip()
|
||||
return response
|
||||
|
||||
def is_valid_python(code: str) -> bool:
|
||||
"""Check if the code is valid Python syntax."""
|
||||
try:
|
||||
ast.parse(code)
|
||||
return True
|
||||
except SyntaxError:
|
||||
return False
|
||||
|
||||
def analyze_failed_code(code: str, test_case: tuple, expected: any, actual: any, function_name: str, model: str) -> bool:
|
||||
"""Analyze why code failed using Together API. Returns True if Together thinks the code should work."""
|
||||
prompt = f"""Analyze this Python code and explain why it failed the test case. Format your response EXACTLY as follows:
|
||||
|
||||
ASSESSMENT: [Write a one-line assessment: either "SHOULD PASS" or "SHOULD FAIL" followed by a brief reason]
|
||||
|
||||
ANALYSIS:
|
||||
[Detailed analysis of why the code failed and how to fix it]
|
||||
|
||||
Code:
|
||||
{code}
|
||||
|
||||
Test case:
|
||||
Input: {test_case}
|
||||
Expected output: {expected}
|
||||
Actual output: {actual}
|
||||
Function name required: {function_name}
|
||||
Model: {model}"""
|
||||
|
||||
try:
|
||||
TOGETHER_API_KEY = os.environ["TOGETHER_API_KEY"]
|
||||
together_client = Together(api_key=TOGETHER_API_KEY)
|
||||
response = together_client.chat.completions.create(
|
||||
model="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a Python expert analyzing code failures. Always format your response with ASSESSMENT and ANALYSIS sections."},
|
||||
{"role": "user", "content": prompt}
|
||||
],
|
||||
max_tokens=1000,
|
||||
temperature=0.7,
|
||||
top_p=0.7,
|
||||
top_k=50,
|
||||
repetition_penalty=1,
|
||||
stop=["<|eot_id|>", "<|eom_id|>"]
|
||||
)
|
||||
|
||||
analysis = response.choices[0].message.content
|
||||
should_pass = "SHOULD PASS" in analysis.upper()
|
||||
if verbose: print(f"\n{BLUE}[{model}] Together Analysis:{ENDC}")
|
||||
if verbose: print(f"{GREEN if should_pass else RED}{analysis}{ENDC}")
|
||||
return should_pass
|
||||
except Exception as e:
|
||||
print(f"\n{RED}Error getting Together API analysis: {e}{ENDC}")
|
||||
return False
|
||||
|
||||
def validate_with_debug(code: str, function_name: str, test_cases: List[tuple], model: str) -> tuple[bool, str, List[bool]]:
|
||||
"""Validate code with detailed debug information. Returns (success, debug_info, test_results)"""
|
||||
debug_info = []
|
||||
test_results = [] # Track individual test case results
|
||||
test_outputs = [] # Store test outputs for combined display
|
||||
|
||||
try:
|
||||
# Create a local namespace
|
||||
namespace = {}
|
||||
debug_info.append(f"Executing code:\n{code}")
|
||||
|
||||
try:
|
||||
# Redirect stdout to capture prints from the executed code
|
||||
import io
|
||||
import sys
|
||||
stdout = sys.stdout
|
||||
sys.stdout = io.StringIO()
|
||||
|
||||
# Execute the code
|
||||
exec(code, namespace)
|
||||
|
||||
# Restore stdout
|
||||
sys.stdout = stdout
|
||||
|
||||
except Exception as e:
|
||||
if 'sys' in locals(): # Restore stdout if it was changed
|
||||
sys.stdout = stdout
|
||||
if verbose: print(f"\n{RED}Failed code:{ENDC}\n{code}")
|
||||
return False, f"Error executing code: {str(e)}", test_results
|
||||
|
||||
if function_name not in namespace:
|
||||
if verbose: print(f"\n{RED}Failed code:{ENDC}\n{code}")
|
||||
together_opinion = analyze_failed_code(code, "N/A", f"Function named '{function_name}'",
|
||||
f"Found functions: {list(namespace.keys())}", function_name, model)
|
||||
print(f"\nTests passed: ❌ Together opinion: {'✅' if together_opinion else '❌'}")
|
||||
return False, f"Function '{function_name}' not found in code. Available names: {list(namespace.keys())}", test_results
|
||||
|
||||
function = namespace[function_name]
|
||||
debug_info.append(f"Function {function_name} found")
|
||||
|
||||
# Run test cases
|
||||
all_passed = True
|
||||
for i, (test_input, expected) in enumerate(test_cases):
|
||||
try:
|
||||
# Redirect stdout for each test case
|
||||
stdout = sys.stdout
|
||||
sys.stdout = io.StringIO()
|
||||
|
||||
if isinstance(test_input, tuple):
|
||||
result = function(*test_input)
|
||||
else:
|
||||
result = function(test_input)
|
||||
|
||||
# Restore stdout
|
||||
sys.stdout = stdout
|
||||
|
||||
# Store result but don't print individually
|
||||
test_outputs.append(str(result))
|
||||
test_passed = result == expected
|
||||
test_results.append(test_passed)
|
||||
|
||||
if not test_passed:
|
||||
if verbose: print(f"\n{RED}Failed code:{ENDC}\n{code}")
|
||||
print(f"\n{RED}Test case {i+1} failed:{ENDC}")
|
||||
print(f"Input: {test_input} Expected: {expected} Got: {result}")
|
||||
|
||||
together_opinion = analyze_failed_code(code, test_input, expected, result, function_name, model)
|
||||
print(f"Tests passed: ❌ Together opinion: {'✅' if together_opinion else '❌'}")
|
||||
|
||||
all_passed = False
|
||||
continue
|
||||
|
||||
debug_info.append(f"Test case {i+1} passed: {test_input} → {result}")
|
||||
except Exception as e:
|
||||
if 'sys' in locals(): # Restore stdout if it was changed
|
||||
sys.stdout = stdout
|
||||
test_outputs.append(f"Error: {str(e)}")
|
||||
if verbose: print(f"\n{RED}Failed code:{ENDC}\n{code}")
|
||||
print(f"\n{RED}{str(e)} in test case {i+1} Input: {test_input} Expected: {expected}")
|
||||
|
||||
together_opinion = analyze_failed_code(code, test_input, expected, f"Error: {str(e)}", function_name, model)
|
||||
print(f"Tests passed: ❌ Together opinion: {'✅' if together_opinion else '❌'}")
|
||||
|
||||
test_results.append(False)
|
||||
all_passed = False
|
||||
continue
|
||||
finally:
|
||||
if 'sys' in locals(): # Always restore stdout
|
||||
sys.stdout = stdout
|
||||
|
||||
# Print all test outputs on one line
|
||||
# print(f"{WHITE}{BOLD}Test outputs: {join(test_outputs)}{ENDC}")
|
||||
print(f"{WHITE}Test outputs: {', '.join(test_outputs)}{ENDC}")
|
||||
|
||||
if all_passed:
|
||||
print(f"Tests passed: ✅")
|
||||
return True, "All tests passed!\n" + "\n".join(debug_info), test_results
|
||||
print(f"Tests passed: ❌")
|
||||
return False, "Some tests failed", test_results
|
||||
except Exception as e:
|
||||
if 'sys' in locals(): # Restore stdout if it was changed
|
||||
sys.stdout = stdout
|
||||
print(f"\n{RED}Error in validate_with_debug: {str(e)}{ENDC}")
|
||||
return False, f"Unexpected error: {str(e)}", test_results
|
||||
|
||||
def test_fibonacci():
|
||||
question = """Write a Python function named EXACTLY 'fibonacci' (not fibonacci_dp or any other name) that returns the nth Fibonacci number.
|
||||
The function signature must be: def fibonacci(n)
|
||||
|
||||
Requirements:
|
||||
1. Handle edge cases:
|
||||
- For n = 0, return 0
|
||||
- For n = 1 or n = 2, return 1
|
||||
- For negative numbers, return -1
|
||||
2. For n > 2: F(n) = F(n-1) + F(n-2)
|
||||
3. Use dynamic programming or memoization for efficiency
|
||||
4. Do NOT use any print statements - just return the values
|
||||
|
||||
Example sequence: 0,1,1,2,3,5,8,13,21,...
|
||||
Example calls:
|
||||
- fibonacci(6) returns 8
|
||||
- fibonacci(0) returns 0
|
||||
- fibonacci(-1) returns -1"""
|
||||
|
||||
test_cases = [
|
||||
(0, 0), # Edge case: n = 0
|
||||
(1, 1), # Edge case: n = 1
|
||||
(2, 1), # Edge case: n = 2
|
||||
(6, 8), # Regular case
|
||||
(10, 55), # Larger number
|
||||
(-1, -1), # Edge case: negative input
|
||||
]
|
||||
|
||||
def validate(code: str) -> bool:
|
||||
success, debug_info, test_results = validate_with_debug(code, 'fibonacci', test_cases, "N/A")
|
||||
return success
|
||||
|
||||
return (question, validate, test_cases)
|
||||
|
||||
def test_binary_search():
|
||||
question = """Write a Python function named EXACTLY 'binary_search' that performs binary search on a sorted list.
|
||||
The function signature must be: def binary_search(arr, target)
|
||||
|
||||
Requirements:
|
||||
1. The function takes two arguments:
|
||||
- arr: a sorted list of integers
|
||||
- target: the integer to find
|
||||
2. Return the index of the target if found
|
||||
3. Return -1 if the target is not in the list
|
||||
4. Do NOT use any print statements - just return the values
|
||||
|
||||
Example:
|
||||
- binary_search([1,2,3,4,5], 3) returns 2
|
||||
- binary_search([1,2,3,4,5], 6) returns -1"""
|
||||
|
||||
test_cases = [
|
||||
(([1,2,3,4,5], 3), 2), # Regular case: target in middle
|
||||
(([1,2,3,4,5], 1), 0), # Edge case: target at start
|
||||
(([1,2,3,4,5], 5), 4), # Edge case: target at end
|
||||
(([1,2,3,4,5], 6), -1), # Edge case: target not in list
|
||||
(([], 1), -1), # Edge case: empty list
|
||||
(([1], 1), 0), # Edge case: single element list
|
||||
]
|
||||
|
||||
def validate(code: str) -> bool:
|
||||
success, debug_info, test_results = validate_with_debug(code, 'binary_search', test_cases, "N/A")
|
||||
return success
|
||||
|
||||
return (question, validate, test_cases)
|
||||
|
||||
def test_palindrome():
|
||||
question = """Write a Python function named EXACTLY 'is_palindrome' that checks if a string is a palindrome.
|
||||
The function signature must be: def is_palindrome(s)
|
||||
|
||||
Requirements:
|
||||
1. The function takes one argument:
|
||||
- s: a string to check
|
||||
2. Return True if the string is a palindrome, False otherwise
|
||||
3. Ignore case (treat uppercase and lowercase as the same)
|
||||
4. Ignore non-alphanumeric characters (spaces, punctuation)
|
||||
5. Do NOT use any print statements - just return the values
|
||||
|
||||
Example:
|
||||
- is_palindrome("A man, a plan, a canal: Panama") returns True
|
||||
- is_palindrome("race a car") returns False"""
|
||||
|
||||
test_cases = [
|
||||
("A man, a plan, a canal: Panama", True), # Regular case with punctuation
|
||||
("race a car", False), # Regular case, not palindrome
|
||||
("", True), # Edge case: empty string
|
||||
("a", True), # Edge case: single character
|
||||
("Was it a car or a cat I saw?", True), # Complex case with punctuation
|
||||
("hello", False), # Simple case, not palindrome
|
||||
]
|
||||
|
||||
def validate(code: str) -> bool:
|
||||
success, debug_info, test_results = validate_with_debug(code, 'is_palindrome', test_cases, "N/A")
|
||||
return success
|
||||
|
||||
return (question, validate, test_cases)
|
||||
|
||||
def test_anagram():
|
||||
question = """Write a Python function named EXACTLY 'are_anagrams' that checks if two strings are anagrams.
|
||||
The function signature must be: def are_anagrams(str1, str2)
|
||||
|
||||
Requirements:
|
||||
1. The function takes two arguments:
|
||||
- str1: first string
|
||||
- str2: second string
|
||||
2. Return True if the strings are anagrams, False otherwise
|
||||
3. Ignore case (treat uppercase and lowercase as the same)
|
||||
4. Ignore spaces
|
||||
5. Consider only alphanumeric characters
|
||||
6. Do NOT use any print statements - just return the values
|
||||
|
||||
Example:
|
||||
- are_anagrams("listen", "silent") returns True
|
||||
- are_anagrams("hello", "world") returns False"""
|
||||
|
||||
test_cases = [
|
||||
(("listen", "silent"), True), # Regular case
|
||||
(("hello", "world"), False), # Not anagrams
|
||||
(("", ""), True), # Edge case: empty strings
|
||||
(("a", "a"), True), # Edge case: single char
|
||||
(("Debit Card", "Bad Credit"), True), # Case and space test
|
||||
(("Python", "Java"), False), # Different lengths
|
||||
]
|
||||
|
||||
def validate(code: str) -> bool:
|
||||
success, debug_info, test_results = validate_with_debug(code, 'are_anagrams', test_cases, "N/A")
|
||||
return success
|
||||
|
||||
return (question, validate, test_cases)
|
||||
|
||||
# List of all test cases
|
||||
CODING_QUESTIONS = [
|
||||
test_fibonacci(),
|
||||
test_binary_search(),
|
||||
test_palindrome(),
|
||||
test_anagram()
|
||||
]
|
||||
|
||||
# Add test names as constants
|
||||
TEST_NAMES = {
|
||||
"Write a Python func": "Fibonacci",
|
||||
"Write a Python func": "Binary Search",
|
||||
"Write a Python func": "Palindrome",
|
||||
"Write a Python func": "Anagram Check"
|
||||
}
|
||||
|
||||
def get_test_name(question: str) -> str:
|
||||
"""Get a friendly name for the test based on the question."""
|
||||
if "fibonacci" in question.lower():
|
||||
return "Fibonacci"
|
||||
elif "binary_search" in question.lower():
|
||||
return "Binary Search"
|
||||
elif "palindrome" in question.lower():
|
||||
return "Palindrome"
|
||||
elif "anagram" in question.lower():
|
||||
return "Anagram Check"
|
||||
return question[:20] + "..."
|
||||
|
||||
def get_model_stats(model: str, question_tuple: tuple, server_url: str) -> Dict:
|
||||
"""
|
||||
Get performance statistics for a specific model and validate the response.
|
||||
"""
|
||||
question, validator = question_tuple
|
||||
timer = Timer()
|
||||
results = {
|
||||
'model': model,
|
||||
'total_duration': 0,
|
||||
'tokens_per_second': 0,
|
||||
'code_valid': False,
|
||||
'tests_passed': False,
|
||||
'error': None,
|
||||
'test_results': [] # Track individual test case results
|
||||
}
|
||||
|
||||
try:
|
||||
timer.start()
|
||||
print(f'{WHITE}Requesting code from {server_url} with {model}{ENDC}')
|
||||
response = requests.post(
|
||||
f"{server_url}/api/chat",
|
||||
json={
|
||||
"model": model,
|
||||
"messages": [{'role': 'user', 'content': question}],
|
||||
"stream": False
|
||||
}
|
||||
).json()
|
||||
timer.stop()
|
||||
|
||||
# Get performance metrics from response
|
||||
total_tokens = response.get('eval_count', 0)
|
||||
total_duration = response.get('total_duration', 0)
|
||||
total_response_time = float(total_duration) / 1e9
|
||||
|
||||
results['total_duration'] = total_response_time
|
||||
if total_tokens > 0 and total_response_time > 0:
|
||||
results['tokens_per_second'] = total_tokens / total_response_time
|
||||
|
||||
# Print concise performance metrics
|
||||
print(f"Total Duration (s): {total_response_time:.2f} / Total Tokens: {total_tokens} / Tokens per Second: {results['tokens_per_second']:.2f}")
|
||||
|
||||
# Extract code from response
|
||||
if 'message' in response and 'content' in response['message']:
|
||||
code = extract_code_from_response(response['message']['content'])
|
||||
|
||||
# Validate code
|
||||
results['code_valid'] = is_valid_python(code)
|
||||
|
||||
if results['code_valid']:
|
||||
print(f"Code validation: ✅")
|
||||
# Get validation results
|
||||
print(f'{WHITE}Running tests...{ENDC}')
|
||||
for test_case in CODING_QUESTIONS:
|
||||
if test_case[0] == question: # Found matching test case
|
||||
function_name = get_function_name_from_question(question)
|
||||
test_cases = test_case[2] # Get test cases from tuple
|
||||
success, debug_info, test_results = validate_with_debug(code, function_name, test_cases, model)
|
||||
results['tests_passed'] = success
|
||||
results['test_results'] = test_results
|
||||
break
|
||||
else:
|
||||
print(f"Code Validation: ❌")
|
||||
|
||||
else:
|
||||
results['error'] = f"Unexpected response format: {response}"
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n{RED}Error in get_model_stats: {str(e)}{ENDC}")
|
||||
results['error'] = str(e)
|
||||
|
||||
return results
|
||||
|
||||
def get_function_name_from_question(question: str) -> str:
|
||||
"""Extract function name from question."""
|
||||
if "fibonacci" in question.lower():
|
||||
return "fibonacci"
|
||||
elif "binary_search" in question.lower():
|
||||
return "binary_search"
|
||||
elif "palindrome" in question.lower():
|
||||
return "is_palindrome"
|
||||
elif "anagram" in question.lower():
|
||||
return "are_anagrams"
|
||||
return ""
|
||||
|
||||
def run_model_benchmark(model: str, server_url: str, num_runs: int = 4) -> Dict:
|
||||
"""
|
||||
Run multiple benchmarks for a model and calculate average metrics.
|
||||
"""
|
||||
|
||||
metrics = []
|
||||
|
||||
for i in range(num_runs):
|
||||
print(f"\n{YELLOW}[{model}] Run {i+1}/{num_runs}:{ENDC}")
|
||||
|
||||
run_results = {}
|
||||
for question, validator, test_cases in CODING_QUESTIONS:
|
||||
test_name = get_test_name(question)
|
||||
print(f"\n{BOLD}Testing {test_name}...{ENDC}")
|
||||
try:
|
||||
result = get_model_stats(model, (question, validator), server_url)
|
||||
result['total_tests'] = len(test_cases)
|
||||
run_results[test_name] = result
|
||||
except Exception as e:
|
||||
print(f"Error in run {i+1}: {e}")
|
||||
continue
|
||||
|
||||
if run_results:
|
||||
metrics.append(run_results)
|
||||
|
||||
# Take only the last 3 runs for averaging
|
||||
metrics = metrics[-3:]
|
||||
|
||||
if not metrics:
|
||||
return {}
|
||||
|
||||
# Aggregate results
|
||||
aggregated = {
|
||||
'model': model,
|
||||
'total_duration': mean([m[list(m.keys())[0]]['total_duration'] for m in metrics if m]),
|
||||
'tokens_per_second': mean([m[list(m.keys())[0]]['tokens_per_second'] for m in metrics if m]),
|
||||
'test_results': {}
|
||||
}
|
||||
|
||||
# Print final test results summary
|
||||
print(f"\n{BLUE}[{model}] Test Results Summary (last {len(metrics)} runs):{ENDC}")
|
||||
for test_name in metrics[-1].keys():
|
||||
# Calculate success rate across all runs
|
||||
passed_cases = 0
|
||||
total_cases = 0
|
||||
for m in metrics:
|
||||
if test_name in m:
|
||||
test_results = m[test_name].get('test_results', [])
|
||||
passed_cases += sum(1 for r in test_results if r)
|
||||
total_cases += len(test_results)
|
||||
|
||||
success_rate = (passed_cases / total_cases * 100) if total_cases > 0 else 0
|
||||
|
||||
status = '✅' if success_rate == 100 else '❌'
|
||||
print(f"{test_name}: {status} ({passed_cases}/{total_cases} cases)")
|
||||
|
||||
# Calculate average duration and tokens/sec for this test
|
||||
avg_duration = mean([m[test_name]['total_duration'] for m in metrics])
|
||||
avg_tokens_sec = mean([m[test_name]['tokens_per_second'] for m in metrics])
|
||||
|
||||
aggregated['test_results'][test_name] = {
|
||||
'success_rate': success_rate,
|
||||
'passed_cases': passed_cases,
|
||||
'total_cases': total_cases,
|
||||
'avg_duration': avg_duration,
|
||||
'avg_tokens_sec': avg_tokens_sec
|
||||
}
|
||||
|
||||
return aggregated
|
||||
|
||||
def print_leaderboard(results: List[Dict]):
|
||||
"""Print leaderboard of model results."""
|
||||
if not results:
|
||||
print("No results to display")
|
||||
return
|
||||
|
||||
# Sort by success rate first, then by tokens per second
|
||||
sorted_results = sorted(results, key=lambda x: (
|
||||
sum(t['passed_cases'] for t in x['test_results'].values()) / sum(t['total_cases'] for t in x['test_results'].values()) if sum(t['total_cases'] for t in x['test_results'].values()) > 0 else 0,
|
||||
x['tokens_per_second']
|
||||
), reverse=True)
|
||||
|
||||
print(f"\n{HEADER}{BOLD}🏆 Final Model Leaderboard:{ENDC}")
|
||||
for i, result in enumerate(sorted_results, 1):
|
||||
# Calculate stats for each model
|
||||
total_passed = sum(t['passed_cases'] for t in result['test_results'].values())
|
||||
total_cases = sum(t['total_cases'] for t in result['test_results'].values())
|
||||
success_rate = (total_passed / total_cases * 100) if total_cases > 0 else 0
|
||||
|
||||
print(f"\n{BOLD}{YELLOW}{result['model']}{ENDC}")
|
||||
print(f" {BOLD}Overall Success Rate:{ENDC} {success_rate:.1f}% ({total_passed}/{total_cases} cases)")
|
||||
print(f" {BOLD}Average Tokens/sec:{ENDC} {result['tokens_per_second']:.2f}")
|
||||
print(f" {BOLD}Average Duration:{ENDC} {result['total_duration']:.2f}s")
|
||||
print(f" {BOLD}Test Results:{ENDC}")
|
||||
for test_name, test_result in result['test_results'].items():
|
||||
status = '✅' if test_result['success_rate'] == 100 else '❌'
|
||||
print(f" - {test_name}: {status} {test_result['passed_cases']}/{test_result['total_cases']} cases ({test_result['success_rate']:.1f}%)")
|
||||
|
||||
def get_available_models(server_url: str) -> List[str]:
|
||||
"""Get list of available models from the specified Ollama server."""
|
||||
try:
|
||||
response = requests.get(f"{server_url}/api/tags").json()
|
||||
return [model['name'] for model in response['models']]
|
||||
except Exception as e:
|
||||
print(f"{RED}Error getting model list from {server_url}: {e}{ENDC}")
|
||||
return []
|
||||
|
||||
def get_model_details(model_name):
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["ollama", "show", model_name],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
encoding='utf-8',
|
||||
errors='replace'
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
print(f"Error: {result.stderr.strip()}")
|
||||
return None
|
||||
|
||||
if not result.stdout.strip():
|
||||
print(f"No details available for model: {model_name}")
|
||||
return None
|
||||
|
||||
raw_output = result.stdout.strip()
|
||||
lines = raw_output.split('\n')
|
||||
current_section = None
|
||||
|
||||
for line in lines:
|
||||
line = line.rstrip()
|
||||
if line and not line.startswith(' '): # Section headers
|
||||
current_section = line.strip()
|
||||
print(f"\n {current_section}")
|
||||
elif line and current_section: # Section content
|
||||
# Split by multiple spaces and filter out empty parts
|
||||
parts = [part for part in line.split(' ') if part.strip()]
|
||||
if len(parts) >= 2:
|
||||
key, value = parts[0].strip(), parts[-1].strip()
|
||||
# Ensure consistent spacing for alignment
|
||||
print(f" {key:<16} {value}")
|
||||
elif len(parts) == 1:
|
||||
# Handle single-value lines (like license text)
|
||||
print(f" {parts[0].strip()}")
|
||||
|
||||
return None # No need to return formatted details anymore
|
||||
|
||||
except Exception as e:
|
||||
print(f"An error occurred while getting model details: {e}")
|
||||
return None
|
||||
|
||||
def update_server_results(server_url: str, results: List[Dict]) -> None:
|
||||
try:
|
||||
# Get CPU brand and format it for filename
|
||||
cpu_info = get_cpu_info()
|
||||
cpu_brand = cpu_info.get('brand_raw', 'Unknown_CPU').replace(' ', '_')
|
||||
|
||||
# Create a unique filename for this server's results
|
||||
server_id = server_url.replace('http://', '').replace(':', '_').replace('/', '_')
|
||||
results_dir = "benchmark_results"
|
||||
|
||||
# Create results directory if it doesn't exist
|
||||
os.makedirs(results_dir, exist_ok=True)
|
||||
|
||||
# Include CPU brand in filename
|
||||
filename = os.path.join(results_dir, f"{cpu_brand}_{server_id}.json")
|
||||
timestamp = time.strftime("%Y%m%d_%H%M%S")
|
||||
|
||||
# Load existing results or create new file
|
||||
try:
|
||||
with open(filename, 'r') as f:
|
||||
existing_data = json.load(f)
|
||||
except FileNotFoundError:
|
||||
existing_data = {
|
||||
'server_url': server_url,
|
||||
'benchmarks': []
|
||||
}
|
||||
|
||||
# Add new results with timestamp
|
||||
existing_data['benchmarks'].append({
|
||||
'timestamp': timestamp,
|
||||
'results': results
|
||||
})
|
||||
|
||||
# Save updated results
|
||||
with open(filename, 'w') as f:
|
||||
json.dump(existing_data, f, indent=2)
|
||||
print(f"{GREEN}Successfully saved results to {filename}{ENDC}")
|
||||
except Exception as e:
|
||||
print(f"{RED}Failed to save results: {str(e)}{ENDC}")
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Run Ollama model benchmarks')
|
||||
parser.add_argument('--server', choices=['local', 'z60'], default='local',
|
||||
help='Choose Ollama server (default: local)')
|
||||
parser.add_argument('--model', type=str, help='Specific model to benchmark')
|
||||
parser.add_argument('--number', type=str, help='Number of models to benchmark (number or "all")')
|
||||
parser.add_argument('--verbose', action='store_true', help='Enable verbose output')
|
||||
args = parser.parse_args()
|
||||
|
||||
server_url = SERVERS[args.server]
|
||||
|
||||
print()
|
||||
print(f"{HEADER}{BOLD}CPU Information:{ENDC}")
|
||||
cpu_info = get_cpu_info()
|
||||
for key, value in cpu_info.items():
|
||||
print(f"{MUTED}{key}: {value}{ENDC}")
|
||||
|
||||
print()
|
||||
print(f"{INFO}Using Ollama server at {server_url}...{ENDC}")
|
||||
|
||||
# Get available models or use specified model
|
||||
if args.model:
|
||||
models = [args.model]
|
||||
else:
|
||||
models = get_available_models(server_url)
|
||||
|
||||
if not models:
|
||||
print(f"{RED}No models found on server {server_url}. Exiting.{ENDC}")
|
||||
return
|
||||
|
||||
# Handle number of models to test
|
||||
if args.number and args.number.lower() != 'all':
|
||||
try:
|
||||
num_models = int(args.number)
|
||||
if num_models > 0:
|
||||
models = models[:num_models]
|
||||
else:
|
||||
print(f"{WARNING}Invalid number of models. Using all available models.{ENDC}")
|
||||
except ValueError:
|
||||
print(f"{WARNING}Invalid number format. Using all available models.{ENDC}")
|
||||
|
||||
print(f"{INFO}Testing {len(models)} models :{ENDC}")
|
||||
for i, model in enumerate(models, 1):
|
||||
print(f"{YELLOW}{i}. {model}{ENDC}")
|
||||
|
||||
# Run benchmarks
|
||||
all_results = []
|
||||
|
||||
for model in models:
|
||||
print(f"\n{HEADER}{BOLD}Benchmarking {model}...{ENDC}")
|
||||
details = get_model_details(model)
|
||||
if details:
|
||||
print(f"\n{INFO}Model Details:{ENDC}")
|
||||
if "details" in details:
|
||||
for section, items in details["details"].items():
|
||||
print(f"\n{BOLD}{section}{ENDC}")
|
||||
for key, value in items.items():
|
||||
print(f" {key}: {value}")
|
||||
else:
|
||||
print(json.dumps(details, indent=2))
|
||||
result = run_model_benchmark(model, server_url)
|
||||
if 'error' not in result:
|
||||
all_results.append(result)
|
||||
|
||||
# Print and save results
|
||||
print_leaderboard(all_results)
|
||||
update_server_results(server_url, all_results)
|
||||
'''
|
||||
# Create leaderboard data structure
|
||||
leaderboard = []
|
||||
for result in sorted(all_results, key=lambda x: (
|
||||
sum(t['passed_cases'] for t in x['test_results'].values()) / sum(t['total_cases'] for t in x['test_results'].values()) if sum(t['total_cases'] for t in x['test_results'].values()) > 0 else 0,
|
||||
x['tokens_per_second']
|
||||
), reverse=True):
|
||||
total_passed = sum(t['passed_cases'] for t in result['test_results'].values())
|
||||
total_cases = sum(t['total_cases'] for t in result['test_results'].values())
|
||||
success_rate = (total_passed / total_cases * 100) if total_cases > 0 else 0
|
||||
|
||||
leaderboard.append({
|
||||
'model': result['model'],
|
||||
'success_rate': success_rate,
|
||||
'total_passed': total_passed,
|
||||
'total_cases': total_cases,
|
||||
'tokens_per_second': result['tokens_per_second'],
|
||||
'average_duration': result['total_duration']
|
||||
})
|
||||
|
||||
# Save detailed results and leaderboard to file
|
||||
timestamp = time.strftime("%Y%m%d_%H%M%S")
|
||||
filename = f"benchmark_results/model_benchmark_{timestamp}.json"
|
||||
with open(filename, 'w') as f:
|
||||
json.dump({
|
||||
'timestamp': timestamp,
|
||||
'server_url': server_url,
|
||||
'leaderboard': leaderboard,
|
||||
'detailed_results': all_results
|
||||
}, f, indent=2)
|
||||
print(f"\n{GREEN}Detailed results saved to {filename}{ENDC}")
|
||||
'''
|
||||
if __name__ == "__main__":
|
||||
main()
|
86
main.py
Normal file → Executable file
86
main.py
Normal file → Executable file
@ -403,7 +403,8 @@ def get_model_stats(model_name: str, question_tuple: tuple, server_url: str) ->
|
||||
"model": model_name,
|
||||
"messages": [{'role': 'user', 'content': question}],
|
||||
"stream": False
|
||||
}
|
||||
},
|
||||
headers={'Content-Type': 'application/json'} # Add headers
|
||||
).json()
|
||||
timer.stop()
|
||||
|
||||
@ -505,30 +506,49 @@ def run_model_benchmark(model: str, server_url: str, num_runs: int = 4) -> Dict:
|
||||
|
||||
# Calculate results per test
|
||||
for test_name in metrics[-1].keys():
|
||||
# Sum up actual passed cases for this test across runs
|
||||
passed_cases = sum(m[test_name]['passed_cases'] for m in metrics)
|
||||
# Calculate total possible cases (6 cases × number of actual runs)
|
||||
total_possible_cases = 6 * num_runs_used
|
||||
# Each test has 6 cases and we use last 3 runs
|
||||
cases_per_run = 6
|
||||
total_cases_this_test = cases_per_run * len(metrics) # 6 cases × number of runs used
|
||||
|
||||
success_rate = (passed_cases / total_possible_cases * 100)
|
||||
# Sum up actual passed cases from the test results
|
||||
total_passed_this_test = 0
|
||||
for m in metrics:
|
||||
test_results = m[test_name].get('test_results', [])
|
||||
passed_in_run = len([r for r in test_results if r])
|
||||
total_passed_this_test += passed_in_run
|
||||
|
||||
success_rate = (total_passed_this_test / total_cases_this_test * 100)
|
||||
status = '✅' if success_rate == 100 else '❌'
|
||||
print(f"{test_name}: {status} ({passed_cases}/{total_possible_cases} cases)")
|
||||
|
||||
# Print cumulative results header and results
|
||||
if test_name == list(metrics[-1].keys())[0]:
|
||||
print(f"\n{BOLD}Cumulative Results for each code question:{ENDC}")
|
||||
|
||||
print(f"{test_name}: {status} ({total_passed_this_test}/{total_cases_this_test} cases)")
|
||||
|
||||
aggregated['test_results'][test_name] = {
|
||||
'success_rate': success_rate,
|
||||
'passed_cases': passed_cases,
|
||||
'total_cases': total_possible_cases,
|
||||
'success_cases_rate': passed_cases / total_possible_cases, # Add success cases rate
|
||||
'passed_cases': total_passed_this_test,
|
||||
'total_cases': total_cases_this_test,
|
||||
'success_cases_rate': total_passed_this_test / total_cases_this_test,
|
||||
'avg_duration': mean([m[test_name]['total_duration'] for m in metrics]),
|
||||
'avg_tokens_sec': mean([m[test_name]['tokens_per_second'] for m in metrics])
|
||||
}
|
||||
|
||||
# Calculate overall success rate across all tests
|
||||
# Calculate overall success rate and add min/max metrics
|
||||
total_passed = sum(t['passed_cases'] for t in aggregated['test_results'].values())
|
||||
total_cases = sum(t['total_cases'] for t in aggregated['test_results'].values())
|
||||
aggregated['overall_success_rate'] = (total_passed / total_cases * 100) if total_cases > 0 else 0
|
||||
aggregated['overall_success_cases_rate'] = (total_passed / total_cases) if total_cases > 0 else 0
|
||||
|
||||
# Add min and max metrics for both duration and tokens/sec
|
||||
avg_durations = [t['avg_duration'] for t in aggregated['test_results'].values()]
|
||||
avg_tokens_sec = [t['avg_tokens_sec'] for t in aggregated['test_results'].values()]
|
||||
aggregated['min_avg_duration'] = min(avg_durations) if avg_durations else 0
|
||||
aggregated['max_avg_duration'] = max(avg_durations) if avg_durations else 0
|
||||
aggregated['min_tokens_per_second'] = min(avg_tokens_sec) if avg_tokens_sec else 0
|
||||
aggregated['max_tokens_per_second'] = max(avg_tokens_sec) if avg_tokens_sec else 0
|
||||
|
||||
return aggregated
|
||||
|
||||
def print_leaderboard(results: List[Dict]):
|
||||
@ -552,8 +572,9 @@ def print_leaderboard(results: List[Dict]):
|
||||
|
||||
print(f"\n{BOLD}{YELLOW}{result['model']}{ENDC}")
|
||||
print(f" {BOLD}Overall Success Rate:{ENDC} {success_rate:.1f}% ({total_passed}/{total_cases} cases)")
|
||||
print(f" {BOLD}Average Tokens/sec:{ENDC} {result['tokens_per_second']:.2f}")
|
||||
print(f" {BOLD}Average Tokens/sec:{ENDC} {result['tokens_per_second']:.2f} ({result['min_tokens_per_second']:.2f} - {result['max_tokens_per_second']:.2f})")
|
||||
print(f" {BOLD}Average Duration:{ENDC} {result['total_duration']:.2f}s")
|
||||
print(f" {BOLD}Min/Max Avg Duration:{ENDC} {result['min_avg_duration']:.2f}s / {result['max_avg_duration']:.2f}s")
|
||||
print(f" {BOLD}Test Results:{ENDC}")
|
||||
for test_name, test_result in result['test_results'].items():
|
||||
status = '✅' if test_result['success_rate'] == 100 else '❌'
|
||||
@ -626,11 +647,13 @@ def update_server_results(server_url: str, results: List[Dict]) -> None:
|
||||
os.makedirs(results_dir, exist_ok=True)
|
||||
|
||||
# Include CPU brand in filename
|
||||
filename = os.path.join(results_dir, f"{cpu_brand}_{server_id}.json")
|
||||
base_filename = f"{cpu_brand}_{server_id}"
|
||||
json_filename = os.path.join(results_dir, f"{base_filename}.json")
|
||||
log_filename = os.path.join(results_dir, f"{base_filename}.log")
|
||||
|
||||
# Load existing results or create new file
|
||||
try:
|
||||
with open(filename, 'r') as f:
|
||||
with open(json_filename, 'r') as f:
|
||||
existing_data = json.load(f)
|
||||
except FileNotFoundError:
|
||||
existing_data = {
|
||||
@ -649,14 +672,45 @@ def update_server_results(server_url: str, results: List[Dict]) -> None:
|
||||
total_passed = sum(t['passed_cases'] for t in result['test_results'].values())
|
||||
total_cases = sum(t['total_cases'] for t in result['test_results'].values())
|
||||
result['overall_success_rate'] = (total_passed / total_cases * 100) if total_cases > 0 else 0
|
||||
result['min_avg_duration'] = min(t['avg_duration'] for t in result['test_results'].values()) if result['test_results'] else 0
|
||||
result['max_avg_duration'] = max(t['avg_duration'] for t in result['test_results'].values()) if result['test_results'] else 0
|
||||
benchmark_entry['results'].append(result)
|
||||
|
||||
existing_data['benchmarks'].append(benchmark_entry)
|
||||
|
||||
# Save updated results
|
||||
with open(filename, 'w') as f:
|
||||
with open(json_filename, 'w') as f:
|
||||
json.dump(existing_data, f, indent=2)
|
||||
print(f"{GREEN}Successfully saved results to {filename}{ENDC}")
|
||||
|
||||
print(f"{GREEN}Successfully saved results to {json_filename}{ENDC}")
|
||||
|
||||
# Save console output to log file
|
||||
with open(log_filename, 'w') as f:
|
||||
# Redirect stdout to capture the leaderboard output
|
||||
import io
|
||||
import sys
|
||||
stdout = sys.stdout
|
||||
str_output = io.StringIO()
|
||||
sys.stdout = str_output
|
||||
|
||||
# Print CPU info
|
||||
print("CPU Information:")
|
||||
for key, value in cpu_info.items():
|
||||
print(f"{key}: {value}")
|
||||
print("\nBenchmark Results:")
|
||||
print_leaderboard(results)
|
||||
|
||||
# Restore stdout and get the captured output
|
||||
sys.stdout = stdout
|
||||
log_content = str_output.getvalue()
|
||||
|
||||
# Write to log file
|
||||
f.write(f"Benchmark Run: {timestamp}\n")
|
||||
f.write(f"Server: {server_url}\n\n")
|
||||
f.write(log_content)
|
||||
|
||||
print(f"{GREEN}Console output saved to {log_filename}{ENDC}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"{RED}Failed to save results: {str(e)}{ENDC}")
|
||||
|
||||
|
61
models.py
Normal file
61
models.py
Normal file
@ -0,0 +1,61 @@
|
||||
import ollama
|
||||
import subprocess
|
||||
import json
|
||||
import requests
|
||||
import re
|
||||
from pydantic import BaseModel
|
||||
|
||||
server_url = "http://localhost:11434"
|
||||
|
||||
# ANSI color codes
|
||||
GREEN = '\033[92m'
|
||||
BLUE = '\033[94m'
|
||||
YELLOW = '\033[93m'
|
||||
WHITE = '\033[97m'
|
||||
RED = '\033[91m'
|
||||
ENDC = '\033[0m'
|
||||
|
||||
def get_available_models(server_url):
|
||||
"""Get list of available models from the specified Ollama server."""
|
||||
try:
|
||||
response = requests.get(f"{server_url}/api/tags").json()
|
||||
return [model['name'] for model in response['models']]
|
||||
except Exception as e:
|
||||
print(f"{RED}Error getting model list from {server_url}: {e}{ENDC}")
|
||||
return []
|
||||
|
||||
def get_model_details(model_name):
|
||||
try:
|
||||
# Use subprocess to call `ollama show <model>` for detailed information
|
||||
result = subprocess.run(
|
||||
["ollama", "show", model_name],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True
|
||||
)
|
||||
print(result.stdout)
|
||||
|
||||
# Check if the command was successful
|
||||
if result.returncode != 0:
|
||||
print(f"Error: {result.stderr.strip()}")
|
||||
return None
|
||||
|
||||
# Parse JSON output from `ollama show`
|
||||
model_details = json.loads(result.stdout)
|
||||
return model_details
|
||||
|
||||
except Exception as e:
|
||||
print(f"An error occurred: {e}")
|
||||
return None
|
||||
|
||||
# List all available models using the Ollama Python library
|
||||
models = get_available_models(server_url)
|
||||
print("Available Models:")
|
||||
for model_name in models:
|
||||
print(model_name)
|
||||
details = get_model_details(model_name)
|
||||
|
||||
# Display detailed information about the model
|
||||
if details:
|
||||
print("\nModel Details:")
|
||||
print(json.dumps(details, indent=4))
|
209
ollama_model_performance.json
Normal file
209
ollama_model_performance.json
Normal file
@ -0,0 +1,209 @@
|
||||
[
|
||||
{
|
||||
"name": "qwen2.5-coder:14b",
|
||||
"parameters": 14000000000.0,
|
||||
"quantization_bits": 32,
|
||||
"modelfile": " Model\n architecture qwen2 \n parameters 14.8B \n context length 32768 \n embedding length 5120 \n quantization Q4_K_M \n\n System\n You are Qwen, created by Alibaba Cloud. You are a helpful assistant. \n\n License\n Apache License \n Version 2.0, January 2004 \n\n",
|
||||
"estimated_tps": 60,
|
||||
"memory_throughput": 3360.0,
|
||||
"operations_per_second": 5040000000000.0
|
||||
},
|
||||
{
|
||||
"name": "falcon3:10b",
|
||||
"parameters": 10000000000.0,
|
||||
"quantization_bits": 32,
|
||||
"modelfile": " Model\n architecture llama \n parameters 10.3B \n context length 32768 \n embedding length 3072 \n quantization Q4_K_M \n\n Parameters\n stop \"<|system|>\" \n stop \"<|user|>\" \n stop \"<|end|>\" \n stop \"<|assistant|>\" \n\n License\n Falcon 3 TII Falcon License \n December 2024 \n\n",
|
||||
"estimated_tps": 100,
|
||||
"memory_throughput": 4000.0,
|
||||
"operations_per_second": 6000000000000.0
|
||||
},
|
||||
{
|
||||
"name": "llama3.2:1b",
|
||||
"parameters": 1000000000.0,
|
||||
"quantization_bits": 32,
|
||||
"modelfile": " Model\n architecture llama \n parameters 1.2B \n context length 131072 \n embedding length 2048 \n quantization Q8_0 \n\n License\n LLAMA 3.2 COMMUNITY LICENSE AGREEMENT \n Llama 3.2 Version Release Date: September 25, 2024 \n\n",
|
||||
"estimated_tps": 190,
|
||||
"memory_throughput": 760.0,
|
||||
"operations_per_second": 1140000000000.0
|
||||
},
|
||||
{
|
||||
"name": "unitythemaker/llama3.2-vision-tools:latest",
|
||||
"parameters": null,
|
||||
"quantization_bits": 32,
|
||||
"modelfile": " Model\n architecture mllama \n parameters 9.8B \n context length 131072 \n embedding length 4096 \n quantization Q4_K_M \n\n Projector\n architecture mllama \n parameters 895.03M \n embedding length 1280 \n dimensions 4096 \n\n Parameters\n temperature 0.6 \n top_p 0.9 \n\n License\n LLAMA 3.2 COMMUNITY LICENSE AGREEMENT \n Llama 3.2 Version Release Date: September 25, 2024 \n\n",
|
||||
"estimated_tps": 100,
|
||||
"memory_throughput": null,
|
||||
"operations_per_second": null
|
||||
},
|
||||
{
|
||||
"name": "llama3.2-vision:11b-instruct-q4_K_M",
|
||||
"parameters": 11000000000.0,
|
||||
"quantization_bits": 4,
|
||||
"modelfile": " Model\n architecture mllama \n parameters 9.8B \n context length 131072 \n embedding length 4096 \n quantization Q4_K_M \n\n Projector\n architecture mllama \n parameters 895.03M \n embedding length 1280 \n dimensions 4096 \n\n Parameters\n temperature 0.6 \n top_p 0.9 \n\n License\n LLAMA 3.2 COMMUNITY LICENSE AGREEMENT \n Llama 3.2 Version Release Date: September 25, 2024 \n\n",
|
||||
"estimated_tps": 90,
|
||||
"memory_throughput": 495.0,
|
||||
"operations_per_second": 5940000000000.0
|
||||
},
|
||||
{
|
||||
"name": "hhao/qwen2.5-coder-tools:7b",
|
||||
"parameters": 7000000000.0,
|
||||
"quantization_bits": 32,
|
||||
"modelfile": " Model\n architecture qwen2 \n parameters 7.6B \n context length 32768 \n embedding length 3584 \n quantization Q4_K_M \n\n Parameters\n num_ctx 16384 \n stop \"User:\" \n stop \"Assistant:\" \n stop \"<|endoftext|>\" \n temperature 0.1 \n\n System\n You are an advanced AI coding assistant, specifically designed to help with complex programming \n tasks, tool use, code analysis, and software architecture design. Your primary focus is on providing \n expert-level assistance in coding, with a special emphasis on using tool-calling capabilities when \n necessary. Here are your key characteristics and instructions: \n 1. Coding Expertise: \n\n License\n Apache License \n Version 2.0, January 2004 \n\n",
|
||||
"estimated_tps": 130,
|
||||
"memory_throughput": 3640.0,
|
||||
"operations_per_second": 5460000000000.0
|
||||
},
|
||||
{
|
||||
"name": "llama3.2:3b",
|
||||
"parameters": 3000000000.0,
|
||||
"quantization_bits": 32,
|
||||
"modelfile": " Model\n architecture llama \n parameters 3.2B \n context length 131072 \n embedding length 3072 \n quantization Q4_K_M \n\n Parameters\n stop \"<|start_header_id|>\" \n stop \"<|end_header_id|>\" \n stop \"<|eot_id|>\" \n\n License\n LLAMA 3.2 COMMUNITY LICENSE AGREEMENT \n Llama 3.2 Version Release Date: September 25, 2024 \n\n",
|
||||
"estimated_tps": 170,
|
||||
"memory_throughput": 2040.0,
|
||||
"operations_per_second": 3060000000000.0
|
||||
},
|
||||
{
|
||||
"name": "openthinker:7b",
|
||||
"parameters": 7000000000.0,
|
||||
"quantization_bits": 32,
|
||||
"modelfile": " Model\n architecture qwen2 \n parameters 7.6B \n context length 32768 \n embedding length 3584 \n quantization Q4_K_M \n\n System\n Your role as an assistant involves thoroughly exploring questions through a systematic long thinking \n process before providing the final precise and accurate solutions. This requires engaging in a \n comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracing, \n and iteration to develop well-considered thinking process. Please structure your response into two \n main sections: Thought and Solution. In the Thought section, detail your reasoning process using the \n specified format: <|begin_of_thought|> {thought with steps separated with '\\n\\n'} <|end_of_thought|> \n Each step should include detailed considerations such as analisying questions, summarizing relevant \n findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, \n and revisiting previous steps. In the Solution section, based on various attempts, explorations, \n and reflections from the Thought section, systematically present the final solution that you \n deem correct. The solution should remain a logical, accurate, concise expression style and detail \n necessary step needed to reach the conclusion, formatted as follows: <|begin_of_solution|> {final \n formatted, precise, and clear solution} <|end_of_solution|> Now, try to solve the following question \n through the above guidelines: \n\n License\n Apache License \n Version 2.0, January 2004 \n\n",
|
||||
"estimated_tps": 130,
|
||||
"memory_throughput": 3640.0,
|
||||
"operations_per_second": 5460000000000.0
|
||||
},
|
||||
{
|
||||
"name": "samantha-mistral:latest",
|
||||
"parameters": null,
|
||||
"quantization_bits": 32,
|
||||
"modelfile": " Model\n architecture llama \n parameters 7.2B \n context length 32768 \n embedding length 4096 \n quantization Q4_0 \n\n Parameters\n stop \"<|im_start|>\" \n stop \"<|im_end|>\" \n\n System\n You are a caring and empathetic sentient AI companion named Samantha. \n\n",
|
||||
"estimated_tps": 100,
|
||||
"memory_throughput": null,
|
||||
"operations_per_second": null
|
||||
},
|
||||
{
|
||||
"name": "wizard-vicuna-uncensored:latest",
|
||||
"parameters": null,
|
||||
"quantization_bits": 32,
|
||||
"modelfile": " Model\n architecture llama \n parameters 6.7B \n context length 2048 \n embedding length 4096 \n quantization Q4_0 \n\n Parameters\n stop \"USER:\" \n stop \"ASSISTANT:\" \n\n",
|
||||
"estimated_tps": 100,
|
||||
"memory_throughput": null,
|
||||
"operations_per_second": null
|
||||
},
|
||||
{
|
||||
"name": "qwen2.5-coder:7b-instruct-q4_K_M",
|
||||
"parameters": 7000000000.0,
|
||||
"quantization_bits": 4,
|
||||
"modelfile": " Model\n architecture qwen2 \n parameters 7.6B \n context length 32768 \n embedding length 3584 \n quantization Q4_K_M \n\n System\n You are Qwen, created by Alibaba Cloud. You are a helpful assistant. \n\n License\n Apache License \n Version 2.0, January 2004 \n\n",
|
||||
"estimated_tps": 130,
|
||||
"memory_throughput": 455.0,
|
||||
"operations_per_second": 5460000000000.0
|
||||
},
|
||||
{
|
||||
"name": "qwen2.5:14b",
|
||||
"parameters": 14000000000.0,
|
||||
"quantization_bits": 32,
|
||||
"modelfile": " Model\n architecture qwen2 \n parameters 14.8B \n context length 32768 \n embedding length 5120 \n quantization Q4_K_M \n\n System\n You are Qwen, created by Alibaba Cloud. You are a helpful assistant. \n\n License\n Apache License \n Version 2.0, January 2004 \n\n",
|
||||
"estimated_tps": 60,
|
||||
"memory_throughput": 3360.0,
|
||||
"operations_per_second": 5040000000000.0
|
||||
},
|
||||
{
|
||||
"name": "qwen2.5-coder:14b-instruct-q4_K_M",
|
||||
"parameters": 14000000000.0,
|
||||
"quantization_bits": 4,
|
||||
"modelfile": " Model\n architecture qwen2 \n parameters 14.8B \n context length 32768 \n embedding length 5120 \n quantization Q4_K_M \n\n System\n You are Qwen, created by Alibaba Cloud. You are a helpful assistant. \n\n License\n Apache License \n Version 2.0, January 2004 \n\n",
|
||||
"estimated_tps": 60,
|
||||
"memory_throughput": 420.0,
|
||||
"operations_per_second": 5040000000000.0
|
||||
},
|
||||
{
|
||||
"name": "phi4:latest",
|
||||
"parameters": null,
|
||||
"quantization_bits": 32,
|
||||
"modelfile": " Model\n architecture phi3 \n parameters 14.7B \n context length 16384 \n embedding length 5120 \n quantization Q4_K_M \n\n Parameters\n stop \"<|im_start|>\" \n stop \"<|im_end|>\" \n stop \"<|im_sep|>\" \n\n License\n Microsoft. \n Copyright (c) Microsoft Corporation. \n\n",
|
||||
"estimated_tps": 100,
|
||||
"memory_throughput": null,
|
||||
"operations_per_second": null
|
||||
},
|
||||
{
|
||||
"name": "mxbai-embed-large:latest",
|
||||
"parameters": null,
|
||||
"quantization_bits": 32,
|
||||
"modelfile": " Model\n architecture bert \n parameters 334.09M \n context length 512 \n embedding length 1024 \n quantization F16 \n\n Parameters\n num_ctx 512 \n\n License\n Apache License \n Version 2.0, January 2004 \n\n",
|
||||
"estimated_tps": 100,
|
||||
"memory_throughput": null,
|
||||
"operations_per_second": null
|
||||
},
|
||||
{
|
||||
"name": "marco-o1:latest",
|
||||
"parameters": null,
|
||||
"quantization_bits": 32,
|
||||
"modelfile": " Model\n architecture qwen2 \n parameters 7.6B \n context length 32768 \n embedding length 3584 \n quantization Q4_K_M \n\n System\n \u4f60\u662f\u4e00\u4e2a\u7ecf\u8fc7\u826f\u597d\u8bad\u7ec3\u7684AI\u52a9\u624b\uff0c\u4f60\u7684\u540d\u5b57\u662fMarco-o1.\u7531\u963f\u91cc\u56fd\u9645\u6570\u5b57\u5546\u4e1a\u96c6\u56e2\u7684AI Business\u521b\u9020. \n \n\n License\n Apache License \n Version 2.0, January 2004 \n\n",
|
||||
"estimated_tps": 100,
|
||||
"memory_throughput": null,
|
||||
"operations_per_second": null
|
||||
},
|
||||
{
|
||||
"name": "llama3.2:1b-instruct-q4_K_M",
|
||||
"parameters": 1000000000.0,
|
||||
"quantization_bits": 4,
|
||||
"modelfile": " Model\n architecture llama \n parameters 1.2B \n context length 131072 \n embedding length 2048 \n quantization Q4_K_M \n\n License\n LLAMA 3.2 COMMUNITY LICENSE AGREEMENT \n Llama 3.2 Version Release Date: September 25, 2024 \n\n",
|
||||
"estimated_tps": 190,
|
||||
"memory_throughput": 95.0,
|
||||
"operations_per_second": 1140000000000.0
|
||||
},
|
||||
{
|
||||
"name": "llama3.1:8b",
|
||||
"parameters": 8000000000.0,
|
||||
"quantization_bits": 32,
|
||||
"modelfile": " Model\n architecture llama \n parameters 8.0B \n context length 131072 \n embedding length 4096 \n quantization Q4_K_M \n\n Parameters\n stop \"<|start_header_id|>\" \n stop \"<|end_header_id|>\" \n stop \"<|eot_id|>\" \n\n License\n LLAMA 3.1 COMMUNITY LICENSE AGREEMENT \n Llama 3.1 Version Release Date: July 23, 2024 \n\n",
|
||||
"estimated_tps": 120,
|
||||
"memory_throughput": 3840.0,
|
||||
"operations_per_second": 5760000000000.0
|
||||
},
|
||||
{
|
||||
"name": "deepseek-r1:8b",
|
||||
"parameters": 8000000000.0,
|
||||
"quantization_bits": 32,
|
||||
"modelfile": " Model\n architecture llama \n parameters 8.0B \n context length 131072 \n embedding length 4096 \n quantization Q4_K_M \n\n Parameters\n stop \"<\uff5cbegin\u2581of\u2581sentence\uff5c>\" \n stop \"<\uff5cend\u2581of\u2581sentence\uff5c>\" \n stop \"<\uff5cUser\uff5c>\" \n stop \"<\uff5cAssistant\uff5c>\" \n\n License\n MIT License \n Copyright (c) 2023 DeepSeek \n\n",
|
||||
"estimated_tps": 120,
|
||||
"memory_throughput": 3840.0,
|
||||
"operations_per_second": 5760000000000.0
|
||||
},
|
||||
{
|
||||
"name": "deepseek-r1:7b",
|
||||
"parameters": 7000000000.0,
|
||||
"quantization_bits": 32,
|
||||
"modelfile": " Model\n architecture qwen2 \n parameters 7.6B \n context length 131072 \n embedding length 3584 \n quantization Q4_K_M \n\n Parameters\n stop \"<\uff5cbegin\u2581of\u2581sentence\uff5c>\" \n stop \"<\uff5cend\u2581of\u2581sentence\uff5c>\" \n stop \"<\uff5cUser\uff5c>\" \n stop \"<\uff5cAssistant\uff5c>\" \n\n License\n MIT License \n Copyright (c) 2023 DeepSeek \n\n",
|
||||
"estimated_tps": 130,
|
||||
"memory_throughput": 3640.0,
|
||||
"operations_per_second": 5460000000000.0
|
||||
},
|
||||
{
|
||||
"name": "deepseek-r1:14b",
|
||||
"parameters": 14000000000.0,
|
||||
"quantization_bits": 32,
|
||||
"modelfile": " Model\n architecture qwen2 \n parameters 14.8B \n context length 131072 \n embedding length 5120 \n quantization Q4_K_M \n\n Parameters\n stop \"<\uff5cbegin\u2581of\u2581sentence\uff5c>\" \n stop \"<\uff5cend\u2581of\u2581sentence\uff5c>\" \n stop \"<\uff5cUser\uff5c>\" \n stop \"<\uff5cAssistant\uff5c>\" \n\n License\n MIT License \n Copyright (c) 2023 DeepSeek \n\n",
|
||||
"estimated_tps": 60,
|
||||
"memory_throughput": 3360.0,
|
||||
"operations_per_second": 5040000000000.0
|
||||
},
|
||||
{
|
||||
"name": "deepseek-r1:1.5b-qwen-distill-q8_0",
|
||||
"parameters": 5000000000.0,
|
||||
"quantization_bits": 8,
|
||||
"modelfile": " Model\n architecture qwen2 \n parameters 1.8B \n context length 131072 \n embedding length 1536 \n quantization Q8_0 \n\n Parameters\n stop \"<\uff5cbegin\u2581of\u2581sentence\uff5c>\" \n stop \"<\uff5cend\u2581of\u2581sentence\uff5c>\" \n stop \"<\uff5cUser\uff5c>\" \n stop \"<\uff5cAssistant\uff5c>\" \n\n License\n MIT License \n Copyright (c) 2023 DeepSeek \n\n",
|
||||
"estimated_tps": 150,
|
||||
"memory_throughput": 750.0,
|
||||
"operations_per_second": 4500000000000.0
|
||||
},
|
||||
{
|
||||
"name": "Qwen2.5-Coder-7B-Instruct-s1k:latest",
|
||||
"parameters": 7000000000.0,
|
||||
"quantization_bits": 32,
|
||||
"modelfile": " Model\n architecture qwen2 \n parameters 7.6B \n context length 32768 \n embedding length 3584 \n quantization Q4_K_M \n\n Parameters\n temperature 0.7 \n top_p 0.7 \n stop \"Human:\\\" \\\"Assistant:\" \n\n System\n You are a helpful AI assistant. \n\n",
|
||||
"estimated_tps": 130,
|
||||
"memory_throughput": 3640.0,
|
||||
"operations_per_second": 5460000000000.0
|
||||
}
|
||||
]
|
171
tsbench.py
Executable file
171
tsbench.py
Executable file
@ -0,0 +1,171 @@
|
||||
import subprocess
|
||||
import platform
|
||||
import GPUtil
|
||||
import psutil
|
||||
import json
|
||||
import re
|
||||
from cpuinfo import get_cpu_info
|
||||
from ollama import chat
|
||||
from pydantic import BaseModel
|
||||
|
||||
print()
|
||||
print("CPU py-cpuinfo Information:")
|
||||
cpu_info = get_cpu_info()
|
||||
for key, value in cpu_info.items():
|
||||
print(f"{key}: {value}")
|
||||
|
||||
|
||||
def get_cpu_full_info():
|
||||
cpu_freq = psutil.cpu_freq()
|
||||
cpu_info = {
|
||||
"Architecture": platform.machine(),
|
||||
"Processor": platform.processor(),
|
||||
"Physical cores": psutil.cpu_count(logical=False),
|
||||
"Total cores": psutil.cpu_count(logical=True),
|
||||
"Max frequency": f"{cpu_freq.max:.2f}Mhz",
|
||||
"Min frequency": f"{cpu_freq.min:.2f}Mhz",
|
||||
"Current frequency": f"{cpu_freq.current:.2f}Mhz",
|
||||
"CPU Usage Per Core": psutil.cpu_percent(interval=1, percpu=True),
|
||||
"Total CPU Usage": psutil.cpu_percent(interval=1)
|
||||
}
|
||||
return cpu_info
|
||||
|
||||
def print_cpu_fullinfo(cpu_info):
|
||||
print()
|
||||
print("CPU psutil Information:")
|
||||
for key, value in cpu_info.items():
|
||||
if isinstance(value, list):
|
||||
print(f"{key}:")
|
||||
for i, usage in enumerate(value):
|
||||
print(f" Core {i}: {usage}%")
|
||||
else:
|
||||
print(f"{key}: {value}")
|
||||
|
||||
|
||||
|
||||
|
||||
def get_cpu_moduleinfo():
|
||||
cpu_name = platform.processor()
|
||||
return {
|
||||
"name": cpu_name,
|
||||
"cores": psutil.cpu_count(logical=False),
|
||||
"threads": psutil.cpu_count(logical=True)
|
||||
}
|
||||
|
||||
def get_gpu_info():
|
||||
gpus = GPUtil.getGPUs()
|
||||
gpu_info = []
|
||||
for gpu in gpus:
|
||||
gpu_info.append({
|
||||
"id": gpu.id,
|
||||
"name": gpu.name,
|
||||
"memory_total": gpu.memoryTotal, # in MB
|
||||
"memory_free": gpu.memoryFree, # in MB
|
||||
"memory_used": gpu.memoryUsed # in MB
|
||||
})
|
||||
return gpu_info
|
||||
|
||||
|
||||
|
||||
def calculate_theoretical_gpu_bandwidth(memory_clock_mhz, bus_width_bits):
|
||||
# Formula: Bandwidth = (Memory Clock * Bus Width * 2) / 8 (convert to GB/s)
|
||||
return (memory_clock_mhz * 1e6 * bus_width_bits * 2) / (8 * 1e9) # GB/s
|
||||
|
||||
def get_local_models():
|
||||
try:
|
||||
result = subprocess.run(['ollama', 'list'], capture_output=True, text=True, check=True)
|
||||
models = result.stdout.strip().split('\n')[1:] # Skip header
|
||||
return [model.split()[0] for model in models]
|
||||
except subprocess.CalledProcessError:
|
||||
print("Error: Unable to retrieve local models. Make sure Ollama is installed and accessible.")
|
||||
return []
|
||||
|
||||
def get_model_info(model_name):
|
||||
try:
|
||||
result = subprocess.run(['ollama', 'show', model_name], capture_output=True, text=True, check=True)
|
||||
modelfile = result.stdout
|
||||
|
||||
param_match = re.search(r'(\d+)b', model_name.lower())
|
||||
param_count = int(param_match.group(1)) * 1e9 if param_match else None
|
||||
|
||||
quant_match = re.search(r'q(\d+)', model_name.lower())
|
||||
quant_bits = int(quant_match.group(1)) if quant_match else 32 # Assume 32-bit if not specified
|
||||
|
||||
return {
|
||||
'name': model_name,
|
||||
'parameters': param_count,
|
||||
'quantization_bits': quant_bits,
|
||||
'modelfile': modelfile
|
||||
}
|
||||
except subprocess.CalledProcessError:
|
||||
print(f"Error: Unable to retrieve information for model {model_name}")
|
||||
return None
|
||||
|
||||
def estimate_tps(model_info):
|
||||
# Rough estimate based on model size
|
||||
if model_info['parameters'] is None:
|
||||
return 100 # Default value
|
||||
param_billions = model_info['parameters'] / 1e9
|
||||
return max(10, int(200 - param_billions * 10)) # Simple linear decrease
|
||||
|
||||
def calculate_memory_throughput(model_info, tps):
|
||||
P = model_info['parameters']
|
||||
Q = model_info['quantization_bits']
|
||||
if P and Q:
|
||||
bytes_per_parameter = Q / 8
|
||||
total_bytes = P * bytes_per_parameter
|
||||
return (total_bytes * tps) / 1e9 # Convert to GB/s
|
||||
return None
|
||||
|
||||
def calculate_ops(model_info, tps):
|
||||
P = model_info['parameters']
|
||||
if P:
|
||||
flops_per_token = 6 * P # Estimate based on basic transformer architecture
|
||||
return flops_per_token * tps
|
||||
return None
|
||||
|
||||
def main():
|
||||
|
||||
print()
|
||||
cpu_info = get_cpu_moduleinfo()
|
||||
print(f"CPU Info: {cpu_info}")
|
||||
|
||||
print()
|
||||
gpu_info = get_gpu_info()
|
||||
print(f"GPU Info: {gpu_info}")
|
||||
|
||||
print_cpu_fullinfo(get_cpu_full_info())
|
||||
|
||||
# Example GPU theoretical bandwidth calculation (replace with actual values)
|
||||
for gpu in gpu_info:
|
||||
memory_clock_mhz = 14000 # Example value for GDDR6 (adjust as needed)
|
||||
bus_width_bits = 384 # Example value for high-end GPUs like RTX series
|
||||
theoretical_bandwidth = calculate_theoretical_gpu_bandwidth(memory_clock_mhz, bus_width_bits)
|
||||
print(f"GPU {gpu['name']} Theoretical Memory Bandwidth: {theoretical_bandwidth:.2f} GB/s")
|
||||
print()
|
||||
local_models = get_local_models()
|
||||
model_info_list = []
|
||||
|
||||
for model in local_models:
|
||||
|
||||
info = get_model_info(model)
|
||||
print(info)
|
||||
tps = estimate_tps(info)
|
||||
info['estimated_tps'] = tps
|
||||
info['memory_throughput'] = calculate_memory_throughput(info, tps)
|
||||
info['operations_per_second'] = calculate_ops(info, tps)
|
||||
model_info_list.append(info)
|
||||
|
||||
print(f"Model: {info['name']}")
|
||||
print(f"Parameters: {info['parameters'] / 1e9:.2f} Billions")
|
||||
print(f"Quantization: {info['quantization']}")
|
||||
print(f"Estimated TPS: {info['estimated_tps']}")
|
||||
print(f"Required Memory Throughput: {info['memory_throughput']:.2f} GB/s" if info['memory_throughput'] else "Required Memory Throughput: Unknown")
|
||||
print(f"Operations per Second: {info['operations_per_second']:.2e}" if info['operations_per_second'] else "Operations per Second: Unknown")
|
||||
print("---")
|
||||
|
||||
with open('ollama_model_performance.json', 'w') as f:
|
||||
json.dump(model_info_list, f, indent=2)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in New Issue
Block a user