Leader Board picture and bug Fixs

This commit is contained in:
leduc 2025-03-15 01:35:25 +01:00
parent dedbeceb8e
commit f538ed1bd3
10 changed files with 664 additions and 991 deletions

View file

@ -998,6 +998,251 @@
"max_avg_duration": 12.908918361333333,
"min_tokens_per_second": 18.377766002186945,
"max_tokens_per_second": 18.9448229322312
},
{
"model": "phi4-mini:latest",
"total_duration": 10.860303611333332,
"tokens_per_second": 29.361579428697542,
"test_results": {
"Fibonacci": {
"success_rate": 61.111111111111114,
"passed_cases": 11,
"total_cases": 18,
"success_cases_rate": 0.6111111111111112,
"avg_duration": 10.860303611333332,
"avg_tokens_sec": 29.361579428697542
},
"Binary Search": {
"success_rate": 100.0,
"passed_cases": 18,
"total_cases": 18,
"success_cases_rate": 1.0,
"avg_duration": 10.22926025,
"avg_tokens_sec": 29.360358027471495
},
"Palindrome": {
"success_rate": 100.0,
"passed_cases": 18,
"total_cases": 18,
"success_cases_rate": 1.0,
"avg_duration": 7.7338954719999995,
"avg_tokens_sec": 29.349959100715157
},
"Anagram Check": {
"success_rate": 100.0,
"passed_cases": 18,
"total_cases": 18,
"success_cases_rate": 1.0,
"avg_duration": 9.66612725,
"avg_tokens_sec": 29.794841927435822
}
},
"overall_success_rate": 90.27777777777779,
"overall_success_cases_rate": 0.9027777777777778,
"min_avg_duration": 7.7338954719999995,
"max_avg_duration": 10.860303611333332,
"min_tokens_per_second": 29.349959100715157,
"max_tokens_per_second": 29.794841927435822
}
]
},
{
"timestamp": "20250313_051856",
"results": [
{
"model": "gemma3:12b",
"total_duration": 17.904428624666668,
"tokens_per_second": 11.206900603314153,
"test_results": {
"Fibonacci": {
"success_rate": 100.0,
"passed_cases": 18,
"total_cases": 18,
"success_cases_rate": 1.0,
"avg_duration": 17.904428624666668,
"avg_tokens_sec": 11.206900603314153
},
"Binary Search": {
"success_rate": 100.0,
"passed_cases": 18,
"total_cases": 18,
"success_cases_rate": 1.0,
"avg_duration": 14.096915041666666,
"avg_tokens_sec": 11.209157987254114
},
"Palindrome": {
"success_rate": 100.0,
"passed_cases": 18,
"total_cases": 18,
"success_cases_rate": 1.0,
"avg_duration": 9.514898375333333,
"avg_tokens_sec": 11.037508677057549
},
"Anagram Check": {
"success_rate": 100.0,
"passed_cases": 18,
"total_cases": 18,
"success_cases_rate": 1.0,
"avg_duration": 24.419397555666666,
"avg_tokens_sec": 11.87609409055045
}
},
"overall_success_rate": 100.0,
"overall_success_cases_rate": 1.0,
"min_avg_duration": 9.514898375333333,
"max_avg_duration": 24.419397555666666,
"min_tokens_per_second": 11.037508677057549,
"max_tokens_per_second": 11.87609409055045
}
]
},
{
"timestamp": "20250314_024439",
"results": [
{
"model": "SiliconBasedWorld/Qwen2.5-7B-Instruct-1M",
"total_duration": 20.47047556933333,
"tokens_per_second": 19.721316911932245,
"test_results": {
"Fibonacci": {
"success_rate": 61.111111111111114,
"passed_cases": 11,
"total_cases": 18,
"success_cases_rate": 0.6111111111111112,
"avg_duration": 20.47047556933333,
"avg_tokens_sec": 19.721316911932245
},
"Binary Search": {
"success_rate": 66.66666666666666,
"passed_cases": 12,
"total_cases": 18,
"success_cases_rate": 0.6666666666666666,
"avg_duration": 89.59582123599999,
"avg_tokens_sec": 19.522371869517652
},
"Palindrome": {
"success_rate": 100.0,
"passed_cases": 18,
"total_cases": 18,
"success_cases_rate": 1.0,
"avg_duration": 29.476939527666666,
"avg_tokens_sec": 19.835750358255293
},
"Anagram Check": {
"success_rate": 33.33333333333333,
"passed_cases": 6,
"total_cases": 18,
"success_cases_rate": 0.3333333333333333,
"avg_duration": 52.099640236333336,
"avg_tokens_sec": 19.661776969493513
}
},
"overall_success_rate": 65.27777777777779,
"overall_success_cases_rate": 0.6527777777777778,
"min_avg_duration": 20.47047556933333,
"max_avg_duration": 89.59582123599999,
"min_tokens_per_second": 19.522371869517652,
"max_tokens_per_second": 19.835750358255293
}
]
},
{
"timestamp": "20250314_110909",
"results": [
{
"model": "olmo2:13b",
"total_duration": 25.239670416666666,
"tokens_per_second": 8.973277631244137,
"test_results": {
"Fibonacci": {
"success_rate": 61.111111111111114,
"passed_cases": 11,
"total_cases": 18,
"success_cases_rate": 0.6111111111111112,
"avg_duration": 25.239670416666666,
"avg_tokens_sec": 8.973277631244137
},
"Binary Search": {
"success_rate": 100.0,
"passed_cases": 18,
"total_cases": 18,
"success_cases_rate": 1.0,
"avg_duration": 10.511362861,
"avg_tokens_sec": 8.094987124683419
},
"Palindrome": {
"success_rate": 100.0,
"passed_cases": 18,
"total_cases": 18,
"success_cases_rate": 1.0,
"avg_duration": 7.803927528,
"avg_tokens_sec": 8.07489922259982
},
"Anagram Check": {
"success_rate": 100.0,
"passed_cases": 18,
"total_cases": 18,
"success_cases_rate": 1.0,
"avg_duration": 16.829488430333335,
"avg_tokens_sec": 8.85685146687769
}
},
"overall_success_rate": 90.27777777777779,
"overall_success_cases_rate": 0.9027777777777778,
"min_avg_duration": 7.803927528,
"max_avg_duration": 25.239670416666666,
"min_tokens_per_second": 8.07489922259982,
"max_tokens_per_second": 8.973277631244137
}
]
},
{
"timestamp": "20250314_111430",
"results": [
{
"model": "olmo2:13b-1124-instruct-q4_K_M",
"total_duration": 27.796664694333334,
"tokens_per_second": 9.16360668962085,
"test_results": {
"Fibonacci": {
"success_rate": 27.77777777777778,
"passed_cases": 5,
"total_cases": 18,
"success_cases_rate": 0.2777777777777778,
"avg_duration": 27.796664694333334,
"avg_tokens_sec": 9.16360668962085
},
"Binary Search": {
"success_rate": 100.0,
"passed_cases": 18,
"total_cases": 18,
"success_cases_rate": 1.0,
"avg_duration": 21.839994722333333,
"avg_tokens_sec": 9.000336176480124
},
"Palindrome": {
"success_rate": 100.0,
"passed_cases": 18,
"total_cases": 18,
"success_cases_rate": 1.0,
"avg_duration": 10.587036805333334,
"avg_tokens_sec": 8.492606444397637
},
"Anagram Check": {
"success_rate": 100.0,
"passed_cases": 18,
"total_cases": 18,
"success_cases_rate": 1.0,
"avg_duration": 9.969617250333334,
"avg_tokens_sec": 8.499243210997909
}
},
"overall_success_rate": 81.94444444444444,
"overall_success_cases_rate": 0.8194444444444444,
"min_avg_duration": 9.969617250333334,
"max_avg_duration": 27.796664694333334,
"min_tokens_per_second": 8.492606444397637,
"max_tokens_per_second": 9.16360668962085
}
]
}

View file

@ -1,4 +1,4 @@
Benchmark Run: 20250303_174821
Benchmark Run: 20250314_111430
Server: http://localhost:11434
CPU Information:
@ -15,222 +15,13 @@ Benchmark Results:
🏆 Final Model Leaderboard:
qwen2.5-coder:7b-instruct-q4_K_M
Overall Success Rate: 100.0% (72/72 cases)
Average Tokens/sec: 19.33 (18.75 - 19.58)
Average Duration: 17.32s
Min/Max Avg Duration: 8.67s / 17.99s
Test Results:
- Fibonacci: ✅ 18/18 cases (100.0%)
- Binary Search: ✅ 18/18 cases (100.0%)
- Palindrome: ✅ 18/18 cases (100.0%)
- Anagram Check: ✅ 18/18 cases (100.0%)
falcon3:10b
Overall Success Rate: 100.0% (72/72 cases)
Average Tokens/sec: 13.21 (12.53 - 13.31)
Average Duration: 13.46s
Min/Max Avg Duration: 6.76s / 13.46s
Test Results:
- Fibonacci: ✅ 18/18 cases (100.0%)
- Binary Search: ✅ 18/18 cases (100.0%)
- Palindrome: ✅ 18/18 cases (100.0%)
- Anagram Check: ✅ 18/18 cases (100.0%)
qwen2.5:14b
Overall Success Rate: 100.0% (72/72 cases)
Average Tokens/sec: 9.78 (9.78 - 9.88)
Average Duration: 35.25s
Min/Max Avg Duration: 30.09s / 35.25s
Test Results:
- Fibonacci: ✅ 18/18 cases (100.0%)
- Binary Search: ✅ 18/18 cases (100.0%)
- Palindrome: ✅ 18/18 cases (100.0%)
- Anagram Check: ✅ 18/18 cases (100.0%)
qwen2.5-coder:14b-instruct-q4_K_M
Overall Success Rate: 100.0% (72/72 cases)
Average Tokens/sec: 9.68 (9.65 - 9.88)
Average Duration: 37.18s
Min/Max Avg Duration: 23.06s / 37.18s
Test Results:
- Fibonacci: ✅ 18/18 cases (100.0%)
- Binary Search: ✅ 18/18 cases (100.0%)
- Palindrome: ✅ 18/18 cases (100.0%)
- Anagram Check: ✅ 18/18 cases (100.0%)
phi4:latest
Overall Success Rate: 100.0% (72/72 cases)
Average Tokens/sec: 9.01 (8.96 - 9.32)
Average Duration: 23.44s
Min/Max Avg Duration: 23.44s / 38.82s
Test Results:
- Fibonacci: ✅ 18/18 cases (100.0%)
- Binary Search: ✅ 18/18 cases (100.0%)
- Palindrome: ✅ 18/18 cases (100.0%)
- Anagram Check: ✅ 18/18 cases (100.0%)
deepseek-r1:14b
Overall Success Rate: 97.2% (70/72 cases)
Average Tokens/sec: 9.05 (8.90 - 9.38)
Average Duration: 278.32s
Min/Max Avg Duration: 174.30s / 482.10s
Test Results:
- Fibonacci: ✅ 18/18 cases (100.0%)
- Binary Search: ✅ 18/18 cases (100.0%)
- Palindrome: ❌ 16/18 cases (88.9%)
- Anagram Check: ✅ 18/18 cases (100.0%)
llama3.2-vision:11b-instruct-q4_K_M
Overall Success Rate: 95.8% (69/72 cases)
Average Tokens/sec: 15.68 (14.92 - 15.92)
Average Duration: 22.33s
Min/Max Avg Duration: 16.31s / 28.85s
Test Results:
- Fibonacci: ❌ 16/18 cases (88.9%)
- Binary Search: ❌ 17/18 cases (94.4%)
- Palindrome: ✅ 18/18 cases (100.0%)
- Anagram Check: ✅ 18/18 cases (100.0%)
llama3.2:3b
Overall Success Rate: 94.4% (68/72 cases)
Average Tokens/sec: 36.09 (30.85 - 37.53)
Average Duration: 2.67s
Min/Max Avg Duration: 1.04s / 2.76s
Test Results:
- Fibonacci: ❌ 14/18 cases (77.8%)
- Binary Search: ✅ 18/18 cases (100.0%)
- Palindrome: ✅ 18/18 cases (100.0%)
- Anagram Check: ✅ 18/18 cases (100.0%)
llama3.1:8b
Overall Success Rate: 94.4% (68/72 cases)
Average Tokens/sec: 17.92 (17.92 - 18.45)
Average Duration: 18.04s
Min/Max Avg Duration: 14.68s / 19.56s
Test Results:
- Fibonacci: ❌ 14/18 cases (77.8%)
- Binary Search: ✅ 18/18 cases (100.0%)
- Palindrome: ✅ 18/18 cases (100.0%)
- Anagram Check: ✅ 18/18 cases (100.0%)
hhao/qwen2.5-coder-tools:7b
Overall Success Rate: 91.7% (66/72 cases)
Average Tokens/sec: 17.75 (16.05 - 17.75)
Average Duration: 9.35s
Min/Max Avg Duration: 4.17s / 9.35s
Test Results:
- Fibonacci: ❌ 12/18 cases (66.7%)
- Binary Search: ✅ 18/18 cases (100.0%)
- Palindrome: ✅ 18/18 cases (100.0%)
- Anagram Check: ✅ 18/18 cases (100.0%)
Qwen2.5-Coder-7B-Instruct-s1k:latest
Overall Success Rate: 88.9% (64/72 cases)
Average Tokens/sec: 18.38 (18.38 - 18.94)
Average Duration: 9.95s
Min/Max Avg Duration: 9.06s / 12.91s
Test Results:
- Fibonacci: ❌ 16/18 cases (88.9%)
- Binary Search: ❌ 12/18 cases (66.7%)
- Palindrome: ✅ 18/18 cases (100.0%)
- Anagram Check: ✅ 18/18 cases (100.0%)
deepseek-r1:8b
Overall Success Rate: 86.1% (62/72 cases)
Average Tokens/sec: 17.43 (17.29 - 18.01)
Average Duration: 168.97s
Min/Max Avg Duration: 107.91s / 168.97s
Test Results:
- Fibonacci: ✅ 18/18 cases (100.0%)
- Binary Search: ✅ 18/18 cases (100.0%)
- Palindrome: ❌ 16/18 cases (88.9%)
- Anagram Check: ❌ 10/18 cases (55.6%)
llama3.2:1b-instruct-q4_K_M
olmo2:13b-1124-instruct-q4_K_M
Overall Success Rate: 81.9% (59/72 cases)
Average Tokens/sec: 88.24 (88.24 - 88.93)
Average Duration: 3.64s
Min/Max Avg Duration: 1.87s / 4.93s
Average Tokens/sec: 9.16 (8.49 - 9.16)
Average Duration: 27.80s
Min/Max Avg Duration: 9.97s / 27.80s
Test Results:
- Fibonacci: ❌ 5/18 cases (27.8%)
- Binary Search: ✅ 18/18 cases (100.0%)
- Palindrome: ✅ 18/18 cases (100.0%)
- Anagram Check: ✅ 18/18 cases (100.0%)
samantha-mistral:latest
Overall Success Rate: 80.6% (58/72 cases)
Average Tokens/sec: 23.92 (23.91 - 24.79)
Average Duration: 12.21s
Min/Max Avg Duration: 7.59s / 12.21s
Test Results:
- Fibonacci: ❌ 8/18 cases (44.4%)
- Binary Search: ✅ 18/18 cases (100.0%)
- Palindrome: ❌ 16/18 cases (88.9%)
- Anagram Check: ❌ 16/18 cases (88.9%)
marco-o1:latest
Overall Success Rate: 80.6% (58/72 cases)
Average Tokens/sec: 19.19 (19.19 - 19.39)
Average Duration: 41.14s
Min/Max Avg Duration: 33.28s / 51.50s
Test Results:
- Fibonacci: ✅ 18/18 cases (100.0%)
- Binary Search: ❌ 6/18 cases (33.3%)
- Palindrome: ✅ 18/18 cases (100.0%)
- Anagram Check: ❌ 16/18 cases (88.9%)
deepseek-r1:7b
Overall Success Rate: 80.6% (58/72 cases)
Average Tokens/sec: 18.01 (18.01 - 19.07)
Average Duration: 336.87s
Min/Max Avg Duration: 78.71s / 336.87s
Test Results:
- Fibonacci: ❌ 10/18 cases (55.6%)
- Binary Search: ✅ 18/18 cases (100.0%)
- Palindrome: ❌ 12/18 cases (66.7%)
- Anagram Check: ✅ 18/18 cases (100.0%)
deepseek-r1:1.5b-qwen-distill-q8_0
Overall Success Rate: 52.8% (38/72 cases)
Average Tokens/sec: 57.37 (53.88 - 59.60)
Average Duration: 137.59s
Min/Max Avg Duration: 41.38s / 371.13s
Test Results:
- Fibonacci: ❌ 11/18 cases (61.1%)
- Binary Search: ❌ 12/18 cases (66.7%)
- Palindrome: ❌ 6/18 cases (33.3%)
- Anagram Check: ❌ 9/18 cases (50.0%)
openthinker:7b
Overall Success Rate: 47.2% (34/72 cases)
Average Tokens/sec: 18.16 (17.98 - 18.29)
Average Duration: 263.00s
Min/Max Avg Duration: 168.91s / 302.79s
Test Results:
- Fibonacci: ❌ 0/18 cases (0.0%)
- Binary Search: ✅ 18/18 cases (100.0%)
- Palindrome: ❌ 12/18 cases (66.7%)
- Anagram Check: ❌ 4/18 cases (22.2%)
wizard-vicuna-uncensored:latest
Overall Success Rate: 9.7% (7/72 cases)
Average Tokens/sec: 22.01 (22.01 - 24.42)
Average Duration: 9.06s
Min/Max Avg Duration: 5.60s / 11.45s
Test Results:
- Fibonacci: ❌ 0/18 cases (0.0%)
- Binary Search: ❌ 0/18 cases (0.0%)
- Palindrome: ❌ 6/18 cases (33.3%)
- Anagram Check: ❌ 1/18 cases (5.6%)
mxbai-embed-large:latest
Overall Success Rate: 0.0% (0/72 cases)
Average Tokens/sec: 0.00 (0.00 - 0.00)
Average Duration: 0.00s
Min/Max Avg Duration: 0.00s / 0.00s
Test Results:
- Fibonacci: ❌ 0/18 cases (0.0%)
- Binary Search: ❌ 0/18 cases (0.0%)
- Palindrome: ❌ 0/18 cases (0.0%)
- Anagram Check: ❌ 0/18 cases (0.0%)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.3 KiB

After

Width:  |  Height:  |  Size: 490 KiB

Before After
Before After