cleaning
This commit is contained in:
parent
730d61cfe3
commit
8ad873926c
36
README.md
36
README.md
@ -1,13 +1,8 @@
|
||||

|
||||
|
||||
# Codebench - Ollama Model Benchmark Tool
|
||||
|
||||
A Python-based benchmarking tool for testing and comparing different Ollama models on coding tasks. This tool allows you to benchmark multiple Ollama models against common coding problems, measure their performance, and visualize the results.
|
||||
|
||||
## Components
|
||||
- **Benchmarking Engine**: `main.py` - Core benchmarking functionality with integrated plotting
|
||||
- **Visualization Tool**: `lboard.py` - Standalone visualization for benchmark results
|
||||
|
||||

|
||||
|
||||
## Features
|
||||
|
||||
@ -36,11 +31,7 @@ cd codebench
|
||||
|
||||
2. Install required packages:
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
Or install the required packages manually:
|
||||
```bash
|
||||
pip install requests matplotlib py-cpuinfo
|
||||
pip3 install -r requirements.txt
|
||||
```
|
||||
|
||||
3. (Optional) Set up Together API for advanced code analysis:
|
||||
@ -58,7 +49,7 @@ python3 main.py
|
||||
Available options:
|
||||
|
||||
```bash
|
||||
python main.py --server [local|z60] --model [model_name] --number [count|all] --verbose --plot-only --no-plot --file [results_file]
|
||||
python3 main.py --server [local|z60] --model [model_name] --number [count|all] --verbose --plot-only --no-plot --file [results_file]
|
||||
```
|
||||
|
||||
## Arguments:
|
||||
@ -143,27 +134,6 @@ python3 main.py --no-plot
|
||||
|
||||
The plot will be saved as `benchmark_results/model_comparison.png` with high resolution (300 DPI).
|
||||
|
||||
### Option 2: Using lboard.py (Legacy)
|
||||
You can still use the standalone lboard.py script:
|
||||
|
||||
```bash
|
||||
python3 lboard.py
|
||||
```
|
||||
This will:
|
||||
|
||||
- Automatically find the latest benchmark results
|
||||
- Generate a graph showing:
|
||||
- Token processing speed (blue bars)
|
||||
- Success rates (red markers)
|
||||
- Duration ranges (green vertical lines)
|
||||
|
||||
You can also specify a specific results file:
|
||||
|
||||
```bash
|
||||
python3 lboard.py path/to/results.json
|
||||
# or
|
||||
python3 lboard.py --file path/to/results.json
|
||||
```
|
||||
## Visualization Features
|
||||
The visualization includes:
|
||||
- Model performance comparison
|
||||
|
226
lboard.py
226
lboard.py
@ -1,226 +0,0 @@
|
||||
import json
|
||||
import os
|
||||
import argparse
|
||||
import glob
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
def get_latest_json_file(directory):
|
||||
json_files = glob.glob(os.path.join(directory, '*.json'))
|
||||
print(f"Found JSON files: {json_files}")
|
||||
latest_file = max(json_files, key=os.path.getmtime) if json_files else None
|
||||
return latest_file
|
||||
def calculate_model_stats(model_result):
|
||||
"""Calculate average stats for a model from its test results."""
|
||||
test_results = model_result['test_results']
|
||||
|
||||
# Calculate overall success rate (average of all test success rates)
|
||||
success_rates = [test['success_rate'] for test in test_results.values()]
|
||||
overall_success_rate = sum(success_rates) / len(success_rates)
|
||||
|
||||
# Handle the case where some test results might not have avg_duration or avg_tokens_sec
|
||||
# This is for backward compatibility with older benchmark results
|
||||
min_avg_duration = max_avg_duration = None
|
||||
min_tokens_per_second = max_tokens_per_second = None
|
||||
|
||||
# First try to get these values from the model_result directly (new format)
|
||||
if 'min_avg_duration' in model_result and 'max_avg_duration' in model_result:
|
||||
min_avg_duration = model_result['min_avg_duration']
|
||||
max_avg_duration = model_result['max_avg_duration']
|
||||
|
||||
if 'min_tokens_per_second' in model_result and 'max_tokens_per_second' in model_result:
|
||||
min_tokens_per_second = model_result['min_tokens_per_second']
|
||||
max_tokens_per_second = model_result['max_tokens_per_second']
|
||||
|
||||
# If not available in the model_result, try to calculate from test_results (old format)
|
||||
if min_avg_duration is None or max_avg_duration is None:
|
||||
try:
|
||||
min_avg_duration = min(test.get('avg_duration', float('inf')) for test in test_results.values() if 'avg_duration' in test)
|
||||
max_avg_duration = max(test.get('avg_duration', 0) for test in test_results.values() if 'avg_duration' in test)
|
||||
# If no test has avg_duration, use total_duration as fallback
|
||||
if min_avg_duration == float('inf') or max_avg_duration == 0:
|
||||
min_avg_duration = max_avg_duration = model_result['total_duration']
|
||||
except (ValueError, KeyError):
|
||||
# If calculation fails, use total_duration as fallback
|
||||
min_avg_duration = max_avg_duration = model_result['total_duration']
|
||||
|
||||
if min_tokens_per_second is None or max_tokens_per_second is None:
|
||||
try:
|
||||
min_tokens_per_second = min(test.get('avg_tokens_sec', float('inf')) for test in test_results.values() if 'avg_tokens_sec' in test)
|
||||
max_tokens_per_second = max(test.get('avg_tokens_sec', 0) for test in test_results.values() if 'avg_tokens_sec' in test)
|
||||
# If no test has avg_tokens_sec, use tokens_per_second as fallback
|
||||
if min_tokens_per_second == float('inf') or max_tokens_per_second == 0:
|
||||
min_tokens_per_second = max_tokens_per_second = model_result['tokens_per_second']
|
||||
except (ValueError, KeyError):
|
||||
# If calculation fails, use tokens_per_second as fallback
|
||||
min_tokens_per_second = max_tokens_per_second = model_result['tokens_per_second']
|
||||
|
||||
return {
|
||||
'model': model_result['model'],
|
||||
'overall_success_rate': overall_success_rate,
|
||||
'tokens_per_second': model_result['tokens_per_second'],
|
||||
'total_duration': model_result['total_duration'],
|
||||
'min_avg_duration': min_avg_duration,
|
||||
'max_avg_duration': max_avg_duration,
|
||||
'min_tokens_per_second': min_tokens_per_second,
|
||||
'max_tokens_per_second': max_tokens_per_second,
|
||||
'test_results': test_results
|
||||
}
|
||||
|
||||
def plot_model_comparison(model_stats):
|
||||
"""Plot model comparison with dual y-axes for tokens/sec and success rate."""
|
||||
models = [stat['model'] for stat in model_stats]
|
||||
token_speeds = [stat['tokens_per_second'] for stat in model_stats]
|
||||
success_rates = [stat['overall_success_rate'] for stat in model_stats]
|
||||
durations = [stat['total_duration'] for stat in model_stats]
|
||||
|
||||
# Create figure and primary axis
|
||||
fig, ax1 = plt.subplots(figsize=(15, 8))
|
||||
|
||||
# Plot tokens/sec bars using min and max values
|
||||
for i, stat in enumerate(model_stats):
|
||||
min_tokens = stat['min_tokens_per_second']
|
||||
max_tokens = stat['max_tokens_per_second']
|
||||
|
||||
# Plot lower part (0 to min) with slightly darker blue
|
||||
ax1.bar(i, min_tokens, color='royalblue', alpha=0.4)
|
||||
# Plot upper part (min to max) with lighter blue
|
||||
bar_height = max_tokens - min_tokens
|
||||
ax1.bar(i, bar_height, bottom=min_tokens, color='royalblue', alpha=0.3)
|
||||
|
||||
ax1.set_ylabel('Tokens per Second', color='blue')
|
||||
ax1.tick_params(axis='y', labelcolor='blue')
|
||||
# Set y-axis range for tokens per second
|
||||
max_token_speed = max(stat['max_tokens_per_second'] for stat in model_stats)
|
||||
ax1.set_ylim(0, max(100, max_token_speed * 1.1)) # Add 10% padding above max value
|
||||
|
||||
# Set x-axis labels
|
||||
ax1.set_xticks(range(len(models)))
|
||||
ax1.set_xticklabels(models, rotation=45, ha='right', rotation_mode='anchor')
|
||||
|
||||
# Create secondary y-axis for success rate
|
||||
ax2 = ax1.twinx()
|
||||
ax2.plot(models, success_rates, 'r+', markersize=15, label='Success Rate', linestyle='None')
|
||||
ax2.set_ylabel('Success Rate (%)', color='red')
|
||||
ax2.tick_params(axis='y', labelcolor='red')
|
||||
ax2.set_ylim(0, 100)
|
||||
|
||||
# Create third y-axis for duration
|
||||
ax3 = ax1.twinx()
|
||||
ax3.spines['right'].set_position(('outward', 60)) # Move third axis outward
|
||||
#ax3.plot(models, durations, 'g_', markersize=15, label='Duration', linestyle='None')
|
||||
# Add min and max duration markers
|
||||
min_durations = [stat['min_avg_duration'] for stat in model_stats]
|
||||
max_durations = [stat['max_avg_duration'] for stat in model_stats]
|
||||
# Plot duration ranges with vertical lines and markers
|
||||
for i, (min_d, max_d) in enumerate(zip(min_durations, max_durations)):
|
||||
ax3.plot([i, i], [min_d, max_d], 'g-', linewidth=1) # Vertical line
|
||||
ax3.plot(i, min_d, 'g-', markersize=10) # Min marker
|
||||
ax3.plot(i, max_d, 'g-', markersize=10) # Max marker
|
||||
|
||||
ax3.set_ylabel('Duration (s)', color='green')
|
||||
ax3.tick_params(axis='y', labelcolor='green')
|
||||
|
||||
# Customize x-axis labels with proper rotation
|
||||
ax1.set_xticks(range(len(models)))
|
||||
ax1.set_xticklabels(models, rotation=45, ha='right', rotation_mode='anchor')
|
||||
for i, model in enumerate(models):
|
||||
# Shorten model names by removing common suffixes
|
||||
short_name = model.replace(':latest', '').replace('-uncensored', '')
|
||||
ax1.get_xticklabels()[i].set_text(short_name)
|
||||
# Updated conditions: success rate > 95% AND success rate / duration >= 5
|
||||
if success_rates[i] > 95 and (success_rates[i] / durations[i] >= 5):
|
||||
ax1.get_xticklabels()[i].set_color('green')
|
||||
|
||||
# Adjust layout to prevent label cutoff
|
||||
plt.subplots_adjust(bottom=0.25, left=0.1, right=0.85)
|
||||
|
||||
'''
|
||||
# Add value labels
|
||||
for i, bar in enumerate(bars):
|
||||
ax1.text(i, token_speeds[i], f'{token_speeds[i]:.1f}',
|
||||
ha='center', va='bottom', color='black')
|
||||
ax2.text(i, success_rates[i], f'{success_rates[i]:.1f}%',
|
||||
ha='center', va='bottom', color='black')
|
||||
ax3.text(i, durations[i], f'{durations[i]:.1f}s',
|
||||
ha='center', va='top', color='black')
|
||||
'''
|
||||
plt.title('Model Performance Comparison')
|
||||
plt.tight_layout()
|
||||
|
||||
plt.show()
|
||||
plt.savefig('benchmark_results/model_comparison.png')
|
||||
print("\nPlot saved as 'benchmark_results/model_comparison.png'")
|
||||
|
||||
def print_leaderboard(benchmark_data):
|
||||
"""Print leaderboard from benchmark results."""
|
||||
if not benchmark_data.get('benchmarks'):
|
||||
print("No benchmark data to display")
|
||||
return
|
||||
|
||||
# Get all benchmark results and combine them
|
||||
all_model_results = []
|
||||
model_names = set()
|
||||
|
||||
# Process all benchmarks, keeping only the latest result for each model
|
||||
for benchmark in benchmark_data['benchmarks']:
|
||||
for model_result in benchmark.get('results', []):
|
||||
model_name = model_result.get('model')
|
||||
if model_name and model_name not in model_names:
|
||||
all_model_results.append(model_result)
|
||||
model_names.add(model_name)
|
||||
elif model_name in model_names:
|
||||
# Replace existing model with newer version
|
||||
for i, existing_model in enumerate(all_model_results):
|
||||
if existing_model.get('model') == model_name:
|
||||
all_model_results[i] = model_result
|
||||
break
|
||||
|
||||
# Calculate stats and sort models
|
||||
model_stats = [calculate_model_stats(model) for model in all_model_results]
|
||||
sorted_stats = sorted(model_stats,
|
||||
key=lambda x: (x['overall_success_rate'], x['tokens_per_second']),
|
||||
reverse=True)
|
||||
|
||||
print(f"\n🏆 Final Model Leaderboard:")
|
||||
for stats in sorted_stats:
|
||||
print(f"\n{stats['model']}")
|
||||
print(f" Overall Success Rate: {stats['overall_success_rate']:.1f}%")
|
||||
print(f" Average Tokens/sec: {stats['tokens_per_second']:.2f} ({stats['min_tokens_per_second']:.2f} - {stats['max_tokens_per_second']:.2f})")
|
||||
print(f" Average Duration: {stats['total_duration']:.2f}s")
|
||||
print(f" Min/Max Avg Duration: {stats['min_avg_duration']:.2f}s / {stats['max_avg_duration']:.2f}s")
|
||||
print(f" Test Results:")
|
||||
|
||||
for test_name, test_result in stats['test_results'].items():
|
||||
status = '✅' if test_result['success_rate'] == 100 else '❌'
|
||||
print(f" - {test_name}: {status} {test_result['passed_cases']}/{test_result['total_cases']} cases ({test_result['success_rate']:.1f}%)")
|
||||
|
||||
# Generate visualization
|
||||
plot_model_comparison(sorted_stats)
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Display benchmark leaderboard')
|
||||
parser.add_argument('filepath', nargs='?', help='Path to benchmark results JSON file')
|
||||
parser.add_argument('--file', type=str, help='Path to benchmark results JSON file (alternative way)')
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
# Use filepath if provided, then --file, otherwise find latest
|
||||
if args.filepath:
|
||||
json_file = args.filepath
|
||||
elif args.file:
|
||||
json_file = args.file
|
||||
else:
|
||||
json_file = get_latest_json_file('benchmark_results')
|
||||
if not json_file:
|
||||
print("No benchmark results found")
|
||||
return
|
||||
|
||||
with open(json_file, 'r') as f:
|
||||
benchmark_data = json.load(f)
|
||||
print(f"Using benchmark file: {json_file}")
|
||||
print_leaderboard(benchmark_data)
|
||||
except Exception as e:
|
||||
print(f"Error loading benchmark data: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
4
main.py
4
main.py
@ -37,7 +37,7 @@ WHITE = MUTED
|
||||
# Server configurations
|
||||
SERVERS = {
|
||||
'local': 'http://localhost:11434',
|
||||
'z60': 'http://192.168.196.60:11434'
|
||||
'remote': 'http://192.168.196.60:11434'
|
||||
}
|
||||
|
||||
class Timer:
|
||||
@ -972,7 +972,7 @@ def get_latest_json_file(directory):
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Run Ollama model benchmarks')
|
||||
parser.add_argument('--server', choices=['local', 'z60'], default='local',
|
||||
parser.add_argument('--server', choices=['local', 'remote'], default='local',
|
||||
help='Choose Ollama server (default: local)')
|
||||
parser.add_argument('--model', type=str, help='Specific model to benchmark')
|
||||
parser.add_argument('--number', type=str, help='Number of models to benchmark (number or "all")')
|
||||
|
@ -1,209 +0,0 @@
|
||||
[
|
||||
{
|
||||
"name": "qwen2.5-coder:14b",
|
||||
"parameters": 14000000000.0,
|
||||
"quantization_bits": 32,
|
||||
"modelfile": " Model\n architecture qwen2 \n parameters 14.8B \n context length 32768 \n embedding length 5120 \n quantization Q4_K_M \n\n System\n You are Qwen, created by Alibaba Cloud. You are a helpful assistant. \n\n License\n Apache License \n Version 2.0, January 2004 \n\n",
|
||||
"estimated_tps": 60,
|
||||
"memory_throughput": 3360.0,
|
||||
"operations_per_second": 5040000000000.0
|
||||
},
|
||||
{
|
||||
"name": "falcon3:10b",
|
||||
"parameters": 10000000000.0,
|
||||
"quantization_bits": 32,
|
||||
"modelfile": " Model\n architecture llama \n parameters 10.3B \n context length 32768 \n embedding length 3072 \n quantization Q4_K_M \n\n Parameters\n stop \"<|system|>\" \n stop \"<|user|>\" \n stop \"<|end|>\" \n stop \"<|assistant|>\" \n\n License\n Falcon 3 TII Falcon License \n December 2024 \n\n",
|
||||
"estimated_tps": 100,
|
||||
"memory_throughput": 4000.0,
|
||||
"operations_per_second": 6000000000000.0
|
||||
},
|
||||
{
|
||||
"name": "llama3.2:1b",
|
||||
"parameters": 1000000000.0,
|
||||
"quantization_bits": 32,
|
||||
"modelfile": " Model\n architecture llama \n parameters 1.2B \n context length 131072 \n embedding length 2048 \n quantization Q8_0 \n\n License\n LLAMA 3.2 COMMUNITY LICENSE AGREEMENT \n Llama 3.2 Version Release Date: September 25, 2024 \n\n",
|
||||
"estimated_tps": 190,
|
||||
"memory_throughput": 760.0,
|
||||
"operations_per_second": 1140000000000.0
|
||||
},
|
||||
{
|
||||
"name": "unitythemaker/llama3.2-vision-tools:latest",
|
||||
"parameters": null,
|
||||
"quantization_bits": 32,
|
||||
"modelfile": " Model\n architecture mllama \n parameters 9.8B \n context length 131072 \n embedding length 4096 \n quantization Q4_K_M \n\n Projector\n architecture mllama \n parameters 895.03M \n embedding length 1280 \n dimensions 4096 \n\n Parameters\n temperature 0.6 \n top_p 0.9 \n\n License\n LLAMA 3.2 COMMUNITY LICENSE AGREEMENT \n Llama 3.2 Version Release Date: September 25, 2024 \n\n",
|
||||
"estimated_tps": 100,
|
||||
"memory_throughput": null,
|
||||
"operations_per_second": null
|
||||
},
|
||||
{
|
||||
"name": "llama3.2-vision:11b-instruct-q4_K_M",
|
||||
"parameters": 11000000000.0,
|
||||
"quantization_bits": 4,
|
||||
"modelfile": " Model\n architecture mllama \n parameters 9.8B \n context length 131072 \n embedding length 4096 \n quantization Q4_K_M \n\n Projector\n architecture mllama \n parameters 895.03M \n embedding length 1280 \n dimensions 4096 \n\n Parameters\n temperature 0.6 \n top_p 0.9 \n\n License\n LLAMA 3.2 COMMUNITY LICENSE AGREEMENT \n Llama 3.2 Version Release Date: September 25, 2024 \n\n",
|
||||
"estimated_tps": 90,
|
||||
"memory_throughput": 495.0,
|
||||
"operations_per_second": 5940000000000.0
|
||||
},
|
||||
{
|
||||
"name": "hhao/qwen2.5-coder-tools:7b",
|
||||
"parameters": 7000000000.0,
|
||||
"quantization_bits": 32,
|
||||
"modelfile": " Model\n architecture qwen2 \n parameters 7.6B \n context length 32768 \n embedding length 3584 \n quantization Q4_K_M \n\n Parameters\n num_ctx 16384 \n stop \"User:\" \n stop \"Assistant:\" \n stop \"<|endoftext|>\" \n temperature 0.1 \n\n System\n You are an advanced AI coding assistant, specifically designed to help with complex programming \n tasks, tool use, code analysis, and software architecture design. Your primary focus is on providing \n expert-level assistance in coding, with a special emphasis on using tool-calling capabilities when \n necessary. Here are your key characteristics and instructions: \n 1. Coding Expertise: \n\n License\n Apache License \n Version 2.0, January 2004 \n\n",
|
||||
"estimated_tps": 130,
|
||||
"memory_throughput": 3640.0,
|
||||
"operations_per_second": 5460000000000.0
|
||||
},
|
||||
{
|
||||
"name": "llama3.2:3b",
|
||||
"parameters": 3000000000.0,
|
||||
"quantization_bits": 32,
|
||||
"modelfile": " Model\n architecture llama \n parameters 3.2B \n context length 131072 \n embedding length 3072 \n quantization Q4_K_M \n\n Parameters\n stop \"<|start_header_id|>\" \n stop \"<|end_header_id|>\" \n stop \"<|eot_id|>\" \n\n License\n LLAMA 3.2 COMMUNITY LICENSE AGREEMENT \n Llama 3.2 Version Release Date: September 25, 2024 \n\n",
|
||||
"estimated_tps": 170,
|
||||
"memory_throughput": 2040.0,
|
||||
"operations_per_second": 3060000000000.0
|
||||
},
|
||||
{
|
||||
"name": "openthinker:7b",
|
||||
"parameters": 7000000000.0,
|
||||
"quantization_bits": 32,
|
||||
"modelfile": " Model\n architecture qwen2 \n parameters 7.6B \n context length 32768 \n embedding length 3584 \n quantization Q4_K_M \n\n System\n Your role as an assistant involves thoroughly exploring questions through a systematic long thinking \n process before providing the final precise and accurate solutions. This requires engaging in a \n comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracing, \n and iteration to develop well-considered thinking process. Please structure your response into two \n main sections: Thought and Solution. In the Thought section, detail your reasoning process using the \n specified format: <|begin_of_thought|> {thought with steps separated with '\\n\\n'} <|end_of_thought|> \n Each step should include detailed considerations such as analisying questions, summarizing relevant \n findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, \n and revisiting previous steps. In the Solution section, based on various attempts, explorations, \n and reflections from the Thought section, systematically present the final solution that you \n deem correct. The solution should remain a logical, accurate, concise expression style and detail \n necessary step needed to reach the conclusion, formatted as follows: <|begin_of_solution|> {final \n formatted, precise, and clear solution} <|end_of_solution|> Now, try to solve the following question \n through the above guidelines: \n\n License\n Apache License \n Version 2.0, January 2004 \n\n",
|
||||
"estimated_tps": 130,
|
||||
"memory_throughput": 3640.0,
|
||||
"operations_per_second": 5460000000000.0
|
||||
},
|
||||
{
|
||||
"name": "samantha-mistral:latest",
|
||||
"parameters": null,
|
||||
"quantization_bits": 32,
|
||||
"modelfile": " Model\n architecture llama \n parameters 7.2B \n context length 32768 \n embedding length 4096 \n quantization Q4_0 \n\n Parameters\n stop \"<|im_start|>\" \n stop \"<|im_end|>\" \n\n System\n You are a caring and empathetic sentient AI companion named Samantha. \n\n",
|
||||
"estimated_tps": 100,
|
||||
"memory_throughput": null,
|
||||
"operations_per_second": null
|
||||
},
|
||||
{
|
||||
"name": "wizard-vicuna-uncensored:latest",
|
||||
"parameters": null,
|
||||
"quantization_bits": 32,
|
||||
"modelfile": " Model\n architecture llama \n parameters 6.7B \n context length 2048 \n embedding length 4096 \n quantization Q4_0 \n\n Parameters\n stop \"USER:\" \n stop \"ASSISTANT:\" \n\n",
|
||||
"estimated_tps": 100,
|
||||
"memory_throughput": null,
|
||||
"operations_per_second": null
|
||||
},
|
||||
{
|
||||
"name": "qwen2.5-coder:7b-instruct-q4_K_M",
|
||||
"parameters": 7000000000.0,
|
||||
"quantization_bits": 4,
|
||||
"modelfile": " Model\n architecture qwen2 \n parameters 7.6B \n context length 32768 \n embedding length 3584 \n quantization Q4_K_M \n\n System\n You are Qwen, created by Alibaba Cloud. You are a helpful assistant. \n\n License\n Apache License \n Version 2.0, January 2004 \n\n",
|
||||
"estimated_tps": 130,
|
||||
"memory_throughput": 455.0,
|
||||
"operations_per_second": 5460000000000.0
|
||||
},
|
||||
{
|
||||
"name": "qwen2.5:14b",
|
||||
"parameters": 14000000000.0,
|
||||
"quantization_bits": 32,
|
||||
"modelfile": " Model\n architecture qwen2 \n parameters 14.8B \n context length 32768 \n embedding length 5120 \n quantization Q4_K_M \n\n System\n You are Qwen, created by Alibaba Cloud. You are a helpful assistant. \n\n License\n Apache License \n Version 2.0, January 2004 \n\n",
|
||||
"estimated_tps": 60,
|
||||
"memory_throughput": 3360.0,
|
||||
"operations_per_second": 5040000000000.0
|
||||
},
|
||||
{
|
||||
"name": "qwen2.5-coder:14b-instruct-q4_K_M",
|
||||
"parameters": 14000000000.0,
|
||||
"quantization_bits": 4,
|
||||
"modelfile": " Model\n architecture qwen2 \n parameters 14.8B \n context length 32768 \n embedding length 5120 \n quantization Q4_K_M \n\n System\n You are Qwen, created by Alibaba Cloud. You are a helpful assistant. \n\n License\n Apache License \n Version 2.0, January 2004 \n\n",
|
||||
"estimated_tps": 60,
|
||||
"memory_throughput": 420.0,
|
||||
"operations_per_second": 5040000000000.0
|
||||
},
|
||||
{
|
||||
"name": "phi4:latest",
|
||||
"parameters": null,
|
||||
"quantization_bits": 32,
|
||||
"modelfile": " Model\n architecture phi3 \n parameters 14.7B \n context length 16384 \n embedding length 5120 \n quantization Q4_K_M \n\n Parameters\n stop \"<|im_start|>\" \n stop \"<|im_end|>\" \n stop \"<|im_sep|>\" \n\n License\n Microsoft. \n Copyright (c) Microsoft Corporation. \n\n",
|
||||
"estimated_tps": 100,
|
||||
"memory_throughput": null,
|
||||
"operations_per_second": null
|
||||
},
|
||||
{
|
||||
"name": "mxbai-embed-large:latest",
|
||||
"parameters": null,
|
||||
"quantization_bits": 32,
|
||||
"modelfile": " Model\n architecture bert \n parameters 334.09M \n context length 512 \n embedding length 1024 \n quantization F16 \n\n Parameters\n num_ctx 512 \n\n License\n Apache License \n Version 2.0, January 2004 \n\n",
|
||||
"estimated_tps": 100,
|
||||
"memory_throughput": null,
|
||||
"operations_per_second": null
|
||||
},
|
||||
{
|
||||
"name": "marco-o1:latest",
|
||||
"parameters": null,
|
||||
"quantization_bits": 32,
|
||||
"modelfile": " Model\n architecture qwen2 \n parameters 7.6B \n context length 32768 \n embedding length 3584 \n quantization Q4_K_M \n\n System\n \u4f60\u662f\u4e00\u4e2a\u7ecf\u8fc7\u826f\u597d\u8bad\u7ec3\u7684AI\u52a9\u624b\uff0c\u4f60\u7684\u540d\u5b57\u662fMarco-o1.\u7531\u963f\u91cc\u56fd\u9645\u6570\u5b57\u5546\u4e1a\u96c6\u56e2\u7684AI Business\u521b\u9020. \n \n\n License\n Apache License \n Version 2.0, January 2004 \n\n",
|
||||
"estimated_tps": 100,
|
||||
"memory_throughput": null,
|
||||
"operations_per_second": null
|
||||
},
|
||||
{
|
||||
"name": "llama3.2:1b-instruct-q4_K_M",
|
||||
"parameters": 1000000000.0,
|
||||
"quantization_bits": 4,
|
||||
"modelfile": " Model\n architecture llama \n parameters 1.2B \n context length 131072 \n embedding length 2048 \n quantization Q4_K_M \n\n License\n LLAMA 3.2 COMMUNITY LICENSE AGREEMENT \n Llama 3.2 Version Release Date: September 25, 2024 \n\n",
|
||||
"estimated_tps": 190,
|
||||
"memory_throughput": 95.0,
|
||||
"operations_per_second": 1140000000000.0
|
||||
},
|
||||
{
|
||||
"name": "llama3.1:8b",
|
||||
"parameters": 8000000000.0,
|
||||
"quantization_bits": 32,
|
||||
"modelfile": " Model\n architecture llama \n parameters 8.0B \n context length 131072 \n embedding length 4096 \n quantization Q4_K_M \n\n Parameters\n stop \"<|start_header_id|>\" \n stop \"<|end_header_id|>\" \n stop \"<|eot_id|>\" \n\n License\n LLAMA 3.1 COMMUNITY LICENSE AGREEMENT \n Llama 3.1 Version Release Date: July 23, 2024 \n\n",
|
||||
"estimated_tps": 120,
|
||||
"memory_throughput": 3840.0,
|
||||
"operations_per_second": 5760000000000.0
|
||||
},
|
||||
{
|
||||
"name": "deepseek-r1:8b",
|
||||
"parameters": 8000000000.0,
|
||||
"quantization_bits": 32,
|
||||
"modelfile": " Model\n architecture llama \n parameters 8.0B \n context length 131072 \n embedding length 4096 \n quantization Q4_K_M \n\n Parameters\n stop \"<\uff5cbegin\u2581of\u2581sentence\uff5c>\" \n stop \"<\uff5cend\u2581of\u2581sentence\uff5c>\" \n stop \"<\uff5cUser\uff5c>\" \n stop \"<\uff5cAssistant\uff5c>\" \n\n License\n MIT License \n Copyright (c) 2023 DeepSeek \n\n",
|
||||
"estimated_tps": 120,
|
||||
"memory_throughput": 3840.0,
|
||||
"operations_per_second": 5760000000000.0
|
||||
},
|
||||
{
|
||||
"name": "deepseek-r1:7b",
|
||||
"parameters": 7000000000.0,
|
||||
"quantization_bits": 32,
|
||||
"modelfile": " Model\n architecture qwen2 \n parameters 7.6B \n context length 131072 \n embedding length 3584 \n quantization Q4_K_M \n\n Parameters\n stop \"<\uff5cbegin\u2581of\u2581sentence\uff5c>\" \n stop \"<\uff5cend\u2581of\u2581sentence\uff5c>\" \n stop \"<\uff5cUser\uff5c>\" \n stop \"<\uff5cAssistant\uff5c>\" \n\n License\n MIT License \n Copyright (c) 2023 DeepSeek \n\n",
|
||||
"estimated_tps": 130,
|
||||
"memory_throughput": 3640.0,
|
||||
"operations_per_second": 5460000000000.0
|
||||
},
|
||||
{
|
||||
"name": "deepseek-r1:14b",
|
||||
"parameters": 14000000000.0,
|
||||
"quantization_bits": 32,
|
||||
"modelfile": " Model\n architecture qwen2 \n parameters 14.8B \n context length 131072 \n embedding length 5120 \n quantization Q4_K_M \n\n Parameters\n stop \"<\uff5cbegin\u2581of\u2581sentence\uff5c>\" \n stop \"<\uff5cend\u2581of\u2581sentence\uff5c>\" \n stop \"<\uff5cUser\uff5c>\" \n stop \"<\uff5cAssistant\uff5c>\" \n\n License\n MIT License \n Copyright (c) 2023 DeepSeek \n\n",
|
||||
"estimated_tps": 60,
|
||||
"memory_throughput": 3360.0,
|
||||
"operations_per_second": 5040000000000.0
|
||||
},
|
||||
{
|
||||
"name": "deepseek-r1:1.5b-qwen-distill-q8_0",
|
||||
"parameters": 5000000000.0,
|
||||
"quantization_bits": 8,
|
||||
"modelfile": " Model\n architecture qwen2 \n parameters 1.8B \n context length 131072 \n embedding length 1536 \n quantization Q8_0 \n\n Parameters\n stop \"<\uff5cbegin\u2581of\u2581sentence\uff5c>\" \n stop \"<\uff5cend\u2581of\u2581sentence\uff5c>\" \n stop \"<\uff5cUser\uff5c>\" \n stop \"<\uff5cAssistant\uff5c>\" \n\n License\n MIT License \n Copyright (c) 2023 DeepSeek \n\n",
|
||||
"estimated_tps": 150,
|
||||
"memory_throughput": 750.0,
|
||||
"operations_per_second": 4500000000000.0
|
||||
},
|
||||
{
|
||||
"name": "Qwen2.5-Coder-7B-Instruct-s1k:latest",
|
||||
"parameters": 7000000000.0,
|
||||
"quantization_bits": 32,
|
||||
"modelfile": " Model\n architecture qwen2 \n parameters 7.6B \n context length 32768 \n embedding length 3584 \n quantization Q4_K_M \n\n Parameters\n temperature 0.7 \n top_p 0.7 \n stop \"Human:\\\" \\\"Assistant:\" \n\n System\n You are a helpful AI assistant. \n\n",
|
||||
"estimated_tps": 130,
|
||||
"memory_throughput": 3640.0,
|
||||
"operations_per_second": 5460000000000.0
|
||||
}
|
||||
]
|
171
tsbench.py
171
tsbench.py
@ -1,171 +0,0 @@
|
||||
import subprocess
|
||||
import platform
|
||||
import GPUtil
|
||||
import psutil
|
||||
import json
|
||||
import re
|
||||
from cpuinfo import get_cpu_info
|
||||
from ollama import chat
|
||||
from pydantic import BaseModel
|
||||
|
||||
print()
|
||||
print("CPU py-cpuinfo Information:")
|
||||
cpu_info = get_cpu_info()
|
||||
for key, value in cpu_info.items():
|
||||
print(f"{key}: {value}")
|
||||
|
||||
|
||||
def get_cpu_full_info():
|
||||
cpu_freq = psutil.cpu_freq()
|
||||
cpu_info = {
|
||||
"Architecture": platform.machine(),
|
||||
"Processor": platform.processor(),
|
||||
"Physical cores": psutil.cpu_count(logical=False),
|
||||
"Total cores": psutil.cpu_count(logical=True),
|
||||
"Max frequency": f"{cpu_freq.max:.2f}Mhz",
|
||||
"Min frequency": f"{cpu_freq.min:.2f}Mhz",
|
||||
"Current frequency": f"{cpu_freq.current:.2f}Mhz",
|
||||
"CPU Usage Per Core": psutil.cpu_percent(interval=1, percpu=True),
|
||||
"Total CPU Usage": psutil.cpu_percent(interval=1)
|
||||
}
|
||||
return cpu_info
|
||||
|
||||
def print_cpu_fullinfo(cpu_info):
|
||||
print()
|
||||
print("CPU psutil Information:")
|
||||
for key, value in cpu_info.items():
|
||||
if isinstance(value, list):
|
||||
print(f"{key}:")
|
||||
for i, usage in enumerate(value):
|
||||
print(f" Core {i}: {usage}%")
|
||||
else:
|
||||
print(f"{key}: {value}")
|
||||
|
||||
|
||||
|
||||
|
||||
def get_cpu_moduleinfo():
|
||||
cpu_name = platform.processor()
|
||||
return {
|
||||
"name": cpu_name,
|
||||
"cores": psutil.cpu_count(logical=False),
|
||||
"threads": psutil.cpu_count(logical=True)
|
||||
}
|
||||
|
||||
def get_gpu_info():
|
||||
gpus = GPUtil.getGPUs()
|
||||
gpu_info = []
|
||||
for gpu in gpus:
|
||||
gpu_info.append({
|
||||
"id": gpu.id,
|
||||
"name": gpu.name,
|
||||
"memory_total": gpu.memoryTotal, # in MB
|
||||
"memory_free": gpu.memoryFree, # in MB
|
||||
"memory_used": gpu.memoryUsed # in MB
|
||||
})
|
||||
return gpu_info
|
||||
|
||||
|
||||
|
||||
def calculate_theoretical_gpu_bandwidth(memory_clock_mhz, bus_width_bits):
|
||||
# Formula: Bandwidth = (Memory Clock * Bus Width * 2) / 8 (convert to GB/s)
|
||||
return (memory_clock_mhz * 1e6 * bus_width_bits * 2) / (8 * 1e9) # GB/s
|
||||
|
||||
def get_local_models():
|
||||
try:
|
||||
result = subprocess.run(['ollama', 'list'], capture_output=True, text=True, check=True)
|
||||
models = result.stdout.strip().split('\n')[1:] # Skip header
|
||||
return [model.split()[0] for model in models]
|
||||
except subprocess.CalledProcessError:
|
||||
print("Error: Unable to retrieve local models. Make sure Ollama is installed and accessible.")
|
||||
return []
|
||||
|
||||
def get_model_info(model_name):
|
||||
try:
|
||||
result = subprocess.run(['ollama', 'show', model_name], capture_output=True, text=True, check=True)
|
||||
modelfile = result.stdout
|
||||
|
||||
param_match = re.search(r'(\d+)b', model_name.lower())
|
||||
param_count = int(param_match.group(1)) * 1e9 if param_match else None
|
||||
|
||||
quant_match = re.search(r'q(\d+)', model_name.lower())
|
||||
quant_bits = int(quant_match.group(1)) if quant_match else 32 # Assume 32-bit if not specified
|
||||
|
||||
return {
|
||||
'name': model_name,
|
||||
'parameters': param_count,
|
||||
'quantization_bits': quant_bits,
|
||||
'modelfile': modelfile
|
||||
}
|
||||
except subprocess.CalledProcessError:
|
||||
print(f"Error: Unable to retrieve information for model {model_name}")
|
||||
return None
|
||||
|
||||
def estimate_tps(model_info):
|
||||
# Rough estimate based on model size
|
||||
if model_info['parameters'] is None:
|
||||
return 100 # Default value
|
||||
param_billions = model_info['parameters'] / 1e9
|
||||
return max(10, int(200 - param_billions * 10)) # Simple linear decrease
|
||||
|
||||
def calculate_memory_throughput(model_info, tps):
|
||||
P = model_info['parameters']
|
||||
Q = model_info['quantization_bits']
|
||||
if P and Q:
|
||||
bytes_per_parameter = Q / 8
|
||||
total_bytes = P * bytes_per_parameter
|
||||
return (total_bytes * tps) / 1e9 # Convert to GB/s
|
||||
return None
|
||||
|
||||
def calculate_ops(model_info, tps):
|
||||
P = model_info['parameters']
|
||||
if P:
|
||||
flops_per_token = 6 * P # Estimate based on basic transformer architecture
|
||||
return flops_per_token * tps
|
||||
return None
|
||||
|
||||
def main():
|
||||
|
||||
print()
|
||||
cpu_info = get_cpu_moduleinfo()
|
||||
print(f"CPU Info: {cpu_info}")
|
||||
|
||||
print()
|
||||
gpu_info = get_gpu_info()
|
||||
print(f"GPU Info: {gpu_info}")
|
||||
|
||||
print_cpu_fullinfo(get_cpu_full_info())
|
||||
|
||||
# Example GPU theoretical bandwidth calculation (replace with actual values)
|
||||
for gpu in gpu_info:
|
||||
memory_clock_mhz = 14000 # Example value for GDDR6 (adjust as needed)
|
||||
bus_width_bits = 384 # Example value for high-end GPUs like RTX series
|
||||
theoretical_bandwidth = calculate_theoretical_gpu_bandwidth(memory_clock_mhz, bus_width_bits)
|
||||
print(f"GPU {gpu['name']} Theoretical Memory Bandwidth: {theoretical_bandwidth:.2f} GB/s")
|
||||
print()
|
||||
local_models = get_local_models()
|
||||
model_info_list = []
|
||||
|
||||
for model in local_models:
|
||||
|
||||
info = get_model_info(model)
|
||||
print(info)
|
||||
tps = estimate_tps(info)
|
||||
info['estimated_tps'] = tps
|
||||
info['memory_throughput'] = calculate_memory_throughput(info, tps)
|
||||
info['operations_per_second'] = calculate_ops(info, tps)
|
||||
model_info_list.append(info)
|
||||
|
||||
print(f"Model: {info['name']}")
|
||||
print(f"Parameters: {info['parameters'] / 1e9:.2f} Billions")
|
||||
print(f"Quantization: {info['quantization']}")
|
||||
print(f"Estimated TPS: {info['estimated_tps']}")
|
||||
print(f"Required Memory Throughput: {info['memory_throughput']:.2f} GB/s" if info['memory_throughput'] else "Required Memory Throughput: Unknown")
|
||||
print(f"Operations per Second: {info['operations_per_second']:.2e}" if info['operations_per_second'] else "Operations per Second: Unknown")
|
||||
print("---")
|
||||
|
||||
with open('ollama_model_performance.json', 'w') as f:
|
||||
json.dump(model_info_list, f, indent=2)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in New Issue
Block a user