codebench/tsbench.py
2025-03-04 04:34:23 +01:00

172 lines
5.9 KiB
Python
Executable File

import subprocess
import platform
import GPUtil
import psutil
import json
import re
from cpuinfo import get_cpu_info
from ollama import chat
from pydantic import BaseModel
print()
print("CPU py-cpuinfo Information:")
cpu_info = get_cpu_info()
for key, value in cpu_info.items():
print(f"{key}: {value}")
def get_cpu_full_info():
cpu_freq = psutil.cpu_freq()
cpu_info = {
"Architecture": platform.machine(),
"Processor": platform.processor(),
"Physical cores": psutil.cpu_count(logical=False),
"Total cores": psutil.cpu_count(logical=True),
"Max frequency": f"{cpu_freq.max:.2f}Mhz",
"Min frequency": f"{cpu_freq.min:.2f}Mhz",
"Current frequency": f"{cpu_freq.current:.2f}Mhz",
"CPU Usage Per Core": psutil.cpu_percent(interval=1, percpu=True),
"Total CPU Usage": psutil.cpu_percent(interval=1)
}
return cpu_info
def print_cpu_fullinfo(cpu_info):
print()
print("CPU psutil Information:")
for key, value in cpu_info.items():
if isinstance(value, list):
print(f"{key}:")
for i, usage in enumerate(value):
print(f" Core {i}: {usage}%")
else:
print(f"{key}: {value}")
def get_cpu_moduleinfo():
cpu_name = platform.processor()
return {
"name": cpu_name,
"cores": psutil.cpu_count(logical=False),
"threads": psutil.cpu_count(logical=True)
}
def get_gpu_info():
gpus = GPUtil.getGPUs()
gpu_info = []
for gpu in gpus:
gpu_info.append({
"id": gpu.id,
"name": gpu.name,
"memory_total": gpu.memoryTotal, # in MB
"memory_free": gpu.memoryFree, # in MB
"memory_used": gpu.memoryUsed # in MB
})
return gpu_info
def calculate_theoretical_gpu_bandwidth(memory_clock_mhz, bus_width_bits):
# Formula: Bandwidth = (Memory Clock * Bus Width * 2) / 8 (convert to GB/s)
return (memory_clock_mhz * 1e6 * bus_width_bits * 2) / (8 * 1e9) # GB/s
def get_local_models():
try:
result = subprocess.run(['ollama', 'list'], capture_output=True, text=True, check=True)
models = result.stdout.strip().split('\n')[1:] # Skip header
return [model.split()[0] for model in models]
except subprocess.CalledProcessError:
print("Error: Unable to retrieve local models. Make sure Ollama is installed and accessible.")
return []
def get_model_info(model_name):
try:
result = subprocess.run(['ollama', 'show', model_name], capture_output=True, text=True, check=True)
modelfile = result.stdout
param_match = re.search(r'(\d+)b', model_name.lower())
param_count = int(param_match.group(1)) * 1e9 if param_match else None
quant_match = re.search(r'q(\d+)', model_name.lower())
quant_bits = int(quant_match.group(1)) if quant_match else 32 # Assume 32-bit if not specified
return {
'name': model_name,
'parameters': param_count,
'quantization_bits': quant_bits,
'modelfile': modelfile
}
except subprocess.CalledProcessError:
print(f"Error: Unable to retrieve information for model {model_name}")
return None
def estimate_tps(model_info):
# Rough estimate based on model size
if model_info['parameters'] is None:
return 100 # Default value
param_billions = model_info['parameters'] / 1e9
return max(10, int(200 - param_billions * 10)) # Simple linear decrease
def calculate_memory_throughput(model_info, tps):
P = model_info['parameters']
Q = model_info['quantization_bits']
if P and Q:
bytes_per_parameter = Q / 8
total_bytes = P * bytes_per_parameter
return (total_bytes * tps) / 1e9 # Convert to GB/s
return None
def calculate_ops(model_info, tps):
P = model_info['parameters']
if P:
flops_per_token = 6 * P # Estimate based on basic transformer architecture
return flops_per_token * tps
return None
def main():
print()
cpu_info = get_cpu_moduleinfo()
print(f"CPU Info: {cpu_info}")
print()
gpu_info = get_gpu_info()
print(f"GPU Info: {gpu_info}")
print_cpu_fullinfo(get_cpu_full_info())
# Example GPU theoretical bandwidth calculation (replace with actual values)
for gpu in gpu_info:
memory_clock_mhz = 14000 # Example value for GDDR6 (adjust as needed)
bus_width_bits = 384 # Example value for high-end GPUs like RTX series
theoretical_bandwidth = calculate_theoretical_gpu_bandwidth(memory_clock_mhz, bus_width_bits)
print(f"GPU {gpu['name']} Theoretical Memory Bandwidth: {theoretical_bandwidth:.2f} GB/s")
print()
local_models = get_local_models()
model_info_list = []
for model in local_models:
info = get_model_info(model)
print(info)
tps = estimate_tps(info)
info['estimated_tps'] = tps
info['memory_throughput'] = calculate_memory_throughput(info, tps)
info['operations_per_second'] = calculate_ops(info, tps)
model_info_list.append(info)
print(f"Model: {info['name']}")
print(f"Parameters: {info['parameters'] / 1e9:.2f} Billions")
print(f"Quantization: {info['quantization']}")
print(f"Estimated TPS: {info['estimated_tps']}")
print(f"Required Memory Throughput: {info['memory_throughput']:.2f} GB/s" if info['memory_throughput'] else "Required Memory Throughput: Unknown")
print(f"Operations per Second: {info['operations_per_second']:.2e}" if info['operations_per_second'] else "Operations per Second: Unknown")
print("---")
with open('ollama_model_performance.json', 'w') as f:
json.dump(model_info_list, f, indent=2)
if __name__ == "__main__":
main()