172 lines
5.9 KiB
Python
Executable File
172 lines
5.9 KiB
Python
Executable File
import subprocess
|
|
import platform
|
|
import GPUtil
|
|
import psutil
|
|
import json
|
|
import re
|
|
from cpuinfo import get_cpu_info
|
|
from ollama import chat
|
|
from pydantic import BaseModel
|
|
|
|
print()
|
|
print("CPU py-cpuinfo Information:")
|
|
cpu_info = get_cpu_info()
|
|
for key, value in cpu_info.items():
|
|
print(f"{key}: {value}")
|
|
|
|
|
|
def get_cpu_full_info():
|
|
cpu_freq = psutil.cpu_freq()
|
|
cpu_info = {
|
|
"Architecture": platform.machine(),
|
|
"Processor": platform.processor(),
|
|
"Physical cores": psutil.cpu_count(logical=False),
|
|
"Total cores": psutil.cpu_count(logical=True),
|
|
"Max frequency": f"{cpu_freq.max:.2f}Mhz",
|
|
"Min frequency": f"{cpu_freq.min:.2f}Mhz",
|
|
"Current frequency": f"{cpu_freq.current:.2f}Mhz",
|
|
"CPU Usage Per Core": psutil.cpu_percent(interval=1, percpu=True),
|
|
"Total CPU Usage": psutil.cpu_percent(interval=1)
|
|
}
|
|
return cpu_info
|
|
|
|
def print_cpu_fullinfo(cpu_info):
|
|
print()
|
|
print("CPU psutil Information:")
|
|
for key, value in cpu_info.items():
|
|
if isinstance(value, list):
|
|
print(f"{key}:")
|
|
for i, usage in enumerate(value):
|
|
print(f" Core {i}: {usage}%")
|
|
else:
|
|
print(f"{key}: {value}")
|
|
|
|
|
|
|
|
|
|
def get_cpu_moduleinfo():
|
|
cpu_name = platform.processor()
|
|
return {
|
|
"name": cpu_name,
|
|
"cores": psutil.cpu_count(logical=False),
|
|
"threads": psutil.cpu_count(logical=True)
|
|
}
|
|
|
|
def get_gpu_info():
|
|
gpus = GPUtil.getGPUs()
|
|
gpu_info = []
|
|
for gpu in gpus:
|
|
gpu_info.append({
|
|
"id": gpu.id,
|
|
"name": gpu.name,
|
|
"memory_total": gpu.memoryTotal, # in MB
|
|
"memory_free": gpu.memoryFree, # in MB
|
|
"memory_used": gpu.memoryUsed # in MB
|
|
})
|
|
return gpu_info
|
|
|
|
|
|
|
|
def calculate_theoretical_gpu_bandwidth(memory_clock_mhz, bus_width_bits):
|
|
# Formula: Bandwidth = (Memory Clock * Bus Width * 2) / 8 (convert to GB/s)
|
|
return (memory_clock_mhz * 1e6 * bus_width_bits * 2) / (8 * 1e9) # GB/s
|
|
|
|
def get_local_models():
|
|
try:
|
|
result = subprocess.run(['ollama', 'list'], capture_output=True, text=True, check=True)
|
|
models = result.stdout.strip().split('\n')[1:] # Skip header
|
|
return [model.split()[0] for model in models]
|
|
except subprocess.CalledProcessError:
|
|
print("Error: Unable to retrieve local models. Make sure Ollama is installed and accessible.")
|
|
return []
|
|
|
|
def get_model_info(model_name):
|
|
try:
|
|
result = subprocess.run(['ollama', 'show', model_name], capture_output=True, text=True, check=True)
|
|
modelfile = result.stdout
|
|
|
|
param_match = re.search(r'(\d+)b', model_name.lower())
|
|
param_count = int(param_match.group(1)) * 1e9 if param_match else None
|
|
|
|
quant_match = re.search(r'q(\d+)', model_name.lower())
|
|
quant_bits = int(quant_match.group(1)) if quant_match else 32 # Assume 32-bit if not specified
|
|
|
|
return {
|
|
'name': model_name,
|
|
'parameters': param_count,
|
|
'quantization_bits': quant_bits,
|
|
'modelfile': modelfile
|
|
}
|
|
except subprocess.CalledProcessError:
|
|
print(f"Error: Unable to retrieve information for model {model_name}")
|
|
return None
|
|
|
|
def estimate_tps(model_info):
|
|
# Rough estimate based on model size
|
|
if model_info['parameters'] is None:
|
|
return 100 # Default value
|
|
param_billions = model_info['parameters'] / 1e9
|
|
return max(10, int(200 - param_billions * 10)) # Simple linear decrease
|
|
|
|
def calculate_memory_throughput(model_info, tps):
|
|
P = model_info['parameters']
|
|
Q = model_info['quantization_bits']
|
|
if P and Q:
|
|
bytes_per_parameter = Q / 8
|
|
total_bytes = P * bytes_per_parameter
|
|
return (total_bytes * tps) / 1e9 # Convert to GB/s
|
|
return None
|
|
|
|
def calculate_ops(model_info, tps):
|
|
P = model_info['parameters']
|
|
if P:
|
|
flops_per_token = 6 * P # Estimate based on basic transformer architecture
|
|
return flops_per_token * tps
|
|
return None
|
|
|
|
def main():
|
|
|
|
print()
|
|
cpu_info = get_cpu_moduleinfo()
|
|
print(f"CPU Info: {cpu_info}")
|
|
|
|
print()
|
|
gpu_info = get_gpu_info()
|
|
print(f"GPU Info: {gpu_info}")
|
|
|
|
print_cpu_fullinfo(get_cpu_full_info())
|
|
|
|
# Example GPU theoretical bandwidth calculation (replace with actual values)
|
|
for gpu in gpu_info:
|
|
memory_clock_mhz = 14000 # Example value for GDDR6 (adjust as needed)
|
|
bus_width_bits = 384 # Example value for high-end GPUs like RTX series
|
|
theoretical_bandwidth = calculate_theoretical_gpu_bandwidth(memory_clock_mhz, bus_width_bits)
|
|
print(f"GPU {gpu['name']} Theoretical Memory Bandwidth: {theoretical_bandwidth:.2f} GB/s")
|
|
print()
|
|
local_models = get_local_models()
|
|
model_info_list = []
|
|
|
|
for model in local_models:
|
|
|
|
info = get_model_info(model)
|
|
print(info)
|
|
tps = estimate_tps(info)
|
|
info['estimated_tps'] = tps
|
|
info['memory_throughput'] = calculate_memory_throughput(info, tps)
|
|
info['operations_per_second'] = calculate_ops(info, tps)
|
|
model_info_list.append(info)
|
|
|
|
print(f"Model: {info['name']}")
|
|
print(f"Parameters: {info['parameters'] / 1e9:.2f} Billions")
|
|
print(f"Quantization: {info['quantization']}")
|
|
print(f"Estimated TPS: {info['estimated_tps']}")
|
|
print(f"Required Memory Throughput: {info['memory_throughput']:.2f} GB/s" if info['memory_throughput'] else "Required Memory Throughput: Unknown")
|
|
print(f"Operations per Second: {info['operations_per_second']:.2e}" if info['operations_per_second'] else "Operations per Second: Unknown")
|
|
print("---")
|
|
|
|
with open('ollama_model_performance.json', 'w') as f:
|
|
json.dump(model_info_list, f, indent=2)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|