From 2bef2a0b7c2219afc0075fb35271983eee0aa82e Mon Sep 17 00:00:00 2001 From: leduc Date: Tue, 22 Apr 2025 21:42:36 +0200 Subject: [PATCH] first commit --- README.md | 358 +++++++++++++++++++++++- allendpoints.py | 715 +++++++++++++++++++++++++++++++++++++++++++++++ example.py | 164 +++++++++++ requirements.txt | 15 + 4 files changed, 1250 insertions(+), 2 deletions(-) create mode 100644 allendpoints.py create mode 100644 example.py create mode 100644 requirements.txt diff --git a/README.md b/README.md index 93ada67..011e75e 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,357 @@ -# allendpoints +# AllEndpoints - Universal LLM Inference Tool -AllEndpoints is a powerful Python module for making inferences with various LLM providers through a unified interface. \ No newline at end of file +AllEndpoints is a powerful Python module for making inferences with various LLM providers through a unified interface. It supports multiple providers including Ollama (local), HuggingFace, Together, Google Gemini, AIQL, Groq, NVIDIA, and GitHub Copilot APIs. + +## Table of Contents + +- [Installation](#installation) +- [Environment Variables](#environment-variables) +- [Setting Up Environment Variables](#setting-up-environment-variables) +- [Linux/macOS](#linuxmacos) +- [Windows](#windows) +- [Usage](#usage) +- [Command-Line Arguments](#command-line-arguments) +- [Examples](#examples) +- [Using as a Python Module](#using-as-a-python-module) +- [Supported Providers](#supported-providers) +- [Adding New Models](#adding-new-models) +- [Troubleshooting](#troubleshooting) + +## Installation + +1. Clone the repository: + ```bash + git clone https://github.com/yourusername/allendpoints.git + cd allendpoints + ``` + +2. Install the required dependencies: + ```bash + pip install ollama requests google-generativeai huggingface_hub together groq openai colorama + ``` + +3. Install Ollama (optional, for local inference): + - [Ollama Installation Guide](https://github.com/ollama/ollama) + +## Environment Variables + +The script uses environment variables to store API keys for different providers. Here are the required environment variables for each provider: + +| Provider | Environment Variable | Description | +|-------------|---------------------|--------------------------------------------| +| HuggingFace | `HF_API_KEY` | HuggingFace API key | +| Together | `TOGETHER_API_KEY` | Together AI API key | +| Google Gemini | `GEMINI_API_KEY` | Google AI Studio API key | +| AIQL | `AIQL_API_KEY` | AIQL API key | +| Groq | `GROQ_API_KEY` | Groq API key | +| NVIDIA | `NVIDIA_API_KEY` | NVIDIA API key | +| GitHub | `GITHUB_TOKEN` | GitHub token for Copilot API access | + +### Setting Up Environment Variables + +#### Linux/macOS + +**Temporary (Current Session Only)** + +```bash +export HF_API_KEY="your_huggingface_api_key" +export TOGETHER_API_KEY="your_together_api_key" +export GEMINI_API_KEY="your_gemini_api_key" +export AIQL_API_KEY="your_aiql_api_key" +export GROQ_API_KEY="your_groq_api_key" +export NVIDIA_API_KEY="your_nvidia_api_key" +export GITHUB_TOKEN="your_github_token" +``` + +**Permanent (Add to Shell Profile)** + +Add the above export commands to your `~/.bashrc`, `~/.zshrc`, or `~/.profile` file: + +```bash +echo 'export HF_API_KEY="your_huggingface_api_key"' >> ~/.bashrc +echo 'export TOGETHER_API_KEY="your_together_api_key"' >> ~/.bashrc +# Add other API keys similarly +``` + +Then reload your shell configuration: +```bash +source ~/.bashrc # or ~/.zshrc or ~/.profile +``` + +#### Windows + +**Command Prompt (Temporary)** + +```cmd +set HF_API_KEY=your_huggingface_api_key +set TOGETHER_API_KEY=your_together_api_key +set GEMINI_API_KEY=your_gemini_api_key +set AIQL_API_KEY=your_aiql_api_key +set GROQ_API_KEY=your_groq_api_key +set NVIDIA_API_KEY=your_nvidia_api_key +set GITHUB_TOKEN=your_github_token +``` + +**PowerShell (Temporary)** + +```powershell +$env:HF_API_KEY = "your_huggingface_api_key" +$env:TOGETHER_API_KEY = "your_together_api_key" +$env:GEMINI_API_KEY = "your_gemini_api_key" +$env:AIQL_API_KEY = "your_aiql_api_key" +$env:GROQ_API_KEY = "your_groq_api_key" +$env:NVIDIA_API_KEY = "your_nvidia_api_key" +$env:GITHUB_TOKEN = "your_github_token" +``` + +**Permanent (System Environment Variables)** + +1. Right-click on "This PC" or "My Computer" and select "Properties" +2. Click on "Advanced system settings" +3. Click on "Environment Variables" +4. Under "User variables" or "System variables", click "New" +5. Enter the variable name (e.g., `HF_API_KEY`) and its value +6. Click "OK" to save + +## Usage + +### Command-Line Arguments + +``` +usage: allendpoints.py [-h] [--provider PROVIDER] [--model MODEL] [--system SYSTEM] [--list] [--debug] [-a] [prompt] + +LLM Inference Module + +positional arguments: + prompt The prompt to send to the model (default: "Why is the sky blue?") + +options: + -h, --help show this help message and exit + --provider PROVIDER The provider to use (ollama, hf, together, gemini, aiql, groq, nvidia, github) + --model MODEL The specific model to use + --system SYSTEM System content for chat models (default: "You are a helpful assistant.") + --list List available providers and models + --debug Enable debug output + -a, --all Run inference on all available providers and models +``` + +### Examples + +**List all available providers and models:** +```bash +python allendpoints.py --list +``` + +**Run inference with a specific provider and model:** +```bash +python allendpoints.py "What is the capital of France?" --provider ollama --model llama3.2:3b +``` + +**Run inference with a specific provider and its default model:** +```bash +python allendpoints.py "Explain quantum computing" --provider gemini +``` + +**Run inference with a custom system prompt:** +```bash +python allendpoints.py "Write a poem about AI" --provider ollama --model llama3.2:3b --system "You are a poetic assistant." +``` + +**Run inference on all available providers and models:** +```bash +python allendpoints.py "What is the meaning of life?" -a +``` + +**Run with debug output:** +```bash +python allendpoints.py "How does a nuclear reactor work?" --provider nvidia --model qwen2.5-coder-32b --debug +``` + +## Using as a Python Module + +AllEndpoints can be imported and used as a Python module in your own projects. Here's how to use it programmatically: + +### Basic Usage + +```python +# Import the necessary functions from allendpoints +from allendpoints import run_inference, check_available_apis, CONFIG + +# Run inference with a specific provider and model +# Always specify the model parameter explicitly +response = run_inference( + prompt="What is the capital of France?", + provider="ollama", + model="llama3.2:3b", + system_content="You are a helpful assistant." +) + +print(response) + +# If you want to use the default model for a provider +default_model = CONFIG["defaults"]["ollama"] +response = run_inference( + prompt="What is quantum computing?", + provider="ollama", + model=default_model +) + +print(response) +``` + +### Advanced Usage + +```python +# Import more functions for advanced usage +from allendpoints import ( + run_inference, + check_available_apis, + get_ollama_models, + InferenceHandler, + CONFIG +) + +# Get all available providers +available_providers = check_available_apis() +print(f"Available providers: {available_providers}") + +# Get all available Ollama models +ollama_models = get_ollama_models() +print(f"Available Ollama models: {ollama_models}") + +# Use a specific provider's handler directly +if "nvidia" in available_providers: + nvidia_response = InferenceHandler.nvidia( + prompt="Explain quantum computing", + model="qwen/qwen2.5-coder-32b-instruct" + ) + print(f"NVIDIA response: {nvidia_response}") + +# Access the configuration +default_models = CONFIG["defaults"] +print(f"Default models: {default_models}") +``` + +### Batch Processing Example + +```python +# Process multiple prompts with different providers +prompts = [ + "What is machine learning?", + "Explain the theory of relativity", + "How does a neural network work?" +] + +providers = ["ollama", "gemini", "github"] + +# Process each prompt with each provider +for prompt in prompts: + for provider in providers: + try: + # Always specify the model parameter explicitly + default_model = CONFIG["defaults"][provider] + response = run_inference(prompt, provider, model=default_model) + print(f"\nPrompt: {prompt}") + print(f"Provider: {provider}") + print(f"Response: {response[:100]}...") + except Exception as e: + print(f"Error with {provider}: {str(e)}") +``` + +### Integration with main.py + +The allendpoints module is integrated with main.py for benchmarking LLM performance on coding tasks: + +```python +# In main.py +from allendpoints import check_available_apis, run_inference + +# Get available providers +available_apis = check_available_apis() + +# Run inference with a specific model +response = run_inference( + question, # The coding problem to solve + provider, # The provider to use + model_id, # The specific model to use + system_content # Optional system prompt +) +``` + +This integration allows main.py to benchmark various LLM providers and models on coding tasks using a unified interface. + +## Supported Providers + +### Ollama (Local) +- Runs locally on your machine +- Supports various open-source models +- No API key required, but needs Ollama installed + +### HuggingFace +- Provides access to HuggingFace's Inference API +- Requires `HF_API_KEY` environment variable + +### Together +- Provides access to Together AI's models +- Requires `TOGETHER_API_KEY` environment variable + +### Google Gemini +- Provides access to Google's Gemini models +- Requires `GEMINI_API_KEY` environment variable + +### AIQL +- Provides access to AIQL's models +- Requires `AIQL_API_KEY` environment variable + +### Groq +- Provides access to Groq's models +- Requires `GROQ_API_KEY` environment variable + +### NVIDIA +- Provides access to NVIDIA's models +- Requires `NVIDIA_API_KEY` environment variable + +### GitHub +- Provides access to GitHub Copilot models +- Requires `GITHUB_TOKEN` environment variable + +## Adding New Models + +To add a new model to an existing provider, edit the `CONFIG` dictionary in the script: + +```python +CONFIG = { + "models": { + "provider_name": { + "model_display_name": "actual_model_id", + # Add your new model here + "new_model_name": "new_model_id" + } + } +} +``` + +## Troubleshooting + +### API Key Issues +- Ensure your API keys are correctly set in your environment variables +- Check that the API keys have not expired +- Verify that you have the necessary permissions for the models you're trying to access + +### Ollama Issues +- Ensure Ollama is installed and running +- Check that the model you're trying to use is downloaded (`ollama list`) +- If a model is not available, pull it with `ollama pull model_name` + +### Connection Issues +- Check your internet connection +- Ensure that the API endpoints are not blocked by your network or firewall +- Some providers may have rate limits or usage quotas + +### Model Loading +- Large models may take time to load, especially on the first run +- The script preloads Ollama models to ensure fair timing measurements +- If a model consistently fails to load, try a smaller model or a different provider + +### Colored Error Messages +- Install the `colorama` package for colored error messages: `pip install colorama` diff --git a/allendpoints.py b/allendpoints.py new file mode 100644 index 0000000..573160b --- /dev/null +++ b/allendpoints.py @@ -0,0 +1,715 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +''' +LLM Inference Module +A simplified module for making inferences with various LLM providers + +Requirements: +pip install -r requirements.txt +''' + +import sys +import requests +import ollama +import google.generativeai as genai +from huggingface_hub import InferenceClient +from together import Together +from groq import Groq +import os +import time +from openai import OpenAI # Used for both NVIDIA and GitHub endpoints + +CONFIG = { + "api_keys": { + "HF_API_KEY": os.environ.get("HF_API_KEY"), + "TOGETHER_API_KEY": os.environ.get("TOGETHER_API_KEY"), + "GEMINI_API_KEY": os.environ.get("GEMINI_API_KEY"), + "AIQL_API_KEY": os.environ.get("AIQL_API_KEY"), + "GROQ_API_KEY": os.environ.get("GROQ_API_KEY"), + "NVIDIA_API_KEY": os.environ.get("NVIDIA_API_KEY"), + "GITHUB_TOKEN": os.environ.get("GITHUB_TOKEN") + }, + "models": { + "aiql": { + "Llama-3.3-70B-Instruct": "meta-llama/Llama-3.3-70B-Instruct", + "Llama-3.3-70B-Chat": "meta-llama/Llama-3.3-70B-Chat" + }, + "together": { + "DeepSeek-70B": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B-free", + "Llama-3-3-70B-Turbo": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free" + }, + "gemini": { + "gemini-2.5-pro-preview": "gemini-2.5-pro-preview-03-25", + "gemini-2.5-flash-preview": "gemini-2.5-flash-preview-04-17", + "gemini-1.5-flash": "gemini-1.5-flash", + "gemini-1.5-pro": "gemini-1.5-pro", + "gemini-1.5-flash-002": "gemini-1.5-flash-002", + "gemini-1.5-flash-001": "gemini-1.5-flash-001", + "gemini-1.5-pro-002": "gemini-1.5-pro-002", + "gemini-1.5-pro-001": "gemini-1.5-pro-001", + "gemini-2.0-flash": "gemini-2.0-flash", + "gemini-2.0-flash-exp": "gemini-2.0-flash-exp", + "gemini-2.0-flash-thinking-exp-01-21": "gemini-2.0-flash-thinking-exp-01-21" + }, + "hf": { + "DeepSeek-R1-Distill-Qwen-32B": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "Qwen2.5-Coder-32B": "Qwen/Qwen2.5-Coder-32B-Instruct" + }, + "ollama": [ + "falcon3:10b" + ], + "groq": { + "llama-3.3-70b-versatile": "llama-3.3-70b-versatile", + "deepseek-r1-distill-llama-70b": "deepseek-r1-distill-llama-70b" + }, + "nvidia": { + "qwen2.5-coder-32b": "qwen/qwen2.5-coder-32b-instruct", + "llama2-70b": "meta-llama/llama-2-70b-chat", + "mixtral-8x7b": "mistralai/mixtral-8x7b-instruct", + "yi-34b": "01-ai/yi-34b-chat" + }, + "github": { + "gpt-4o": "gpt-4o", + "gpt-4o-mini": "gpt-4o-mini", + "mistral-small": "mistral-small-2503", + "deepseek-v3": "deepseek-v3", + "phi-4": "phi-4", + "llama-3.3-70b": "llama-3.3-70b-instruct" + } + }, + "defaults": { + "ollama": "llama3.2:3b", + "hf": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "together": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B-free", + "gemini": "gemini-1.5-flash", + "aiql": "meta-llama/Llama-3.3-70B-Instruct", + "groq": "llama-3.3-70b-versatile", + "nvidia": "qwen/qwen2.5-coder-32b-instruct", + "github": "gpt-4o" + } +} + +class InferenceHandler: + @staticmethod + def preload_ollama_model(model: str): + """Preload an Ollama model by sending a simple query to it""" + try: + print(f"Loading Ollama model {model}...") + client = ollama.Client() + + # Send a simple query to load the model into memory + client.chat(model=model, messages=[{'role': 'user', 'content': 'hello'}]) + print(f"Model {model} loaded successfully") + return True + except Exception as e: + print(f"Failed to preload model {model}: {str(e)}") + return False + + @staticmethod + def ollama(prompt: str, model: str, system_content: str = None, preload: bool = False) -> str: + try: + client = ollama.Client() + + # Preload the model if requested + if preload: + InferenceHandler.preload_ollama_model(model) + + # If system_content is provided, use the chat API with messages + if system_content: + messages = [ + {'role': 'system', 'content': system_content}, + {'role': 'user', 'content': prompt} + ] + response = client.chat(model=model, messages=messages) + + # Add response validation + if not response or 'message' not in response or 'content' not in response['message']: + return "Error: Empty response from Ollama chat API" + + return response['message']['content'] + else: + # Use the generate API without system content + response = client.generate(model=model, prompt=prompt) + + # Add response validation + if not response or 'response' not in response: + return "Error: Empty response from Ollama generate API" + + return response['response'] + + except Exception as e: + error_msg = str(e) + if "connection" in error_msg.lower(): + return "Error: Could not connect to Ollama server" + return f"Ollama Error: {error_msg}" + + @staticmethod + def hf(prompt: str, model: str) -> str: + try: + client = InferenceClient(token=CONFIG['api_keys']['HF_API_KEY']) + response = client.text_generation(prompt, model=model) + + # Add response validation + if not response or response.isspace(): + return "Error: Empty response from HuggingFace" + + return response + + except Exception as e: + # Improve error message + error_msg = str(e) + if "Expecting value" in error_msg: + return "Error: Invalid response format from HuggingFace API" + return f"HF Error: {error_msg}" + + @staticmethod + def together(prompt: str, model: str) -> str: + try: + client = Together(api_key=CONFIG['api_keys']['TOGETHER_API_KEY']) + response = client.chat.completions.create( + model=model, + messages=[{"role": "user", "content": prompt}], + max_tokens=2048 # Add reasonable token limit + ) + + # Add response validation + if not response or not response.choices: + return "Error: Empty response from Together" + + return response.choices[0].message.content + + except Exception as e: + error_msg = str(e) + if "authentication" in error_msg.lower(): + return "Error: Invalid Together API key" + return f"Together Error: {error_msg}" + + @staticmethod + def gemini(prompt: str, model: str) -> str: + try: + genai.configure(api_key=CONFIG['api_keys']['GEMINI_API_KEY']) + model = genai.GenerativeModel(model) + response = model.generate_content(prompt) + + # Add response validation + if not response or not response.text: + return "Error: Empty response from Gemini" + + return response.text + + except Exception as e: + error_msg = str(e) + if "invalid" in error_msg.lower() and "model" in error_msg.lower(): + return "Error: Invalid Gemini model" + return f"Gemini Error: {error_msg}" + + @staticmethod + def aiql(prompt: str, model: str) -> str: + try: + headers = { + "Authorization": f"Bearer {CONFIG['api_keys']['AIQL_API_KEY']}", + "Content-Type": "application/json" + } + data = { + "model": model, + "messages": [{"role": "user", "content": prompt}] + } + response = requests.post( + "https://ai.aiql.com/v1/chat/completions", + headers=headers, + json=data + ) + + # Add response validation + if not response or response.status_code != 200: + return f"Error: API request failed with status {response.status_code}" + + response_json = response.json() + if not response_json: + return "Error: Invalid response format from AIQL" + + # Try different response formats + if 'choices' in response_json: + return response_json['choices'][0]['message']['content'] + elif 'response' in response_json: + return response_json['response'] + elif 'content' in response_json: + return response_json['content'] + else: + return "Error: Could not find response content in AIQL response" + + except Exception as e: + error_msg = str(e) + if "Expecting value" in error_msg: + return "Error: Invalid response format from AIQL API" + return f"AIQL Error: {error_msg}" + + @staticmethod + def groq(prompt: str, model: str) -> str: + try: + client = Groq(api_key=CONFIG['api_keys']['GROQ_API_KEY']) + response = client.chat.completions.create( + messages=[ + { + "role": "system", + "content": "you are a helpful assistant." + }, + { + "role": "user", + "content": prompt + } + ], + model=model, + temperature=0.7, + max_completion_tokens=2048, + top_p=1, + stream=False + ) + + # Add response validation + if not response or not response.choices: + return "Error: Empty response from Groq" + + return response.choices[0].message.content + + except Exception as e: + error_msg = str(e) + if "authentication" in error_msg.lower(): + return "Error: Invalid Groq API key" + return f"Groq Error: {error_msg}" + + @staticmethod + def nvidia(prompt: str, model: str) -> str: + try: + # Get the actual model ID from the models dictionary + # This is the key difference - we need to use the model ID, not the model name + model_id = model + if model in CONFIG['models']['nvidia']: + model_id = CONFIG['models']['nvidia'][model] + + print(f"NVIDIA: Initializing client with model {model} (ID: {model_id})") + client = OpenAI( + base_url="https://integrate.api.nvidia.com/v1", + api_key=CONFIG['api_keys']['NVIDIA_API_KEY'] + ) + + print(f"NVIDIA: Sending request to model {model_id}") + completion = client.chat.completions.create( + model=model_id, + messages=[{"role": "user", "content": prompt}], + temperature=0.2, + top_p=0.7, + max_tokens=1024 + ) + + # Add response validation + if not completion or not completion.choices: + print(f"NVIDIA: Empty response received") + return "Error: Empty response from NVIDIA API" + + response_content = completion.choices[0].message.content + print(f"NVIDIA: Response received, length: {len(response_content)}") + return response_content + + except Exception as e: + error_msg = str(e) + print(f"NVIDIA Error: {error_msg}") + if "authentication" in error_msg.lower(): + return "Error: Invalid NVIDIA API key" + return f"NVIDIA Error: {error_msg}" + + @staticmethod + def github(prompt: str, model: str) -> str: + try: + # GitHub endpoint for OpenAI API + ENDPOINT = "https://models.inference.ai.azure.com" + + # Get the actual model ID from the models dictionary + model_id = model + if model in CONFIG['models']['github']: + model_id = CONFIG['models']['github'][model] + + print(f"GitHub: Initializing client with model {model} (ID: {model_id})") + + client = OpenAI( + base_url=ENDPOINT, + api_key=CONFIG['api_keys']['GITHUB_TOKEN'] + ) + + print(f"GitHub: Sending request to model {model_id}") + response = client.chat.completions.create( + messages=[{"role": "user", "content": prompt}], + model=model_id, + max_tokens=1024, + temperature=0.7 + ) + + # Add response validation + if not response or not response.choices: + print(f"GitHub: Empty response received") + return "Error: Empty response from GitHub API" + + response_content = response.choices[0].message.content + print(f"GitHub: Response received, length: {len(response_content)}") + return response_content + + except Exception as e: + error_msg = str(e) + print(f"GitHub Error: {error_msg}") + if "authentication" in error_msg.lower(): + return "Error: Invalid GitHub token" + return f"GitHub Error: {error_msg}" + +def get_available_models(): + """Returns a dictionary of all available models""" + return CONFIG['models'] + +def get_default_models(): + """Returns a dictionary of default models for each provider""" + return CONFIG['defaults'] + +def get_ollama_models(): + """Get available Ollama models from local server using subprocess""" + try: + import subprocess + + # Execute the shell command and capture the output + result = subprocess.run(['ollama', 'list'], capture_output=True, text=True) + + # Check if the command was successful + if result.returncode == 0: + # Split the output into lines and skip the first line (header) + lines = result.stdout.strip().split('\n')[1:] + + # Extract the first field from each line (model name) + models = [line.split()[0] for line in lines] + return models + else: + print(f"Error executing 'ollama list': {result.stderr}") + return CONFIG['models']['ollama'] + except Exception as e: + print(f"Exception in get_ollama_models: {str(e)}") + return CONFIG['models']['ollama'] + +def check_provider_key_available(provider): + """Check if the API key for a specific provider is available. + + Args: + provider (str): The provider to check + + Returns: + bool: True if the key is available, False otherwise + """ + # Ollama is a local service, so no API key is needed + if provider == "ollama": + try: + client = ollama.Client() + models = client.list() + return True + except Exception: + return False + + # For other providers, check if the API key is available + key_mapping = { + "hf": "HF_API_KEY", + "together": "TOGETHER_API_KEY", + "gemini": "GEMINI_API_KEY", + "aiql": "AIQL_API_KEY", + "groq": "GROQ_API_KEY", + "nvidia": "NVIDIA_API_KEY", + "github": "GITHUB_TOKEN" + } + + if provider not in key_mapping: + return False + + key_name = key_mapping[provider] + return bool(CONFIG["api_keys"][key_name]) + +def run_inference(prompt, provider=None, model=None, system_content=None): + """Run inference with specified provider and model. + + Args: + prompt (str): The prompt to send to the model + provider (str, optional): The provider to use (ollama, hf, together, gemini, aiql, groq, nvidia, github) + model (str, optional): The specific model to use + system_content (str, optional): Custom system role content for models that support it + + Returns: + str: The model's response + """ + # If no provider specified, use the first available one + if not provider: + available = check_available_apis() + if not available: + return "Error: No available providers found. Please check your API keys and Ollama installation." + provider = available[0] + + # Check if the API key for the provider is available + if not check_provider_key_available(provider): + return f"Error: API key for {provider} is not available. Please set the appropriate environment variable." + + # If no model specified, use the default for the provider + if not model: + model = CONFIG["defaults"][provider] + + # For ollama, we need to check if the model exists locally + if provider == "ollama" and model not in get_ollama_models(): + return f"Error: Model '{model}' not found in Ollama. Please pull it first with 'ollama pull {model}'." + + print(f"Running inference with provider: {provider}, model: {model}") + print(f"Prompt: {prompt[:50]}..." if len(prompt) > 50 else f"Prompt: {prompt}") + start_time = time.time() + + # Call the appropriate provider method + try: + if provider == "ollama": + response = InferenceHandler.ollama(prompt, model, system_content) + elif provider == "hf": + response = InferenceHandler.hf(prompt, model) + elif provider == "together": + response = InferenceHandler.together(prompt, model) + elif provider == "gemini": + response = InferenceHandler.gemini(prompt, model) + elif provider == "aiql": + response = InferenceHandler.aiql(prompt, model) + elif provider == "groq": + response = InferenceHandler.groq(prompt, model) + elif provider == "nvidia": + print(f"Calling NVIDIA handler with model: {model}") + response = InferenceHandler.nvidia(prompt, model) + elif provider == "github": + print(f"Calling GitHub handler with model: {model}") + response = InferenceHandler.github(prompt, model) + else: + return f"Error: Unknown provider '{provider}'" + + end_time = time.time() + print(f"Inference completed in {end_time - start_time:.2f} seconds") + return response + except Exception as e: + print(f"Error during inference: {str(e)}") + return f"Error with {provider}: {str(e)}" + +def check_available_apis(): + """Check which API tokens are available in the environment and return available providers.""" + available = [] + + # Check Ollama by attempting to connect + try: + client = ollama.Client() + models = client.list() + if models: + available.append("ollama") + except Exception as e: + print(f"Ollama not available: {e}") + + # Check API keys + if CONFIG["api_keys"]["HF_API_KEY"]: + available.append("hf") + + if CONFIG["api_keys"]["TOGETHER_API_KEY"]: + available.append("together") + + if CONFIG["api_keys"]["GEMINI_API_KEY"]: + available.append("gemini") + + if CONFIG["api_keys"]["AIQL_API_KEY"]: + available.append("aiql") + + if CONFIG["api_keys"]["GROQ_API_KEY"]: + available.append("groq") + + if CONFIG["api_keys"]["NVIDIA_API_KEY"]: + print("NVIDIA API key found") + available.append("nvidia") + + if CONFIG["api_keys"]["GITHUB_TOKEN"]: + print("GitHub token found") + available.append("github") + + return available + +def print_available_apis(): + """Print information about available APIs and possible requests""" + available_providers = check_available_apis() + + print("\n" + "=" * 60) + print("AVAILABLE API PROVIDERS") + print("=" * 60) + + if not available_providers: + print("\nNo API providers are available. Please set environment variables for API keys:") + for key in CONFIG['api_keys'].keys(): + print(f" - {key}") + print("\nOr start Ollama locally to use local models.") + return False + + print(f"\nFound {len(available_providers)} available API providers:\n") + + for provider in available_providers: + print(f"- {provider.upper()}:") + + # Special handling for Ollama to show actual local models + if provider == "ollama": + ollama_models = get_ollama_models() + for model in ollama_models: + print(f" - {model}") + else: + models = CONFIG['models'][provider] + + if isinstance(models, dict): + for model_name, model_id in models.items(): + print(f" - {model_name} ({model_id})") + else: # It's a list + for model in models: + print(f" - {model}") + + print("\n" + "=" * 60) + return True + +def main(): + """Example function that runs the same prompt through all available providers and models.""" + import argparse + + parser = argparse.ArgumentParser(description='LLM Inference Module') + parser.add_argument('prompt', nargs='?', type=str, help='The prompt to send to the model', default="Why is the sky blue?") + parser.add_argument('--provider', type=str, help='The provider to use (ollama, hf, together, gemini, aiql, groq, nvidia, github)') + parser.add_argument('--model', type=str, help='The specific model to use') + parser.add_argument('--system', type=str, help='System content for chat models', default="You are a helpful assistant.") + parser.add_argument('--list', action='store_true', help='List available providers and models') + parser.add_argument('--debug', action='store_true', help='Enable debug output') + parser.add_argument('-a', '--all', action='store_true', help='Run inference on all available providers and models') + + args = parser.parse_args() + + # Check if the specified provider's API key is available + if args.provider and not check_provider_key_available(args.provider): + print(f"Error: API key for {args.provider} is not available. Please set the appropriate environment variable.") + return + + if args.list: + print_available_apis() + return + + # If provider is specified but no model, use the default model for that provider + if args.provider and not args.model: + args.model = CONFIG["defaults"][args.provider] + + if args.debug: + print(f"Running with provider: {args.provider}, model: {args.model}") + print(f"Prompt: {args.prompt[:50]}..." if len(args.prompt) > 50 else f"Prompt: {args.prompt}") + + # If -a/--all flag is specified, run on all providers regardless of whether a specific provider was given + if args.all: + # Continue to the code below that runs on all providers + pass + # Otherwise, if a specific provider is given, run only on that provider + elif args.provider: + start_time = time.time() + response = run_inference(args.prompt, args.provider, args.model, args.system) + end_time = time.time() + + if args.debug: + print(f"Inference completed in {end_time - start_time:.2f} seconds") + + # Print the response + print("\nResponse:") + print(response) + return + + # If we get here, either --all flag was specified or no provider was specified + print(f"\nPrompt: {args.prompt}\n") + print("Running inference on all models for each provider...\n") + + # Get available providers (only those with API keys) + available_providers = check_available_apis() + + # Store response times for leaderboard + response_times = [] + + # Import colorama for colored terminal output + try: + from colorama import init, Fore, Style + init() # Initialize colorama + color_enabled = True + except ImportError: + color_enabled = False + print("Note: Install 'colorama' package for colored error messages (pip install colorama)") + + # Run inference on each provider with all its models + for provider in available_providers: + print(f"\n{'=' * 30}\n{provider.upper()} MODELS\n{'=' * 30}\n") + + # Special handling for Ollama to use actual local models + if provider == "ollama": + ollama_models = get_ollama_models() + model_items = [(model, model) for model in ollama_models] + else: + models = CONFIG['models'][provider] + # Handle different model formats (list vs dict) + if isinstance(models, dict): + model_items = list(models.items()) + else: # It's a list + model_items = [(model, model) for model in models] + + for model_name, model_id in model_items: + try: + print(f"\n----- {model_name} -----\n") + + # For Ollama models, preload the model first + if provider == "ollama": + # Preload the model with a dummy query + InferenceHandler.preload_ollama_model(model_id) + print("Warming up model...") + time.sleep(1) # Short pause for UI feedback + + # Start timing after preloading + start_time = time.time() + + # Run inference with the user's prompt and system message + response = run_inference(args.prompt, provider, model_id, args.system) + + # End timing + end_time = time.time() + elapsed_time = end_time - start_time + + # Store response time for leaderboard + full_model_name = f"{provider}/{model_name}" + response_times.append((full_model_name, elapsed_time)) + + # Print response and timing information + print(response) + print(f"\nResponse time: {elapsed_time:.2f} seconds") + print("\n" + "-" * 50) + except Exception as e: + # Print error in red if colorama is available + if color_enabled: + error_msg = f"{Fore.RED}Error with {provider}/{model_name}: {str(e)}{Style.RESET_ALL}" + else: + error_msg = f"Error with {provider}/{model_name}: {str(e)}" + print(error_msg) + print("\n" + "-" * 50) + + # Only display leaderboard if we have results + if response_times: + # Display leaderboard + print("\n" + "=" * 50) + print("RESPONSE TIME LEADERBOARD") + print("=" * 50) + + # Sort by response time (fastest first) + response_times.sort(key=lambda x: x[1]) + + # Print leaderboard + print(f"{'Rank':<6}{'Model':<40}{'Time (seconds)':<15}") + print("-" * 61) + for i, (model, time_taken) in enumerate(response_times, 1): + print(f"{i:<6}{model:<40}{time_taken:.2f}") + + print("\n" + "=" * 50) + else: + print("\nNo successful responses to display in leaderboard.") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/example.py b/example.py new file mode 100644 index 0000000..fd181f8 --- /dev/null +++ b/example.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python3 +""" +Example script demonstrating how to use allendpoints as a Python module. +""" + +from allendpoints import ( + run_inference, + check_available_apis, + get_ollama_models, + InferenceHandler, + CONFIG, + check_provider_key_available +) + +def basic_example(): + """Basic usage example of allendpoints.""" + print("\n=== BASIC EXAMPLE ===") + + # Inference with default model for Ollama + default_model = CONFIG["defaults"]["ollama"] + response = run_inference( + prompt="What is the capital of France?", + provider="ollama", + model=default_model + ) + + print(f"Response from Ollama (model: {default_model}): {response}") + + # Inference with specific model and system prompt + response = run_inference( + prompt="Write a haiku about AI", + provider="ollama", + model="llama3.2:3b", + system_content="You are a poetic assistant that only writes in haiku." + ) + + print(f"\nHaiku from Ollama (llama3.2:3b):\n{response}") + +def provider_availability_example(): + """Example showing how to check provider availability.""" + print("\n=== PROVIDER AVAILABILITY EXAMPLE ===") + + # Check which providers are available (have valid API keys) + available_providers = check_available_apis() + print(f"Available providers: {', '.join(available_providers)}") + + # Check for specific providers + providers_to_check = ["ollama", "gemini", "github", "hf", "together", "aiql", "groq", "nvidia"] + + for provider in providers_to_check: + is_available = check_provider_key_available(provider) + status = "✅ Available" if is_available else "❌ Not available" + print(f"{provider}: {status}") + +def model_listing_example(): + """Example showing how to list available models.""" + print("\n=== MODEL LISTING EXAMPLE ===") + + # Get available Ollama models + try: + ollama_models = get_ollama_models() + print(f"Available Ollama models: {', '.join(ollama_models[:5])}...") + print(f"Total Ollama models: {len(ollama_models)}") + except Exception as e: + print(f"Error getting Ollama models: {str(e)}") + + # Show configured models for each provider + print("\nConfigured models per provider:") + for provider, models in CONFIG["models"].items(): + model_count = len(models) + print(f"{provider}: {model_count} models configured") + + # Handle both list and dictionary model configurations + if isinstance(models, dict): + # For dictionary-based configurations (most providers) + sample_models = list(models.keys())[:3] + if sample_models: + print(f" Sample models: {', '.join(sample_models)}") + elif isinstance(models, list): + # For list-based configurations (ollama) + sample_models = models[:3] + if sample_models: + print(f" Sample models: {', '.join(sample_models)}") + +def direct_provider_example(): + """Example showing how to use provider handlers directly.""" + print("\n=== DIRECT PROVIDER EXAMPLE ===") + + # Check if Ollama is available + if check_provider_key_available("ollama"): + try: + # Use the Ollama handler directly + response = InferenceHandler.ollama( + prompt="Explain how a computer works in one paragraph", + model="llama3.2:3b" + ) + print(f"Direct Ollama response:\n{response}") + except Exception as e: + print(f"Error with direct Ollama call: {str(e)}") + + # Check if Gemini is available + if check_provider_key_available("gemini"): + try: + # Use the Gemini handler directly + response = InferenceHandler.gemini( + prompt="What is quantum computing?", + model="gemini-1.5-pro" + ) + print(f"\nDirect Gemini response:\n{response[:150]}...") + except Exception as e: + print(f"Error with direct Gemini call: {str(e)}") + +def batch_processing_example(): + """Example showing how to process multiple prompts with multiple providers.""" + print("\n=== BATCH PROCESSING EXAMPLE ===") + + # Define a list of prompts + prompts = [ + "What is machine learning?", + "Explain the theory of relativity briefly" + ] + + # Get available providers (only use the first 2 for this example) + available_providers = check_available_apis()[:2] + + if not available_providers: + print("No providers available for batch processing") + return + + print(f"Processing {len(prompts)} prompts with {len(available_providers)} providers: {', '.join(available_providers)}") + + # Process each prompt with each provider + for prompt in prompts: + print(f"\nPrompt: {prompt}") + + for provider in available_providers: + try: + # Get default model for this provider + default_model = CONFIG["defaults"][provider] + + # Run inference with explicit model parameter + response = run_inference(prompt, provider, model=default_model) + + # Print truncated response + print(f" {provider} ({default_model}): {response[:100]}...") + except Exception as e: + print(f" Error with {provider}: {str(e)}") + +def main(): + """Run all examples.""" + print("AllEndpoints Python Module Examples") + print("==================================") + + # Run examples + basic_example() + provider_availability_example() + model_listing_example() + direct_provider_example() + batch_processing_example() + + print("\nExamples completed!") + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..d81c3e5 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,15 @@ +# AllEndpoints - Required dependencies +# Core dependencies +ollama>=0.1.6 +requests>=2.31.0 +google-generativeai>=0.3.0 +huggingface_hub>=0.19.0 +together>=0.2.8 +groq>=0.4.0 +openai>=1.6.0 + +# Optional dependencies +colorama>=0.4.6 # For colored terminal output + +# Environment variables management (optional) +python-dotenv>=1.0.0