allendpoints/allendpoints.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
'''
LLM Inference Module
A simplified module for making inferences with various LLM providers

Requirements:
pip install -r requirements.txt
'''

import sys
import requests
import ollama
import google.generativeai as genai
from huggingface_hub import InferenceClient
from together import Together
from groq import Groq
import os
import time
from openai import OpenAI  # Used for both NVIDIA and GitHub endpoints

CONFIG = {
    "api_keys": {
        "HF_API_KEY": os.environ.get("HF_API_KEY"),
        "TOGETHER_API_KEY": os.environ.get("TOGETHER_API_KEY"),
        "GEMINI_API_KEY": os.environ.get("GEMINI_API_KEY"),
        "AIQL_API_KEY": os.environ.get("AIQL_API_KEY"),
        "GROQ_API_KEY": os.environ.get("GROQ_API_KEY"),
        "NVIDIA_API_KEY": os.environ.get("NVIDIA_API_KEY"),
        "GITHUB_TOKEN": os.environ.get("GITHUB_TOKEN")
    },
    "models": {
        "aiql": {
            "Llama-3.3-70B-Instruct": "meta-llama/Llama-3.3-70B-Instruct",
            "Llama-3.3-70B-Chat": "meta-llama/Llama-3.3-70B-Chat"
        },
        "together": {
            "DeepSeek-70B": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B-free",
            "Llama-3-3-70B-Turbo": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free"
        },
        "gemini": {
            "gemini-2.5-pro-preview": "gemini-2.5-pro-preview-03-25",
            "gemini-2.5-flash-preview": "gemini-2.5-flash-preview-04-17",
            "gemini-1.5-flash": "gemini-1.5-flash",
            "gemini-1.5-pro": "gemini-1.5-pro",
            "gemini-1.5-flash-002": "gemini-1.5-flash-002",
            "gemini-1.5-flash-001": "gemini-1.5-flash-001",
            "gemini-1.5-pro-002": "gemini-1.5-pro-002",
            "gemini-1.5-pro-001": "gemini-1.5-pro-001",
            "gemini-2.0-flash": "gemini-2.0-flash",
            "gemini-2.0-flash-exp": "gemini-2.0-flash-exp",
            "gemini-2.0-flash-thinking-exp-01-21": "gemini-2.0-flash-thinking-exp-01-21"
        },
        "hf": {
            "DeepSeek-R1-Distill-Qwen-32B": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
            "Qwen2.5-Coder-32B": "Qwen/Qwen2.5-Coder-32B-Instruct"
        },
        "ollama": [
            "falcon3:10b"
        ],
        "groq": {
            "llama-3.3-70b-versatile": "llama-3.3-70b-versatile",
            "deepseek-r1-distill-llama-70b": "deepseek-r1-distill-llama-70b"
        },
        "nvidia": {
            "qwen2.5-coder-32b": "qwen/qwen2.5-coder-32b-instruct",
            "llama2-70b": "meta-llama/llama-2-70b-chat",
            "mixtral-8x7b": "mistralai/mixtral-8x7b-instruct",
            "yi-34b": "01-ai/yi-34b-chat"
        },
        "github": {
            "gpt-4o": "gpt-4o",
            "gpt-4o-mini": "gpt-4o-mini",
            "mistral-small": "mistral-small-2503",
            "deepseek-v3": "deepseek-v3",
            "phi-4": "phi-4",
            "llama-3.3-70b": "llama-3.3-70b-instruct"
        }
    },
    "defaults": {
        "ollama": "llama3.2:3b",
        "hf": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
        "together": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B-free",
        "gemini": "gemini-1.5-flash",
        "aiql": "meta-llama/Llama-3.3-70B-Instruct",
        "groq": "llama-3.3-70b-versatile",
        "nvidia": "qwen/qwen2.5-coder-32b-instruct",
        "github": "gpt-4o"
    }
}

class InferenceHandler:
    @staticmethod
    def preload_ollama_model(model: str):
        """Preload an Ollama model by sending a simple query to it"""
        try:
            print(f"Loading Ollama model {model}...")
            client = ollama.Client()

            # Send a simple query to load the model into memory
            client.chat(model=model, messages=[{'role': 'user', 'content': 'hello'}])
            print(f"Model {model} loaded successfully")
            return True
        except Exception as e:
            print(f"Failed to preload model {model}: {str(e)}")
            return False

    @staticmethod
    def ollama(prompt: str, model: str, system_content: str = None, preload: bool = False) -> str:
        try:
            client = ollama.Client()

            # Preload the model if requested
            if preload:
                InferenceHandler.preload_ollama_model(model)

            # If system_content is provided, use the chat API with messages
            if system_content:
                messages = [
                    {'role': 'system', 'content': system_content},
                    {'role': 'user', 'content': prompt}
                ]
                response = client.chat(model=model, messages=messages)

                # Add response validation
                if not response or 'message' not in response or 'content' not in response['message']:
                    return "Error: Empty response from Ollama chat API"

                return response['message']['content']
            else:
                # Use the generate API without system content
                response = client.generate(model=model, prompt=prompt)

                # Add response validation
                if not response or 'response' not in response:
                    return "Error: Empty response from Ollama generate API"

                return response['response']

        except Exception as e:
            error_msg = str(e)
            if "connection" in error_msg.lower():
                return "Error: Could not connect to Ollama server"
            return f"Ollama Error: {error_msg}"

    @staticmethod
    def hf(prompt: str, model: str) -> str:
        try:
            client = InferenceClient(token=CONFIG['api_keys']['HF_API_KEY'])
            response = client.text_generation(prompt, model=model)

            # Add response validation
            if not response or response.isspace():
                return "Error: Empty response from HuggingFace"

            return response

        except Exception as e:
            # Improve error message
            error_msg = str(e)
            if "Expecting value" in error_msg:
                return "Error: Invalid response format from HuggingFace API"
            return f"HF Error: {error_msg}"

    @staticmethod
    def together(prompt: str, model: str) -> str:
        try:
            client = Together(api_key=CONFIG['api_keys']['TOGETHER_API_KEY'])
            response = client.chat.completions.create(
                model=model,
                messages=[{"role": "user", "content": prompt}],
                max_tokens=2048  # Add reasonable token limit
            )

            # Add response validation
            if not response or not response.choices:
                return "Error: Empty response from Together"

            return response.choices[0].message.content

        except Exception as e:
            error_msg = str(e)
            if "authentication" in error_msg.lower():
                return "Error: Invalid Together API key"
            return f"Together Error: {error_msg}"

    @staticmethod
    def gemini(prompt: str, model: str) -> str:
        try:
            genai.configure(api_key=CONFIG['api_keys']['GEMINI_API_KEY'])
            model = genai.GenerativeModel(model)
            response = model.generate_content(prompt)

            # Add response validation
            if not response or not response.text:
                return "Error: Empty response from Gemini"

            return response.text

        except Exception as e:
            error_msg = str(e)
            if "invalid" in error_msg.lower() and "model" in error_msg.lower():
                return "Error: Invalid Gemini model"
            return f"Gemini Error: {error_msg}"

    @staticmethod
    def aiql(prompt: str, model: str) -> str:
        try:
            headers = {
                "Authorization": f"Bearer {CONFIG['api_keys']['AIQL_API_KEY']}",
                "Content-Type": "application/json"
            }
            data = {
                "model": model,
                "messages": [{"role": "user", "content": prompt}]
            }
            response = requests.post(
                "https://ai.aiql.com/v1/chat/completions",
                headers=headers,
                json=data
            )

            # Add response validation
            if not response or response.status_code != 200:
                return f"Error: API request failed with status {response.status_code}"

            response_json = response.json()
            if not response_json:
                return "Error: Invalid response format from AIQL"

            # Try different response formats
            if 'choices' in response_json:
                return response_json['choices'][0]['message']['content']
            elif 'response' in response_json:
                return response_json['response']
            elif 'content' in response_json:
                return response_json['content']
            else:
                return "Error: Could not find response content in AIQL response"

        except Exception as e:
            error_msg = str(e)
            if "Expecting value" in error_msg:
                return "Error: Invalid response format from AIQL API"
            return f"AIQL Error: {error_msg}"

    @staticmethod
    def groq(prompt: str, model: str) -> str:
        try:
            client = Groq(api_key=CONFIG['api_keys']['GROQ_API_KEY'])
            response = client.chat.completions.create(
                messages=[
                    {
                        "role": "system",
                        "content": "you are a helpful assistant."
                    },
                    {
                        "role": "user",
                        "content": prompt
                    }
                ],
                model=model,
                temperature=0.7,
                max_completion_tokens=2048,
                top_p=1,
                stream=False
            )

            # Add response validation
            if not response or not response.choices:
                return "Error: Empty response from Groq"

            return response.choices[0].message.content

        except Exception as e:
            error_msg = str(e)
            if "authentication" in error_msg.lower():
                return "Error: Invalid Groq API key"
            return f"Groq Error: {error_msg}"

    @staticmethod
    def nvidia(prompt: str, model: str) -> str:
        try:
            # Get the actual model ID from the models dictionary
            # This is the key difference - we need to use the model ID, not the model name
            model_id = model
            if model in CONFIG['models']['nvidia']:
                model_id = CONFIG['models']['nvidia'][model]

            print(f"NVIDIA: Initializing client with model {model} (ID: {model_id})")
            client = OpenAI(
                base_url="https://integrate.api.nvidia.com/v1",
                api_key=CONFIG['api_keys']['NVIDIA_API_KEY']
            )

            print(f"NVIDIA: Sending request to model {model_id}")
            completion = client.chat.completions.create(
                model=model_id,
                messages=[{"role": "user", "content": prompt}],
                temperature=0.2,
                top_p=0.7,
                max_tokens=1024
            )

            # Add response validation
            if not completion or not completion.choices:
                print(f"NVIDIA: Empty response received")
                return "Error: Empty response from NVIDIA API"

            response_content = completion.choices[0].message.content
            print(f"NVIDIA: Response received, length: {len(response_content)}")
            return response_content

        except Exception as e:
            error_msg = str(e)
            print(f"NVIDIA Error: {error_msg}")
            if "authentication" in error_msg.lower():
                return "Error: Invalid NVIDIA API key"
            return f"NVIDIA Error: {error_msg}"

    @staticmethod
    def github(prompt: str, model: str) -> str:
        try:
            # GitHub endpoint for OpenAI API
            ENDPOINT = "https://models.inference.ai.azure.com"

            # Get the actual model ID from the models dictionary
            model_id = model
            if model in CONFIG['models']['github']:
                model_id = CONFIG['models']['github'][model]

            print(f"GitHub: Initializing client with model {model} (ID: {model_id})")

            client = OpenAI(
                base_url=ENDPOINT,
                api_key=CONFIG['api_keys']['GITHUB_TOKEN']
            )

            print(f"GitHub: Sending request to model {model_id}")
            response = client.chat.completions.create(
                messages=[{"role": "user", "content": prompt}],
                model=model_id,
                max_tokens=1024,
                temperature=0.7
            )

            # Add response validation
            if not response or not response.choices:
                print(f"GitHub: Empty response received")
                return "Error: Empty response from GitHub API"

            response_content = response.choices[0].message.content
            print(f"GitHub: Response received, length: {len(response_content)}")
            return response_content

        except Exception as e:
            error_msg = str(e)
            print(f"GitHub Error: {error_msg}")
            if "authentication" in error_msg.lower():
                return "Error: Invalid GitHub token"
            return f"GitHub Error: {error_msg}"

def get_available_models():
    """Returns a dictionary of all available models"""
    return CONFIG['models']

def get_default_models():
    """Returns a dictionary of default models for each provider"""
    return CONFIG['defaults']

def get_ollama_models():
    """Get available Ollama models from local server using subprocess"""
    try:
        import subprocess

        # Execute the shell command and capture the output
        result = subprocess.run(['ollama', 'list'], capture_output=True, text=True)

        # Check if the command was successful
        if result.returncode == 0:
            # Split the output into lines and skip the first line (header)
            lines = result.stdout.strip().split('\n')[1:]

            # Extract the first field from each line (model name)
            models = [line.split()[0] for line in lines]
            return models
        else:
            print(f"Error executing 'ollama list': {result.stderr}")
            return CONFIG['models']['ollama']
    except Exception as e:
        print(f"Exception in get_ollama_models: {str(e)}")
        return CONFIG['models']['ollama']

def check_provider_key_available(provider):
    """Check if the API key for a specific provider is available.

    Args:
        provider (str): The provider to check

    Returns:
        bool: True if the key is available, False otherwise
    """
    # Ollama is a local service, so no API key is needed
    if provider == "ollama":
        try:
            client = ollama.Client()
            models = client.list()
            return True
        except Exception:
            return False

    # For other providers, check if the API key is available
    key_mapping = {
        "hf": "HF_API_KEY",
        "together": "TOGETHER_API_KEY",
        "gemini": "GEMINI_API_KEY",
        "aiql": "AIQL_API_KEY",
        "groq": "GROQ_API_KEY",
        "nvidia": "NVIDIA_API_KEY",
        "github": "GITHUB_TOKEN"
    }

    if provider not in key_mapping:
        return False

    key_name = key_mapping[provider]
    return bool(CONFIG["api_keys"][key_name])

def run_inference(prompt, provider=None, model=None, system_content=None):
    """Run inference with specified provider and model.

    Args:
        prompt (str): The prompt to send to the model
        provider (str, optional): The provider to use (ollama, hf, together, gemini, aiql, groq, nvidia, github)
        model (str, optional): The specific model to use
        system_content (str, optional): Custom system role content for models that support it

    Returns:
        str: The model's response
    """
    # If no provider specified, use the first available one
    if not provider:
        available = check_available_apis()
        if not available:
            return "Error: No available providers found. Please check your API keys and Ollama installation."
        provider = available[0]

    # Check if the API key for the provider is available
    if not check_provider_key_available(provider):
        return f"Error: API key for {provider} is not available. Please set the appropriate environment variable."

    # If no model specified, use the default for the provider
    if not model:
        model = CONFIG["defaults"][provider]

    # For ollama, we need to check if the model exists locally
    if provider == "ollama" and model not in get_ollama_models():
        return f"Error: Model '{model}' not found in Ollama. Please pull it first with 'ollama pull {model}'."

    print(f"Running inference with provider: {provider}, model: {model}")
    print(f"Prompt: {prompt[:50]}..." if len(prompt) > 50 else f"Prompt: {prompt}")
    start_time = time.time()

    # Call the appropriate provider method
    try:
        if provider == "ollama":
            response = InferenceHandler.ollama(prompt, model, system_content)
        elif provider == "hf":
            response = InferenceHandler.hf(prompt, model)
        elif provider == "together":
            response = InferenceHandler.together(prompt, model)
        elif provider == "gemini":
            response = InferenceHandler.gemini(prompt, model)
        elif provider == "aiql":
            response = InferenceHandler.aiql(prompt, model)
        elif provider == "groq":
            response = InferenceHandler.groq(prompt, model)
        elif provider == "nvidia":
            print(f"Calling NVIDIA handler with model: {model}")
            response = InferenceHandler.nvidia(prompt, model)
        elif provider == "github":
            print(f"Calling GitHub handler with model: {model}")
            response = InferenceHandler.github(prompt, model)
        else:
            return f"Error: Unknown provider '{provider}'"

        end_time = time.time()
        print(f"Inference completed in {end_time - start_time:.2f} seconds")
        return response
    except Exception as e:
        print(f"Error during inference: {str(e)}")
        return f"Error with {provider}: {str(e)}"

def check_available_apis():
    """Check which API tokens are available in the environment and return available providers."""
    available = []

    # Check Ollama by attempting to connect
    try:
        client = ollama.Client()
        models = client.list()
        if models:
            available.append("ollama")
    except Exception as e:
        print(f"Ollama not available: {e}")

    # Check API keys
    if CONFIG["api_keys"]["HF_API_KEY"]:
        available.append("hf")

    if CONFIG["api_keys"]["TOGETHER_API_KEY"]:
        available.append("together")

    if CONFIG["api_keys"]["GEMINI_API_KEY"]:
        available.append("gemini")

    if CONFIG["api_keys"]["AIQL_API_KEY"]:
        available.append("aiql")

    if CONFIG["api_keys"]["GROQ_API_KEY"]:
        available.append("groq")

    if CONFIG["api_keys"]["NVIDIA_API_KEY"]:
        print("NVIDIA API key found")
        available.append("nvidia")

    if CONFIG["api_keys"]["GITHUB_TOKEN"]:
        print("GitHub token found")
        available.append("github")

    return available

def print_available_apis():
    """Print information about available APIs and possible requests"""
    available_providers = check_available_apis()

    print("\n" + "=" * 60)
    print("AVAILABLE API PROVIDERS")
    print("=" * 60)

    if not available_providers:
        print("\nNo API providers are available. Please set environment variables for API keys:")
        for key in CONFIG['api_keys'].keys():
            print(f"  - {key}")
        print("\nOr start Ollama locally to use local models.")
        return False

    print(f"\nFound {len(available_providers)} available API providers:\n")

    for provider in available_providers:
        print(f"- {provider.upper()}:")

        # Special handling for Ollama to show actual local models
        if provider == "ollama":
            ollama_models = get_ollama_models()
            for model in ollama_models:
                print(f"  - {model}")
        else:
            models = CONFIG['models'][provider]

            if isinstance(models, dict):
                for model_name, model_id in models.items():
                    print(f"  - {model_name} ({model_id})")
            else:  # It's a list
                for model in models:
                    print(f"  - {model}")

    print("\n" + "=" * 60)
    return True

def main():
    """Example function that runs the same prompt through all available providers and models."""
    import argparse

    parser = argparse.ArgumentParser(description='LLM Inference Module')
    parser.add_argument('prompt', nargs='?', type=str, help='The prompt to send to the model', default="Why is the sky blue?")
    parser.add_argument('--provider', type=str, help='The provider to use (ollama, hf, together, gemini, aiql, groq, nvidia, github)')
    parser.add_argument('--model', type=str, help='The specific model to use')
    parser.add_argument('--system', type=str, help='System content for chat models', default="You are a helpful assistant.")
    parser.add_argument('--list', action='store_true', help='List available providers and models')
    parser.add_argument('--debug', action='store_true', help='Enable debug output')
    parser.add_argument('-a', '--all', action='store_true', help='Run inference on all available providers and models')

    args = parser.parse_args()

    # Check if the specified provider's API key is available
    if args.provider and not check_provider_key_available(args.provider):
        print(f"Error: API key for {args.provider} is not available. Please set the appropriate environment variable.")
        return

    if args.list:
        print_available_apis()
        return

    # If provider is specified but no model, use the default model for that provider
    if args.provider and not args.model:
        args.model = CONFIG["defaults"][args.provider]

    if args.debug:
        print(f"Running with provider: {args.provider}, model: {args.model}")
        print(f"Prompt: {args.prompt[:50]}..." if len(args.prompt) > 50 else f"Prompt: {args.prompt}")

    # If -a/--all flag is specified, run on all providers regardless of whether a specific provider was given
    if args.all:
        # Continue to the code below that runs on all providers
        pass
    # Otherwise, if a specific provider is given, run only on that provider
    elif args.provider:
        start_time = time.time()
        response = run_inference(args.prompt, args.provider, args.model, args.system)
        end_time = time.time()

        if args.debug:
            print(f"Inference completed in {end_time - start_time:.2f} seconds")

        # Print the response
        print("\nResponse:")
        print(response)
        return

    # If we get here, either --all flag was specified or no provider was specified
    print(f"\nPrompt: {args.prompt}\n")
    print("Running inference on all models for each provider...\n")

    # Get available providers (only those with API keys)
    available_providers = check_available_apis()

    # Store response times for leaderboard
    response_times = []

    # Import colorama for colored terminal output
    try:
        from colorama import init, Fore, Style
        init()  # Initialize colorama
        color_enabled = True
    except ImportError:
        color_enabled = False
        print("Note: Install 'colorama' package for colored error messages (pip install colorama)")

    # Run inference on each provider with all its models
    for provider in available_providers:
        print(f"\n{'=' * 30}\n{provider.upper()} MODELS\n{'=' * 30}\n")

        # Special handling for Ollama to use actual local models
        if provider == "ollama":
            ollama_models = get_ollama_models()
            model_items = [(model, model) for model in ollama_models]
        else:
            models = CONFIG['models'][provider]
            # Handle different model formats (list vs dict)
            if isinstance(models, dict):
                model_items = list(models.items())
            else:  # It's a list
                model_items = [(model, model) for model in models]

        for model_name, model_id in model_items:
            try:
                print(f"\n----- {model_name} -----\n")

                # For Ollama models, preload the model first
                if provider == "ollama":
                    # Preload the model with a dummy query
                    InferenceHandler.preload_ollama_model(model_id)
                    print("Warming up model...")
                    time.sleep(1)  # Short pause for UI feedback

                # Start timing after preloading
                start_time = time.time()

                # Run inference with the user's prompt and system message
                response = run_inference(args.prompt, provider, model_id, args.system)

                # End timing
                end_time = time.time()
                elapsed_time = end_time - start_time

                # Store response time for leaderboard
                full_model_name = f"{provider}/{model_name}"
                response_times.append((full_model_name, elapsed_time))

                # Print response and timing information
                print(response)
                print(f"\nResponse time: {elapsed_time:.2f} seconds")
                print("\n" + "-" * 50)
            except Exception as e:
                # Print error in red if colorama is available
                if color_enabled:
                    error_msg = f"{Fore.RED}Error with {provider}/{model_name}: {str(e)}{Style.RESET_ALL}"
                else:
                    error_msg = f"Error with {provider}/{model_name}: {str(e)}"
                print(error_msg)
                print("\n" + "-" * 50)

    # Only display leaderboard if we have results
    if response_times:
        # Display leaderboard
        print("\n" + "=" * 50)
        print("RESPONSE TIME LEADERBOARD")
        print("=" * 50)

        # Sort by response time (fastest first)
        response_times.sort(key=lambda x: x[1])

        # Print leaderboard
        print(f"{'Rank':<6}{'Model':<40}{'Time (seconds)':<15}")
        print("-" * 61)
        for i, (model, time_taken) in enumerate(response_times, 1):
            print(f"{i:<6}{model:<40}{time_taken:.2f}")

        print("\n" + "=" * 50)
    else:
        print("\nNo successful responses to display in leaderboard.")

if __name__ == "__main__":
    main()