LLM API Comparison: OpenAI vs Anthropic vs Open Source

Introduction

Choosing the right LLM API is critical for cost, performance, and capability. With options ranging from OpenAI to Anthropic to open-source models, understanding the tradeoffs is essential.

Key Statistics:

OpenAI: 70% market share
Anthropic: Growing 300% annually
Open-source: 40% cost savings potential
Average API cost: $1-15 per 1M tokens

Comparison Matrix

┌─────────────────────────────────────────────────────────────────┐
│                    LLM API Comparison                                 │
├─────────────────────────────────────────────────────────────────┤
│                                                                  │
│  Provider     │ Model       │ Context │ Input   │ Output  │ Speed│
│  ─────────────┼─────────────┼─────────┼─────────┼─────────┼──────│
│  OpenAI       │ GPT-4o      │ 128K   │ $5.00   │ $15.00  │ Fast │
│  OpenAI       │ GPT-4 Turbo │ 128K   │ $10.00  │ $30.00  │ Fast │
│  OpenAI       │ GPT-3.5     │ 16K    │ $0.50   │ $1.50   │ V.Fast│
│  Anthropic    │ Claude 3.5  │ 200K   │ $3.00   │ $15.00  │ Fast │
│  Anthropic    │ Claude 3     │ 200K   │ $15.00  │ $75.00  │ Fast │
│  Anthropic    │ Claude 2.1   │ 200K   │ $8.00   │ $24.00  │ Med  │
│  Meta         │ Llama 3.1   │ 128K   │ $0.00*  │ $0.00*  │ Var  │
│  Mistral      │ Mixtral     │ 32K    │ $0.00*  │ $0.00*  │ Var  │
│                                                                  │
│  * Self-hosted cost                                              │
│                                                                  │
└─────────────────────────────────────────────────────────────────┘

OpenAI API

#!/usr/bin/env python3
"""OpenAI API integration."""

from openai import OpenAI

class OpenAIClient:
    """OpenAI API wrapper."""
    
    def __init__(self, api_key):
        self.client = OpenAI(api_key=api_key)
    
    def chat(self, model='gpt-4o', messages=None, **kwargs):
        """Chat completion."""
        
        response = self.client.chat.completions.create(
            model=model,
            messages=messages,
            **kwargs
        )
        
        return {
            'content': response.choices[0].message.content,
            'model': response.model,
            'usage': {
                'prompt_tokens': response.usage.prompt_tokens,
                'completion_tokens': response.usage.completion_tokens,
                'total_tokens': response.usage.total_tokens
            }
        }
    
    def stream_chat(self, model='gpt-4o', messages=None, **kwargs):
        """Streaming chat."""
        
        response = self.client.chat.completions.create(
            model=model,
            messages=messages,
            stream=True,
            **kwargs
        )
        
        for chunk in response:
            if chunk.choices[0].delta.content:
                yield chunk.choices[0].delta.content
    
    def embed(self, text, model='text-embedding-3-small'):
        """Get embeddings."""
        
        response = self.client.embeddings.create(
            model=model,
            input=text
        )
        
        return response.data[0].embedding
    
    def get_image(self, prompt, model='dall-e-3', size='1024x1024'):
        """Generate image."""
        
        response = self.client.images.generate(
            model=model,
            prompt=prompt,
            size=size,
            n=1
        )
        
        return response.data[0].url

Anthropic API

#!/usr/bin/env python3
"""Anthropic Claude API integration."""

import anthropic

class AnthropicClient:
    """Anthropic API wrapper."""
    
    def __init__(self, api_key):
        self.client = anthropic.Anthropic(api_key=api_key)
    
    def message(self, model='claude-3-5-sonnet-20241022', 
                messages=None, max_tokens=1024, **kwargs):
        """Claude message API."""
        
        response = self.client.messages.create(
            model=model,
            messages=messages,
            max_tokens=max_tokens,
            **kwargs
        )
        
        return {
            'content': response.content[0].text,
            'model': response.model,
            'usage': {
                'input_tokens': response.usage.input_tokens,
                'output_tokens': response.usage.output_tokens
            }
        }
    
    def stream_message(self, model='claude-3-5-sonnet-20241022',
                     messages=None, max_tokens=1024):
        """Streaming message."""
        
        with self.client.messages.stream(
            model=model,
            messages=messages,
            max_tokens=max_tokens
        ) as stream:
            for text in stream.text_stream:
                yield text
    
    def count_tokens(self, text, model='claude-3-5-sonnet-20241022'):
        """Count tokens."""
        
        return self.client.count_tokens(text)

Open-Source Models

Self-Hosted Llama

#!/usr/bin/env python3
"""Llama self-hosted inference."""

from llama_cpp import Llama

class LlamaClient:
    """Llama inference wrapper."""
    
    def __init__(self, model_path, n_ctx=4096, n_gpu_layers=0):
        self.llm = Llama(
            model_path=model_path,
            n_ctx=n_ctx,
            n_gpu_layers=n_gpu_layers,
            verbose=False
        )
    
    def chat(self, messages, temperature=0.7, max_tokens=1024):
        """Chat completion."""
        
        # Convert messages to Llama format
        prompt = self.format_messages(messages)
        
        response = self.llm(
            prompt,
            temperature=temperature,
            max_tokens=max_tokens,
            stop=['<|eot_id|>', '<|start_header_id|>']
        )
        
        return {
            'content': response['choices'][0]['text'],
            'model': 'llama',
            'usage': {
                'prompt_tokens': response['usage']['prompt_tokens'],
                'completion_tokens': response['usage']['completion_tokens'],
                'total_tokens': response['usage']['total_tokens']
            }
        }
    
    def format_messages(self, messages):
        """Format messages for Llama."""
        
        prompt = "<|begin_of_text|>"
        
        for msg in messages:
            role = msg['role']
            content = msg['content']
            prompt += f"<|start_header_id|>{role}<|end_header_id|>\n\n{content}<|eot_id|>"
        
        prompt += "<|start_header_id|>assistant<|end_header_id|>\n\n"
        
        return prompt

Using Ollama

# Install Ollama
curl -fsSL https://ollama.com/install.sh | sh

# Pull model
ollama pull llama3.1
ollama pull mistral
ollama pull codellama

# Run API
ollama serve

import ollama

class OllamaClient:
    """Ollama API client."""
    
    def __init__(self, base_url='http://localhost:11434'):
        self.base_url = base_url
    
    def chat(self, model='llama3.1', messages=None, **kwargs):
        """Chat completion."""
        
        response = ollama.chat(
            model=model,
            messages=messages,
            **kwargs
        )
        
        return {
            'content': response['message']['content'],
            'model': response['model']
        }
    
    def embed(self, model='nomic-embed-text', text):
        """Get embeddings."""
        
        response = ollama.embeddings(
            model=model,
            prompt=text
        )
        
        return response['embedding']

Cost Comparison

#!/usr/bin/env python3
"""Cost comparison calculator."""

def compare_costs(monthly_tokens):
    """Compare API costs."""
    
    # Monthly tokens in millions
    prompt, completion = monthly_tokens['prompt'], monthly_tokens['completion']
    
    providers = {
        'OpenAI GPT-4o': {
            'prompt': 5.0,
            'completion': 15.0,
            'calc': lambda p, c: p * 5 + c * 15
        },
        'OpenAI GPT-3.5': {
            'prompt': 0.5,
            'completion': 1.5,
            'calc': lambda p, c: p * 0.5 + c * 1.5
        },
        'Anthropic Claude 3.5': {
            'prompt': 3.0,
            'completion': 15.0,
            'calc': lambda p, c: p * 3 + c * 15
        },
        'Self-hosted Llama 3.1': {
            # Approximate self-hosted cost (8x A100 GPU)
            'calc': lambda p, c: (p + c) * 0.50  # ~$0.50 per 1M tokens
        }
    }
    
    results = {}
    
    for name, config in providers.items():
        cost = config['calc'](prompt, completion)
        results[name] = cost
    
    return results

When to Choose

Use Case	Recommended
General chat, GPT-4 best	OpenAI GPT-4o
Long context needed	Anthropic Claude
Cost-sensitive	Open-source
Coding tasks	OpenAI Codex / Claude
On-premise required	Llama / Mistral
Enterprise compliance	Anthropic / OpenAI
Low latency	OpenAI / Anthropic