Introduction
Choosing the right LLM API is critical for cost, performance, and capability. With options ranging from OpenAI to Anthropic to open-source models, understanding the tradeoffs is essential.
Key Statistics:
- OpenAI: 70% market share
- Anthropic: Growing 300% annually
- Open-source: 40% cost savings potential
- Average API cost: $1-15 per 1M tokens
Comparison Matrix
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โ LLM API Comparison โ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค
โ โ
โ Provider โ Model โ Context โ Input โ Output โ Speedโ
โ โโโโโโโโโโโโโโผโโโโโโโโโโโโโโผโโโโโโโโโโผโโโโโโโโโโผโโโโโโโโโโผโโโโโโโ
โ OpenAI โ GPT-4o โ 128K โ $5.00 โ $15.00 โ Fast โ
โ OpenAI โ GPT-4 Turbo โ 128K โ $10.00 โ $30.00 โ Fast โ
โ OpenAI โ GPT-3.5 โ 16K โ $0.50 โ $1.50 โ V.Fastโ
โ Anthropic โ Claude 3.5 โ 200K โ $3.00 โ $15.00 โ Fast โ
โ Anthropic โ Claude 3 โ 200K โ $15.00 โ $75.00 โ Fast โ
โ Anthropic โ Claude 2.1 โ 200K โ $8.00 โ $24.00 โ Med โ
โ Meta โ Llama 3.1 โ 128K โ $0.00* โ $0.00* โ Var โ
โ Mistral โ Mixtral โ 32K โ $0.00* โ $0.00* โ Var โ
โ โ
โ * Self-hosted cost โ
โ โ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
OpenAI API
#!/usr/bin/env python3
"""OpenAI API integration."""
from openai import OpenAI
class OpenAIClient:
"""OpenAI API wrapper."""
def __init__(self, api_key):
self.client = OpenAI(api_key=api_key)
def chat(self, model='gpt-4o', messages=None, **kwargs):
"""Chat completion."""
response = self.client.chat.completions.create(
model=model,
messages=messages,
**kwargs
)
return {
'content': response.choices[0].message.content,
'model': response.model,
'usage': {
'prompt_tokens': response.usage.prompt_tokens,
'completion_tokens': response.usage.completion_tokens,
'total_tokens': response.usage.total_tokens
}
}
def stream_chat(self, model='gpt-4o', messages=None, **kwargs):
"""Streaming chat."""
response = self.client.chat.completions.create(
model=model,
messages=messages,
stream=True,
**kwargs
)
for chunk in response:
if chunk.choices[0].delta.content:
yield chunk.choices[0].delta.content
def embed(self, text, model='text-embedding-3-small'):
"""Get embeddings."""
response = self.client.embeddings.create(
model=model,
input=text
)
return response.data[0].embedding
def get_image(self, prompt, model='dall-e-3', size='1024x1024'):
"""Generate image."""
response = self.client.images.generate(
model=model,
prompt=prompt,
size=size,
n=1
)
return response.data[0].url
Anthropic API
#!/usr/bin/env python3
"""Anthropic Claude API integration."""
import anthropic
class AnthropicClient:
"""Anthropic API wrapper."""
def __init__(self, api_key):
self.client = anthropic.Anthropic(api_key=api_key)
def message(self, model='claude-3-5-sonnet-20241022',
messages=None, max_tokens=1024, **kwargs):
"""Claude message API."""
response = self.client.messages.create(
model=model,
messages=messages,
max_tokens=max_tokens,
**kwargs
)
return {
'content': response.content[0].text,
'model': response.model,
'usage': {
'input_tokens': response.usage.input_tokens,
'output_tokens': response.usage.output_tokens
}
}
def stream_message(self, model='claude-3-5-sonnet-20241022',
messages=None, max_tokens=1024):
"""Streaming message."""
with self.client.messages.stream(
model=model,
messages=messages,
max_tokens=max_tokens
) as stream:
for text in stream.text_stream:
yield text
def count_tokens(self, text, model='claude-3-5-sonnet-20241022'):
"""Count tokens."""
return self.client.count_tokens(text)
Open-Source Models
Self-Hosted Llama
#!/usr/bin/env python3
"""Llama self-hosted inference."""
from llama_cpp import Llama
class LlamaClient:
"""Llama inference wrapper."""
def __init__(self, model_path, n_ctx=4096, n_gpu_layers=0):
self.llm = Llama(
model_path=model_path,
n_ctx=n_ctx,
n_gpu_layers=n_gpu_layers,
verbose=False
)
def chat(self, messages, temperature=0.7, max_tokens=1024):
"""Chat completion."""
# Convert messages to Llama format
prompt = self.format_messages(messages)
response = self.llm(
prompt,
temperature=temperature,
max_tokens=max_tokens,
stop=['<|eot_id|>', '<|start_header_id|>']
)
return {
'content': response['choices'][0]['text'],
'model': 'llama',
'usage': {
'prompt_tokens': response['usage']['prompt_tokens'],
'completion_tokens': response['usage']['completion_tokens'],
'total_tokens': response['usage']['total_tokens']
}
}
def format_messages(self, messages):
"""Format messages for Llama."""
prompt = "<|begin_of_text|>"
for msg in messages:
role = msg['role']
content = msg['content']
prompt += f"<|start_header_id|>{role}<|end_header_id|>\n\n{content}<|eot_id|>"
prompt += "<|start_header_id|>assistant<|end_header_id|>\n\n"
return prompt
Using Ollama
# Install Ollama
curl -fsSL https://ollama.com/install.sh | sh
# Pull model
ollama pull llama3.1
ollama pull mistral
ollama pull codellama
# Run API
ollama serve
import ollama
class OllamaClient:
"""Ollama API client."""
def __init__(self, base_url='http://localhost:11434'):
self.base_url = base_url
def chat(self, model='llama3.1', messages=None, **kwargs):
"""Chat completion."""
response = ollama.chat(
model=model,
messages=messages,
**kwargs
)
return {
'content': response['message']['content'],
'model': response['model']
}
def embed(self, model='nomic-embed-text', text):
"""Get embeddings."""
response = ollama.embeddings(
model=model,
prompt=text
)
return response['embedding']
Cost Comparison
#!/usr/bin/env python3
"""Cost comparison calculator."""
def compare_costs(monthly_tokens):
"""Compare API costs."""
# Monthly tokens in millions
prompt, completion = monthly_tokens['prompt'], monthly_tokens['completion']
providers = {
'OpenAI GPT-4o': {
'prompt': 5.0,
'completion': 15.0,
'calc': lambda p, c: p * 5 + c * 15
},
'OpenAI GPT-3.5': {
'prompt': 0.5,
'completion': 1.5,
'calc': lambda p, c: p * 0.5 + c * 1.5
},
'Anthropic Claude 3.5': {
'prompt': 3.0,
'completion': 15.0,
'calc': lambda p, c: p * 3 + c * 15
},
'Self-hosted Llama 3.1': {
# Approximate self-hosted cost (8x A100 GPU)
'calc': lambda p, c: (p + c) * 0.50 # ~$0.50 per 1M tokens
}
}
results = {}
for name, config in providers.items():
cost = config['calc'](prompt, completion)
results[name] = cost
return results
When to Choose
| Use Case | Recommended |
|---|---|
| General chat, GPT-4 best | OpenAI GPT-4o |
| Long context needed | Anthropic Claude |
| Cost-sensitive | Open-source |
| Coding tasks | OpenAI Codex / Claude |
| On-premise required | Llama / Mistral |
| Enterprise compliance | Anthropic / OpenAI |
| Low latency | OpenAI / Anthropic |
Comments