Files
microdao-daarion/services/llm-proxy/providers/local_provider.py
Apple 744c149300
Some checks failed
Build and Deploy Docs / build-and-deploy (push) Has been cancelled
Add automated session logging system
- Created logs/ structure (sessions, operations, incidents)
- Added session-start/log/end scripts
- Installed Git hooks for auto-logging commits/pushes
- Added shell integration for zsh
- Created CHANGELOG.md
- Documented today's session (2026-01-10)
2026-01-10 04:53:17 -08:00

110 lines
3.1 KiB
Python

import httpx
import time
from models import ChatMessage, LLMResponse, Usage, ProviderConfig
class LocalProvider:
"""Local LLM provider (Ollama/vLLM/llama.cpp)"""
def __init__(self, config: ProviderConfig):
self.config = config
self.client = httpx.AsyncClient(
base_url=config.base_url,
timeout=config.timeout
)
async def chat(
self,
messages: list[ChatMessage],
model_name: str,
max_tokens: int | None = None,
temperature: float = 0.7,
top_p: float = 1.0,
**kwargs
) -> LLMResponse:
"""
Call local LLM (Ollama format)
Note: For Phase 3, this is a simple stub.
Can be extended to support llama.cpp, vLLM, etc.
"""
start_time = time.time()
# Ollama chat format
payload = {
"model": model_name,
"messages": [{"role": m.role, "content": m.content} for m in messages],
"stream": False,
"options": {
"temperature": temperature,
"top_p": top_p
}
}
if max_tokens:
payload["options"]["num_predict"] = max_tokens
try:
response = await self.client.post(
"/api/chat",
json=payload
)
response.raise_for_status()
data = response.json()
latency_ms = (time.time() - start_time) * 1000
# Ollama doesn't always provide token counts
content = data.get("message", {}).get("content", "")
prompt_tokens = data.get("prompt_eval_count", 0)
completion_tokens = data.get("eval_count", 0)
return LLMResponse(
content=content,
usage=Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=prompt_tokens + completion_tokens
),
provider="local",
model_resolved=model_name,
latency_ms=latency_ms,
cached=False
)
except httpx.ConnectError:
# Local LLM not running - return stub response
print(f"⚠️ Local LLM not available at {self.config.base_url}, using stub")
return LLMResponse(
content="[STUB] Local LLM not running. Start Ollama: `ollama serve`",
usage=Usage(prompt_tokens=0, completion_tokens=0, total_tokens=0),
provider="local",
model_resolved=model_name,
latency_ms=(time.time() - start_time) * 1000,
cached=False
)
except httpx.HTTPStatusError as e:
raise Exception(f"Local LLM API error: {e.response.status_code} - {e.response.text}")
except Exception as e:
raise Exception(f"Local provider error: {str(e)}")
async def close(self):
"""Close HTTP client"""
await self.client.aclose()