- matrix-gateway: POST /internal/matrix/presence/online endpoint - usePresenceHeartbeat hook with activity tracking - Auto away after 5 min inactivity - Offline on page close/visibility change - Integrated in MatrixChatRoom component
98 lines
3.1 KiB
Python
98 lines
3.1 KiB
Python
import httpx
|
|
import time
|
|
from models import ChatMessage, LLMResponse, Usage, ProviderConfig
|
|
|
|
class LocalProvider:
|
|
"""Local LLM provider (Ollama/vLLM/llama.cpp)"""
|
|
|
|
def __init__(self, config: ProviderConfig):
|
|
self.config = config
|
|
self.client = httpx.AsyncClient(
|
|
base_url=config.base_url,
|
|
timeout=config.timeout
|
|
)
|
|
|
|
async def chat(
|
|
self,
|
|
messages: list[ChatMessage],
|
|
model_name: str,
|
|
max_tokens: int | None = None,
|
|
temperature: float = 0.7,
|
|
top_p: float = 1.0,
|
|
**kwargs
|
|
) -> LLMResponse:
|
|
"""
|
|
Call local LLM (Ollama format)
|
|
|
|
Note: For Phase 3, this is a simple stub.
|
|
Can be extended to support llama.cpp, vLLM, etc.
|
|
"""
|
|
start_time = time.time()
|
|
|
|
# Ollama chat format
|
|
payload = {
|
|
"model": model_name,
|
|
"messages": [{"role": m.role, "content": m.content} for m in messages],
|
|
"stream": False,
|
|
"options": {
|
|
"temperature": temperature,
|
|
"top_p": top_p
|
|
}
|
|
}
|
|
|
|
if max_tokens:
|
|
payload["options"]["num_predict"] = max_tokens
|
|
|
|
try:
|
|
response = await self.client.post(
|
|
"/api/chat",
|
|
json=payload
|
|
)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
latency_ms = (time.time() - start_time) * 1000
|
|
|
|
# Ollama doesn't always provide token counts
|
|
content = data.get("message", {}).get("content", "")
|
|
prompt_tokens = data.get("prompt_eval_count", 0)
|
|
completion_tokens = data.get("eval_count", 0)
|
|
|
|
return LLMResponse(
|
|
content=content,
|
|
usage=Usage(
|
|
prompt_tokens=prompt_tokens,
|
|
completion_tokens=completion_tokens,
|
|
total_tokens=prompt_tokens + completion_tokens
|
|
),
|
|
provider="local",
|
|
model_resolved=model_name,
|
|
latency_ms=latency_ms,
|
|
cached=False
|
|
)
|
|
|
|
except httpx.ConnectError:
|
|
# Local LLM not running - return stub response
|
|
print(f"⚠️ Local LLM not available at {self.config.base_url}, using stub")
|
|
return LLMResponse(
|
|
content="[STUB] Local LLM not running. Start Ollama: `ollama serve`",
|
|
usage=Usage(prompt_tokens=0, completion_tokens=0, total_tokens=0),
|
|
provider="local",
|
|
model_resolved=model_name,
|
|
latency_ms=(time.time() - start_time) * 1000,
|
|
cached=False
|
|
)
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
raise Exception(f"Local LLM API error: {e.response.status_code} - {e.response.text}")
|
|
except Exception as e:
|
|
raise Exception(f"Local provider error: {str(e)}")
|
|
|
|
async def close(self):
|
|
"""Close HTTP client"""
|
|
await self.client.aclose()
|
|
|
|
|
|
|
|
|