microdao-daarion/services/brand-intake/app/main.py

"""
Brand Intake Service
- Detects and attributes brand identity from inputs
- Stores sources and snapshots (MVP file-based)
"""

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import Any, Dict, List, Optional, Tuple
from datetime import datetime
import json
import logging
import os
import re
import uuid
from pathlib import Path

import yaml

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

DATA_DIR = Path(os.getenv("BRAND_INTAKE_DATA", "/data/brand-intake"))
BRAND_MAP_PATH = Path(os.getenv("BRAND_MAP_PATH", "/app/config/BrandMap.yaml"))

app = FastAPI(
    title="Brand Intake Service",
    description="Detects, attributes and stores brand sources",
    version="0.1.0"
)


class IntakeRequest(BaseModel):
    source_type: str
    text: Optional[str] = None
    url: Optional[str] = None
    filename: Optional[str] = None
    raw_ref: Optional[str] = None
    mime_type: Optional[str] = None
    agent_id: Optional[str] = None
    workspace_id: Optional[str] = None
    project_id: Optional[str] = None
    tags: Optional[List[str]] = None


class IntakeResponse(BaseModel):
    id: str
    attribution: Dict[str, Any]
    status: str
    created_at: str


class BrandMap:
    def __init__(self, data: Dict[str, Any]):
        self.data = data
        self.defaults = data.get("defaults", {})
        self.brands = data.get("brands", [])

    @property
    def min_confidence(self) -> float:
        return float(self.defaults.get("min_confidence", 0.72))

    @property
    def min_confidence_context_override(self) -> float:
        return float(self.defaults.get("min_confidence_context_override", 0.55))

    @property
    def weights(self) -> Dict[str, float]:
        return self.defaults.get("weights", {})


BRAND_MAP: Optional[BrandMap] = None


def load_brand_map() -> BrandMap:
    global BRAND_MAP
    if not BRAND_MAP_PATH.exists():
        raise FileNotFoundError(f"BrandMap not found: {BRAND_MAP_PATH}")
    data = yaml.safe_load(BRAND_MAP_PATH.read_text(encoding="utf-8"))
    BRAND_MAP = BrandMap(data)
    return BRAND_MAP


def _ensure_dirs() -> None:
    (DATA_DIR / "sources").mkdir(parents=True, exist_ok=True)
    (DATA_DIR / "snapshots").mkdir(parents=True, exist_ok=True)


def _norm(text: str) -> str:
    return re.sub(r"\s+", " ", text.strip().lower())


def _match_any(text: str, patterns: List[str]) -> List[str]:
    found = []
    if not text:
        return found
    text_norm = _norm(text)
    for p in patterns:
        if not p:
            continue
        if _norm(p) in text_norm:
            found.append(p)
    return found


def _domain_from_url(url: str) -> str:
    try:
        return re.sub(r"^www\.", "", re.split(r"/|:\/\/", url)[-1].split("/")[0])
    except Exception:
        return ""


def _score_brand(brand: Dict[str, Any], req: IntakeRequest, weights: Dict[str, float]) -> Tuple[float, List[str], bool]:
    score = 0.0
    reasons: List[str] = []
    has_context_match = False

    text_blob = " ".join(filter(None, [req.text, req.filename, req.url]))
    text_blob_norm = _norm(text_blob)

    domains = brand.get("domains", [])
    aliases = brand.get("aliases", [])
    keywords = brand.get("keywords", [])

    if req.url:
        url_lower = req.url.lower()
        for d in domains:
            if d and d.lower() in url_lower:
                score += weights.get("domain_match", 0)
                reasons.append(f"domain:{d}")
                break

    alias_hits = _match_any(text_blob, aliases)
    if alias_hits:
        score += weights.get("alias_match", 0)
        reasons.append("alias")

    keyword_hits = _match_any(text_blob, keywords)
    if keyword_hits:
        score += weights.get("keyword_match", 0)
        reasons.append("keyword")

    # Attachment hint: filename mentions alias or keyword
    if req.filename and (alias_hits or keyword_hits):
        score += weights.get("attachment_hint", 0)
        reasons.append("attachment_hint")

    # Context rules
    for rule in brand.get("context_rules", []):
        if rule.get("type") == "agent_id" and req.agent_id:
            if _norm(rule.get("value", "")) == _norm(req.agent_id):
                score += weights.get("context_match", 0)
                reasons.append("context:agent_id")
                has_context_match = True
        if rule.get("type") == "workspace_id" and req.workspace_id:
            if _norm(rule.get("value", "")) == _norm(req.workspace_id):
                score += weights.get("context_match", 0)
                reasons.append("context:workspace_id")
                has_context_match = True

    return min(score, 1.0), reasons, has_context_match


def _attribute(req: IntakeRequest) -> Dict[str, Any]:
    if BRAND_MAP is None:
        load_brand_map()
    assert BRAND_MAP is not None

    candidates = []
    context_override = False
    for brand in BRAND_MAP.brands:
        score, reasons, has_context = _score_brand(brand, req, BRAND_MAP.weights)
        if score > 0:
            candidates.append({
                "brand_id": brand.get("brand_id"),
                "score": round(score, 3),
                "reasons": reasons
            })
        if has_context and score >= BRAND_MAP.min_confidence_context_override:
            context_override = True

    candidates.sort(key=lambda x: x["score"], reverse=True)
    top = candidates[0] if candidates else None

    status = "unattributed"
    brand_id = None
    confidence = 0.0

    if top:
        confidence = float(top["score"])
        if confidence >= BRAND_MAP.min_confidence or context_override:
            status = "attributed"
            brand_id = top["brand_id"]
        else:
            status = "needs_review"

    return {
        "status": status,
        "brand_id": brand_id,
        "confidence": confidence,
        "candidates": candidates
    }


@app.get("/")
async def root() -> Dict[str, Any]:
    _ensure_dirs()
    return {
        "service": "brand-intake",
        "status": "running",
        "brand_map": str(BRAND_MAP_PATH),
        "version": "0.1.0"
    }


@app.get("/health")
async def health() -> Dict[str, Any]:
    return {"status": "healthy"}


@app.post("/brand/intake", response_model=IntakeResponse)
async def brand_intake(req: IntakeRequest) -> IntakeResponse:
    _ensure_dirs()
    if req.source_type not in {"url", "text", "file", "figma", "drive", "notion"}:
        raise HTTPException(status_code=400, detail="Unsupported source_type")

    attribution = _attribute(req)
    source_id = uuid.uuid4().hex
    created_at = datetime.utcnow().isoformat() + "Z"

    source_doc = {
        "id": source_id,
        "created_at": created_at,
        "created_by": "brand-intake",
        "workspace_id": req.workspace_id,
        "project_id": req.project_id,
        "agent_id": req.agent_id,
        "source_type": req.source_type,
        "payload": {
            "raw_ref": req.raw_ref or req.url or req.text or "",
            "mime_type": req.mime_type,
            "filename": req.filename,
            "url": req.url,
            "text_excerpt": (req.text or "")[:2000]
        },
        "attribution": attribution,
        "tags": req.tags or []
    }

    (DATA_DIR / "sources" / f"{source_id}.json").write_text(
        json.dumps(source_doc, ensure_ascii=False, indent=2),
        encoding="utf-8"
    )

    snapshot_id = uuid.uuid4().hex
    snapshot_doc = {
        "id": snapshot_id,
        "created_at": created_at,
        "brand_id": attribution.get("brand_id") or "unattributed",
        "source_id": source_id,
        "quality": {
            "confidence": attribution.get("confidence", 0.0),
            "warnings": ["extraction_not_implemented"],
            "needs_review": attribution.get("status") != "attributed"
        },
        "extracted": {
            "palette": {},
            "typography": {},
            "logos": [],
            "web_tokens": {},
            "documents": {},
            "licensing": {}
        }
    }
    (DATA_DIR / "snapshots" / f"{snapshot_id}.json").write_text(
        json.dumps(snapshot_doc, ensure_ascii=False, indent=2),
        encoding="utf-8"
    )

    return IntakeResponse(
        id=source_id,
        attribution=attribution,
        status=attribution.get("status", "unattributed"),
        created_at=created_at
    )


@app.get("/brand/sources/{source_id}")
async def get_source(source_id: str) -> Dict[str, Any]:
    path = DATA_DIR / "sources" / f"{source_id}.json"
    if not path.exists():
        raise HTTPException(status_code=404, detail="Source not found")
    return json.loads(path.read_text(encoding="utf-8"))


@app.get("/brand/brands/{brand_id}/latest")
async def latest_brand_snapshot(brand_id: str) -> Dict[str, Any]:
    snapshot_dir = DATA_DIR / "snapshots"
    if not snapshot_dir.exists():
        raise HTTPException(status_code=404, detail="No snapshots")
    candidates = []
    for path in snapshot_dir.glob("*.json"):
        data = json.loads(path.read_text(encoding="utf-8"))
        if data.get("brand_id") == brand_id:
            candidates.append(data)
    if not candidates:
        raise HTTPException(status_code=404, detail="No snapshots for brand")
    candidates.sort(key=lambda x: x.get("created_at", ""))
    return candidates[-1]