""" Brand Intake Service - Detects and attributes brand identity from inputs - Stores sources and snapshots (MVP file-based) """ from fastapi import FastAPI, HTTPException from pydantic import BaseModel from typing import Any, Dict, List, Optional, Tuple from datetime import datetime import json import logging import os import re import uuid from pathlib import Path import yaml logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) DATA_DIR = Path(os.getenv("BRAND_INTAKE_DATA", "/data/brand-intake")) BRAND_MAP_PATH = Path(os.getenv("BRAND_MAP_PATH", "/app/config/BrandMap.yaml")) app = FastAPI( title="Brand Intake Service", description="Detects, attributes and stores brand sources", version="0.1.0" ) class IntakeRequest(BaseModel): source_type: str text: Optional[str] = None url: Optional[str] = None filename: Optional[str] = None raw_ref: Optional[str] = None mime_type: Optional[str] = None agent_id: Optional[str] = None workspace_id: Optional[str] = None project_id: Optional[str] = None tags: Optional[List[str]] = None class IntakeResponse(BaseModel): id: str attribution: Dict[str, Any] status: str created_at: str class BrandMap: def __init__(self, data: Dict[str, Any]): self.data = data self.defaults = data.get("defaults", {}) self.brands = data.get("brands", []) @property def min_confidence(self) -> float: return float(self.defaults.get("min_confidence", 0.72)) @property def min_confidence_context_override(self) -> float: return float(self.defaults.get("min_confidence_context_override", 0.55)) @property def weights(self) -> Dict[str, float]: return self.defaults.get("weights", {}) BRAND_MAP: Optional[BrandMap] = None def load_brand_map() -> BrandMap: global BRAND_MAP if not BRAND_MAP_PATH.exists(): raise FileNotFoundError(f"BrandMap not found: {BRAND_MAP_PATH}") data = yaml.safe_load(BRAND_MAP_PATH.read_text(encoding="utf-8")) BRAND_MAP = BrandMap(data) return BRAND_MAP def _ensure_dirs() -> None: (DATA_DIR / "sources").mkdir(parents=True, exist_ok=True) (DATA_DIR / "snapshots").mkdir(parents=True, exist_ok=True) def _norm(text: str) -> str: return re.sub(r"\s+", " ", text.strip().lower()) def _match_any(text: str, patterns: List[str]) -> List[str]: found = [] if not text: return found text_norm = _norm(text) for p in patterns: if not p: continue if _norm(p) in text_norm: found.append(p) return found def _domain_from_url(url: str) -> str: try: return re.sub(r"^www\.", "", re.split(r"/|:\/\/", url)[-1].split("/")[0]) except Exception: return "" def _score_brand(brand: Dict[str, Any], req: IntakeRequest, weights: Dict[str, float]) -> Tuple[float, List[str], bool]: score = 0.0 reasons: List[str] = [] has_context_match = False text_blob = " ".join(filter(None, [req.text, req.filename, req.url])) text_blob_norm = _norm(text_blob) domains = brand.get("domains", []) aliases = brand.get("aliases", []) keywords = brand.get("keywords", []) if req.url: url_lower = req.url.lower() for d in domains: if d and d.lower() in url_lower: score += weights.get("domain_match", 0) reasons.append(f"domain:{d}") break alias_hits = _match_any(text_blob, aliases) if alias_hits: score += weights.get("alias_match", 0) reasons.append("alias") keyword_hits = _match_any(text_blob, keywords) if keyword_hits: score += weights.get("keyword_match", 0) reasons.append("keyword") # Attachment hint: filename mentions alias or keyword if req.filename and (alias_hits or keyword_hits): score += weights.get("attachment_hint", 0) reasons.append("attachment_hint") # Context rules for rule in brand.get("context_rules", []): if rule.get("type") == "agent_id" and req.agent_id: if _norm(rule.get("value", "")) == _norm(req.agent_id): score += weights.get("context_match", 0) reasons.append("context:agent_id") has_context_match = True if rule.get("type") == "workspace_id" and req.workspace_id: if _norm(rule.get("value", "")) == _norm(req.workspace_id): score += weights.get("context_match", 0) reasons.append("context:workspace_id") has_context_match = True return min(score, 1.0), reasons, has_context_match def _attribute(req: IntakeRequest) -> Dict[str, Any]: if BRAND_MAP is None: load_brand_map() assert BRAND_MAP is not None candidates = [] context_override = False for brand in BRAND_MAP.brands: score, reasons, has_context = _score_brand(brand, req, BRAND_MAP.weights) if score > 0: candidates.append({ "brand_id": brand.get("brand_id"), "score": round(score, 3), "reasons": reasons }) if has_context and score >= BRAND_MAP.min_confidence_context_override: context_override = True candidates.sort(key=lambda x: x["score"], reverse=True) top = candidates[0] if candidates else None status = "unattributed" brand_id = None confidence = 0.0 if top: confidence = float(top["score"]) if confidence >= BRAND_MAP.min_confidence or context_override: status = "attributed" brand_id = top["brand_id"] else: status = "needs_review" return { "status": status, "brand_id": brand_id, "confidence": confidence, "candidates": candidates } @app.get("/") async def root() -> Dict[str, Any]: _ensure_dirs() return { "service": "brand-intake", "status": "running", "brand_map": str(BRAND_MAP_PATH), "version": "0.1.0" } @app.get("/health") async def health() -> Dict[str, Any]: return {"status": "healthy"} @app.post("/brand/intake", response_model=IntakeResponse) async def brand_intake(req: IntakeRequest) -> IntakeResponse: _ensure_dirs() if req.source_type not in {"url", "text", "file", "figma", "drive", "notion"}: raise HTTPException(status_code=400, detail="Unsupported source_type") attribution = _attribute(req) source_id = uuid.uuid4().hex created_at = datetime.utcnow().isoformat() + "Z" source_doc = { "id": source_id, "created_at": created_at, "created_by": "brand-intake", "workspace_id": req.workspace_id, "project_id": req.project_id, "agent_id": req.agent_id, "source_type": req.source_type, "payload": { "raw_ref": req.raw_ref or req.url or req.text or "", "mime_type": req.mime_type, "filename": req.filename, "url": req.url, "text_excerpt": (req.text or "")[:2000] }, "attribution": attribution, "tags": req.tags or [] } (DATA_DIR / "sources" / f"{source_id}.json").write_text( json.dumps(source_doc, ensure_ascii=False, indent=2), encoding="utf-8" ) snapshot_id = uuid.uuid4().hex snapshot_doc = { "id": snapshot_id, "created_at": created_at, "brand_id": attribution.get("brand_id") or "unattributed", "source_id": source_id, "quality": { "confidence": attribution.get("confidence", 0.0), "warnings": ["extraction_not_implemented"], "needs_review": attribution.get("status") != "attributed" }, "extracted": { "palette": {}, "typography": {}, "logos": [], "web_tokens": {}, "documents": {}, "licensing": {} } } (DATA_DIR / "snapshots" / f"{snapshot_id}.json").write_text( json.dumps(snapshot_doc, ensure_ascii=False, indent=2), encoding="utf-8" ) return IntakeResponse( id=source_id, attribution=attribution, status=attribution.get("status", "unattributed"), created_at=created_at ) @app.get("/brand/sources/{source_id}") async def get_source(source_id: str) -> Dict[str, Any]: path = DATA_DIR / "sources" / f"{source_id}.json" if not path.exists(): raise HTTPException(status_code=404, detail="Source not found") return json.loads(path.read_text(encoding="utf-8")) @app.get("/brand/brands/{brand_id}/latest") async def latest_brand_snapshot(brand_id: str) -> Dict[str, Any]: snapshot_dir = DATA_DIR / "snapshots" if not snapshot_dir.exists(): raise HTTPException(status_code=404, detail="No snapshots") candidates = [] for path in snapshot_dir.glob("*.json"): data = json.loads(path.read_text(encoding="utf-8")) if data.get("brand_id") == brand_id: candidates.append(data) if not candidates: raise HTTPException(status_code=404, detail="No snapshots for brand") candidates.sort(key=lambda x: x.get("created_at", "")) return candidates[-1]