breakpilot-compliance/document-crawler/classifiers/llm_classifier.py

"""LLM-based document classification via ai-compliance-sdk."""

import json
import httpx

from config import settings
from .prompts import (
    CLASSIFICATION_SYSTEM_PROMPT,
    CLASSIFICATION_USER_PROMPT,
    VALID_CLASSIFICATIONS,
)
from .keyword_fallback import keyword_classify


async def classify_document(
    text: str,
    filename: str,
    tenant_id: str,
    user_id: str = "system",
) -> dict:
    """Classify a document using the LLM gateway.

    Returns dict with keys: classification, confidence, reasoning.
    Falls back to keyword heuristic if LLM is unavailable.
    """
    truncated = text[: settings.LLM_TEXT_LIMIT]
    user_prompt = CLASSIFICATION_USER_PROMPT.format(
        filename=filename, text=truncated
    )

    try:
        async with httpx.AsyncClient(timeout=60.0) as client:
            resp = await client.post(
                f"{settings.LLM_GATEWAY_URL}/sdk/v1/llm/chat",
                json={
                    "messages": [
                        {"role": "system", "content": CLASSIFICATION_SYSTEM_PROMPT},
                        {"role": "user", "content": user_prompt},
                    ],
                    "temperature": 0.1,
                    "max_tokens": 300,
                },
                headers={
                    "X-Tenant-ID": tenant_id,
                    "X-User-ID": user_id,
                    "Content-Type": "application/json",
                },
            )

            if resp.status_code != 200:
                return keyword_classify(text, filename)

            data = resp.json()
            # The SDK returns the assistant message content
            content = (
                data.get("content")
                or data.get("message", {}).get("content")
                or data.get("choices", [{}])[0].get("message", {}).get("content", "")
            )

            result = json.loads(content)
            classification = result.get("classification", "Sonstiges")
            if classification not in VALID_CLASSIFICATIONS:
                classification = "Sonstiges"

            return {
                "classification": classification,
                "confidence": min(max(float(result.get("confidence", 0.5)), 0.0), 1.0),
                "reasoning": result.get("reasoning", ""),
            }

    except (httpx.RequestError, json.JSONDecodeError, KeyError, IndexError):
        return keyword_classify(text, filename)