feat: Add Document Crawler & Auto-Onboarding service (Phase 1.4)

New standalone Python/FastAPI service for automatic compliance document scanning, LLM-based classification, IPFS archival, and gap analysis. Includes extractors (PDF, DOCX, XLSX, PPTX), keyword fallback classifier, compliance matrix, and full REST API on port 8098. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 20:35:15 +01:00
parent 0923c03756
commit 364d2c69ff
34 changed files with 1633 additions and 0 deletions
@@ -0,0 +1,73 @@
+"""LLM-based document classification via ai-compliance-sdk."""
+
+import json
+import httpx
+
+from config import settings
+from .prompts import (
+    CLASSIFICATION_SYSTEM_PROMPT,
+    CLASSIFICATION_USER_PROMPT,
+    VALID_CLASSIFICATIONS,
+)
+from .keyword_fallback import keyword_classify
+
+
+async def classify_document(
+    text: str,
+    filename: str,
+    tenant_id: str,
+    user_id: str = "system",
+) -> dict:
+    """Classify a document using the LLM gateway.
+
+    Returns dict with keys: classification, confidence, reasoning.
+    Falls back to keyword heuristic if LLM is unavailable.
+    """
+    truncated = text[: settings.LLM_TEXT_LIMIT]
+    user_prompt = CLASSIFICATION_USER_PROMPT.format(
+        filename=filename, text=truncated
+    )
+
+    try:
+        async with httpx.AsyncClient(timeout=60.0) as client:
+            resp = await client.post(
+                f"{settings.LLM_GATEWAY_URL}/sdk/v1/llm/chat",
+                json={
+                    "messages": [
+                        {"role": "system", "content": CLASSIFICATION_SYSTEM_PROMPT},
+                        {"role": "user", "content": user_prompt},
+                    ],
+                    "temperature": 0.1,
+                    "max_tokens": 300,
+                },
+                headers={
+                    "X-Tenant-ID": tenant_id,
+                    "X-User-ID": user_id,
+                    "Content-Type": "application/json",
+                },
+            )
+
+            if resp.status_code != 200:
+                return keyword_classify(text, filename)
+
+            data = resp.json()
+            # The SDK returns the assistant message content
+            content = (
+                data.get("content")
+                or data.get("message", {}).get("content")
+                or data.get("choices", [{}])[0].get("message", {}).get("content", "")
+            )
+
+            result = json.loads(content)
+            classification = result.get("classification", "Sonstiges")
+            if classification not in VALID_CLASSIFICATIONS:
+                classification = "Sonstiges"
+
+            return {
+                "classification": classification,
+                "confidence": min(max(float(result.get("confidence", 0.5)), 0.0), 1.0),
+                "reasoning": result.get("reasoning", ""),
+            }
+
+    except (httpx.RequestError, json.JSONDecodeError, KeyError, IndexError):
+        return keyword_classify(text, filename)