New standalone Python/FastAPI service for automatic compliance document scanning, LLM-based classification, IPFS archival, and gap analysis. Includes extractors (PDF, DOCX, XLSX, PPTX), keyword fallback classifier, compliance matrix, and full REST API on port 8098. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
74 lines
2.4 KiB
Python
74 lines
2.4 KiB
Python
"""LLM-based document classification via ai-compliance-sdk."""
|
|
|
|
import json
|
|
import httpx
|
|
|
|
from config import settings
|
|
from .prompts import (
|
|
CLASSIFICATION_SYSTEM_PROMPT,
|
|
CLASSIFICATION_USER_PROMPT,
|
|
VALID_CLASSIFICATIONS,
|
|
)
|
|
from .keyword_fallback import keyword_classify
|
|
|
|
|
|
async def classify_document(
|
|
text: str,
|
|
filename: str,
|
|
tenant_id: str,
|
|
user_id: str = "system",
|
|
) -> dict:
|
|
"""Classify a document using the LLM gateway.
|
|
|
|
Returns dict with keys: classification, confidence, reasoning.
|
|
Falls back to keyword heuristic if LLM is unavailable.
|
|
"""
|
|
truncated = text[: settings.LLM_TEXT_LIMIT]
|
|
user_prompt = CLASSIFICATION_USER_PROMPT.format(
|
|
filename=filename, text=truncated
|
|
)
|
|
|
|
try:
|
|
async with httpx.AsyncClient(timeout=60.0) as client:
|
|
resp = await client.post(
|
|
f"{settings.LLM_GATEWAY_URL}/sdk/v1/llm/chat",
|
|
json={
|
|
"messages": [
|
|
{"role": "system", "content": CLASSIFICATION_SYSTEM_PROMPT},
|
|
{"role": "user", "content": user_prompt},
|
|
],
|
|
"temperature": 0.1,
|
|
"max_tokens": 300,
|
|
},
|
|
headers={
|
|
"X-Tenant-ID": tenant_id,
|
|
"X-User-ID": user_id,
|
|
"Content-Type": "application/json",
|
|
},
|
|
)
|
|
|
|
if resp.status_code != 200:
|
|
return keyword_classify(text, filename)
|
|
|
|
data = resp.json()
|
|
# The SDK returns the assistant message content
|
|
content = (
|
|
data.get("content")
|
|
or data.get("message", {}).get("content")
|
|
or data.get("choices", [{}])[0].get("message", {}).get("content", "")
|
|
)
|
|
|
|
result = json.loads(content)
|
|
classification = result.get("classification", "Sonstiges")
|
|
if classification not in VALID_CLASSIFICATIONS:
|
|
classification = "Sonstiges"
|
|
|
|
return {
|
|
"classification": classification,
|
|
"confidence": min(max(float(result.get("confidence", 0.5)), 0.0), 1.0),
|
|
"reasoning": result.get("reasoning", ""),
|
|
}
|
|
|
|
except (httpx.RequestError, json.JSONDecodeError, KeyError, IndexError):
|
|
return keyword_classify(text, filename)
|