Some checks failed
Deploy to Coolify / deploy (push) Has been cancelled
SQLAlchemy 2.x requires raw SQL strings to be explicitly wrapped in text(). Fixed 16 instances across 5 route files. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
447 lines
16 KiB
Python
447 lines
16 KiB
Python
"""
|
|
FastAPI routes for Document Import and Gap Analysis.
|
|
|
|
Endpoints:
|
|
- POST /v1/import/analyze: Upload and analyze a compliance document
|
|
- GET /v1/import/documents: List imported documents for a tenant
|
|
- GET /v1/import/gap-analysis/{document_id}: Get gap analysis for a document
|
|
"""
|
|
|
|
import logging
|
|
import os
|
|
import uuid
|
|
from typing import Optional
|
|
|
|
import httpx
|
|
from fastapi import APIRouter, File, Form, Header, UploadFile, HTTPException
|
|
from pydantic import BaseModel
|
|
from sqlalchemy import text
|
|
|
|
from database import SessionLocal
|
|
|
|
logger = logging.getLogger(__name__)
|
|
router = APIRouter(prefix="/v1/import", tags=["document-import"])
|
|
|
|
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
|
|
LLM_MODEL = os.getenv("COMPLIANCE_LLM_MODEL", "qwen3.5:35b-a3b")
|
|
|
|
# =============================================================================
|
|
# DOCUMENT TYPE DETECTION
|
|
# =============================================================================
|
|
|
|
DOCUMENT_TYPE_KEYWORDS = {
|
|
"DSFA": ["datenschutz-folgenabschaetzung", "dsfa", "dpia", "privacy impact"],
|
|
"TOM": ["technisch-organisatorische", "tom", "massnahmen", "technical measures"],
|
|
"VVT": ["verarbeitungsverzeichnis", "vvt", "processing activities", "art. 30"],
|
|
"PRIVACY_POLICY": ["datenschutzerklaerung", "privacy policy", "datenschutzhinweis"],
|
|
"AGB": ["allgemeine geschaeftsbedingungen", "agb", "terms and conditions"],
|
|
"COOKIE_POLICY": ["cookie", "tracking", "einwilligung"],
|
|
"RISK_ASSESSMENT": ["risikobewertung", "risk assessment", "risikoanalyse"],
|
|
"AUDIT_REPORT": ["audit", "pruefbericht", "zertifizierung"],
|
|
}
|
|
|
|
|
|
def detect_document_type(text: str) -> tuple[str, float]:
|
|
"""Detect document type from extracted text using keyword matching."""
|
|
text_lower = text.lower()
|
|
scores: dict[str, int] = {}
|
|
|
|
for doc_type, keywords in DOCUMENT_TYPE_KEYWORDS.items():
|
|
score = sum(1 for kw in keywords if kw in text_lower)
|
|
if score > 0:
|
|
scores[doc_type] = score
|
|
|
|
if not scores:
|
|
return "OTHER", 0.3
|
|
|
|
best_type = max(scores, key=scores.get)
|
|
confidence = min(0.95, 0.5 + scores[best_type] * 0.15)
|
|
return best_type, confidence
|
|
|
|
|
|
# =============================================================================
|
|
# GAP ANALYSIS
|
|
# =============================================================================
|
|
|
|
GAP_RULES = [
|
|
{
|
|
"category": "AI Act Compliance",
|
|
"regulation": "EU AI Act Art. 6",
|
|
"check_keywords": ["ki", "ai", "kuenstliche intelligenz", "machine learning"],
|
|
"gap_if_missing": ["risikoklassifizierung", "risk classification", "risikokategorie"],
|
|
"severity": "CRITICAL",
|
|
"action": "Risikoklassifizierung fuer KI-Systeme durchfuehren",
|
|
},
|
|
{
|
|
"category": "Transparenz",
|
|
"regulation": "DSGVO Art. 13, 14, 22",
|
|
"check_keywords": ["automatisiert", "automated", "profiling"],
|
|
"gap_if_missing": ["informationspflicht", "information obligation", "transparenz"],
|
|
"severity": "HIGH",
|
|
"action": "Informationspflichten bei automatisierten Entscheidungen ergaenzen",
|
|
},
|
|
{
|
|
"category": "TOMs",
|
|
"regulation": "DSGVO Art. 32",
|
|
"check_keywords": ["ki", "ai", "cloud", "saas"],
|
|
"gap_if_missing": ["technische massnahmen", "verschluesselung", "encryption"],
|
|
"severity": "MEDIUM",
|
|
"action": "Technisch-organisatorische Massnahmen um KI-Aspekte erweitern",
|
|
},
|
|
{
|
|
"category": "VVT",
|
|
"regulation": "DSGVO Art. 30",
|
|
"check_keywords": ["verarbeitung", "processing", "daten"],
|
|
"gap_if_missing": ["verarbeitungsverzeichnis", "vvt", "processing activities"],
|
|
"severity": "HIGH",
|
|
"action": "Verarbeitungsverzeichnis aktualisieren",
|
|
},
|
|
{
|
|
"category": "Menschliche Aufsicht",
|
|
"regulation": "EU AI Act Art. 14",
|
|
"check_keywords": ["ki", "ai", "autonom", "autonomous"],
|
|
"gap_if_missing": ["menschliche aufsicht", "human oversight", "human-in-the-loop"],
|
|
"severity": "MEDIUM",
|
|
"action": "Prozesse fuer menschliche Aufsicht definieren",
|
|
},
|
|
]
|
|
|
|
|
|
def analyze_gaps(text: str, doc_type: str) -> list[dict]:
|
|
"""Analyze document text for compliance gaps."""
|
|
text_lower = text.lower()
|
|
gaps = []
|
|
|
|
for rule in GAP_RULES:
|
|
# Check if rule applies (keywords present in document)
|
|
applies = any(kw in text_lower for kw in rule["check_keywords"])
|
|
if not applies:
|
|
continue
|
|
|
|
# Check if gap exists (required elements missing)
|
|
has_gap = not any(kw in text_lower for kw in rule["gap_if_missing"])
|
|
if has_gap:
|
|
gaps.append({
|
|
"id": f"gap-{uuid.uuid4().hex[:8]}",
|
|
"category": rule["category"],
|
|
"description": f"{rule['category']}: Luecke erkannt",
|
|
"severity": rule["severity"],
|
|
"regulation": rule["regulation"],
|
|
"required_action": rule["action"],
|
|
"related_step_id": doc_type.lower(),
|
|
})
|
|
|
|
return gaps
|
|
|
|
|
|
# =============================================================================
|
|
# TEXT EXTRACTION
|
|
# =============================================================================
|
|
|
|
def extract_text_from_pdf(content: bytes) -> str:
|
|
"""Extract text from PDF using PyMuPDF (fitz)."""
|
|
try:
|
|
import fitz
|
|
doc = fitz.open(stream=content, filetype="pdf")
|
|
text_parts = []
|
|
for page in doc:
|
|
text_parts.append(page.get_text())
|
|
doc.close()
|
|
return "\n".join(text_parts)
|
|
except ImportError:
|
|
logger.warning("PyMuPDF not available, returning empty text")
|
|
return ""
|
|
except Exception as e:
|
|
logger.error(f"PDF extraction failed: {e}")
|
|
return ""
|
|
|
|
|
|
# =============================================================================
|
|
# LLM CLASSIFICATION (optional enhancement)
|
|
# =============================================================================
|
|
|
|
async def classify_with_llm(text: str) -> Optional[tuple[str, float]]:
|
|
"""Use Ollama LLM to classify document type (optional, falls back to keywords)."""
|
|
try:
|
|
prompt = f"""Klassifiziere das folgende Dokument in eine dieser Kategorien:
|
|
DSFA, TOM, VVT, PRIVACY_POLICY, AGB, COOKIE_POLICY, RISK_ASSESSMENT, AUDIT_REPORT, OTHER
|
|
|
|
Antworte NUR mit dem Kategorienamen, nichts anderes.
|
|
|
|
Dokumenttext (erste 2000 Zeichen):
|
|
{text[:2000]}"""
|
|
|
|
async with httpx.AsyncClient(timeout=30.0) as client:
|
|
response = await client.post(
|
|
f"{OLLAMA_URL}/api/generate",
|
|
json={
|
|
"model": LLM_MODEL,
|
|
"prompt": prompt,
|
|
"stream": False,
|
|
"options": {"temperature": 0.1, "num_predict": 20},
|
|
},
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
result = response.json()
|
|
answer = result.get("response", "").strip().upper()
|
|
# Validate answer
|
|
valid_types = {"DSFA", "TOM", "VVT", "PRIVACY_POLICY", "AGB",
|
|
"COOKIE_POLICY", "RISK_ASSESSMENT", "AUDIT_REPORT", "OTHER"}
|
|
if answer in valid_types:
|
|
return answer, 0.85
|
|
except Exception as e:
|
|
logger.warning(f"LLM classification failed, using keyword fallback: {e}")
|
|
|
|
return None
|
|
|
|
|
|
# =============================================================================
|
|
# RESPONSE MODELS
|
|
# =============================================================================
|
|
|
|
class DocumentAnalysisResponse(BaseModel):
|
|
document_id: str
|
|
filename: str
|
|
detected_type: str
|
|
confidence: float
|
|
extracted_entities: list[str]
|
|
recommendations: list[str]
|
|
gap_analysis: dict
|
|
|
|
|
|
class DocumentListResponse(BaseModel):
|
|
documents: list[dict]
|
|
total: int
|
|
|
|
|
|
# =============================================================================
|
|
# ROUTES
|
|
# =============================================================================
|
|
|
|
@router.post("/analyze", response_model=DocumentAnalysisResponse)
|
|
async def analyze_document(
|
|
file: UploadFile = File(...),
|
|
document_type: str = Form("OTHER"),
|
|
tenant_id: str = Form("default"),
|
|
):
|
|
"""Upload and analyze a compliance document."""
|
|
if not file.filename:
|
|
raise HTTPException(status_code=400, detail="No file provided")
|
|
|
|
# Read file content
|
|
content = await file.read()
|
|
file_size = len(content)
|
|
|
|
# Extract text
|
|
if file.content_type == "application/pdf" or (file.filename and file.filename.endswith(".pdf")):
|
|
text = extract_text_from_pdf(content)
|
|
else:
|
|
# Try to decode as text
|
|
try:
|
|
text = content.decode("utf-8")
|
|
except UnicodeDecodeError:
|
|
text = ""
|
|
|
|
# Detect document type
|
|
if document_type == "OTHER" and text:
|
|
# Try LLM first, fallback to keywords
|
|
llm_result = await classify_with_llm(text)
|
|
if llm_result:
|
|
detected_type, confidence = llm_result
|
|
else:
|
|
detected_type, confidence = detect_document_type(text)
|
|
else:
|
|
detected_type = document_type
|
|
confidence = 1.0
|
|
|
|
# Extract key entities
|
|
entities = []
|
|
entity_keywords = ["DSGVO", "AI Act", "ISO 27001", "NIS2", "BDSG",
|
|
"Personenbezogene Daten", "Auftragsverarbeitung", "DSFA"]
|
|
for kw in entity_keywords:
|
|
if kw.lower() in text.lower():
|
|
entities.append(kw)
|
|
|
|
# Analyze gaps
|
|
gaps = analyze_gaps(text, detected_type)
|
|
|
|
# Generate recommendations
|
|
recommendations = []
|
|
if gaps:
|
|
recommendations = [g["required_action"] for g in gaps[:5]]
|
|
if not recommendations:
|
|
recommendations = ["Dokument erscheint vollstaendig"]
|
|
|
|
# Persist to database
|
|
doc_id = str(uuid.uuid4())
|
|
|
|
# Build gap_analysis_result before DB write so it's always available
|
|
total_gaps = len(gaps)
|
|
gap_analysis_result = {
|
|
"id": f"analysis-{doc_id[:8]}",
|
|
"total_gaps": total_gaps,
|
|
"critical_gaps": len([g for g in gaps if g["severity"] == "CRITICAL"]),
|
|
"high_gaps": len([g for g in gaps if g["severity"] == "HIGH"]),
|
|
"medium_gaps": len([g for g in gaps if g["severity"] == "MEDIUM"]),
|
|
"low_gaps": len([g for g in gaps if g["severity"] == "LOW"]),
|
|
"gaps": gaps,
|
|
"recommended_packages": ["analyse", "dokumentation"] if total_gaps > 0 else [],
|
|
}
|
|
|
|
db = SessionLocal()
|
|
try:
|
|
db.execute(
|
|
text("""INSERT INTO compliance_imported_documents
|
|
(id, tenant_id, filename, file_type, file_size, detected_type, detection_confidence,
|
|
extracted_text, extracted_entities, recommendations, status, analyzed_at)
|
|
VALUES (:id, :tenant_id, :filename, :file_type, :file_size, :detected_type, :confidence,
|
|
:text, :entities::jsonb, :recommendations::jsonb, 'analyzed', NOW())"""),
|
|
{
|
|
"id": doc_id,
|
|
"tenant_id": tenant_id,
|
|
"filename": file.filename,
|
|
"file_type": file.content_type or "unknown",
|
|
"file_size": file_size,
|
|
"detected_type": detected_type,
|
|
"confidence": confidence,
|
|
"text": text[:50000], # Limit stored text
|
|
"entities": str(entities).replace("'", '"'),
|
|
"recommendations": str(recommendations).replace("'", '"'),
|
|
},
|
|
)
|
|
|
|
if total_gaps > 0:
|
|
import json
|
|
db.execute(
|
|
text("""INSERT INTO compliance_gap_analyses
|
|
(tenant_id, document_id, total_gaps, critical_gaps, high_gaps, medium_gaps, low_gaps, gaps, recommended_packages)
|
|
VALUES (:tenant_id, :document_id, :total, :critical, :high, :medium, :low, :gaps::jsonb, :packages::jsonb)"""),
|
|
{
|
|
"tenant_id": tenant_id,
|
|
"document_id": doc_id,
|
|
"total": gap_analysis_result["total_gaps"],
|
|
"critical": gap_analysis_result["critical_gaps"],
|
|
"high": gap_analysis_result["high_gaps"],
|
|
"medium": gap_analysis_result["medium_gaps"],
|
|
"low": gap_analysis_result["low_gaps"],
|
|
"gaps": json.dumps(gaps),
|
|
"packages": json.dumps(gap_analysis_result["recommended_packages"]),
|
|
},
|
|
)
|
|
|
|
db.commit()
|
|
except Exception as e:
|
|
db.rollback()
|
|
logger.error(f"Failed to persist document analysis: {e}")
|
|
finally:
|
|
db.close()
|
|
|
|
return DocumentAnalysisResponse(
|
|
document_id=doc_id,
|
|
filename=file.filename or "unknown",
|
|
detected_type=detected_type,
|
|
confidence=confidence,
|
|
extracted_entities=entities,
|
|
recommendations=recommendations,
|
|
gap_analysis=gap_analysis_result,
|
|
)
|
|
|
|
|
|
@router.get("/gap-analysis/{document_id}")
|
|
async def get_gap_analysis(
|
|
document_id: str,
|
|
tenant_id: str = "default",
|
|
x_tenant_id: Optional[str] = Header(None, alias="X-Tenant-ID"),
|
|
):
|
|
"""Get gap analysis for a specific document."""
|
|
tid = x_tenant_id or tenant_id
|
|
db = SessionLocal()
|
|
try:
|
|
result = db.execute(
|
|
text("SELECT * FROM compliance_gap_analyses WHERE document_id = :doc_id AND tenant_id = :tid"),
|
|
{"doc_id": document_id, "tid": tid},
|
|
).fetchone()
|
|
if not result:
|
|
raise HTTPException(status_code=404, detail="Gap analysis not found")
|
|
return dict(result)
|
|
finally:
|
|
db.close()
|
|
|
|
|
|
@router.get("/documents", response_model=DocumentListResponse)
|
|
async def list_documents(tenant_id: str = "default"):
|
|
"""List all imported documents for a tenant."""
|
|
db = SessionLocal()
|
|
try:
|
|
result = db.execute(
|
|
text("""SELECT id, filename, file_type, file_size, detected_type, detection_confidence,
|
|
extracted_entities, recommendations, status, analyzed_at, created_at
|
|
FROM compliance_imported_documents
|
|
WHERE tenant_id = :tenant_id
|
|
ORDER BY created_at DESC"""),
|
|
{"tenant_id": tenant_id},
|
|
)
|
|
rows = result.fetchall()
|
|
documents = []
|
|
for row in rows:
|
|
documents.append({
|
|
"id": str(row[0]),
|
|
"filename": row[1],
|
|
"file_type": row[2],
|
|
"file_size": row[3],
|
|
"detected_type": row[4],
|
|
"confidence": row[5],
|
|
"extracted_entities": row[6] or [],
|
|
"recommendations": row[7] or [],
|
|
"status": row[8],
|
|
"analyzed_at": str(row[9]) if row[9] else None,
|
|
"created_at": str(row[10]),
|
|
})
|
|
return DocumentListResponse(documents=documents, total=len(documents))
|
|
finally:
|
|
db.close()
|
|
|
|
|
|
@router.get("", response_model=DocumentListResponse)
|
|
async def list_documents_root(
|
|
tenant_id: str = "default",
|
|
x_tenant_id: Optional[str] = Header(None, alias="X-Tenant-ID"),
|
|
):
|
|
"""Alias: GET /v1/import → list documents (proxy-compatible URL)."""
|
|
tid = x_tenant_id or tenant_id
|
|
return await list_documents(tenant_id=tid)
|
|
|
|
|
|
@router.delete("/{document_id}")
|
|
async def delete_document(
|
|
document_id: str,
|
|
tenant_id: str = "default",
|
|
x_tenant_id: Optional[str] = Header(None, alias="X-Tenant-ID"),
|
|
):
|
|
"""Delete an imported document and its gap analysis."""
|
|
tid = x_tenant_id or tenant_id
|
|
db = SessionLocal()
|
|
try:
|
|
# Delete gap analysis first (FK dependency)
|
|
db.execute(
|
|
text("DELETE FROM compliance_gap_analyses WHERE document_id = :doc_id AND tenant_id = :tid"),
|
|
{"doc_id": document_id, "tid": tid},
|
|
)
|
|
result = db.execute(
|
|
text("DELETE FROM compliance_imported_documents WHERE id = :doc_id AND tenant_id = :tid"),
|
|
{"doc_id": document_id, "tid": tid},
|
|
)
|
|
db.commit()
|
|
if result.rowcount == 0:
|
|
raise HTTPException(status_code=404, detail="Document not found")
|
|
return {"success": True, "deleted_id": document_id}
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
db.rollback()
|
|
logger.error(f"Failed to delete document {document_id}: {e}")
|
|
raise HTTPException(status_code=500, detail="Failed to delete document")
|
|
finally:
|
|
db.close()
|