Files
breakpilot-compliance/backend-compliance/compliance/api/import_routes.py
Sharang Parnerkar 86588aff09 Fix SQLAlchemy 2.x compatibility: wrap raw SQL in text()
SQLAlchemy 2.x requires raw SQL strings to be explicitly wrapped
in text(). Fixed 16 instances across 5 route files.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-13 10:16:52 +01:00

447 lines
16 KiB
Python

"""
FastAPI routes for Document Import and Gap Analysis.
Endpoints:
- POST /v1/import/analyze: Upload and analyze a compliance document
- GET /v1/import/documents: List imported documents for a tenant
- GET /v1/import/gap-analysis/{document_id}: Get gap analysis for a document
"""
import logging
import os
import uuid
from typing import Optional
import httpx
from fastapi import APIRouter, File, Form, Header, UploadFile, HTTPException
from pydantic import BaseModel
from sqlalchemy import text
from database import SessionLocal
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/v1/import", tags=["document-import"])
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
LLM_MODEL = os.getenv("COMPLIANCE_LLM_MODEL", "qwen3.5:35b-a3b")
# =============================================================================
# DOCUMENT TYPE DETECTION
# =============================================================================
DOCUMENT_TYPE_KEYWORDS = {
"DSFA": ["datenschutz-folgenabschaetzung", "dsfa", "dpia", "privacy impact"],
"TOM": ["technisch-organisatorische", "tom", "massnahmen", "technical measures"],
"VVT": ["verarbeitungsverzeichnis", "vvt", "processing activities", "art. 30"],
"PRIVACY_POLICY": ["datenschutzerklaerung", "privacy policy", "datenschutzhinweis"],
"AGB": ["allgemeine geschaeftsbedingungen", "agb", "terms and conditions"],
"COOKIE_POLICY": ["cookie", "tracking", "einwilligung"],
"RISK_ASSESSMENT": ["risikobewertung", "risk assessment", "risikoanalyse"],
"AUDIT_REPORT": ["audit", "pruefbericht", "zertifizierung"],
}
def detect_document_type(text: str) -> tuple[str, float]:
"""Detect document type from extracted text using keyword matching."""
text_lower = text.lower()
scores: dict[str, int] = {}
for doc_type, keywords in DOCUMENT_TYPE_KEYWORDS.items():
score = sum(1 for kw in keywords if kw in text_lower)
if score > 0:
scores[doc_type] = score
if not scores:
return "OTHER", 0.3
best_type = max(scores, key=scores.get)
confidence = min(0.95, 0.5 + scores[best_type] * 0.15)
return best_type, confidence
# =============================================================================
# GAP ANALYSIS
# =============================================================================
GAP_RULES = [
{
"category": "AI Act Compliance",
"regulation": "EU AI Act Art. 6",
"check_keywords": ["ki", "ai", "kuenstliche intelligenz", "machine learning"],
"gap_if_missing": ["risikoklassifizierung", "risk classification", "risikokategorie"],
"severity": "CRITICAL",
"action": "Risikoklassifizierung fuer KI-Systeme durchfuehren",
},
{
"category": "Transparenz",
"regulation": "DSGVO Art. 13, 14, 22",
"check_keywords": ["automatisiert", "automated", "profiling"],
"gap_if_missing": ["informationspflicht", "information obligation", "transparenz"],
"severity": "HIGH",
"action": "Informationspflichten bei automatisierten Entscheidungen ergaenzen",
},
{
"category": "TOMs",
"regulation": "DSGVO Art. 32",
"check_keywords": ["ki", "ai", "cloud", "saas"],
"gap_if_missing": ["technische massnahmen", "verschluesselung", "encryption"],
"severity": "MEDIUM",
"action": "Technisch-organisatorische Massnahmen um KI-Aspekte erweitern",
},
{
"category": "VVT",
"regulation": "DSGVO Art. 30",
"check_keywords": ["verarbeitung", "processing", "daten"],
"gap_if_missing": ["verarbeitungsverzeichnis", "vvt", "processing activities"],
"severity": "HIGH",
"action": "Verarbeitungsverzeichnis aktualisieren",
},
{
"category": "Menschliche Aufsicht",
"regulation": "EU AI Act Art. 14",
"check_keywords": ["ki", "ai", "autonom", "autonomous"],
"gap_if_missing": ["menschliche aufsicht", "human oversight", "human-in-the-loop"],
"severity": "MEDIUM",
"action": "Prozesse fuer menschliche Aufsicht definieren",
},
]
def analyze_gaps(text: str, doc_type: str) -> list[dict]:
"""Analyze document text for compliance gaps."""
text_lower = text.lower()
gaps = []
for rule in GAP_RULES:
# Check if rule applies (keywords present in document)
applies = any(kw in text_lower for kw in rule["check_keywords"])
if not applies:
continue
# Check if gap exists (required elements missing)
has_gap = not any(kw in text_lower for kw in rule["gap_if_missing"])
if has_gap:
gaps.append({
"id": f"gap-{uuid.uuid4().hex[:8]}",
"category": rule["category"],
"description": f"{rule['category']}: Luecke erkannt",
"severity": rule["severity"],
"regulation": rule["regulation"],
"required_action": rule["action"],
"related_step_id": doc_type.lower(),
})
return gaps
# =============================================================================
# TEXT EXTRACTION
# =============================================================================
def extract_text_from_pdf(content: bytes) -> str:
"""Extract text from PDF using PyMuPDF (fitz)."""
try:
import fitz
doc = fitz.open(stream=content, filetype="pdf")
text_parts = []
for page in doc:
text_parts.append(page.get_text())
doc.close()
return "\n".join(text_parts)
except ImportError:
logger.warning("PyMuPDF not available, returning empty text")
return ""
except Exception as e:
logger.error(f"PDF extraction failed: {e}")
return ""
# =============================================================================
# LLM CLASSIFICATION (optional enhancement)
# =============================================================================
async def classify_with_llm(text: str) -> Optional[tuple[str, float]]:
"""Use Ollama LLM to classify document type (optional, falls back to keywords)."""
try:
prompt = f"""Klassifiziere das folgende Dokument in eine dieser Kategorien:
DSFA, TOM, VVT, PRIVACY_POLICY, AGB, COOKIE_POLICY, RISK_ASSESSMENT, AUDIT_REPORT, OTHER
Antworte NUR mit dem Kategorienamen, nichts anderes.
Dokumenttext (erste 2000 Zeichen):
{text[:2000]}"""
async with httpx.AsyncClient(timeout=30.0) as client:
response = await client.post(
f"{OLLAMA_URL}/api/generate",
json={
"model": LLM_MODEL,
"prompt": prompt,
"stream": False,
"options": {"temperature": 0.1, "num_predict": 20},
},
)
if response.status_code == 200:
result = response.json()
answer = result.get("response", "").strip().upper()
# Validate answer
valid_types = {"DSFA", "TOM", "VVT", "PRIVACY_POLICY", "AGB",
"COOKIE_POLICY", "RISK_ASSESSMENT", "AUDIT_REPORT", "OTHER"}
if answer in valid_types:
return answer, 0.85
except Exception as e:
logger.warning(f"LLM classification failed, using keyword fallback: {e}")
return None
# =============================================================================
# RESPONSE MODELS
# =============================================================================
class DocumentAnalysisResponse(BaseModel):
document_id: str
filename: str
detected_type: str
confidence: float
extracted_entities: list[str]
recommendations: list[str]
gap_analysis: dict
class DocumentListResponse(BaseModel):
documents: list[dict]
total: int
# =============================================================================
# ROUTES
# =============================================================================
@router.post("/analyze", response_model=DocumentAnalysisResponse)
async def analyze_document(
file: UploadFile = File(...),
document_type: str = Form("OTHER"),
tenant_id: str = Form("default"),
):
"""Upload and analyze a compliance document."""
if not file.filename:
raise HTTPException(status_code=400, detail="No file provided")
# Read file content
content = await file.read()
file_size = len(content)
# Extract text
if file.content_type == "application/pdf" or (file.filename and file.filename.endswith(".pdf")):
text = extract_text_from_pdf(content)
else:
# Try to decode as text
try:
text = content.decode("utf-8")
except UnicodeDecodeError:
text = ""
# Detect document type
if document_type == "OTHER" and text:
# Try LLM first, fallback to keywords
llm_result = await classify_with_llm(text)
if llm_result:
detected_type, confidence = llm_result
else:
detected_type, confidence = detect_document_type(text)
else:
detected_type = document_type
confidence = 1.0
# Extract key entities
entities = []
entity_keywords = ["DSGVO", "AI Act", "ISO 27001", "NIS2", "BDSG",
"Personenbezogene Daten", "Auftragsverarbeitung", "DSFA"]
for kw in entity_keywords:
if kw.lower() in text.lower():
entities.append(kw)
# Analyze gaps
gaps = analyze_gaps(text, detected_type)
# Generate recommendations
recommendations = []
if gaps:
recommendations = [g["required_action"] for g in gaps[:5]]
if not recommendations:
recommendations = ["Dokument erscheint vollstaendig"]
# Persist to database
doc_id = str(uuid.uuid4())
# Build gap_analysis_result before DB write so it's always available
total_gaps = len(gaps)
gap_analysis_result = {
"id": f"analysis-{doc_id[:8]}",
"total_gaps": total_gaps,
"critical_gaps": len([g for g in gaps if g["severity"] == "CRITICAL"]),
"high_gaps": len([g for g in gaps if g["severity"] == "HIGH"]),
"medium_gaps": len([g for g in gaps if g["severity"] == "MEDIUM"]),
"low_gaps": len([g for g in gaps if g["severity"] == "LOW"]),
"gaps": gaps,
"recommended_packages": ["analyse", "dokumentation"] if total_gaps > 0 else [],
}
db = SessionLocal()
try:
db.execute(
text("""INSERT INTO compliance_imported_documents
(id, tenant_id, filename, file_type, file_size, detected_type, detection_confidence,
extracted_text, extracted_entities, recommendations, status, analyzed_at)
VALUES (:id, :tenant_id, :filename, :file_type, :file_size, :detected_type, :confidence,
:text, :entities::jsonb, :recommendations::jsonb, 'analyzed', NOW())"""),
{
"id": doc_id,
"tenant_id": tenant_id,
"filename": file.filename,
"file_type": file.content_type or "unknown",
"file_size": file_size,
"detected_type": detected_type,
"confidence": confidence,
"text": text[:50000], # Limit stored text
"entities": str(entities).replace("'", '"'),
"recommendations": str(recommendations).replace("'", '"'),
},
)
if total_gaps > 0:
import json
db.execute(
text("""INSERT INTO compliance_gap_analyses
(tenant_id, document_id, total_gaps, critical_gaps, high_gaps, medium_gaps, low_gaps, gaps, recommended_packages)
VALUES (:tenant_id, :document_id, :total, :critical, :high, :medium, :low, :gaps::jsonb, :packages::jsonb)"""),
{
"tenant_id": tenant_id,
"document_id": doc_id,
"total": gap_analysis_result["total_gaps"],
"critical": gap_analysis_result["critical_gaps"],
"high": gap_analysis_result["high_gaps"],
"medium": gap_analysis_result["medium_gaps"],
"low": gap_analysis_result["low_gaps"],
"gaps": json.dumps(gaps),
"packages": json.dumps(gap_analysis_result["recommended_packages"]),
},
)
db.commit()
except Exception as e:
db.rollback()
logger.error(f"Failed to persist document analysis: {e}")
finally:
db.close()
return DocumentAnalysisResponse(
document_id=doc_id,
filename=file.filename or "unknown",
detected_type=detected_type,
confidence=confidence,
extracted_entities=entities,
recommendations=recommendations,
gap_analysis=gap_analysis_result,
)
@router.get("/gap-analysis/{document_id}")
async def get_gap_analysis(
document_id: str,
tenant_id: str = "default",
x_tenant_id: Optional[str] = Header(None, alias="X-Tenant-ID"),
):
"""Get gap analysis for a specific document."""
tid = x_tenant_id or tenant_id
db = SessionLocal()
try:
result = db.execute(
text("SELECT * FROM compliance_gap_analyses WHERE document_id = :doc_id AND tenant_id = :tid"),
{"doc_id": document_id, "tid": tid},
).fetchone()
if not result:
raise HTTPException(status_code=404, detail="Gap analysis not found")
return dict(result)
finally:
db.close()
@router.get("/documents", response_model=DocumentListResponse)
async def list_documents(tenant_id: str = "default"):
"""List all imported documents for a tenant."""
db = SessionLocal()
try:
result = db.execute(
text("""SELECT id, filename, file_type, file_size, detected_type, detection_confidence,
extracted_entities, recommendations, status, analyzed_at, created_at
FROM compliance_imported_documents
WHERE tenant_id = :tenant_id
ORDER BY created_at DESC"""),
{"tenant_id": tenant_id},
)
rows = result.fetchall()
documents = []
for row in rows:
documents.append({
"id": str(row[0]),
"filename": row[1],
"file_type": row[2],
"file_size": row[3],
"detected_type": row[4],
"confidence": row[5],
"extracted_entities": row[6] or [],
"recommendations": row[7] or [],
"status": row[8],
"analyzed_at": str(row[9]) if row[9] else None,
"created_at": str(row[10]),
})
return DocumentListResponse(documents=documents, total=len(documents))
finally:
db.close()
@router.get("", response_model=DocumentListResponse)
async def list_documents_root(
tenant_id: str = "default",
x_tenant_id: Optional[str] = Header(None, alias="X-Tenant-ID"),
):
"""Alias: GET /v1/import → list documents (proxy-compatible URL)."""
tid = x_tenant_id or tenant_id
return await list_documents(tenant_id=tid)
@router.delete("/{document_id}")
async def delete_document(
document_id: str,
tenant_id: str = "default",
x_tenant_id: Optional[str] = Header(None, alias="X-Tenant-ID"),
):
"""Delete an imported document and its gap analysis."""
tid = x_tenant_id or tenant_id
db = SessionLocal()
try:
# Delete gap analysis first (FK dependency)
db.execute(
text("DELETE FROM compliance_gap_analyses WHERE document_id = :doc_id AND tenant_id = :tid"),
{"doc_id": document_id, "tid": tid},
)
result = db.execute(
text("DELETE FROM compliance_imported_documents WHERE id = :doc_id AND tenant_id = :tid"),
{"doc_id": document_id, "tid": tid},
)
db.commit()
if result.rowcount == 0:
raise HTTPException(status_code=404, detail="Document not found")
return {"success": True, "deleted_id": document_id}
except HTTPException:
raise
except Exception as e:
db.rollback()
logger.error(f"Failed to delete document {document_id}: {e}")
raise HTTPException(status_code=500, detail="Failed to delete document")
finally:
db.close()