Files
breakpilot-compliance/backend-compliance/compliance/api/import_routes.py
Benjamin Admin e6d666b89b
All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-ai-compliance (push) Successful in 37s
CI / test-python-backend-compliance (push) Successful in 32s
CI / test-python-document-crawler (push) Successful in 22s
CI / test-python-dsms-gateway (push) Successful in 18s
feat: Vorbereitung-Module auf 100% — Persistenz, Backend-Services, UCCA Frontend
Phase A: PostgreSQL State Store (sdk_states Tabelle, InMemory-Fallback)
Phase B: Modules dynamisch vom Backend, Scope DB-Persistenz, Source Policy State
Phase C: UCCA Frontend (3 Seiten, Wizard, RiskScoreGauge), Obligations Live-Daten
Phase D: Document Import (PDF/LLM/Gap-Analyse), System Screening (SBOM/OSV.dev)
Phase E: Company Profile CRUD mit Audit-Logging
Phase F: Tests (Python + TypeScript), flow-data.ts DB-Tabellen aktualisiert

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-02 11:04:31 +01:00

381 lines
14 KiB
Python

"""
FastAPI routes for Document Import and Gap Analysis.
Endpoints:
- POST /v1/import/analyze: Upload and analyze a compliance document
- GET /v1/import/documents: List imported documents for a tenant
- GET /v1/import/gap-analysis/{document_id}: Get gap analysis for a document
"""
import logging
import os
import uuid
from typing import Optional
import httpx
from fastapi import APIRouter, File, Form, UploadFile, HTTPException
from pydantic import BaseModel
from database import SessionLocal
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/v1/import", tags=["document-import"])
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
LLM_MODEL = os.getenv("COMPLIANCE_LLM_MODEL", "qwen3:30b-a3b")
# =============================================================================
# DOCUMENT TYPE DETECTION
# =============================================================================
DOCUMENT_TYPE_KEYWORDS = {
"DSFA": ["datenschutz-folgenabschaetzung", "dsfa", "dpia", "privacy impact"],
"TOM": ["technisch-organisatorische", "tom", "massnahmen", "technical measures"],
"VVT": ["verarbeitungsverzeichnis", "vvt", "processing activities", "art. 30"],
"PRIVACY_POLICY": ["datenschutzerklaerung", "privacy policy", "datenschutzhinweis"],
"AGB": ["allgemeine geschaeftsbedingungen", "agb", "terms and conditions"],
"COOKIE_POLICY": ["cookie", "tracking", "einwilligung"],
"RISK_ASSESSMENT": ["risikobewertung", "risk assessment", "risikoanalyse"],
"AUDIT_REPORT": ["audit", "pruefbericht", "zertifizierung"],
}
def detect_document_type(text: str) -> tuple[str, float]:
"""Detect document type from extracted text using keyword matching."""
text_lower = text.lower()
scores: dict[str, int] = {}
for doc_type, keywords in DOCUMENT_TYPE_KEYWORDS.items():
score = sum(1 for kw in keywords if kw in text_lower)
if score > 0:
scores[doc_type] = score
if not scores:
return "OTHER", 0.3
best_type = max(scores, key=scores.get)
confidence = min(0.95, 0.5 + scores[best_type] * 0.15)
return best_type, confidence
# =============================================================================
# GAP ANALYSIS
# =============================================================================
GAP_RULES = [
{
"category": "AI Act Compliance",
"regulation": "EU AI Act Art. 6",
"check_keywords": ["ki", "ai", "kuenstliche intelligenz", "machine learning"],
"gap_if_missing": ["risikoklassifizierung", "risk classification", "risikokategorie"],
"severity": "CRITICAL",
"action": "Risikoklassifizierung fuer KI-Systeme durchfuehren",
},
{
"category": "Transparenz",
"regulation": "DSGVO Art. 13, 14, 22",
"check_keywords": ["automatisiert", "automated", "profiling"],
"gap_if_missing": ["informationspflicht", "information obligation", "transparenz"],
"severity": "HIGH",
"action": "Informationspflichten bei automatisierten Entscheidungen ergaenzen",
},
{
"category": "TOMs",
"regulation": "DSGVO Art. 32",
"check_keywords": ["ki", "ai", "cloud", "saas"],
"gap_if_missing": ["technische massnahmen", "verschluesselung", "encryption"],
"severity": "MEDIUM",
"action": "Technisch-organisatorische Massnahmen um KI-Aspekte erweitern",
},
{
"category": "VVT",
"regulation": "DSGVO Art. 30",
"check_keywords": ["verarbeitung", "processing", "daten"],
"gap_if_missing": ["verarbeitungsverzeichnis", "vvt", "processing activities"],
"severity": "HIGH",
"action": "Verarbeitungsverzeichnis aktualisieren",
},
{
"category": "Menschliche Aufsicht",
"regulation": "EU AI Act Art. 14",
"check_keywords": ["ki", "ai", "autonom", "autonomous"],
"gap_if_missing": ["menschliche aufsicht", "human oversight", "human-in-the-loop"],
"severity": "MEDIUM",
"action": "Prozesse fuer menschliche Aufsicht definieren",
},
]
def analyze_gaps(text: str, doc_type: str) -> list[dict]:
"""Analyze document text for compliance gaps."""
text_lower = text.lower()
gaps = []
for rule in GAP_RULES:
# Check if rule applies (keywords present in document)
applies = any(kw in text_lower for kw in rule["check_keywords"])
if not applies:
continue
# Check if gap exists (required elements missing)
has_gap = not any(kw in text_lower for kw in rule["gap_if_missing"])
if has_gap:
gaps.append({
"id": f"gap-{uuid.uuid4().hex[:8]}",
"category": rule["category"],
"description": f"{rule['category']}: Luecke erkannt",
"severity": rule["severity"],
"regulation": rule["regulation"],
"required_action": rule["action"],
"related_step_id": doc_type.lower(),
})
return gaps
# =============================================================================
# TEXT EXTRACTION
# =============================================================================
def extract_text_from_pdf(content: bytes) -> str:
"""Extract text from PDF using PyMuPDF (fitz)."""
try:
import fitz
doc = fitz.open(stream=content, filetype="pdf")
text_parts = []
for page in doc:
text_parts.append(page.get_text())
doc.close()
return "\n".join(text_parts)
except ImportError:
logger.warning("PyMuPDF not available, returning empty text")
return ""
except Exception as e:
logger.error(f"PDF extraction failed: {e}")
return ""
# =============================================================================
# LLM CLASSIFICATION (optional enhancement)
# =============================================================================
async def classify_with_llm(text: str) -> Optional[tuple[str, float]]:
"""Use Ollama LLM to classify document type (optional, falls back to keywords)."""
try:
prompt = f"""Klassifiziere das folgende Dokument in eine dieser Kategorien:
DSFA, TOM, VVT, PRIVACY_POLICY, AGB, COOKIE_POLICY, RISK_ASSESSMENT, AUDIT_REPORT, OTHER
Antworte NUR mit dem Kategorienamen, nichts anderes.
Dokumenttext (erste 2000 Zeichen):
{text[:2000]}"""
async with httpx.AsyncClient(timeout=30.0) as client:
response = await client.post(
f"{OLLAMA_URL}/api/generate",
json={
"model": LLM_MODEL,
"prompt": prompt,
"stream": False,
"options": {"temperature": 0.1, "num_predict": 20},
},
)
if response.status_code == 200:
result = response.json()
answer = result.get("response", "").strip().upper()
# Validate answer
valid_types = {"DSFA", "TOM", "VVT", "PRIVACY_POLICY", "AGB",
"COOKIE_POLICY", "RISK_ASSESSMENT", "AUDIT_REPORT", "OTHER"}
if answer in valid_types:
return answer, 0.85
except Exception as e:
logger.warning(f"LLM classification failed, using keyword fallback: {e}")
return None
# =============================================================================
# RESPONSE MODELS
# =============================================================================
class DocumentAnalysisResponse(BaseModel):
document_id: str
filename: str
detected_type: str
confidence: float
extracted_entities: list[str]
recommendations: list[str]
gap_analysis: dict
class DocumentListResponse(BaseModel):
documents: list[dict]
total: int
# =============================================================================
# ROUTES
# =============================================================================
@router.post("/analyze", response_model=DocumentAnalysisResponse)
async def analyze_document(
file: UploadFile = File(...),
document_type: str = Form("OTHER"),
tenant_id: str = Form("default"),
):
"""Upload and analyze a compliance document."""
if not file.filename:
raise HTTPException(status_code=400, detail="No file provided")
# Read file content
content = await file.read()
file_size = len(content)
# Extract text
if file.content_type == "application/pdf" or (file.filename and file.filename.endswith(".pdf")):
text = extract_text_from_pdf(content)
else:
# Try to decode as text
try:
text = content.decode("utf-8")
except UnicodeDecodeError:
text = ""
# Detect document type
if document_type == "OTHER" and text:
# Try LLM first, fallback to keywords
llm_result = await classify_with_llm(text)
if llm_result:
detected_type, confidence = llm_result
else:
detected_type, confidence = detect_document_type(text)
else:
detected_type = document_type
confidence = 1.0
# Extract key entities
entities = []
entity_keywords = ["DSGVO", "AI Act", "ISO 27001", "NIS2", "BDSG",
"Personenbezogene Daten", "Auftragsverarbeitung", "DSFA"]
for kw in entity_keywords:
if kw.lower() in text.lower():
entities.append(kw)
# Analyze gaps
gaps = analyze_gaps(text, detected_type)
# Generate recommendations
recommendations = []
if gaps:
recommendations = [g["required_action"] for g in gaps[:5]]
if not recommendations:
recommendations = ["Dokument erscheint vollstaendig"]
# Persist to database
doc_id = str(uuid.uuid4())
db = SessionLocal()
try:
db.execute(
"""INSERT INTO compliance_imported_documents
(id, tenant_id, filename, file_type, file_size, detected_type, detection_confidence,
extracted_text, extracted_entities, recommendations, status, analyzed_at)
VALUES (:id, :tenant_id, :filename, :file_type, :file_size, :detected_type, :confidence,
:text, :entities::jsonb, :recommendations::jsonb, 'analyzed', NOW())""",
{
"id": doc_id,
"tenant_id": tenant_id,
"filename": file.filename,
"file_type": file.content_type or "unknown",
"file_size": file_size,
"detected_type": detected_type,
"confidence": confidence,
"text": text[:50000], # Limit stored text
"entities": str(entities).replace("'", '"'),
"recommendations": str(recommendations).replace("'", '"'),
},
)
# Save gap analysis
total_gaps = len(gaps)
gap_analysis_result = {
"id": f"analysis-{doc_id[:8]}",
"total_gaps": total_gaps,
"critical_gaps": len([g for g in gaps if g["severity"] == "CRITICAL"]),
"high_gaps": len([g for g in gaps if g["severity"] == "HIGH"]),
"medium_gaps": len([g for g in gaps if g["severity"] == "MEDIUM"]),
"low_gaps": len([g for g in gaps if g["severity"] == "LOW"]),
"gaps": gaps,
"recommended_packages": ["analyse", "dokumentation"] if total_gaps > 0 else [],
}
if total_gaps > 0:
import json
db.execute(
"""INSERT INTO compliance_gap_analyses
(tenant_id, document_id, total_gaps, critical_gaps, high_gaps, medium_gaps, low_gaps, gaps, recommended_packages)
VALUES (:tenant_id, :document_id, :total, :critical, :high, :medium, :low, :gaps::jsonb, :packages::jsonb)""",
{
"tenant_id": tenant_id,
"document_id": doc_id,
"total": gap_analysis_result["total_gaps"],
"critical": gap_analysis_result["critical_gaps"],
"high": gap_analysis_result["high_gaps"],
"medium": gap_analysis_result["medium_gaps"],
"low": gap_analysis_result["low_gaps"],
"gaps": json.dumps(gaps),
"packages": json.dumps(gap_analysis_result["recommended_packages"]),
},
)
db.commit()
except Exception as e:
db.rollback()
logger.error(f"Failed to persist document analysis: {e}")
finally:
db.close()
return DocumentAnalysisResponse(
document_id=doc_id,
filename=file.filename or "unknown",
detected_type=detected_type,
confidence=confidence,
extracted_entities=entities,
recommendations=recommendations,
gap_analysis=gap_analysis_result,
)
@router.get("/documents", response_model=DocumentListResponse)
async def list_documents(tenant_id: str = "default"):
"""List all imported documents for a tenant."""
db = SessionLocal()
try:
result = db.execute(
"""SELECT id, filename, file_type, file_size, detected_type, detection_confidence,
extracted_entities, recommendations, status, analyzed_at, created_at
FROM compliance_imported_documents
WHERE tenant_id = :tenant_id
ORDER BY created_at DESC""",
{"tenant_id": tenant_id},
)
rows = result.fetchall()
documents = []
for row in rows:
documents.append({
"id": str(row[0]),
"filename": row[1],
"file_type": row[2],
"file_size": row[3],
"detected_type": row[4],
"confidence": row[5],
"extracted_entities": row[6] or [],
"recommendations": row[7] or [],
"status": row[8],
"analyzed_at": str(row[9]) if row[9] else None,
"created_at": str(row[10]),
})
return DocumentListResponse(documents=documents, total=len(documents))
finally:
db.close()