""" FastAPI routes for Document Import and Gap Analysis. Endpoints: - POST /v1/import/analyze: Upload and analyze a compliance document - GET /v1/import/documents: List imported documents for a tenant - GET /v1/import/gap-analysis/{document_id}: Get gap analysis for a document """ import logging import os import uuid from typing import Optional import httpx from fastapi import APIRouter, File, Form, Header, UploadFile, HTTPException from pydantic import BaseModel from sqlalchemy import text from database import SessionLocal logger = logging.getLogger(__name__) router = APIRouter(prefix="/v1/import", tags=["document-import"]) OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434") LLM_MODEL = os.getenv("COMPLIANCE_LLM_MODEL", "qwen3.5:35b-a3b") # ============================================================================= # DOCUMENT TYPE DETECTION # ============================================================================= DOCUMENT_TYPE_KEYWORDS = { "DSFA": ["datenschutz-folgenabschaetzung", "dsfa", "dpia", "privacy impact"], "TOM": ["technisch-organisatorische", "tom", "massnahmen", "technical measures"], "VVT": ["verarbeitungsverzeichnis", "vvt", "processing activities", "art. 30"], "PRIVACY_POLICY": ["datenschutzerklaerung", "privacy policy", "datenschutzhinweis"], "AGB": ["allgemeine geschaeftsbedingungen", "agb", "terms and conditions"], "COOKIE_POLICY": ["cookie", "tracking", "einwilligung"], "RISK_ASSESSMENT": ["risikobewertung", "risk assessment", "risikoanalyse"], "AUDIT_REPORT": ["audit", "pruefbericht", "zertifizierung"], } def detect_document_type(text: str) -> tuple[str, float]: """Detect document type from extracted text using keyword matching.""" text_lower = text.lower() scores: dict[str, int] = {} for doc_type, keywords in DOCUMENT_TYPE_KEYWORDS.items(): score = sum(1 for kw in keywords if kw in text_lower) if score > 0: scores[doc_type] = score if not scores: return "OTHER", 0.3 best_type = max(scores, key=scores.get) confidence = min(0.95, 0.5 + scores[best_type] * 0.15) return best_type, confidence # ============================================================================= # GAP ANALYSIS # ============================================================================= GAP_RULES = [ { "category": "AI Act Compliance", "regulation": "EU AI Act Art. 6", "check_keywords": ["ki", "ai", "kuenstliche intelligenz", "machine learning"], "gap_if_missing": ["risikoklassifizierung", "risk classification", "risikokategorie"], "severity": "CRITICAL", "action": "Risikoklassifizierung fuer KI-Systeme durchfuehren", }, { "category": "Transparenz", "regulation": "DSGVO Art. 13, 14, 22", "check_keywords": ["automatisiert", "automated", "profiling"], "gap_if_missing": ["informationspflicht", "information obligation", "transparenz"], "severity": "HIGH", "action": "Informationspflichten bei automatisierten Entscheidungen ergaenzen", }, { "category": "TOMs", "regulation": "DSGVO Art. 32", "check_keywords": ["ki", "ai", "cloud", "saas"], "gap_if_missing": ["technische massnahmen", "verschluesselung", "encryption"], "severity": "MEDIUM", "action": "Technisch-organisatorische Massnahmen um KI-Aspekte erweitern", }, { "category": "VVT", "regulation": "DSGVO Art. 30", "check_keywords": ["verarbeitung", "processing", "daten"], "gap_if_missing": ["verarbeitungsverzeichnis", "vvt", "processing activities"], "severity": "HIGH", "action": "Verarbeitungsverzeichnis aktualisieren", }, { "category": "Menschliche Aufsicht", "regulation": "EU AI Act Art. 14", "check_keywords": ["ki", "ai", "autonom", "autonomous"], "gap_if_missing": ["menschliche aufsicht", "human oversight", "human-in-the-loop"], "severity": "MEDIUM", "action": "Prozesse fuer menschliche Aufsicht definieren", }, ] def analyze_gaps(text: str, doc_type: str) -> list[dict]: """Analyze document text for compliance gaps.""" text_lower = text.lower() gaps = [] for rule in GAP_RULES: # Check if rule applies (keywords present in document) applies = any(kw in text_lower for kw in rule["check_keywords"]) if not applies: continue # Check if gap exists (required elements missing) has_gap = not any(kw in text_lower for kw in rule["gap_if_missing"]) if has_gap: gaps.append({ "id": f"gap-{uuid.uuid4().hex[:8]}", "category": rule["category"], "description": f"{rule['category']}: Luecke erkannt", "severity": rule["severity"], "regulation": rule["regulation"], "required_action": rule["action"], "related_step_id": doc_type.lower(), }) return gaps # ============================================================================= # TEXT EXTRACTION # ============================================================================= def extract_text_from_pdf(content: bytes) -> str: """Extract text from PDF using PyMuPDF (fitz).""" try: import fitz doc = fitz.open(stream=content, filetype="pdf") text_parts = [] for page in doc: text_parts.append(page.get_text()) doc.close() return "\n".join(text_parts) except ImportError: logger.warning("PyMuPDF not available, returning empty text") return "" except Exception as e: logger.error(f"PDF extraction failed: {e}") return "" # ============================================================================= # LLM CLASSIFICATION (optional enhancement) # ============================================================================= async def classify_with_llm(text: str) -> Optional[tuple[str, float]]: """Use Ollama LLM to classify document type (optional, falls back to keywords).""" try: prompt = f"""Klassifiziere das folgende Dokument in eine dieser Kategorien: DSFA, TOM, VVT, PRIVACY_POLICY, AGB, COOKIE_POLICY, RISK_ASSESSMENT, AUDIT_REPORT, OTHER Antworte NUR mit dem Kategorienamen, nichts anderes. Dokumenttext (erste 2000 Zeichen): {text[:2000]}""" async with httpx.AsyncClient(timeout=30.0) as client: response = await client.post( f"{OLLAMA_URL}/api/generate", json={ "model": LLM_MODEL, "prompt": prompt, "stream": False, "options": {"temperature": 0.1, "num_predict": 20}, }, ) if response.status_code == 200: result = response.json() answer = result.get("response", "").strip().upper() # Validate answer valid_types = {"DSFA", "TOM", "VVT", "PRIVACY_POLICY", "AGB", "COOKIE_POLICY", "RISK_ASSESSMENT", "AUDIT_REPORT", "OTHER"} if answer in valid_types: return answer, 0.85 except Exception as e: logger.warning(f"LLM classification failed, using keyword fallback: {e}") return None # ============================================================================= # RESPONSE MODELS # ============================================================================= class DocumentAnalysisResponse(BaseModel): document_id: str filename: str detected_type: str confidence: float extracted_entities: list[str] recommendations: list[str] gap_analysis: dict class DocumentListResponse(BaseModel): documents: list[dict] total: int # ============================================================================= # ROUTES # ============================================================================= @router.post("/analyze", response_model=DocumentAnalysisResponse) async def analyze_document( file: UploadFile = File(...), document_type: str = Form("OTHER"), tenant_id: str = Form("default"), ): """Upload and analyze a compliance document.""" if not file.filename: raise HTTPException(status_code=400, detail="No file provided") # Read file content content = await file.read() file_size = len(content) # Extract text if file.content_type == "application/pdf" or (file.filename and file.filename.endswith(".pdf")): text = extract_text_from_pdf(content) else: # Try to decode as text try: text = content.decode("utf-8") except UnicodeDecodeError: text = "" # Detect document type if document_type == "OTHER" and text: # Try LLM first, fallback to keywords llm_result = await classify_with_llm(text) if llm_result: detected_type, confidence = llm_result else: detected_type, confidence = detect_document_type(text) else: detected_type = document_type confidence = 1.0 # Extract key entities entities = [] entity_keywords = ["DSGVO", "AI Act", "ISO 27001", "NIS2", "BDSG", "Personenbezogene Daten", "Auftragsverarbeitung", "DSFA"] for kw in entity_keywords: if kw.lower() in text.lower(): entities.append(kw) # Analyze gaps gaps = analyze_gaps(text, detected_type) # Generate recommendations recommendations = [] if gaps: recommendations = [g["required_action"] for g in gaps[:5]] if not recommendations: recommendations = ["Dokument erscheint vollstaendig"] # Persist to database doc_id = str(uuid.uuid4()) # Build gap_analysis_result before DB write so it's always available total_gaps = len(gaps) gap_analysis_result = { "id": f"analysis-{doc_id[:8]}", "total_gaps": total_gaps, "critical_gaps": len([g for g in gaps if g["severity"] == "CRITICAL"]), "high_gaps": len([g for g in gaps if g["severity"] == "HIGH"]), "medium_gaps": len([g for g in gaps if g["severity"] == "MEDIUM"]), "low_gaps": len([g for g in gaps if g["severity"] == "LOW"]), "gaps": gaps, "recommended_packages": ["analyse", "dokumentation"] if total_gaps > 0 else [], } db = SessionLocal() try: db.execute( text("""INSERT INTO compliance_imported_documents (id, tenant_id, filename, file_type, file_size, detected_type, detection_confidence, extracted_text, extracted_entities, recommendations, status, analyzed_at) VALUES (:id, :tenant_id, :filename, :file_type, :file_size, :detected_type, :confidence, :text, :entities::jsonb, :recommendations::jsonb, 'analyzed', NOW())"""), { "id": doc_id, "tenant_id": tenant_id, "filename": file.filename, "file_type": file.content_type or "unknown", "file_size": file_size, "detected_type": detected_type, "confidence": confidence, "text": text[:50000], # Limit stored text "entities": str(entities).replace("'", '"'), "recommendations": str(recommendations).replace("'", '"'), }, ) if total_gaps > 0: import json db.execute( text("""INSERT INTO compliance_gap_analyses (tenant_id, document_id, total_gaps, critical_gaps, high_gaps, medium_gaps, low_gaps, gaps, recommended_packages) VALUES (:tenant_id, :document_id, :total, :critical, :high, :medium, :low, :gaps::jsonb, :packages::jsonb)"""), { "tenant_id": tenant_id, "document_id": doc_id, "total": gap_analysis_result["total_gaps"], "critical": gap_analysis_result["critical_gaps"], "high": gap_analysis_result["high_gaps"], "medium": gap_analysis_result["medium_gaps"], "low": gap_analysis_result["low_gaps"], "gaps": json.dumps(gaps), "packages": json.dumps(gap_analysis_result["recommended_packages"]), }, ) db.commit() except Exception as e: db.rollback() logger.error(f"Failed to persist document analysis: {e}") finally: db.close() return DocumentAnalysisResponse( document_id=doc_id, filename=file.filename or "unknown", detected_type=detected_type, confidence=confidence, extracted_entities=entities, recommendations=recommendations, gap_analysis=gap_analysis_result, ) @router.get("/gap-analysis/{document_id}") async def get_gap_analysis( document_id: str, tenant_id: str = "default", x_tenant_id: Optional[str] = Header(None, alias="X-Tenant-ID"), ): """Get gap analysis for a specific document.""" tid = x_tenant_id or tenant_id db = SessionLocal() try: result = db.execute( text("SELECT * FROM compliance_gap_analyses WHERE document_id = :doc_id AND tenant_id = :tid"), {"doc_id": document_id, "tid": tid}, ).fetchone() if not result: raise HTTPException(status_code=404, detail="Gap analysis not found") return dict(result) finally: db.close() @router.get("/documents", response_model=DocumentListResponse) async def list_documents(tenant_id: str = "default"): """List all imported documents for a tenant.""" db = SessionLocal() try: result = db.execute( text("""SELECT id, filename, file_type, file_size, detected_type, detection_confidence, extracted_entities, recommendations, status, analyzed_at, created_at FROM compliance_imported_documents WHERE tenant_id = :tenant_id ORDER BY created_at DESC"""), {"tenant_id": tenant_id}, ) rows = result.fetchall() documents = [] for row in rows: documents.append({ "id": str(row[0]), "filename": row[1], "file_type": row[2], "file_size": row[3], "detected_type": row[4], "confidence": row[5], "extracted_entities": row[6] or [], "recommendations": row[7] or [], "status": row[8], "analyzed_at": str(row[9]) if row[9] else None, "created_at": str(row[10]), }) return DocumentListResponse(documents=documents, total=len(documents)) finally: db.close() @router.get("", response_model=DocumentListResponse) async def list_documents_root( tenant_id: str = "default", x_tenant_id: Optional[str] = Header(None, alias="X-Tenant-ID"), ): """Alias: GET /v1/import → list documents (proxy-compatible URL).""" tid = x_tenant_id or tenant_id return await list_documents(tenant_id=tid) @router.delete("/{document_id}") async def delete_document( document_id: str, tenant_id: str = "default", x_tenant_id: Optional[str] = Header(None, alias="X-Tenant-ID"), ): """Delete an imported document and its gap analysis.""" tid = x_tenant_id or tenant_id db = SessionLocal() try: # Delete gap analysis first (FK dependency) db.execute( text("DELETE FROM compliance_gap_analyses WHERE document_id = :doc_id AND tenant_id = :tid"), {"doc_id": document_id, "tid": tid}, ) result = db.execute( text("DELETE FROM compliance_imported_documents WHERE id = :doc_id AND tenant_id = :tid"), {"doc_id": document_id, "tid": tid}, ) db.commit() if result.rowcount == 0: raise HTTPException(status_code=404, detail="Document not found") return {"success": True, "deleted_id": document_id} except HTTPException: raise except Exception as e: db.rollback() logger.error(f"Failed to delete document {document_id}: {e}") raise HTTPException(status_code=500, detail="Failed to delete document") finally: db.close()