""" FastAPI routes for Document Import and Gap Analysis. Endpoints: - POST /v1/import/analyze: Upload and analyze a compliance document - GET /v1/import/documents: List imported documents for a tenant - GET /v1/import/gap-analysis/{document_id}: Get gap analysis for a document """ import logging import os import uuid from typing import Optional import httpx from fastapi import APIRouter, File, Form, UploadFile, HTTPException from pydantic import BaseModel from database import SessionLocal logger = logging.getLogger(__name__) router = APIRouter(prefix="/v1/import", tags=["document-import"]) OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434") LLM_MODEL = os.getenv("COMPLIANCE_LLM_MODEL", "qwen3:30b-a3b") # ============================================================================= # DOCUMENT TYPE DETECTION # ============================================================================= DOCUMENT_TYPE_KEYWORDS = { "DSFA": ["datenschutz-folgenabschaetzung", "dsfa", "dpia", "privacy impact"], "TOM": ["technisch-organisatorische", "tom", "massnahmen", "technical measures"], "VVT": ["verarbeitungsverzeichnis", "vvt", "processing activities", "art. 30"], "PRIVACY_POLICY": ["datenschutzerklaerung", "privacy policy", "datenschutzhinweis"], "AGB": ["allgemeine geschaeftsbedingungen", "agb", "terms and conditions"], "COOKIE_POLICY": ["cookie", "tracking", "einwilligung"], "RISK_ASSESSMENT": ["risikobewertung", "risk assessment", "risikoanalyse"], "AUDIT_REPORT": ["audit", "pruefbericht", "zertifizierung"], } def detect_document_type(text: str) -> tuple[str, float]: """Detect document type from extracted text using keyword matching.""" text_lower = text.lower() scores: dict[str, int] = {} for doc_type, keywords in DOCUMENT_TYPE_KEYWORDS.items(): score = sum(1 for kw in keywords if kw in text_lower) if score > 0: scores[doc_type] = score if not scores: return "OTHER", 0.3 best_type = max(scores, key=scores.get) confidence = min(0.95, 0.5 + scores[best_type] * 0.15) return best_type, confidence # ============================================================================= # GAP ANALYSIS # ============================================================================= GAP_RULES = [ { "category": "AI Act Compliance", "regulation": "EU AI Act Art. 6", "check_keywords": ["ki", "ai", "kuenstliche intelligenz", "machine learning"], "gap_if_missing": ["risikoklassifizierung", "risk classification", "risikokategorie"], "severity": "CRITICAL", "action": "Risikoklassifizierung fuer KI-Systeme durchfuehren", }, { "category": "Transparenz", "regulation": "DSGVO Art. 13, 14, 22", "check_keywords": ["automatisiert", "automated", "profiling"], "gap_if_missing": ["informationspflicht", "information obligation", "transparenz"], "severity": "HIGH", "action": "Informationspflichten bei automatisierten Entscheidungen ergaenzen", }, { "category": "TOMs", "regulation": "DSGVO Art. 32", "check_keywords": ["ki", "ai", "cloud", "saas"], "gap_if_missing": ["technische massnahmen", "verschluesselung", "encryption"], "severity": "MEDIUM", "action": "Technisch-organisatorische Massnahmen um KI-Aspekte erweitern", }, { "category": "VVT", "regulation": "DSGVO Art. 30", "check_keywords": ["verarbeitung", "processing", "daten"], "gap_if_missing": ["verarbeitungsverzeichnis", "vvt", "processing activities"], "severity": "HIGH", "action": "Verarbeitungsverzeichnis aktualisieren", }, { "category": "Menschliche Aufsicht", "regulation": "EU AI Act Art. 14", "check_keywords": ["ki", "ai", "autonom", "autonomous"], "gap_if_missing": ["menschliche aufsicht", "human oversight", "human-in-the-loop"], "severity": "MEDIUM", "action": "Prozesse fuer menschliche Aufsicht definieren", }, ] def analyze_gaps(text: str, doc_type: str) -> list[dict]: """Analyze document text for compliance gaps.""" text_lower = text.lower() gaps = [] for rule in GAP_RULES: # Check if rule applies (keywords present in document) applies = any(kw in text_lower for kw in rule["check_keywords"]) if not applies: continue # Check if gap exists (required elements missing) has_gap = not any(kw in text_lower for kw in rule["gap_if_missing"]) if has_gap: gaps.append({ "id": f"gap-{uuid.uuid4().hex[:8]}", "category": rule["category"], "description": f"{rule['category']}: Luecke erkannt", "severity": rule["severity"], "regulation": rule["regulation"], "required_action": rule["action"], "related_step_id": doc_type.lower(), }) return gaps # ============================================================================= # TEXT EXTRACTION # ============================================================================= def extract_text_from_pdf(content: bytes) -> str: """Extract text from PDF using PyMuPDF (fitz).""" try: import fitz doc = fitz.open(stream=content, filetype="pdf") text_parts = [] for page in doc: text_parts.append(page.get_text()) doc.close() return "\n".join(text_parts) except ImportError: logger.warning("PyMuPDF not available, returning empty text") return "" except Exception as e: logger.error(f"PDF extraction failed: {e}") return "" # ============================================================================= # LLM CLASSIFICATION (optional enhancement) # ============================================================================= async def classify_with_llm(text: str) -> Optional[tuple[str, float]]: """Use Ollama LLM to classify document type (optional, falls back to keywords).""" try: prompt = f"""Klassifiziere das folgende Dokument in eine dieser Kategorien: DSFA, TOM, VVT, PRIVACY_POLICY, AGB, COOKIE_POLICY, RISK_ASSESSMENT, AUDIT_REPORT, OTHER Antworte NUR mit dem Kategorienamen, nichts anderes. Dokumenttext (erste 2000 Zeichen): {text[:2000]}""" async with httpx.AsyncClient(timeout=30.0) as client: response = await client.post( f"{OLLAMA_URL}/api/generate", json={ "model": LLM_MODEL, "prompt": prompt, "stream": False, "options": {"temperature": 0.1, "num_predict": 20}, }, ) if response.status_code == 200: result = response.json() answer = result.get("response", "").strip().upper() # Validate answer valid_types = {"DSFA", "TOM", "VVT", "PRIVACY_POLICY", "AGB", "COOKIE_POLICY", "RISK_ASSESSMENT", "AUDIT_REPORT", "OTHER"} if answer in valid_types: return answer, 0.85 except Exception as e: logger.warning(f"LLM classification failed, using keyword fallback: {e}") return None # ============================================================================= # RESPONSE MODELS # ============================================================================= class DocumentAnalysisResponse(BaseModel): document_id: str filename: str detected_type: str confidence: float extracted_entities: list[str] recommendations: list[str] gap_analysis: dict class DocumentListResponse(BaseModel): documents: list[dict] total: int # ============================================================================= # ROUTES # ============================================================================= @router.post("/analyze", response_model=DocumentAnalysisResponse) async def analyze_document( file: UploadFile = File(...), document_type: str = Form("OTHER"), tenant_id: str = Form("default"), ): """Upload and analyze a compliance document.""" if not file.filename: raise HTTPException(status_code=400, detail="No file provided") # Read file content content = await file.read() file_size = len(content) # Extract text if file.content_type == "application/pdf" or (file.filename and file.filename.endswith(".pdf")): text = extract_text_from_pdf(content) else: # Try to decode as text try: text = content.decode("utf-8") except UnicodeDecodeError: text = "" # Detect document type if document_type == "OTHER" and text: # Try LLM first, fallback to keywords llm_result = await classify_with_llm(text) if llm_result: detected_type, confidence = llm_result else: detected_type, confidence = detect_document_type(text) else: detected_type = document_type confidence = 1.0 # Extract key entities entities = [] entity_keywords = ["DSGVO", "AI Act", "ISO 27001", "NIS2", "BDSG", "Personenbezogene Daten", "Auftragsverarbeitung", "DSFA"] for kw in entity_keywords: if kw.lower() in text.lower(): entities.append(kw) # Analyze gaps gaps = analyze_gaps(text, detected_type) # Generate recommendations recommendations = [] if gaps: recommendations = [g["required_action"] for g in gaps[:5]] if not recommendations: recommendations = ["Dokument erscheint vollstaendig"] # Persist to database doc_id = str(uuid.uuid4()) db = SessionLocal() try: db.execute( """INSERT INTO compliance_imported_documents (id, tenant_id, filename, file_type, file_size, detected_type, detection_confidence, extracted_text, extracted_entities, recommendations, status, analyzed_at) VALUES (:id, :tenant_id, :filename, :file_type, :file_size, :detected_type, :confidence, :text, :entities::jsonb, :recommendations::jsonb, 'analyzed', NOW())""", { "id": doc_id, "tenant_id": tenant_id, "filename": file.filename, "file_type": file.content_type or "unknown", "file_size": file_size, "detected_type": detected_type, "confidence": confidence, "text": text[:50000], # Limit stored text "entities": str(entities).replace("'", '"'), "recommendations": str(recommendations).replace("'", '"'), }, ) # Save gap analysis total_gaps = len(gaps) gap_analysis_result = { "id": f"analysis-{doc_id[:8]}", "total_gaps": total_gaps, "critical_gaps": len([g for g in gaps if g["severity"] == "CRITICAL"]), "high_gaps": len([g for g in gaps if g["severity"] == "HIGH"]), "medium_gaps": len([g for g in gaps if g["severity"] == "MEDIUM"]), "low_gaps": len([g for g in gaps if g["severity"] == "LOW"]), "gaps": gaps, "recommended_packages": ["analyse", "dokumentation"] if total_gaps > 0 else [], } if total_gaps > 0: import json db.execute( """INSERT INTO compliance_gap_analyses (tenant_id, document_id, total_gaps, critical_gaps, high_gaps, medium_gaps, low_gaps, gaps, recommended_packages) VALUES (:tenant_id, :document_id, :total, :critical, :high, :medium, :low, :gaps::jsonb, :packages::jsonb)""", { "tenant_id": tenant_id, "document_id": doc_id, "total": gap_analysis_result["total_gaps"], "critical": gap_analysis_result["critical_gaps"], "high": gap_analysis_result["high_gaps"], "medium": gap_analysis_result["medium_gaps"], "low": gap_analysis_result["low_gaps"], "gaps": json.dumps(gaps), "packages": json.dumps(gap_analysis_result["recommended_packages"]), }, ) db.commit() except Exception as e: db.rollback() logger.error(f"Failed to persist document analysis: {e}") finally: db.close() return DocumentAnalysisResponse( document_id=doc_id, filename=file.filename or "unknown", detected_type=detected_type, confidence=confidence, extracted_entities=entities, recommendations=recommendations, gap_analysis=gap_analysis_result, ) @router.get("/documents", response_model=DocumentListResponse) async def list_documents(tenant_id: str = "default"): """List all imported documents for a tenant.""" db = SessionLocal() try: result = db.execute( """SELECT id, filename, file_type, file_size, detected_type, detection_confidence, extracted_entities, recommendations, status, analyzed_at, created_at FROM compliance_imported_documents WHERE tenant_id = :tenant_id ORDER BY created_at DESC""", {"tenant_id": tenant_id}, ) rows = result.fetchall() documents = [] for row in rows: documents.append({ "id": str(row[0]), "filename": row[1], "file_type": row[2], "file_size": row[3], "detected_type": row[4], "confidence": row[5], "extracted_entities": row[6] or [], "recommendations": row[7] or [], "status": row[8], "analyzed_at": str(row[9]) if row[9] else None, "created_at": str(row[10]), }) return DocumentListResponse(documents=documents, total=len(documents)) finally: db.close()