breakpilot-compliance/backend-compliance/compliance/api/import_routes.py

"""
FastAPI routes for Document Import and Gap Analysis.

Endpoints:
- POST /v1/import/analyze: Upload and analyze a compliance document
- GET /v1/import/documents: List imported documents for a tenant
- GET /v1/import/gap-analysis/{document_id}: Get gap analysis for a document
"""

import logging
import os
import uuid
from typing import Optional

import httpx
from fastapi import APIRouter, File, Form, UploadFile, HTTPException
from pydantic import BaseModel

from database import SessionLocal

logger = logging.getLogger(__name__)
router = APIRouter(prefix="/v1/import", tags=["document-import"])

OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
LLM_MODEL = os.getenv("COMPLIANCE_LLM_MODEL", "qwen3:30b-a3b")

# =============================================================================
# DOCUMENT TYPE DETECTION
# =============================================================================

DOCUMENT_TYPE_KEYWORDS = {
    "DSFA": ["datenschutz-folgenabschaetzung", "dsfa", "dpia", "privacy impact"],
    "TOM": ["technisch-organisatorische", "tom", "massnahmen", "technical measures"],
    "VVT": ["verarbeitungsverzeichnis", "vvt", "processing activities", "art. 30"],
    "PRIVACY_POLICY": ["datenschutzerklaerung", "privacy policy", "datenschutzhinweis"],
    "AGB": ["allgemeine geschaeftsbedingungen", "agb", "terms and conditions"],
    "COOKIE_POLICY": ["cookie", "tracking", "einwilligung"],
    "RISK_ASSESSMENT": ["risikobewertung", "risk assessment", "risikoanalyse"],
    "AUDIT_REPORT": ["audit", "pruefbericht", "zertifizierung"],
}


def detect_document_type(text: str) -> tuple[str, float]:
    """Detect document type from extracted text using keyword matching."""
    text_lower = text.lower()
    scores: dict[str, int] = {}

    for doc_type, keywords in DOCUMENT_TYPE_KEYWORDS.items():
        score = sum(1 for kw in keywords if kw in text_lower)
        if score > 0:
            scores[doc_type] = score

    if not scores:
        return "OTHER", 0.3

    best_type = max(scores, key=scores.get)
    confidence = min(0.95, 0.5 + scores[best_type] * 0.15)
    return best_type, confidence


# =============================================================================
# GAP ANALYSIS
# =============================================================================

GAP_RULES = [
    {
        "category": "AI Act Compliance",
        "regulation": "EU AI Act Art. 6",
        "check_keywords": ["ki", "ai", "kuenstliche intelligenz", "machine learning"],
        "gap_if_missing": ["risikoklassifizierung", "risk classification", "risikokategorie"],
        "severity": "CRITICAL",
        "action": "Risikoklassifizierung fuer KI-Systeme durchfuehren",
    },
    {
        "category": "Transparenz",
        "regulation": "DSGVO Art. 13, 14, 22",
        "check_keywords": ["automatisiert", "automated", "profiling"],
        "gap_if_missing": ["informationspflicht", "information obligation", "transparenz"],
        "severity": "HIGH",
        "action": "Informationspflichten bei automatisierten Entscheidungen ergaenzen",
    },
    {
        "category": "TOMs",
        "regulation": "DSGVO Art. 32",
        "check_keywords": ["ki", "ai", "cloud", "saas"],
        "gap_if_missing": ["technische massnahmen", "verschluesselung", "encryption"],
        "severity": "MEDIUM",
        "action": "Technisch-organisatorische Massnahmen um KI-Aspekte erweitern",
    },
    {
        "category": "VVT",
        "regulation": "DSGVO Art. 30",
        "check_keywords": ["verarbeitung", "processing", "daten"],
        "gap_if_missing": ["verarbeitungsverzeichnis", "vvt", "processing activities"],
        "severity": "HIGH",
        "action": "Verarbeitungsverzeichnis aktualisieren",
    },
    {
        "category": "Menschliche Aufsicht",
        "regulation": "EU AI Act Art. 14",
        "check_keywords": ["ki", "ai", "autonom", "autonomous"],
        "gap_if_missing": ["menschliche aufsicht", "human oversight", "human-in-the-loop"],
        "severity": "MEDIUM",
        "action": "Prozesse fuer menschliche Aufsicht definieren",
    },
]


def analyze_gaps(text: str, doc_type: str) -> list[dict]:
    """Analyze document text for compliance gaps."""
    text_lower = text.lower()
    gaps = []

    for rule in GAP_RULES:
        # Check if rule applies (keywords present in document)
        applies = any(kw in text_lower for kw in rule["check_keywords"])
        if not applies:
            continue

        # Check if gap exists (required elements missing)
        has_gap = not any(kw in text_lower for kw in rule["gap_if_missing"])
        if has_gap:
            gaps.append({
                "id": f"gap-{uuid.uuid4().hex[:8]}",
                "category": rule["category"],
                "description": f"{rule['category']}: Luecke erkannt",
                "severity": rule["severity"],
                "regulation": rule["regulation"],
                "required_action": rule["action"],
                "related_step_id": doc_type.lower(),
            })

    return gaps


# =============================================================================
# TEXT EXTRACTION
# =============================================================================

def extract_text_from_pdf(content: bytes) -> str:
    """Extract text from PDF using PyMuPDF (fitz)."""
    try:
        import fitz
        doc = fitz.open(stream=content, filetype="pdf")
        text_parts = []
        for page in doc:
            text_parts.append(page.get_text())
        doc.close()
        return "\n".join(text_parts)
    except ImportError:
        logger.warning("PyMuPDF not available, returning empty text")
        return ""
    except Exception as e:
        logger.error(f"PDF extraction failed: {e}")
        return ""


# =============================================================================
# LLM CLASSIFICATION (optional enhancement)
# =============================================================================

async def classify_with_llm(text: str) -> Optional[tuple[str, float]]:
    """Use Ollama LLM to classify document type (optional, falls back to keywords)."""
    try:
        prompt = f"""Klassifiziere das folgende Dokument in eine dieser Kategorien:
DSFA, TOM, VVT, PRIVACY_POLICY, AGB, COOKIE_POLICY, RISK_ASSESSMENT, AUDIT_REPORT, OTHER

Antworte NUR mit dem Kategorienamen, nichts anderes.

Dokumenttext (erste 2000 Zeichen):
{text[:2000]}"""

        async with httpx.AsyncClient(timeout=30.0) as client:
            response = await client.post(
                f"{OLLAMA_URL}/api/generate",
                json={
                    "model": LLM_MODEL,
                    "prompt": prompt,
                    "stream": False,
                    "options": {"temperature": 0.1, "num_predict": 20},
                },
            )

            if response.status_code == 200:
                result = response.json()
                answer = result.get("response", "").strip().upper()
                # Validate answer
                valid_types = {"DSFA", "TOM", "VVT", "PRIVACY_POLICY", "AGB",
                              "COOKIE_POLICY", "RISK_ASSESSMENT", "AUDIT_REPORT", "OTHER"}
                if answer in valid_types:
                    return answer, 0.85
    except Exception as e:
        logger.warning(f"LLM classification failed, using keyword fallback: {e}")

    return None


# =============================================================================
# RESPONSE MODELS
# =============================================================================

class DocumentAnalysisResponse(BaseModel):
    document_id: str
    filename: str
    detected_type: str
    confidence: float
    extracted_entities: list[str]
    recommendations: list[str]
    gap_analysis: dict


class DocumentListResponse(BaseModel):
    documents: list[dict]
    total: int


# =============================================================================
# ROUTES
# =============================================================================

@router.post("/analyze", response_model=DocumentAnalysisResponse)
async def analyze_document(
    file: UploadFile = File(...),
    document_type: str = Form("OTHER"),
    tenant_id: str = Form("default"),
):
    """Upload and analyze a compliance document."""
    if not file.filename:
        raise HTTPException(status_code=400, detail="No file provided")

    # Read file content
    content = await file.read()
    file_size = len(content)

    # Extract text
    if file.content_type == "application/pdf" or (file.filename and file.filename.endswith(".pdf")):
        text = extract_text_from_pdf(content)
    else:
        # Try to decode as text
        try:
            text = content.decode("utf-8")
        except UnicodeDecodeError:
            text = ""

    # Detect document type
    if document_type == "OTHER" and text:
        # Try LLM first, fallback to keywords
        llm_result = await classify_with_llm(text)
        if llm_result:
            detected_type, confidence = llm_result
        else:
            detected_type, confidence = detect_document_type(text)
    else:
        detected_type = document_type
        confidence = 1.0

    # Extract key entities
    entities = []
    entity_keywords = ["DSGVO", "AI Act", "ISO 27001", "NIS2", "BDSG",
                       "Personenbezogene Daten", "Auftragsverarbeitung", "DSFA"]
    for kw in entity_keywords:
        if kw.lower() in text.lower():
            entities.append(kw)

    # Analyze gaps
    gaps = analyze_gaps(text, detected_type)

    # Generate recommendations
    recommendations = []
    if gaps:
        recommendations = [g["required_action"] for g in gaps[:5]]
    if not recommendations:
        recommendations = ["Dokument erscheint vollstaendig"]

    # Persist to database
    doc_id = str(uuid.uuid4())
    db = SessionLocal()
    try:
        db.execute(
            """INSERT INTO compliance_imported_documents
               (id, tenant_id, filename, file_type, file_size, detected_type, detection_confidence,
                extracted_text, extracted_entities, recommendations, status, analyzed_at)
               VALUES (:id, :tenant_id, :filename, :file_type, :file_size, :detected_type, :confidence,
                       :text, :entities::jsonb, :recommendations::jsonb, 'analyzed', NOW())""",
            {
                "id": doc_id,
                "tenant_id": tenant_id,
                "filename": file.filename,
                "file_type": file.content_type or "unknown",
                "file_size": file_size,
                "detected_type": detected_type,
                "confidence": confidence,
                "text": text[:50000],  # Limit stored text
                "entities": str(entities).replace("'", '"'),
                "recommendations": str(recommendations).replace("'", '"'),
            },
        )

        # Save gap analysis
        total_gaps = len(gaps)
        gap_analysis_result = {
            "id": f"analysis-{doc_id[:8]}",
            "total_gaps": total_gaps,
            "critical_gaps": len([g for g in gaps if g["severity"] == "CRITICAL"]),
            "high_gaps": len([g for g in gaps if g["severity"] == "HIGH"]),
            "medium_gaps": len([g for g in gaps if g["severity"] == "MEDIUM"]),
            "low_gaps": len([g for g in gaps if g["severity"] == "LOW"]),
            "gaps": gaps,
            "recommended_packages": ["analyse", "dokumentation"] if total_gaps > 0 else [],
        }

        if total_gaps > 0:
            import json
            db.execute(
                """INSERT INTO compliance_gap_analyses
                   (tenant_id, document_id, total_gaps, critical_gaps, high_gaps, medium_gaps, low_gaps, gaps, recommended_packages)
                   VALUES (:tenant_id, :document_id, :total, :critical, :high, :medium, :low, :gaps::jsonb, :packages::jsonb)""",
                {
                    "tenant_id": tenant_id,
                    "document_id": doc_id,
                    "total": gap_analysis_result["total_gaps"],
                    "critical": gap_analysis_result["critical_gaps"],
                    "high": gap_analysis_result["high_gaps"],
                    "medium": gap_analysis_result["medium_gaps"],
                    "low": gap_analysis_result["low_gaps"],
                    "gaps": json.dumps(gaps),
                    "packages": json.dumps(gap_analysis_result["recommended_packages"]),
                },
            )

        db.commit()
    except Exception as e:
        db.rollback()
        logger.error(f"Failed to persist document analysis: {e}")
    finally:
        db.close()

    return DocumentAnalysisResponse(
        document_id=doc_id,
        filename=file.filename or "unknown",
        detected_type=detected_type,
        confidence=confidence,
        extracted_entities=entities,
        recommendations=recommendations,
        gap_analysis=gap_analysis_result,
    )


@router.get("/documents", response_model=DocumentListResponse)
async def list_documents(tenant_id: str = "default"):
    """List all imported documents for a tenant."""
    db = SessionLocal()
    try:
        result = db.execute(
            """SELECT id, filename, file_type, file_size, detected_type, detection_confidence,
                      extracted_entities, recommendations, status, analyzed_at, created_at
               FROM compliance_imported_documents
               WHERE tenant_id = :tenant_id
               ORDER BY created_at DESC""",
            {"tenant_id": tenant_id},
        )
        rows = result.fetchall()
        documents = []
        for row in rows:
            documents.append({
                "id": str(row[0]),
                "filename": row[1],
                "file_type": row[2],
                "file_size": row[3],
                "detected_type": row[4],
                "confidence": row[5],
                "extracted_entities": row[6] or [],
                "recommendations": row[7] or [],
                "status": row[8],
                "analyzed_at": str(row[9]) if row[9] else None,
                "created_at": str(row[10]),
            })
        return DocumentListResponse(documents=documents, total=len(documents))
    finally:
        db.close()