breakpilot-compliance/backend-compliance/tests/test_import_routes.py

"""Tests for Document Import routes (import_routes.py)."""

import pytest
from unittest.mock import MagicMock, patch, AsyncMock

from compliance.api.import_routes import (
    detect_document_type,
    analyze_gaps,
    extract_text_from_pdf,
)


class TestDetectDocumentType:
    """Tests for keyword-based document type detection."""

    def test_dsfa_detection(self):
        text = "Dies ist eine Datenschutz-Folgenabschaetzung (DSFA) nach Art. 35 DSGVO"
        doc_type, confidence = detect_document_type(text)
        assert doc_type == "DSFA"
        assert confidence >= 0.5

    def test_tom_detection(self):
        text = "Technisch-organisatorische Massnahmen (TOM) zum Schutz personenbezogener Daten"
        doc_type, confidence = detect_document_type(text)
        assert doc_type == "TOM"
        assert confidence >= 0.5

    def test_vvt_detection(self):
        text = "Verarbeitungsverzeichnis nach Art. 30 DSGVO - VVT processing activities"
        doc_type, confidence = detect_document_type(text)
        assert doc_type == "VVT"
        assert confidence >= 0.5

    def test_privacy_policy_detection(self):
        text = "Datenschutzerklaerung - Privacy Policy fuer unsere Nutzer"
        doc_type, confidence = detect_document_type(text)
        assert doc_type == "PRIVACY_POLICY"
        assert confidence >= 0.5

    def test_unknown_document(self):
        text = "Lorem ipsum dolor sit amet"
        doc_type, confidence = detect_document_type(text)
        assert doc_type == "OTHER"
        assert confidence == 0.3

    def test_empty_text(self):
        doc_type, confidence = detect_document_type("")
        assert doc_type == "OTHER"
        assert confidence == 0.3

    def test_confidence_increases_with_more_keywords(self):
        text_single = "dsfa"
        text_multi = "dsfa dpia datenschutz-folgenabschaetzung privacy impact"
        _, conf_single = detect_document_type(text_single)
        _, conf_multi = detect_document_type(text_multi)
        assert conf_multi > conf_single

    def test_confidence_capped_at_095(self):
        text = "dsfa dpia datenschutz-folgenabschaetzung privacy impact assessment report analysis"
        _, confidence = detect_document_type(text)
        assert confidence <= 0.95


class TestAnalyzeGaps:
    """Tests for gap analysis rules."""

    def test_ai_gap_detected(self):
        text = "Wir setzen KI und AI in unserer Anwendung ein"
        gaps = analyze_gaps(text, "OTHER")
        # Should detect AI Act gap (missing risk classification)
        ai_gaps = [g for g in gaps if g["category"] == "AI Act Compliance"]
        assert len(ai_gaps) > 0
        assert ai_gaps[0]["severity"] == "CRITICAL"

    def test_no_gap_when_requirement_present(self):
        text = "KI-System mit Risikoklassifizierung nach EU AI Act"
        gaps = analyze_gaps(text, "OTHER")
        ai_gaps = [g for g in gaps if g["category"] == "AI Act Compliance"]
        assert len(ai_gaps) == 0

    def test_tom_gap_detected(self):
        text = "Cloud-basiertes SaaS-System mit KI-Funktionen"
        gaps = analyze_gaps(text, "OTHER")
        tom_gaps = [g for g in gaps if g["category"] == "TOMs"]
        assert len(tom_gaps) > 0

    def test_no_gaps_for_irrelevant_text(self):
        text = "Ein einfacher Flyer ohne Relevanz"
        gaps = analyze_gaps(text, "OTHER")
        assert len(gaps) == 0

    def test_gap_has_required_fields(self):
        text = "KI-System mit automatisierten Entscheidungen"
        gaps = analyze_gaps(text, "OTHER")
        assert len(gaps) > 0
        for gap in gaps:
            assert "id" in gap
            assert "category" in gap
            assert "severity" in gap
            assert "regulation" in gap
            assert "required_action" in gap


class TestExtractTextFromPdf:
    """Tests for PDF text extraction."""

    def test_empty_bytes_returns_empty(self):
        result = extract_text_from_pdf(b"")
        assert result == ""

    def test_invalid_pdf_returns_empty(self):
        result = extract_text_from_pdf(b"not a pdf")
        assert result == ""

    def test_fitz_import_error(self):
        """When fitz is not installed, extract_text_from_pdf returns empty string."""
        import sys
        # Temporarily hide fitz from imports
        original = sys.modules.get("fitz")
        sys.modules["fitz"] = None  # type: ignore
        try:
            result = extract_text_from_pdf(b"fake pdf content")
            assert isinstance(result, str)
        finally:
            if original is None:
                sys.modules.pop("fitz", None)
            else:
                sys.modules["fitz"] = original


# =============================================================================
# Additional tests — extended coverage
# =============================================================================

class TestDetectDocumentTypeExtended:
    """Extended tests for document type detection edge cases."""

    def test_agb_detection(self):
        text = "Allgemeine Geschaeftsbedingungen (AGB) fuer die Nutzung unserer Plattform"
        doc_type, confidence = detect_document_type(text)
        assert doc_type == "AGB"
        assert confidence >= 0.5

    def test_cookie_policy_detection(self):
        text = "Cookie-Richtlinie: Wir setzen Tracking und Einwilligung nach DSGVO ein"
        doc_type, confidence = detect_document_type(text)
        assert doc_type == "COOKIE_POLICY"
        assert confidence >= 0.5

    def test_risk_assessment_detection(self):
        text = "Risikobewertung und Risikoanalyse fuer Cloud-Services"
        doc_type, confidence = detect_document_type(text)
        assert doc_type == "RISK_ASSESSMENT"
        assert confidence >= 0.5

    def test_audit_report_detection(self):
        text = "Audit-Pruefbericht nach ISO 27001 Zertifizierung"
        doc_type, confidence = detect_document_type(text)
        assert doc_type == "AUDIT_REPORT"
        assert confidence >= 0.5

    def test_case_insensitive_matching(self):
        text = "DATENSCHUTZ-FOLGENABSCHAETZUNG NACH DSGVO"
        doc_type, confidence = detect_document_type(text)
        assert doc_type == "DSFA"

    def test_returns_tuple(self):
        result = detect_document_type("some text")
        assert isinstance(result, tuple)
        assert len(result) == 2

    def test_confidence_is_float(self):
        _, confidence = detect_document_type("some text")
        assert isinstance(confidence, float)

    def test_confidence_minimum_is_03(self):
        _, confidence = detect_document_type("")
        assert confidence == 0.3

    def test_confidence_maximum_is_095(self):
        # Jam all DSFA keywords in
        text = " ".join(["dsfa", "dpia", "datenschutz-folgenabschaetzung", "privacy impact"] * 5)
        _, confidence = detect_document_type(text)
        assert confidence <= 0.95

    def test_winning_type_has_most_keywords(self):
        # TOM has 4 keywords, DSFA has 1
        text = "technisch-organisatorische massnahmen tom technical measures dsfa"
        doc_type, _ = detect_document_type(text)
        assert doc_type == "TOM"

    def test_whitespace_only_text(self):
        doc_type, confidence = detect_document_type("   \n\t  ")
        assert doc_type == "OTHER"
        assert confidence == 0.3

    def test_numbers_only_text(self):
        doc_type, confidence = detect_document_type("12345 67890")
        assert doc_type == "OTHER"


class TestAnalyzeGapsExtended:
    """Extended tests for gap analysis logic."""

    def test_vvt_gap_detected(self):
        text = "Verarbeitung personenbezogener Daten in unserer Plattform"
        gaps = analyze_gaps(text, "OTHER")
        vvt_gaps = [g for g in gaps if g["category"] == "VVT"]
        assert len(vvt_gaps) > 0

    def test_human_oversight_gap_detected(self):
        text = "KI-System mit autonomen Entscheidungen ohne menschliche Kontrolle"
        gaps = analyze_gaps(text, "OTHER")
        oversight_gaps = [g for g in gaps if g["category"] == "Menschliche Aufsicht"]
        assert len(oversight_gaps) > 0

    def test_no_oversight_gap_when_present(self):
        text = "KI-System mit menschlicher Aufsicht und human-in-the-loop Prozessen"
        gaps = analyze_gaps(text, "OTHER")
        oversight_gaps = [g for g in gaps if g["category"] == "Menschliche Aufsicht"]
        assert len(oversight_gaps) == 0

    def test_transparenz_gap_detected(self):
        text = "Wir setzen automatisierte Entscheidungen und Profiling ein"
        gaps = analyze_gaps(text, "OTHER")
        transp_gaps = [g for g in gaps if g["category"] == "Transparenz"]
        assert len(transp_gaps) > 0

    def test_gap_id_is_unique(self):
        text = "KI-System mit Verarbeitung und automatisierten Entscheidungen ai cloud"
        gaps = analyze_gaps(text, "OTHER")
        ids = [g["id"] for g in gaps]
        assert len(ids) == len(set(ids))

    def test_gap_id_starts_with_gap(self):
        text = "KI-Anwendung mit machine learning"
        gaps = analyze_gaps(text, "OTHER")
        if gaps:
            assert gaps[0]["id"].startswith("gap-")

    def test_related_step_id_matches_doc_type(self):
        text = "KI-Anwendung mit machine learning"
        gaps = analyze_gaps(text, "DSFA")
        if gaps:
            assert gaps[0]["related_step_id"] == "dsfa"

    def test_severity_values_are_valid(self):
        text = "KI-System mit cloud ai saas automatisierten Entscheidungen profiling"
        gaps = analyze_gaps(text, "OTHER")
        valid_severities = {"CRITICAL", "HIGH", "MEDIUM", "LOW"}
        for gap in gaps:
            assert gap["severity"] in valid_severities

    def test_returns_list(self):
        result = analyze_gaps("", "OTHER")
        assert isinstance(result, list)

    def test_all_gap_fields_present(self):
        text = "KI ki ai machine learning"
        gaps = analyze_gaps(text, "TOM")
        required_fields = {"id", "category", "description", "severity", "regulation", "required_action", "related_step_id"}
        for gap in gaps:
            assert required_fields.issubset(gap.keys())

    def test_no_false_positives_for_empty_text(self):
        gaps = analyze_gaps("", "VVT")
        assert gaps == []

    def test_multiple_gaps_can_be_detected(self):
        # Text that triggers multiple rules
        text = "ki ai cloud verarbeitung daten automatisiert profiling"
        gaps = analyze_gaps(text, "OTHER")
        assert len(gaps) >= 2


class TestDocumentTypeKeywords:
    """Tests for the DOCUMENT_TYPE_KEYWORDS constant."""

    def test_keywords_dict_not_empty(self):
        from compliance.api.import_routes import DOCUMENT_TYPE_KEYWORDS
        assert len(DOCUMENT_TYPE_KEYWORDS) > 0

    def test_all_types_have_keywords(self):
        from compliance.api.import_routes import DOCUMENT_TYPE_KEYWORDS
        for doc_type, keywords in DOCUMENT_TYPE_KEYWORDS.items():
            assert len(keywords) > 0, f"{doc_type} has no keywords"

    def test_dsfa_in_keywords(self):
        from compliance.api.import_routes import DOCUMENT_TYPE_KEYWORDS
        assert "DSFA" in DOCUMENT_TYPE_KEYWORDS

    def test_tom_in_keywords(self):
        from compliance.api.import_routes import DOCUMENT_TYPE_KEYWORDS
        assert "TOM" in DOCUMENT_TYPE_KEYWORDS


class TestGapRules:
    """Tests for the GAP_RULES constant."""

    def test_gap_rules_not_empty(self):
        from compliance.api.import_routes import GAP_RULES
        assert len(GAP_RULES) > 0

    def test_each_rule_has_required_keys(self):
        from compliance.api.import_routes import GAP_RULES
        required = {"category", "regulation", "check_keywords", "gap_if_missing", "severity", "action"}
        for rule in GAP_RULES:
            assert required.issubset(rule.keys())

    def test_check_keywords_are_lowercase(self):
        from compliance.api.import_routes import GAP_RULES
        for rule in GAP_RULES:
            for kw in rule["check_keywords"]:
                assert kw == kw.lower(), f"Keyword '{kw}' is not lowercase"

    def test_gap_if_missing_are_lowercase(self):
        from compliance.api.import_routes import GAP_RULES
        for rule in GAP_RULES:
            for kw in rule["gap_if_missing"]:
                assert kw == kw.lower(), f"Keyword '{kw}' is not lowercase"