breakpilot-compliance/backend-compliance/tests/test_import_routes.py

"""Tests for Document Import routes (import_routes.py)."""

import pytest
from unittest.mock import MagicMock, patch, AsyncMock

from compliance.api.import_routes import (
    detect_document_type,
    analyze_gaps,
    extract_text_from_pdf,
)


class TestDetectDocumentType:
    """Tests for keyword-based document type detection."""

    def test_dsfa_detection(self):
        text = "Dies ist eine Datenschutz-Folgenabschaetzung (DSFA) nach Art. 35 DSGVO"
        doc_type, confidence = detect_document_type(text)
        assert doc_type == "DSFA"
        assert confidence >= 0.5

    def test_tom_detection(self):
        text = "Technisch-organisatorische Massnahmen (TOM) zum Schutz personenbezogener Daten"
        doc_type, confidence = detect_document_type(text)
        assert doc_type == "TOM"
        assert confidence >= 0.5

    def test_vvt_detection(self):
        text = "Verarbeitungsverzeichnis nach Art. 30 DSGVO - VVT processing activities"
        doc_type, confidence = detect_document_type(text)
        assert doc_type == "VVT"
        assert confidence >= 0.5

    def test_privacy_policy_detection(self):
        text = "Datenschutzerklaerung - Privacy Policy fuer unsere Nutzer"
        doc_type, confidence = detect_document_type(text)
        assert doc_type == "PRIVACY_POLICY"
        assert confidence >= 0.5

    def test_unknown_document(self):
        text = "Lorem ipsum dolor sit amet"
        doc_type, confidence = detect_document_type(text)
        assert doc_type == "OTHER"
        assert confidence == 0.3

    def test_empty_text(self):
        doc_type, confidence = detect_document_type("")
        assert doc_type == "OTHER"
        assert confidence == 0.3

    def test_confidence_increases_with_more_keywords(self):
        text_single = "dsfa"
        text_multi = "dsfa dpia datenschutz-folgenabschaetzung privacy impact"
        _, conf_single = detect_document_type(text_single)
        _, conf_multi = detect_document_type(text_multi)
        assert conf_multi > conf_single

    def test_confidence_capped_at_095(self):
        text = "dsfa dpia datenschutz-folgenabschaetzung privacy impact assessment report analysis"
        _, confidence = detect_document_type(text)
        assert confidence <= 0.95


class TestAnalyzeGaps:
    """Tests for gap analysis rules."""

    def test_ai_gap_detected(self):
        text = "Wir setzen KI und AI in unserer Anwendung ein"
        gaps = analyze_gaps(text, "OTHER")
        # Should detect AI Act gap (missing risk classification)
        ai_gaps = [g for g in gaps if g["category"] == "AI Act Compliance"]
        assert len(ai_gaps) > 0
        assert ai_gaps[0]["severity"] == "CRITICAL"

    def test_no_gap_when_requirement_present(self):
        text = "KI-System mit Risikoklassifizierung nach EU AI Act"
        gaps = analyze_gaps(text, "OTHER")
        ai_gaps = [g for g in gaps if g["category"] == "AI Act Compliance"]
        assert len(ai_gaps) == 0

    def test_tom_gap_detected(self):
        text = "Cloud-basiertes SaaS-System mit KI-Funktionen"
        gaps = analyze_gaps(text, "OTHER")
        tom_gaps = [g for g in gaps if g["category"] == "TOMs"]
        assert len(tom_gaps) > 0

    def test_no_gaps_for_irrelevant_text(self):
        text = "Ein einfacher Flyer ohne Datenbezug"
        gaps = analyze_gaps(text, "OTHER")
        assert len(gaps) == 0

    def test_gap_has_required_fields(self):
        text = "KI-System mit automatisierten Entscheidungen"
        gaps = analyze_gaps(text, "OTHER")
        assert len(gaps) > 0
        for gap in gaps:
            assert "id" in gap
            assert "category" in gap
            assert "severity" in gap
            assert "regulation" in gap
            assert "required_action" in gap


class TestExtractTextFromPdf:
    """Tests for PDF text extraction."""

    def test_empty_bytes_returns_empty(self):
        result = extract_text_from_pdf(b"")
        assert result == ""

    def test_invalid_pdf_returns_empty(self):
        result = extract_text_from_pdf(b"not a pdf")
        assert result == ""

    @patch("compliance.api.import_routes.fitz")
    def test_fitz_import_error(self, mock_fitz):
        """When fitz is not available, returns empty string."""
        mock_fitz.open.side_effect = ImportError("No module")
        # The actual function catches ImportError internally
        result = extract_text_from_pdf(b"test")
        # Since we mocked fitz at module level it will raise differently,
        # but the function should handle it gracefully
        assert isinstance(result, str)