breakpilot-pwa/backend/tests/test_compliance_pdf_extractor.py

"""
Tests for Compliance PDF Extractor.

Tests cover:
- BSIPDFExtractor.extract_from_file()
- Aspect categorization
- Requirement level detection (MUSS/SOLL/KANN)
- Text parsing and pattern matching
"""

import pytest
import tempfile
from pathlib import Path
from unittest.mock import MagicMock, patch, mock_open
import sys

# Mock fitz if not available
try:
    import fitz
except ImportError:
    fitz = MagicMock()
    sys.modules['fitz'] = fitz

from compliance.services.pdf_extractor import (
    BSIPDFExtractor,
    BSIAspect,
    RequirementLevel,
    AspectCategory,
)


@pytest.fixture
def extractor():
    """Create a BSIPDFExtractor instance."""
    with patch("compliance.services.pdf_extractor.fitz", MagicMock()):
        return BSIPDFExtractor()


@pytest.fixture
def mock_pdf():
    """Create a mock PDF document."""
    mock_doc = MagicMock()
    mock_doc.__len__ = MagicMock(return_value=1)  # 1 page
    mock_page = MagicMock()
    mock_page.get_text = MagicMock(return_value="""
    4.2.1 Authentifizierung

    O.Auth_1: Sichere Passwörter
    Die Anwendung MUSS starke Passwörter erzwingen.
    Passwörter MÜSSEN mindestens 8 Zeichen lang sein.

    O.Auth_2: Multi-Faktor-Authentifizierung
    Die Anwendung SOLL Multi-Faktor-Authentifizierung unterstützen.
    """)
    mock_doc.__getitem__ = MagicMock(return_value=mock_page)
    return mock_doc


# ============================================================================
# BSIPDFExtractor Tests
# ============================================================================

class TestBSIPDFExtractor:
    """Tests for BSIPDFExtractor."""

    @patch("compliance.services.pdf_extractor.fitz", MagicMock())
    def test_extractor_initialization(self):
        """Test that extractor can be initialized."""
        extractor = BSIPDFExtractor()
        assert extractor is not None
        assert extractor.logger is not None

    def test_extractor_requires_pymupdf(self):
        """Test that extractor raises error if PyMuPDF not available."""
        with patch("compliance.services.pdf_extractor.fitz", None):
            with pytest.raises(ImportError) as excinfo:
                BSIPDFExtractor()
            assert "PyMuPDF" in str(excinfo.value)

    def test_extract_from_nonexistent_file(self, extractor):
        """Test extraction from non-existent file raises error."""
        with pytest.raises(FileNotFoundError):
            extractor.extract_from_file("/nonexistent/file.pdf")

    @patch("compliance.services.pdf_extractor.fitz")
    def test_extract_from_file_basic(self, mock_fitz, extractor, mock_pdf):
        """Test basic PDF extraction."""
        mock_fitz.open = MagicMock(return_value=mock_pdf)

        # Create a temporary PDF file
        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
            tmp_path = tmp.name

        try:
            aspects = extractor.extract_from_file(tmp_path)
            assert isinstance(aspects, list)
            # Should extract aspects from the mock PDF
        finally:
            Path(tmp_path).unlink(missing_ok=True)

    @patch("compliance.services.pdf_extractor.fitz")
    def test_extract_from_file_with_source_name(self, mock_fitz, extractor, mock_pdf):
        """Test extraction with custom source name."""
        mock_fitz.open = MagicMock(return_value=mock_pdf)

        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
            tmp_path = tmp.name

        try:
            aspects = extractor.extract_from_file(tmp_path, source_name="BSI-TR-03161-2")
            # Should use provided source name
            if aspects:
                assert aspects[0].source_document == "BSI-TR-03161-2"
        finally:
            Path(tmp_path).unlink(missing_ok=True)


# ============================================================================
# Categorization Tests
# ============================================================================

class TestAspectCategorization:
    """Tests for aspect categorization."""

    def test_category_map_authentication(self, extractor):
        """Test authentication category detection."""
        category_map = extractor.CATEGORY_MAP
        assert category_map.get("O.Auth") == AspectCategory.AUTHENTICATION

    def test_category_map_cryptography(self, extractor):
        """Test cryptography category detection."""
        category_map = extractor.CATEGORY_MAP
        assert category_map.get("O.Cryp") == AspectCategory.CRYPTOGRAPHY
        assert category_map.get("O.Crypto") == AspectCategory.CRYPTOGRAPHY

    def test_category_map_session_management(self, extractor):
        """Test session management category detection."""
        category_map = extractor.CATEGORY_MAP
        assert category_map.get("O.Sess") == AspectCategory.SESSION_MANAGEMENT

    def test_category_map_input_validation(self, extractor):
        """Test input validation category detection."""
        category_map = extractor.CATEGORY_MAP
        assert category_map.get("O.Input") == AspectCategory.INPUT_VALIDATION

    def test_category_map_sql_injection(self, extractor):
        """Test SQL injection category detection."""
        category_map = extractor.CATEGORY_MAP
        assert category_map.get("O.SQL") == AspectCategory.SQL_INJECTION

    def test_category_map_test_aspect(self, extractor):
        """Test that T.* aspects are categorized as test aspects."""
        category_map = extractor.CATEGORY_MAP
        assert category_map.get("T.") == AspectCategory.TEST_ASPECT

    def test_category_keywords_authentication(self, extractor):
        """Test authentication keywords are present."""
        keywords = extractor.CATEGORY_KEYWORDS[AspectCategory.AUTHENTICATION]
        assert "authentication" in keywords
        assert "login" in keywords
        assert "password" in keywords or "passwort" in keywords
        assert "oauth" in keywords

    def test_category_keywords_cryptography(self, extractor):
        """Test cryptography keywords are present."""
        keywords = extractor.CATEGORY_KEYWORDS[AspectCategory.CRYPTOGRAPHY]
        assert "encryption" in keywords or "verschlüsselung" in keywords
        assert "tls" in keywords
        assert "aes" in keywords or "rsa" in keywords

    def test_categorize_by_aspect_id(self, extractor):
        """Test categorization based on aspect ID prefix."""
        # Test various aspect ID patterns
        test_cases = [
            ("O.Auth_1", AspectCategory.AUTHENTICATION),
            ("O.Crypto_2", AspectCategory.CRYPTOGRAPHY),
            ("O.Sess_3", AspectCategory.SESSION_MANAGEMENT),
            ("O.Input_4", AspectCategory.INPUT_VALIDATION),
            ("T.Auth_1", AspectCategory.TEST_ASPECT),
        ]

        for aspect_id, expected_category in test_cases:
            # Find matching prefix in category map
            for prefix, category in extractor.CATEGORY_MAP.items():
                if aspect_id.startswith(prefix):
                    assert category == expected_category
                    break


# ============================================================================
# Requirement Level Tests
# ============================================================================

class TestRequirementLevelDetection:
    """Tests for requirement level detection (MUSS/SOLL/KANN)."""

    def test_requirement_level_enum(self):
        """Test RequirementLevel enum values."""
        assert RequirementLevel.MUSS.value == "MUSS"
        assert RequirementLevel.SOLL.value == "SOLL"
        assert RequirementLevel.KANN.value == "KANN"
        assert RequirementLevel.DARF_NICHT.value == "DARF NICHT"

    def test_requirement_pattern_muss(self, extractor):
        """Test MUSS pattern detection."""
        import re
        pattern = extractor.PATTERNS["requirement"]

        # Test uppercase MUSS
        text_upper = "Die Anwendung MUSS sichere Passwörter verwenden."
        matches = re.findall(pattern, text_upper)
        assert len(matches) > 0
        assert matches[0].upper() == "MUSS"

        # Test lowercase muss
        text_lower = "Das System muss verschlüsselt sein."
        matches = re.findall(pattern, text_lower)
        assert len(matches) > 0
        assert matches[0].upper() == "MUSS"

        # Note: Pattern does not match conjugations like "müssen"
        # since BSI-TR documents use "MUSS" or "muss" as requirement markers

    def test_requirement_pattern_soll(self, extractor):
        """Test SOLL pattern detection."""
        import re
        pattern = extractor.PATTERNS["requirement"]

        test_texts = [
            "Die Anwendung SOLL MFA unterstützen.",
            "Das System soll Logging implementieren.",
        ]

        for text in test_texts:
            matches = re.findall(pattern, text)
            assert len(matches) > 0
            assert matches[0].upper() == "SOLL"

    def test_requirement_pattern_kann(self, extractor):
        """Test KANN pattern detection."""
        import re
        pattern = extractor.PATTERNS["requirement"]

        test_texts = [
            "Die Anwendung KANN biometrische Auth anbieten.",
            "Das System kann zusätzliche Features haben.",
        ]

        for text in test_texts:
            matches = re.findall(pattern, text)
            assert len(matches) > 0
            assert matches[0].upper() == "KANN"

    def test_requirement_pattern_darf_nicht(self, extractor):
        """Test DARF NICHT pattern detection."""
        import re
        pattern = extractor.PATTERNS["requirement"]

        test_texts = [
            "Die Anwendung DARF NICHT Passwörter im Klartext speichern.",
            "Das System darf nicht unverschlüsselt kommunizieren.",
        ]

        for text in test_texts:
            matches = re.findall(pattern, text, re.IGNORECASE)
            assert len(matches) > 0


# ============================================================================
# Pattern Matching Tests
# ============================================================================

class TestPatternMatching:
    """Tests for regex pattern matching."""

    def test_aspect_id_pattern(self, extractor):
        """Test aspect ID pattern matching."""
        import re
        pattern = extractor.PATTERNS["aspect_id"]

        test_cases = [
            ("O.Auth_1", True),
            ("O.Crypto_23", True),
            ("T.Network_5", True),
            ("O.Session_100", True),
            ("InvalidID", False),
            ("O.Auth", False),  # Missing number
        ]

        for text, should_match in test_cases:
            match = re.search(pattern, text)
            if should_match:
                assert match is not None, f"Pattern should match: {text}"
            else:
                assert match is None, f"Pattern should not match: {text}"

    def test_section_pattern(self, extractor):
        """Test section number pattern matching."""
        import re
        pattern = extractor.PATTERNS["section"]

        test_cases = [
            ("4.2.1", True),
            ("1.0", True),
            ("10.5.3", True),
            ("invalid", False),
        ]

        for text, should_match in test_cases:
            match = re.search(pattern, text)
            if should_match:
                assert match is not None, f"Pattern should match: {text}"

    def test_section_aspect_pattern(self, extractor):
        """Test section-based aspect pattern."""
        import re
        pattern = extractor.PATTERNS["section_aspect"]

        test_cases = [
            "Prüfaspekt 4.2.1",
            "Pruefaspekt 10.5",
            "Anforderung 3.1.2",
        ]

        for text in test_cases:
            match = re.search(pattern, text)
            assert match is not None, f"Pattern should match: {text}"
            assert match.group(1) is not None  # Should capture section number


# ============================================================================
# BSIAspect Model Tests
# ============================================================================

class TestBSIAspectModel:
    """Tests for BSIAspect data model."""

    def test_bsi_aspect_creation(self):
        """Test creating a BSIAspect instance."""
        aspect = BSIAspect(
            aspect_id="O.Auth_1",
            title="Sichere Passwörter",
            full_text="Die Anwendung MUSS starke Passwörter erzwingen.",
            category=AspectCategory.AUTHENTICATION,
            page_number=10,
            section="4.2.1",
            requirement_level=RequirementLevel.MUSS,
            source_document="BSI-TR-03161-2",
        )

        assert aspect.aspect_id == "O.Auth_1"
        assert aspect.title == "Sichere Passwörter"
        assert aspect.category == AspectCategory.AUTHENTICATION
        assert aspect.requirement_level == RequirementLevel.MUSS
        assert aspect.page_number == 10

    def test_bsi_aspect_with_optional_fields(self):
        """Test BSIAspect with optional fields."""
        aspect = BSIAspect(
            aspect_id="O.Auth_1",
            title="Test",
            full_text="Test text",
            category=AspectCategory.AUTHENTICATION,
            page_number=1,
            section="1.0",
            requirement_level=RequirementLevel.MUSS,
            source_document="Test",
            context_before="Context before",
            context_after="Context after",
            related_aspects=["O.Auth_2", "O.Auth_3"],
            keywords=["password", "authentication"],
        )

        assert aspect.context_before == "Context before"
        assert aspect.context_after == "Context after"
        assert len(aspect.related_aspects) == 2
        assert "password" in aspect.keywords


# ============================================================================
# Text Extraction Tests
# ============================================================================

class TestTextExtraction:
    """Tests for text extraction logic."""

    @patch("compliance.services.pdf_extractor.fitz")
    def test_extract_aspects_from_text_with_ids(self, mock_fitz, extractor):
        """Test extracting aspects that have explicit IDs."""
        text = """
        4.2 Authentifizierung

        O.Auth_1: Sichere Passwörter
        Die Anwendung MUSS starke Passwörter erzwingen.

        O.Auth_2: Multi-Faktor
        Die Anwendung SOLL MFA unterstützen.
        """

        # Extract aspects from text
        aspects = extractor._extract_aspects_from_text(
            text=text,
            page_num=1,
            source_document="Test"
        )

        # Should find at least the aspects
        assert isinstance(aspects, list)

    def test_extract_multiple_requirement_levels(self, extractor):
        """Test extracting text with multiple requirement levels."""
        text = """
        Das System MUSS verschlüsselt sein.
        Es SOLL Logging aktivieren.
        Es KANN zusätzliche Features haben.
        Es DARF NICHT Passwörter speichern.
        """

        import re
        pattern = extractor.PATTERNS["requirement"]
        matches = re.findall(pattern, text, re.IGNORECASE)

        # Should find all 4 requirement levels
        assert len(matches) >= 4


# ============================================================================
# Integration Tests
# ============================================================================

class TestPDFExtractionIntegration:
    """Integration tests for complete PDF extraction workflow."""

    @patch("compliance.services.pdf_extractor.fitz")
    def test_complete_extraction_workflow(self, mock_fitz, extractor):
        """Test complete extraction from PDF to aspects."""
        # Create mock PDF with realistic content
        mock_doc = MagicMock()
        mock_doc.__len__ = MagicMock(return_value=2)  # 2 pages

        page1 = MagicMock()
        page1.get_text = MagicMock(return_value="""
        4.2.1 Authentifizierung

        O.Auth_1: Sichere Passwörter
        Die Anwendung MUSS starke Passwörter mit mindestens 8 Zeichen erzwingen.
        """)

        page2 = MagicMock()
        page2.get_text = MagicMock(return_value="""
        4.2.2 Session Management

        O.Sess_1: Session Timeout
        Die Anwendung SOLL nach 15 Minuten Inaktivität die Session beenden.
        """)

        mock_doc.__getitem__ = MagicMock(side_effect=[page1, page2])
        mock_fitz.open = MagicMock(return_value=mock_doc)

        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
            tmp_path = tmp.name

        try:
            aspects = extractor.extract_from_file(tmp_path, source_name="BSI-TR-03161-2")

            # Verify extraction worked
            assert isinstance(aspects, list)

            # PDF was closed
            mock_doc.close.assert_called_once()
        finally:
            Path(tmp_path).unlink(missing_ok=True)


if __name__ == "__main__":
    pytest.main([__file__, "-v"])