""" Tests for Compliance PDF Extractor. Tests cover: - BSIPDFExtractor.extract_from_file() - Aspect categorization - Requirement level detection (MUSS/SOLL/KANN) - Text parsing and pattern matching """ import pytest import tempfile from pathlib import Path from unittest.mock import MagicMock, patch, mock_open import sys # Mock fitz if not available try: import fitz except ImportError: fitz = MagicMock() sys.modules['fitz'] = fitz from compliance.services.pdf_extractor import ( BSIPDFExtractor, BSIAspect, RequirementLevel, AspectCategory, ) @pytest.fixture def extractor(): """Create a BSIPDFExtractor instance.""" with patch("compliance.services.pdf_extractor.fitz", MagicMock()): return BSIPDFExtractor() @pytest.fixture def mock_pdf(): """Create a mock PDF document.""" mock_doc = MagicMock() mock_doc.__len__ = MagicMock(return_value=1) # 1 page mock_page = MagicMock() mock_page.get_text = MagicMock(return_value=""" 4.2.1 Authentifizierung O.Auth_1: Sichere Passwörter Die Anwendung MUSS starke Passwörter erzwingen. Passwörter MÜSSEN mindestens 8 Zeichen lang sein. O.Auth_2: Multi-Faktor-Authentifizierung Die Anwendung SOLL Multi-Faktor-Authentifizierung unterstützen. """) mock_doc.__getitem__ = MagicMock(return_value=mock_page) return mock_doc # ============================================================================ # BSIPDFExtractor Tests # ============================================================================ class TestBSIPDFExtractor: """Tests for BSIPDFExtractor.""" @patch("compliance.services.pdf_extractor.fitz", MagicMock()) def test_extractor_initialization(self): """Test that extractor can be initialized.""" extractor = BSIPDFExtractor() assert extractor is not None assert extractor.logger is not None def test_extractor_requires_pymupdf(self): """Test that extractor raises error if PyMuPDF not available.""" with patch("compliance.services.pdf_extractor.fitz", None): with pytest.raises(ImportError) as excinfo: BSIPDFExtractor() assert "PyMuPDF" in str(excinfo.value) def test_extract_from_nonexistent_file(self, extractor): """Test extraction from non-existent file raises error.""" with pytest.raises(FileNotFoundError): extractor.extract_from_file("/nonexistent/file.pdf") @patch("compliance.services.pdf_extractor.fitz") def test_extract_from_file_basic(self, mock_fitz, extractor, mock_pdf): """Test basic PDF extraction.""" mock_fitz.open = MagicMock(return_value=mock_pdf) # Create a temporary PDF file with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp: tmp_path = tmp.name try: aspects = extractor.extract_from_file(tmp_path) assert isinstance(aspects, list) # Should extract aspects from the mock PDF finally: Path(tmp_path).unlink(missing_ok=True) @patch("compliance.services.pdf_extractor.fitz") def test_extract_from_file_with_source_name(self, mock_fitz, extractor, mock_pdf): """Test extraction with custom source name.""" mock_fitz.open = MagicMock(return_value=mock_pdf) with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp: tmp_path = tmp.name try: aspects = extractor.extract_from_file(tmp_path, source_name="BSI-TR-03161-2") # Should use provided source name if aspects: assert aspects[0].source_document == "BSI-TR-03161-2" finally: Path(tmp_path).unlink(missing_ok=True) # ============================================================================ # Categorization Tests # ============================================================================ class TestAspectCategorization: """Tests for aspect categorization.""" def test_category_map_authentication(self, extractor): """Test authentication category detection.""" category_map = extractor.CATEGORY_MAP assert category_map.get("O.Auth") == AspectCategory.AUTHENTICATION def test_category_map_cryptography(self, extractor): """Test cryptography category detection.""" category_map = extractor.CATEGORY_MAP assert category_map.get("O.Cryp") == AspectCategory.CRYPTOGRAPHY assert category_map.get("O.Crypto") == AspectCategory.CRYPTOGRAPHY def test_category_map_session_management(self, extractor): """Test session management category detection.""" category_map = extractor.CATEGORY_MAP assert category_map.get("O.Sess") == AspectCategory.SESSION_MANAGEMENT def test_category_map_input_validation(self, extractor): """Test input validation category detection.""" category_map = extractor.CATEGORY_MAP assert category_map.get("O.Input") == AspectCategory.INPUT_VALIDATION def test_category_map_sql_injection(self, extractor): """Test SQL injection category detection.""" category_map = extractor.CATEGORY_MAP assert category_map.get("O.SQL") == AspectCategory.SQL_INJECTION def test_category_map_test_aspect(self, extractor): """Test that T.* aspects are categorized as test aspects.""" category_map = extractor.CATEGORY_MAP assert category_map.get("T.") == AspectCategory.TEST_ASPECT def test_category_keywords_authentication(self, extractor): """Test authentication keywords are present.""" keywords = extractor.CATEGORY_KEYWORDS[AspectCategory.AUTHENTICATION] assert "authentication" in keywords assert "login" in keywords assert "password" in keywords or "passwort" in keywords assert "oauth" in keywords def test_category_keywords_cryptography(self, extractor): """Test cryptography keywords are present.""" keywords = extractor.CATEGORY_KEYWORDS[AspectCategory.CRYPTOGRAPHY] assert "encryption" in keywords or "verschlüsselung" in keywords assert "tls" in keywords assert "aes" in keywords or "rsa" in keywords def test_categorize_by_aspect_id(self, extractor): """Test categorization based on aspect ID prefix.""" # Test various aspect ID patterns test_cases = [ ("O.Auth_1", AspectCategory.AUTHENTICATION), ("O.Crypto_2", AspectCategory.CRYPTOGRAPHY), ("O.Sess_3", AspectCategory.SESSION_MANAGEMENT), ("O.Input_4", AspectCategory.INPUT_VALIDATION), ("T.Auth_1", AspectCategory.TEST_ASPECT), ] for aspect_id, expected_category in test_cases: # Find matching prefix in category map for prefix, category in extractor.CATEGORY_MAP.items(): if aspect_id.startswith(prefix): assert category == expected_category break # ============================================================================ # Requirement Level Tests # ============================================================================ class TestRequirementLevelDetection: """Tests for requirement level detection (MUSS/SOLL/KANN).""" def test_requirement_level_enum(self): """Test RequirementLevel enum values.""" assert RequirementLevel.MUSS.value == "MUSS" assert RequirementLevel.SOLL.value == "SOLL" assert RequirementLevel.KANN.value == "KANN" assert RequirementLevel.DARF_NICHT.value == "DARF NICHT" def test_requirement_pattern_muss(self, extractor): """Test MUSS pattern detection.""" import re pattern = extractor.PATTERNS["requirement"] # Test uppercase MUSS text_upper = "Die Anwendung MUSS sichere Passwörter verwenden." matches = re.findall(pattern, text_upper) assert len(matches) > 0 assert matches[0].upper() == "MUSS" # Test lowercase muss text_lower = "Das System muss verschlüsselt sein." matches = re.findall(pattern, text_lower) assert len(matches) > 0 assert matches[0].upper() == "MUSS" # Note: Pattern does not match conjugations like "müssen" # since BSI-TR documents use "MUSS" or "muss" as requirement markers def test_requirement_pattern_soll(self, extractor): """Test SOLL pattern detection.""" import re pattern = extractor.PATTERNS["requirement"] test_texts = [ "Die Anwendung SOLL MFA unterstützen.", "Das System soll Logging implementieren.", ] for text in test_texts: matches = re.findall(pattern, text) assert len(matches) > 0 assert matches[0].upper() == "SOLL" def test_requirement_pattern_kann(self, extractor): """Test KANN pattern detection.""" import re pattern = extractor.PATTERNS["requirement"] test_texts = [ "Die Anwendung KANN biometrische Auth anbieten.", "Das System kann zusätzliche Features haben.", ] for text in test_texts: matches = re.findall(pattern, text) assert len(matches) > 0 assert matches[0].upper() == "KANN" def test_requirement_pattern_darf_nicht(self, extractor): """Test DARF NICHT pattern detection.""" import re pattern = extractor.PATTERNS["requirement"] test_texts = [ "Die Anwendung DARF NICHT Passwörter im Klartext speichern.", "Das System darf nicht unverschlüsselt kommunizieren.", ] for text in test_texts: matches = re.findall(pattern, text, re.IGNORECASE) assert len(matches) > 0 # ============================================================================ # Pattern Matching Tests # ============================================================================ class TestPatternMatching: """Tests for regex pattern matching.""" def test_aspect_id_pattern(self, extractor): """Test aspect ID pattern matching.""" import re pattern = extractor.PATTERNS["aspect_id"] test_cases = [ ("O.Auth_1", True), ("O.Crypto_23", True), ("T.Network_5", True), ("O.Session_100", True), ("InvalidID", False), ("O.Auth", False), # Missing number ] for text, should_match in test_cases: match = re.search(pattern, text) if should_match: assert match is not None, f"Pattern should match: {text}" else: assert match is None, f"Pattern should not match: {text}" def test_section_pattern(self, extractor): """Test section number pattern matching.""" import re pattern = extractor.PATTERNS["section"] test_cases = [ ("4.2.1", True), ("1.0", True), ("10.5.3", True), ("invalid", False), ] for text, should_match in test_cases: match = re.search(pattern, text) if should_match: assert match is not None, f"Pattern should match: {text}" def test_section_aspect_pattern(self, extractor): """Test section-based aspect pattern.""" import re pattern = extractor.PATTERNS["section_aspect"] test_cases = [ "Prüfaspekt 4.2.1", "Pruefaspekt 10.5", "Anforderung 3.1.2", ] for text in test_cases: match = re.search(pattern, text) assert match is not None, f"Pattern should match: {text}" assert match.group(1) is not None # Should capture section number # ============================================================================ # BSIAspect Model Tests # ============================================================================ class TestBSIAspectModel: """Tests for BSIAspect data model.""" def test_bsi_aspect_creation(self): """Test creating a BSIAspect instance.""" aspect = BSIAspect( aspect_id="O.Auth_1", title="Sichere Passwörter", full_text="Die Anwendung MUSS starke Passwörter erzwingen.", category=AspectCategory.AUTHENTICATION, page_number=10, section="4.2.1", requirement_level=RequirementLevel.MUSS, source_document="BSI-TR-03161-2", ) assert aspect.aspect_id == "O.Auth_1" assert aspect.title == "Sichere Passwörter" assert aspect.category == AspectCategory.AUTHENTICATION assert aspect.requirement_level == RequirementLevel.MUSS assert aspect.page_number == 10 def test_bsi_aspect_with_optional_fields(self): """Test BSIAspect with optional fields.""" aspect = BSIAspect( aspect_id="O.Auth_1", title="Test", full_text="Test text", category=AspectCategory.AUTHENTICATION, page_number=1, section="1.0", requirement_level=RequirementLevel.MUSS, source_document="Test", context_before="Context before", context_after="Context after", related_aspects=["O.Auth_2", "O.Auth_3"], keywords=["password", "authentication"], ) assert aspect.context_before == "Context before" assert aspect.context_after == "Context after" assert len(aspect.related_aspects) == 2 assert "password" in aspect.keywords # ============================================================================ # Text Extraction Tests # ============================================================================ class TestTextExtraction: """Tests for text extraction logic.""" @patch("compliance.services.pdf_extractor.fitz") def test_extract_aspects_from_text_with_ids(self, mock_fitz, extractor): """Test extracting aspects that have explicit IDs.""" text = """ 4.2 Authentifizierung O.Auth_1: Sichere Passwörter Die Anwendung MUSS starke Passwörter erzwingen. O.Auth_2: Multi-Faktor Die Anwendung SOLL MFA unterstützen. """ # Extract aspects from text aspects = extractor._extract_aspects_from_text( text=text, page_num=1, source_document="Test" ) # Should find at least the aspects assert isinstance(aspects, list) def test_extract_multiple_requirement_levels(self, extractor): """Test extracting text with multiple requirement levels.""" text = """ Das System MUSS verschlüsselt sein. Es SOLL Logging aktivieren. Es KANN zusätzliche Features haben. Es DARF NICHT Passwörter speichern. """ import re pattern = extractor.PATTERNS["requirement"] matches = re.findall(pattern, text, re.IGNORECASE) # Should find all 4 requirement levels assert len(matches) >= 4 # ============================================================================ # Integration Tests # ============================================================================ class TestPDFExtractionIntegration: """Integration tests for complete PDF extraction workflow.""" @patch("compliance.services.pdf_extractor.fitz") def test_complete_extraction_workflow(self, mock_fitz, extractor): """Test complete extraction from PDF to aspects.""" # Create mock PDF with realistic content mock_doc = MagicMock() mock_doc.__len__ = MagicMock(return_value=2) # 2 pages page1 = MagicMock() page1.get_text = MagicMock(return_value=""" 4.2.1 Authentifizierung O.Auth_1: Sichere Passwörter Die Anwendung MUSS starke Passwörter mit mindestens 8 Zeichen erzwingen. """) page2 = MagicMock() page2.get_text = MagicMock(return_value=""" 4.2.2 Session Management O.Sess_1: Session Timeout Die Anwendung SOLL nach 15 Minuten Inaktivität die Session beenden. """) mock_doc.__getitem__ = MagicMock(side_effect=[page1, page2]) mock_fitz.open = MagicMock(return_value=mock_doc) with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp: tmp_path = tmp.name try: aspects = extractor.extract_from_file(tmp_path, source_name="BSI-TR-03161-2") # Verify extraction worked assert isinstance(aspects, list) # PDF was closed mock_doc.close.assert_called_once() finally: Path(tmp_path).unlink(missing_ok=True) if __name__ == "__main__": pytest.main([__file__, "-v"])