fix: Restore all files lost during destructive rebase

A previous `git pull --rebase origin main` dropped 177 local commits, losing 3400+ files across admin-v2, backend, studio-v2, website, klausur-service, and many other services. The partial restore attempt (660295e2) only recovered some files. This commit restores all missing files from pre-rebase ref 98933f5e while preserving post-rebase additions (night-scheduler, night-mode UI, NightModeWidget dashboard integration). Restored features include: - AI Module Sidebar (FAB), OCR Labeling, OCR Compare - GPU Dashboard, RAG Pipeline, Magic Help - Klausur-Korrektur (8 files), Abitur-Archiv (5+ files) - Companion, Zeugnisse-Crawler, Screen Flow - Full backend, studio-v2, website, klausur-service - All compliance SDKs, agent-core, voice-service - CI/CD configs, documentation, scripts Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 09:51:32 +01:00
parent f7487ee240
commit bfdaf63ba9
2009 changed files with 749983 additions and 1731 deletions
--- a/backend/tests/test_compliance_pdf_extractor.py
+++ b/backend/tests/test_compliance_pdf_extractor.py
@@ -0,0 +1,476 @@
+"""
+Tests for Compliance PDF Extractor.
+
+Tests cover:
+- BSIPDFExtractor.extract_from_file()
+- Aspect categorization
+- Requirement level detection (MUSS/SOLL/KANN)
+- Text parsing and pattern matching
+"""
+
+import pytest
+import tempfile
+from pathlib import Path
+from unittest.mock import MagicMock, patch, mock_open
+import sys
+
+# Mock fitz if not available
+try:
+    import fitz
+except ImportError:
+    fitz = MagicMock()
+    sys.modules['fitz'] = fitz
+
+from compliance.services.pdf_extractor import (
+    BSIPDFExtractor,
+    BSIAspect,
+    RequirementLevel,
+    AspectCategory,
+)
+
+
+@pytest.fixture
+def extractor():
+    """Create a BSIPDFExtractor instance."""
+    with patch("compliance.services.pdf_extractor.fitz", MagicMock()):
+        return BSIPDFExtractor()
+
+
+@pytest.fixture
+def mock_pdf():
+    """Create a mock PDF document."""
+    mock_doc = MagicMock()
+    mock_doc.__len__ = MagicMock(return_value=1)  # 1 page
+    mock_page = MagicMock()
+    mock_page.get_text = MagicMock(return_value="""
+    4.2.1 Authentifizierung
+
+    O.Auth_1: Sichere Passwörter
+    Die Anwendung MUSS starke Passwörter erzwingen.
+    Passwörter MÜSSEN mindestens 8 Zeichen lang sein.
+
+    O.Auth_2: Multi-Faktor-Authentifizierung
+    Die Anwendung SOLL Multi-Faktor-Authentifizierung unterstützen.
+    """)
+    mock_doc.__getitem__ = MagicMock(return_value=mock_page)
+    return mock_doc
+
+
+# ============================================================================
+# BSIPDFExtractor Tests
+# ============================================================================
+
+class TestBSIPDFExtractor:
+    """Tests for BSIPDFExtractor."""
+
+    @patch("compliance.services.pdf_extractor.fitz", MagicMock())
+    def test_extractor_initialization(self):
+        """Test that extractor can be initialized."""
+        extractor = BSIPDFExtractor()
+        assert extractor is not None
+        assert extractor.logger is not None
+
+    def test_extractor_requires_pymupdf(self):
+        """Test that extractor raises error if PyMuPDF not available."""
+        with patch("compliance.services.pdf_extractor.fitz", None):
+            with pytest.raises(ImportError) as excinfo:
+                BSIPDFExtractor()
+            assert "PyMuPDF" in str(excinfo.value)
+
+    def test_extract_from_nonexistent_file(self, extractor):
+        """Test extraction from non-existent file raises error."""
+        with pytest.raises(FileNotFoundError):
+            extractor.extract_from_file("/nonexistent/file.pdf")
+
+    @patch("compliance.services.pdf_extractor.fitz")
+    def test_extract_from_file_basic(self, mock_fitz, extractor, mock_pdf):
+        """Test basic PDF extraction."""
+        mock_fitz.open = MagicMock(return_value=mock_pdf)
+
+        # Create a temporary PDF file
+        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
+            tmp_path = tmp.name
+
+        try:
+            aspects = extractor.extract_from_file(tmp_path)
+            assert isinstance(aspects, list)
+            # Should extract aspects from the mock PDF
+        finally:
+            Path(tmp_path).unlink(missing_ok=True)
+
+    @patch("compliance.services.pdf_extractor.fitz")
+    def test_extract_from_file_with_source_name(self, mock_fitz, extractor, mock_pdf):
+        """Test extraction with custom source name."""
+        mock_fitz.open = MagicMock(return_value=mock_pdf)
+
+        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
+            tmp_path = tmp.name
+
+        try:
+            aspects = extractor.extract_from_file(tmp_path, source_name="BSI-TR-03161-2")
+            # Should use provided source name
+            if aspects:
+                assert aspects[0].source_document == "BSI-TR-03161-2"
+        finally:
+            Path(tmp_path).unlink(missing_ok=True)
+
+
+# ============================================================================
+# Categorization Tests
+# ============================================================================
+
+class TestAspectCategorization:
+    """Tests for aspect categorization."""
+
+    def test_category_map_authentication(self, extractor):
+        """Test authentication category detection."""
+        category_map = extractor.CATEGORY_MAP
+        assert category_map.get("O.Auth") == AspectCategory.AUTHENTICATION
+
+    def test_category_map_cryptography(self, extractor):
+        """Test cryptography category detection."""
+        category_map = extractor.CATEGORY_MAP
+        assert category_map.get("O.Cryp") == AspectCategory.CRYPTOGRAPHY
+        assert category_map.get("O.Crypto") == AspectCategory.CRYPTOGRAPHY
+
+    def test_category_map_session_management(self, extractor):
+        """Test session management category detection."""
+        category_map = extractor.CATEGORY_MAP
+        assert category_map.get("O.Sess") == AspectCategory.SESSION_MANAGEMENT
+
+    def test_category_map_input_validation(self, extractor):
+        """Test input validation category detection."""
+        category_map = extractor.CATEGORY_MAP
+        assert category_map.get("O.Input") == AspectCategory.INPUT_VALIDATION
+
+    def test_category_map_sql_injection(self, extractor):
+        """Test SQL injection category detection."""
+        category_map = extractor.CATEGORY_MAP
+        assert category_map.get("O.SQL") == AspectCategory.SQL_INJECTION
+
+    def test_category_map_test_aspect(self, extractor):
+        """Test that T.* aspects are categorized as test aspects."""
+        category_map = extractor.CATEGORY_MAP
+        assert category_map.get("T.") == AspectCategory.TEST_ASPECT
+
+    def test_category_keywords_authentication(self, extractor):
+        """Test authentication keywords are present."""
+        keywords = extractor.CATEGORY_KEYWORDS[AspectCategory.AUTHENTICATION]
+        assert "authentication" in keywords
+        assert "login" in keywords
+        assert "password" in keywords or "passwort" in keywords
+        assert "oauth" in keywords
+
+    def test_category_keywords_cryptography(self, extractor):
+        """Test cryptography keywords are present."""
+        keywords = extractor.CATEGORY_KEYWORDS[AspectCategory.CRYPTOGRAPHY]
+        assert "encryption" in keywords or "verschlüsselung" in keywords
+        assert "tls" in keywords
+        assert "aes" in keywords or "rsa" in keywords
+
+    def test_categorize_by_aspect_id(self, extractor):
+        """Test categorization based on aspect ID prefix."""
+        # Test various aspect ID patterns
+        test_cases = [
+            ("O.Auth_1", AspectCategory.AUTHENTICATION),
+            ("O.Crypto_2", AspectCategory.CRYPTOGRAPHY),
+            ("O.Sess_3", AspectCategory.SESSION_MANAGEMENT),
+            ("O.Input_4", AspectCategory.INPUT_VALIDATION),
+            ("T.Auth_1", AspectCategory.TEST_ASPECT),
+        ]
+
+        for aspect_id, expected_category in test_cases:
+            # Find matching prefix in category map
+            for prefix, category in extractor.CATEGORY_MAP.items():
+                if aspect_id.startswith(prefix):
+                    assert category == expected_category
+                    break
+
+
+# ============================================================================
+# Requirement Level Tests
+# ============================================================================
+
+class TestRequirementLevelDetection:
+    """Tests for requirement level detection (MUSS/SOLL/KANN)."""
+
+    def test_requirement_level_enum(self):
+        """Test RequirementLevel enum values."""
+        assert RequirementLevel.MUSS.value == "MUSS"
+        assert RequirementLevel.SOLL.value == "SOLL"
+        assert RequirementLevel.KANN.value == "KANN"
+        assert RequirementLevel.DARF_NICHT.value == "DARF NICHT"
+
+    def test_requirement_pattern_muss(self, extractor):
+        """Test MUSS pattern detection."""
+        import re
+        pattern = extractor.PATTERNS["requirement"]
+
+        # Test uppercase MUSS
+        text_upper = "Die Anwendung MUSS sichere Passwörter verwenden."
+        matches = re.findall(pattern, text_upper)
+        assert len(matches) > 0
+        assert matches[0].upper() == "MUSS"
+
+        # Test lowercase muss
+        text_lower = "Das System muss verschlüsselt sein."
+        matches = re.findall(pattern, text_lower)
+        assert len(matches) > 0
+        assert matches[0].upper() == "MUSS"
+
+        # Note: Pattern does not match conjugations like "müssen"
+        # since BSI-TR documents use "MUSS" or "muss" as requirement markers
+
+    def test_requirement_pattern_soll(self, extractor):
+        """Test SOLL pattern detection."""
+        import re
+        pattern = extractor.PATTERNS["requirement"]
+
+        test_texts = [
+            "Die Anwendung SOLL MFA unterstützen.",
+            "Das System soll Logging implementieren.",
+        ]
+
+        for text in test_texts:
+            matches = re.findall(pattern, text)
+            assert len(matches) > 0
+            assert matches[0].upper() == "SOLL"
+
+    def test_requirement_pattern_kann(self, extractor):
+        """Test KANN pattern detection."""
+        import re
+        pattern = extractor.PATTERNS["requirement"]
+
+        test_texts = [
+            "Die Anwendung KANN biometrische Auth anbieten.",
+            "Das System kann zusätzliche Features haben.",
+        ]
+
+        for text in test_texts:
+            matches = re.findall(pattern, text)
+            assert len(matches) > 0
+            assert matches[0].upper() == "KANN"
+
+    def test_requirement_pattern_darf_nicht(self, extractor):
+        """Test DARF NICHT pattern detection."""
+        import re
+        pattern = extractor.PATTERNS["requirement"]
+
+        test_texts = [
+            "Die Anwendung DARF NICHT Passwörter im Klartext speichern.",
+            "Das System darf nicht unverschlüsselt kommunizieren.",
+        ]
+
+        for text in test_texts:
+            matches = re.findall(pattern, text, re.IGNORECASE)
+            assert len(matches) > 0
+
+
+# ============================================================================
+# Pattern Matching Tests
+# ============================================================================
+
+class TestPatternMatching:
+    """Tests for regex pattern matching."""
+
+    def test_aspect_id_pattern(self, extractor):
+        """Test aspect ID pattern matching."""
+        import re
+        pattern = extractor.PATTERNS["aspect_id"]
+
+        test_cases = [
+            ("O.Auth_1", True),
+            ("O.Crypto_23", True),
+            ("T.Network_5", True),
+            ("O.Session_100", True),
+            ("InvalidID", False),
+            ("O.Auth", False),  # Missing number
+        ]
+
+        for text, should_match in test_cases:
+            match = re.search(pattern, text)
+            if should_match:
+                assert match is not None, f"Pattern should match: {text}"
+            else:
+                assert match is None, f"Pattern should not match: {text}"
+
+    def test_section_pattern(self, extractor):
+        """Test section number pattern matching."""
+        import re
+        pattern = extractor.PATTERNS["section"]
+
+        test_cases = [
+            ("4.2.1", True),
+            ("1.0", True),
+            ("10.5.3", True),
+            ("invalid", False),
+        ]
+
+        for text, should_match in test_cases:
+            match = re.search(pattern, text)
+            if should_match:
+                assert match is not None, f"Pattern should match: {text}"
+
+    def test_section_aspect_pattern(self, extractor):
+        """Test section-based aspect pattern."""
+        import re
+        pattern = extractor.PATTERNS["section_aspect"]
+
+        test_cases = [
+            "Prüfaspekt 4.2.1",
+            "Pruefaspekt 10.5",
+            "Anforderung 3.1.2",
+        ]
+
+        for text in test_cases:
+            match = re.search(pattern, text)
+            assert match is not None, f"Pattern should match: {text}"
+            assert match.group(1) is not None  # Should capture section number
+
+
+# ============================================================================
+# BSIAspect Model Tests
+# ============================================================================
+
+class TestBSIAspectModel:
+    """Tests for BSIAspect data model."""
+
+    def test_bsi_aspect_creation(self):
+        """Test creating a BSIAspect instance."""
+        aspect = BSIAspect(
+            aspect_id="O.Auth_1",
+            title="Sichere Passwörter",
+            full_text="Die Anwendung MUSS starke Passwörter erzwingen.",
+            category=AspectCategory.AUTHENTICATION,
+            page_number=10,
+            section="4.2.1",
+            requirement_level=RequirementLevel.MUSS,
+            source_document="BSI-TR-03161-2",
+        )
+
+        assert aspect.aspect_id == "O.Auth_1"
+        assert aspect.title == "Sichere Passwörter"
+        assert aspect.category == AspectCategory.AUTHENTICATION
+        assert aspect.requirement_level == RequirementLevel.MUSS
+        assert aspect.page_number == 10
+
+    def test_bsi_aspect_with_optional_fields(self):
+        """Test BSIAspect with optional fields."""
+        aspect = BSIAspect(
+            aspect_id="O.Auth_1",
+            title="Test",
+            full_text="Test text",
+            category=AspectCategory.AUTHENTICATION,
+            page_number=1,
+            section="1.0",
+            requirement_level=RequirementLevel.MUSS,
+            source_document="Test",
+            context_before="Context before",
+            context_after="Context after",
+            related_aspects=["O.Auth_2", "O.Auth_3"],
+            keywords=["password", "authentication"],
+        )
+
+        assert aspect.context_before == "Context before"
+        assert aspect.context_after == "Context after"
+        assert len(aspect.related_aspects) == 2
+        assert "password" in aspect.keywords
+
+
+# ============================================================================
+# Text Extraction Tests
+# ============================================================================
+
+class TestTextExtraction:
+    """Tests for text extraction logic."""
+
+    @patch("compliance.services.pdf_extractor.fitz")
+    def test_extract_aspects_from_text_with_ids(self, mock_fitz, extractor):
+        """Test extracting aspects that have explicit IDs."""
+        text = """
+        4.2 Authentifizierung
+
+        O.Auth_1: Sichere Passwörter
+        Die Anwendung MUSS starke Passwörter erzwingen.
+
+        O.Auth_2: Multi-Faktor
+        Die Anwendung SOLL MFA unterstützen.
+        """
+
+        # Extract aspects from text
+        aspects = extractor._extract_aspects_from_text(
+            text=text,
+            page_num=1,
+            source_document="Test"
+        )
+
+        # Should find at least the aspects
+        assert isinstance(aspects, list)
+
+    def test_extract_multiple_requirement_levels(self, extractor):
+        """Test extracting text with multiple requirement levels."""
+        text = """
+        Das System MUSS verschlüsselt sein.
+        Es SOLL Logging aktivieren.
+        Es KANN zusätzliche Features haben.
+        Es DARF NICHT Passwörter speichern.
+        """
+
+        import re
+        pattern = extractor.PATTERNS["requirement"]
+        matches = re.findall(pattern, text, re.IGNORECASE)
+
+        # Should find all 4 requirement levels
+        assert len(matches) >= 4
+
+
+# ============================================================================
+# Integration Tests
+# ============================================================================
+
+class TestPDFExtractionIntegration:
+    """Integration tests for complete PDF extraction workflow."""
+
+    @patch("compliance.services.pdf_extractor.fitz")
+    def test_complete_extraction_workflow(self, mock_fitz, extractor):
+        """Test complete extraction from PDF to aspects."""
+        # Create mock PDF with realistic content
+        mock_doc = MagicMock()
+        mock_doc.__len__ = MagicMock(return_value=2)  # 2 pages
+
+        page1 = MagicMock()
+        page1.get_text = MagicMock(return_value="""
+        4.2.1 Authentifizierung
+
+        O.Auth_1: Sichere Passwörter
+        Die Anwendung MUSS starke Passwörter mit mindestens 8 Zeichen erzwingen.
+        """)
+
+        page2 = MagicMock()
+        page2.get_text = MagicMock(return_value="""
+        4.2.2 Session Management
+
+        O.Sess_1: Session Timeout
+        Die Anwendung SOLL nach 15 Minuten Inaktivität die Session beenden.
+        """)
+
+        mock_doc.__getitem__ = MagicMock(side_effect=[page1, page2])
+        mock_fitz.open = MagicMock(return_value=mock_doc)
+
+        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
+            tmp_path = tmp.name
+
+        try:
+            aspects = extractor.extract_from_file(tmp_path, source_name="BSI-TR-03161-2")
+
+            # Verify extraction worked
+            assert isinstance(aspects, list)
+
+            # PDF was closed
+            mock_doc.close.assert_called_once()
+        finally:
+            Path(tmp_path).unlink(missing_ok=True)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])