fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
476
backend/tests/test_compliance_pdf_extractor.py
Normal file
476
backend/tests/test_compliance_pdf_extractor.py
Normal file
@@ -0,0 +1,476 @@
|
||||
"""
|
||||
Tests for Compliance PDF Extractor.
|
||||
|
||||
Tests cover:
|
||||
- BSIPDFExtractor.extract_from_file()
|
||||
- Aspect categorization
|
||||
- Requirement level detection (MUSS/SOLL/KANN)
|
||||
- Text parsing and pattern matching
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock, patch, mock_open
|
||||
import sys
|
||||
|
||||
# Mock fitz if not available
|
||||
try:
|
||||
import fitz
|
||||
except ImportError:
|
||||
fitz = MagicMock()
|
||||
sys.modules['fitz'] = fitz
|
||||
|
||||
from compliance.services.pdf_extractor import (
|
||||
BSIPDFExtractor,
|
||||
BSIAspect,
|
||||
RequirementLevel,
|
||||
AspectCategory,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def extractor():
|
||||
"""Create a BSIPDFExtractor instance."""
|
||||
with patch("compliance.services.pdf_extractor.fitz", MagicMock()):
|
||||
return BSIPDFExtractor()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_pdf():
|
||||
"""Create a mock PDF document."""
|
||||
mock_doc = MagicMock()
|
||||
mock_doc.__len__ = MagicMock(return_value=1) # 1 page
|
||||
mock_page = MagicMock()
|
||||
mock_page.get_text = MagicMock(return_value="""
|
||||
4.2.1 Authentifizierung
|
||||
|
||||
O.Auth_1: Sichere Passwörter
|
||||
Die Anwendung MUSS starke Passwörter erzwingen.
|
||||
Passwörter MÜSSEN mindestens 8 Zeichen lang sein.
|
||||
|
||||
O.Auth_2: Multi-Faktor-Authentifizierung
|
||||
Die Anwendung SOLL Multi-Faktor-Authentifizierung unterstützen.
|
||||
""")
|
||||
mock_doc.__getitem__ = MagicMock(return_value=mock_page)
|
||||
return mock_doc
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# BSIPDFExtractor Tests
|
||||
# ============================================================================
|
||||
|
||||
class TestBSIPDFExtractor:
|
||||
"""Tests for BSIPDFExtractor."""
|
||||
|
||||
@patch("compliance.services.pdf_extractor.fitz", MagicMock())
|
||||
def test_extractor_initialization(self):
|
||||
"""Test that extractor can be initialized."""
|
||||
extractor = BSIPDFExtractor()
|
||||
assert extractor is not None
|
||||
assert extractor.logger is not None
|
||||
|
||||
def test_extractor_requires_pymupdf(self):
|
||||
"""Test that extractor raises error if PyMuPDF not available."""
|
||||
with patch("compliance.services.pdf_extractor.fitz", None):
|
||||
with pytest.raises(ImportError) as excinfo:
|
||||
BSIPDFExtractor()
|
||||
assert "PyMuPDF" in str(excinfo.value)
|
||||
|
||||
def test_extract_from_nonexistent_file(self, extractor):
|
||||
"""Test extraction from non-existent file raises error."""
|
||||
with pytest.raises(FileNotFoundError):
|
||||
extractor.extract_from_file("/nonexistent/file.pdf")
|
||||
|
||||
@patch("compliance.services.pdf_extractor.fitz")
|
||||
def test_extract_from_file_basic(self, mock_fitz, extractor, mock_pdf):
|
||||
"""Test basic PDF extraction."""
|
||||
mock_fitz.open = MagicMock(return_value=mock_pdf)
|
||||
|
||||
# Create a temporary PDF file
|
||||
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
|
||||
tmp_path = tmp.name
|
||||
|
||||
try:
|
||||
aspects = extractor.extract_from_file(tmp_path)
|
||||
assert isinstance(aspects, list)
|
||||
# Should extract aspects from the mock PDF
|
||||
finally:
|
||||
Path(tmp_path).unlink(missing_ok=True)
|
||||
|
||||
@patch("compliance.services.pdf_extractor.fitz")
|
||||
def test_extract_from_file_with_source_name(self, mock_fitz, extractor, mock_pdf):
|
||||
"""Test extraction with custom source name."""
|
||||
mock_fitz.open = MagicMock(return_value=mock_pdf)
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
|
||||
tmp_path = tmp.name
|
||||
|
||||
try:
|
||||
aspects = extractor.extract_from_file(tmp_path, source_name="BSI-TR-03161-2")
|
||||
# Should use provided source name
|
||||
if aspects:
|
||||
assert aspects[0].source_document == "BSI-TR-03161-2"
|
||||
finally:
|
||||
Path(tmp_path).unlink(missing_ok=True)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Categorization Tests
|
||||
# ============================================================================
|
||||
|
||||
class TestAspectCategorization:
|
||||
"""Tests for aspect categorization."""
|
||||
|
||||
def test_category_map_authentication(self, extractor):
|
||||
"""Test authentication category detection."""
|
||||
category_map = extractor.CATEGORY_MAP
|
||||
assert category_map.get("O.Auth") == AspectCategory.AUTHENTICATION
|
||||
|
||||
def test_category_map_cryptography(self, extractor):
|
||||
"""Test cryptography category detection."""
|
||||
category_map = extractor.CATEGORY_MAP
|
||||
assert category_map.get("O.Cryp") == AspectCategory.CRYPTOGRAPHY
|
||||
assert category_map.get("O.Crypto") == AspectCategory.CRYPTOGRAPHY
|
||||
|
||||
def test_category_map_session_management(self, extractor):
|
||||
"""Test session management category detection."""
|
||||
category_map = extractor.CATEGORY_MAP
|
||||
assert category_map.get("O.Sess") == AspectCategory.SESSION_MANAGEMENT
|
||||
|
||||
def test_category_map_input_validation(self, extractor):
|
||||
"""Test input validation category detection."""
|
||||
category_map = extractor.CATEGORY_MAP
|
||||
assert category_map.get("O.Input") == AspectCategory.INPUT_VALIDATION
|
||||
|
||||
def test_category_map_sql_injection(self, extractor):
|
||||
"""Test SQL injection category detection."""
|
||||
category_map = extractor.CATEGORY_MAP
|
||||
assert category_map.get("O.SQL") == AspectCategory.SQL_INJECTION
|
||||
|
||||
def test_category_map_test_aspect(self, extractor):
|
||||
"""Test that T.* aspects are categorized as test aspects."""
|
||||
category_map = extractor.CATEGORY_MAP
|
||||
assert category_map.get("T.") == AspectCategory.TEST_ASPECT
|
||||
|
||||
def test_category_keywords_authentication(self, extractor):
|
||||
"""Test authentication keywords are present."""
|
||||
keywords = extractor.CATEGORY_KEYWORDS[AspectCategory.AUTHENTICATION]
|
||||
assert "authentication" in keywords
|
||||
assert "login" in keywords
|
||||
assert "password" in keywords or "passwort" in keywords
|
||||
assert "oauth" in keywords
|
||||
|
||||
def test_category_keywords_cryptography(self, extractor):
|
||||
"""Test cryptography keywords are present."""
|
||||
keywords = extractor.CATEGORY_KEYWORDS[AspectCategory.CRYPTOGRAPHY]
|
||||
assert "encryption" in keywords or "verschlüsselung" in keywords
|
||||
assert "tls" in keywords
|
||||
assert "aes" in keywords or "rsa" in keywords
|
||||
|
||||
def test_categorize_by_aspect_id(self, extractor):
|
||||
"""Test categorization based on aspect ID prefix."""
|
||||
# Test various aspect ID patterns
|
||||
test_cases = [
|
||||
("O.Auth_1", AspectCategory.AUTHENTICATION),
|
||||
("O.Crypto_2", AspectCategory.CRYPTOGRAPHY),
|
||||
("O.Sess_3", AspectCategory.SESSION_MANAGEMENT),
|
||||
("O.Input_4", AspectCategory.INPUT_VALIDATION),
|
||||
("T.Auth_1", AspectCategory.TEST_ASPECT),
|
||||
]
|
||||
|
||||
for aspect_id, expected_category in test_cases:
|
||||
# Find matching prefix in category map
|
||||
for prefix, category in extractor.CATEGORY_MAP.items():
|
||||
if aspect_id.startswith(prefix):
|
||||
assert category == expected_category
|
||||
break
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Requirement Level Tests
|
||||
# ============================================================================
|
||||
|
||||
class TestRequirementLevelDetection:
|
||||
"""Tests for requirement level detection (MUSS/SOLL/KANN)."""
|
||||
|
||||
def test_requirement_level_enum(self):
|
||||
"""Test RequirementLevel enum values."""
|
||||
assert RequirementLevel.MUSS.value == "MUSS"
|
||||
assert RequirementLevel.SOLL.value == "SOLL"
|
||||
assert RequirementLevel.KANN.value == "KANN"
|
||||
assert RequirementLevel.DARF_NICHT.value == "DARF NICHT"
|
||||
|
||||
def test_requirement_pattern_muss(self, extractor):
|
||||
"""Test MUSS pattern detection."""
|
||||
import re
|
||||
pattern = extractor.PATTERNS["requirement"]
|
||||
|
||||
# Test uppercase MUSS
|
||||
text_upper = "Die Anwendung MUSS sichere Passwörter verwenden."
|
||||
matches = re.findall(pattern, text_upper)
|
||||
assert len(matches) > 0
|
||||
assert matches[0].upper() == "MUSS"
|
||||
|
||||
# Test lowercase muss
|
||||
text_lower = "Das System muss verschlüsselt sein."
|
||||
matches = re.findall(pattern, text_lower)
|
||||
assert len(matches) > 0
|
||||
assert matches[0].upper() == "MUSS"
|
||||
|
||||
# Note: Pattern does not match conjugations like "müssen"
|
||||
# since BSI-TR documents use "MUSS" or "muss" as requirement markers
|
||||
|
||||
def test_requirement_pattern_soll(self, extractor):
|
||||
"""Test SOLL pattern detection."""
|
||||
import re
|
||||
pattern = extractor.PATTERNS["requirement"]
|
||||
|
||||
test_texts = [
|
||||
"Die Anwendung SOLL MFA unterstützen.",
|
||||
"Das System soll Logging implementieren.",
|
||||
]
|
||||
|
||||
for text in test_texts:
|
||||
matches = re.findall(pattern, text)
|
||||
assert len(matches) > 0
|
||||
assert matches[0].upper() == "SOLL"
|
||||
|
||||
def test_requirement_pattern_kann(self, extractor):
|
||||
"""Test KANN pattern detection."""
|
||||
import re
|
||||
pattern = extractor.PATTERNS["requirement"]
|
||||
|
||||
test_texts = [
|
||||
"Die Anwendung KANN biometrische Auth anbieten.",
|
||||
"Das System kann zusätzliche Features haben.",
|
||||
]
|
||||
|
||||
for text in test_texts:
|
||||
matches = re.findall(pattern, text)
|
||||
assert len(matches) > 0
|
||||
assert matches[0].upper() == "KANN"
|
||||
|
||||
def test_requirement_pattern_darf_nicht(self, extractor):
|
||||
"""Test DARF NICHT pattern detection."""
|
||||
import re
|
||||
pattern = extractor.PATTERNS["requirement"]
|
||||
|
||||
test_texts = [
|
||||
"Die Anwendung DARF NICHT Passwörter im Klartext speichern.",
|
||||
"Das System darf nicht unverschlüsselt kommunizieren.",
|
||||
]
|
||||
|
||||
for text in test_texts:
|
||||
matches = re.findall(pattern, text, re.IGNORECASE)
|
||||
assert len(matches) > 0
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Pattern Matching Tests
|
||||
# ============================================================================
|
||||
|
||||
class TestPatternMatching:
|
||||
"""Tests for regex pattern matching."""
|
||||
|
||||
def test_aspect_id_pattern(self, extractor):
|
||||
"""Test aspect ID pattern matching."""
|
||||
import re
|
||||
pattern = extractor.PATTERNS["aspect_id"]
|
||||
|
||||
test_cases = [
|
||||
("O.Auth_1", True),
|
||||
("O.Crypto_23", True),
|
||||
("T.Network_5", True),
|
||||
("O.Session_100", True),
|
||||
("InvalidID", False),
|
||||
("O.Auth", False), # Missing number
|
||||
]
|
||||
|
||||
for text, should_match in test_cases:
|
||||
match = re.search(pattern, text)
|
||||
if should_match:
|
||||
assert match is not None, f"Pattern should match: {text}"
|
||||
else:
|
||||
assert match is None, f"Pattern should not match: {text}"
|
||||
|
||||
def test_section_pattern(self, extractor):
|
||||
"""Test section number pattern matching."""
|
||||
import re
|
||||
pattern = extractor.PATTERNS["section"]
|
||||
|
||||
test_cases = [
|
||||
("4.2.1", True),
|
||||
("1.0", True),
|
||||
("10.5.3", True),
|
||||
("invalid", False),
|
||||
]
|
||||
|
||||
for text, should_match in test_cases:
|
||||
match = re.search(pattern, text)
|
||||
if should_match:
|
||||
assert match is not None, f"Pattern should match: {text}"
|
||||
|
||||
def test_section_aspect_pattern(self, extractor):
|
||||
"""Test section-based aspect pattern."""
|
||||
import re
|
||||
pattern = extractor.PATTERNS["section_aspect"]
|
||||
|
||||
test_cases = [
|
||||
"Prüfaspekt 4.2.1",
|
||||
"Pruefaspekt 10.5",
|
||||
"Anforderung 3.1.2",
|
||||
]
|
||||
|
||||
for text in test_cases:
|
||||
match = re.search(pattern, text)
|
||||
assert match is not None, f"Pattern should match: {text}"
|
||||
assert match.group(1) is not None # Should capture section number
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# BSIAspect Model Tests
|
||||
# ============================================================================
|
||||
|
||||
class TestBSIAspectModel:
|
||||
"""Tests for BSIAspect data model."""
|
||||
|
||||
def test_bsi_aspect_creation(self):
|
||||
"""Test creating a BSIAspect instance."""
|
||||
aspect = BSIAspect(
|
||||
aspect_id="O.Auth_1",
|
||||
title="Sichere Passwörter",
|
||||
full_text="Die Anwendung MUSS starke Passwörter erzwingen.",
|
||||
category=AspectCategory.AUTHENTICATION,
|
||||
page_number=10,
|
||||
section="4.2.1",
|
||||
requirement_level=RequirementLevel.MUSS,
|
||||
source_document="BSI-TR-03161-2",
|
||||
)
|
||||
|
||||
assert aspect.aspect_id == "O.Auth_1"
|
||||
assert aspect.title == "Sichere Passwörter"
|
||||
assert aspect.category == AspectCategory.AUTHENTICATION
|
||||
assert aspect.requirement_level == RequirementLevel.MUSS
|
||||
assert aspect.page_number == 10
|
||||
|
||||
def test_bsi_aspect_with_optional_fields(self):
|
||||
"""Test BSIAspect with optional fields."""
|
||||
aspect = BSIAspect(
|
||||
aspect_id="O.Auth_1",
|
||||
title="Test",
|
||||
full_text="Test text",
|
||||
category=AspectCategory.AUTHENTICATION,
|
||||
page_number=1,
|
||||
section="1.0",
|
||||
requirement_level=RequirementLevel.MUSS,
|
||||
source_document="Test",
|
||||
context_before="Context before",
|
||||
context_after="Context after",
|
||||
related_aspects=["O.Auth_2", "O.Auth_3"],
|
||||
keywords=["password", "authentication"],
|
||||
)
|
||||
|
||||
assert aspect.context_before == "Context before"
|
||||
assert aspect.context_after == "Context after"
|
||||
assert len(aspect.related_aspects) == 2
|
||||
assert "password" in aspect.keywords
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Text Extraction Tests
|
||||
# ============================================================================
|
||||
|
||||
class TestTextExtraction:
|
||||
"""Tests for text extraction logic."""
|
||||
|
||||
@patch("compliance.services.pdf_extractor.fitz")
|
||||
def test_extract_aspects_from_text_with_ids(self, mock_fitz, extractor):
|
||||
"""Test extracting aspects that have explicit IDs."""
|
||||
text = """
|
||||
4.2 Authentifizierung
|
||||
|
||||
O.Auth_1: Sichere Passwörter
|
||||
Die Anwendung MUSS starke Passwörter erzwingen.
|
||||
|
||||
O.Auth_2: Multi-Faktor
|
||||
Die Anwendung SOLL MFA unterstützen.
|
||||
"""
|
||||
|
||||
# Extract aspects from text
|
||||
aspects = extractor._extract_aspects_from_text(
|
||||
text=text,
|
||||
page_num=1,
|
||||
source_document="Test"
|
||||
)
|
||||
|
||||
# Should find at least the aspects
|
||||
assert isinstance(aspects, list)
|
||||
|
||||
def test_extract_multiple_requirement_levels(self, extractor):
|
||||
"""Test extracting text with multiple requirement levels."""
|
||||
text = """
|
||||
Das System MUSS verschlüsselt sein.
|
||||
Es SOLL Logging aktivieren.
|
||||
Es KANN zusätzliche Features haben.
|
||||
Es DARF NICHT Passwörter speichern.
|
||||
"""
|
||||
|
||||
import re
|
||||
pattern = extractor.PATTERNS["requirement"]
|
||||
matches = re.findall(pattern, text, re.IGNORECASE)
|
||||
|
||||
# Should find all 4 requirement levels
|
||||
assert len(matches) >= 4
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Integration Tests
|
||||
# ============================================================================
|
||||
|
||||
class TestPDFExtractionIntegration:
|
||||
"""Integration tests for complete PDF extraction workflow."""
|
||||
|
||||
@patch("compliance.services.pdf_extractor.fitz")
|
||||
def test_complete_extraction_workflow(self, mock_fitz, extractor):
|
||||
"""Test complete extraction from PDF to aspects."""
|
||||
# Create mock PDF with realistic content
|
||||
mock_doc = MagicMock()
|
||||
mock_doc.__len__ = MagicMock(return_value=2) # 2 pages
|
||||
|
||||
page1 = MagicMock()
|
||||
page1.get_text = MagicMock(return_value="""
|
||||
4.2.1 Authentifizierung
|
||||
|
||||
O.Auth_1: Sichere Passwörter
|
||||
Die Anwendung MUSS starke Passwörter mit mindestens 8 Zeichen erzwingen.
|
||||
""")
|
||||
|
||||
page2 = MagicMock()
|
||||
page2.get_text = MagicMock(return_value="""
|
||||
4.2.2 Session Management
|
||||
|
||||
O.Sess_1: Session Timeout
|
||||
Die Anwendung SOLL nach 15 Minuten Inaktivität die Session beenden.
|
||||
""")
|
||||
|
||||
mock_doc.__getitem__ = MagicMock(side_effect=[page1, page2])
|
||||
mock_fitz.open = MagicMock(return_value=mock_doc)
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
|
||||
tmp_path = tmp.name
|
||||
|
||||
try:
|
||||
aspects = extractor.extract_from_file(tmp_path, source_name="BSI-TR-03161-2")
|
||||
|
||||
# Verify extraction worked
|
||||
assert isinstance(aspects, list)
|
||||
|
||||
# PDF was closed
|
||||
mock_doc.close.assert_called_once()
|
||||
finally:
|
||||
Path(tmp_path).unlink(missing_ok=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
Reference in New Issue
Block a user