A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
477 lines
17 KiB
Python
477 lines
17 KiB
Python
"""
|
|
Tests for Compliance PDF Extractor.
|
|
|
|
Tests cover:
|
|
- BSIPDFExtractor.extract_from_file()
|
|
- Aspect categorization
|
|
- Requirement level detection (MUSS/SOLL/KANN)
|
|
- Text parsing and pattern matching
|
|
"""
|
|
|
|
import pytest
|
|
import tempfile
|
|
from pathlib import Path
|
|
from unittest.mock import MagicMock, patch, mock_open
|
|
import sys
|
|
|
|
# Mock fitz if not available
|
|
try:
|
|
import fitz
|
|
except ImportError:
|
|
fitz = MagicMock()
|
|
sys.modules['fitz'] = fitz
|
|
|
|
from compliance.services.pdf_extractor import (
|
|
BSIPDFExtractor,
|
|
BSIAspect,
|
|
RequirementLevel,
|
|
AspectCategory,
|
|
)
|
|
|
|
|
|
@pytest.fixture
|
|
def extractor():
|
|
"""Create a BSIPDFExtractor instance."""
|
|
with patch("compliance.services.pdf_extractor.fitz", MagicMock()):
|
|
return BSIPDFExtractor()
|
|
|
|
|
|
@pytest.fixture
|
|
def mock_pdf():
|
|
"""Create a mock PDF document."""
|
|
mock_doc = MagicMock()
|
|
mock_doc.__len__ = MagicMock(return_value=1) # 1 page
|
|
mock_page = MagicMock()
|
|
mock_page.get_text = MagicMock(return_value="""
|
|
4.2.1 Authentifizierung
|
|
|
|
O.Auth_1: Sichere Passwörter
|
|
Die Anwendung MUSS starke Passwörter erzwingen.
|
|
Passwörter MÜSSEN mindestens 8 Zeichen lang sein.
|
|
|
|
O.Auth_2: Multi-Faktor-Authentifizierung
|
|
Die Anwendung SOLL Multi-Faktor-Authentifizierung unterstützen.
|
|
""")
|
|
mock_doc.__getitem__ = MagicMock(return_value=mock_page)
|
|
return mock_doc
|
|
|
|
|
|
# ============================================================================
|
|
# BSIPDFExtractor Tests
|
|
# ============================================================================
|
|
|
|
class TestBSIPDFExtractor:
|
|
"""Tests for BSIPDFExtractor."""
|
|
|
|
@patch("compliance.services.pdf_extractor.fitz", MagicMock())
|
|
def test_extractor_initialization(self):
|
|
"""Test that extractor can be initialized."""
|
|
extractor = BSIPDFExtractor()
|
|
assert extractor is not None
|
|
assert extractor.logger is not None
|
|
|
|
def test_extractor_requires_pymupdf(self):
|
|
"""Test that extractor raises error if PyMuPDF not available."""
|
|
with patch("compliance.services.pdf_extractor.fitz", None):
|
|
with pytest.raises(ImportError) as excinfo:
|
|
BSIPDFExtractor()
|
|
assert "PyMuPDF" in str(excinfo.value)
|
|
|
|
def test_extract_from_nonexistent_file(self, extractor):
|
|
"""Test extraction from non-existent file raises error."""
|
|
with pytest.raises(FileNotFoundError):
|
|
extractor.extract_from_file("/nonexistent/file.pdf")
|
|
|
|
@patch("compliance.services.pdf_extractor.fitz")
|
|
def test_extract_from_file_basic(self, mock_fitz, extractor, mock_pdf):
|
|
"""Test basic PDF extraction."""
|
|
mock_fitz.open = MagicMock(return_value=mock_pdf)
|
|
|
|
# Create a temporary PDF file
|
|
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
|
|
tmp_path = tmp.name
|
|
|
|
try:
|
|
aspects = extractor.extract_from_file(tmp_path)
|
|
assert isinstance(aspects, list)
|
|
# Should extract aspects from the mock PDF
|
|
finally:
|
|
Path(tmp_path).unlink(missing_ok=True)
|
|
|
|
@patch("compliance.services.pdf_extractor.fitz")
|
|
def test_extract_from_file_with_source_name(self, mock_fitz, extractor, mock_pdf):
|
|
"""Test extraction with custom source name."""
|
|
mock_fitz.open = MagicMock(return_value=mock_pdf)
|
|
|
|
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
|
|
tmp_path = tmp.name
|
|
|
|
try:
|
|
aspects = extractor.extract_from_file(tmp_path, source_name="BSI-TR-03161-2")
|
|
# Should use provided source name
|
|
if aspects:
|
|
assert aspects[0].source_document == "BSI-TR-03161-2"
|
|
finally:
|
|
Path(tmp_path).unlink(missing_ok=True)
|
|
|
|
|
|
# ============================================================================
|
|
# Categorization Tests
|
|
# ============================================================================
|
|
|
|
class TestAspectCategorization:
|
|
"""Tests for aspect categorization."""
|
|
|
|
def test_category_map_authentication(self, extractor):
|
|
"""Test authentication category detection."""
|
|
category_map = extractor.CATEGORY_MAP
|
|
assert category_map.get("O.Auth") == AspectCategory.AUTHENTICATION
|
|
|
|
def test_category_map_cryptography(self, extractor):
|
|
"""Test cryptography category detection."""
|
|
category_map = extractor.CATEGORY_MAP
|
|
assert category_map.get("O.Cryp") == AspectCategory.CRYPTOGRAPHY
|
|
assert category_map.get("O.Crypto") == AspectCategory.CRYPTOGRAPHY
|
|
|
|
def test_category_map_session_management(self, extractor):
|
|
"""Test session management category detection."""
|
|
category_map = extractor.CATEGORY_MAP
|
|
assert category_map.get("O.Sess") == AspectCategory.SESSION_MANAGEMENT
|
|
|
|
def test_category_map_input_validation(self, extractor):
|
|
"""Test input validation category detection."""
|
|
category_map = extractor.CATEGORY_MAP
|
|
assert category_map.get("O.Input") == AspectCategory.INPUT_VALIDATION
|
|
|
|
def test_category_map_sql_injection(self, extractor):
|
|
"""Test SQL injection category detection."""
|
|
category_map = extractor.CATEGORY_MAP
|
|
assert category_map.get("O.SQL") == AspectCategory.SQL_INJECTION
|
|
|
|
def test_category_map_test_aspect(self, extractor):
|
|
"""Test that T.* aspects are categorized as test aspects."""
|
|
category_map = extractor.CATEGORY_MAP
|
|
assert category_map.get("T.") == AspectCategory.TEST_ASPECT
|
|
|
|
def test_category_keywords_authentication(self, extractor):
|
|
"""Test authentication keywords are present."""
|
|
keywords = extractor.CATEGORY_KEYWORDS[AspectCategory.AUTHENTICATION]
|
|
assert "authentication" in keywords
|
|
assert "login" in keywords
|
|
assert "password" in keywords or "passwort" in keywords
|
|
assert "oauth" in keywords
|
|
|
|
def test_category_keywords_cryptography(self, extractor):
|
|
"""Test cryptography keywords are present."""
|
|
keywords = extractor.CATEGORY_KEYWORDS[AspectCategory.CRYPTOGRAPHY]
|
|
assert "encryption" in keywords or "verschlüsselung" in keywords
|
|
assert "tls" in keywords
|
|
assert "aes" in keywords or "rsa" in keywords
|
|
|
|
def test_categorize_by_aspect_id(self, extractor):
|
|
"""Test categorization based on aspect ID prefix."""
|
|
# Test various aspect ID patterns
|
|
test_cases = [
|
|
("O.Auth_1", AspectCategory.AUTHENTICATION),
|
|
("O.Crypto_2", AspectCategory.CRYPTOGRAPHY),
|
|
("O.Sess_3", AspectCategory.SESSION_MANAGEMENT),
|
|
("O.Input_4", AspectCategory.INPUT_VALIDATION),
|
|
("T.Auth_1", AspectCategory.TEST_ASPECT),
|
|
]
|
|
|
|
for aspect_id, expected_category in test_cases:
|
|
# Find matching prefix in category map
|
|
for prefix, category in extractor.CATEGORY_MAP.items():
|
|
if aspect_id.startswith(prefix):
|
|
assert category == expected_category
|
|
break
|
|
|
|
|
|
# ============================================================================
|
|
# Requirement Level Tests
|
|
# ============================================================================
|
|
|
|
class TestRequirementLevelDetection:
|
|
"""Tests for requirement level detection (MUSS/SOLL/KANN)."""
|
|
|
|
def test_requirement_level_enum(self):
|
|
"""Test RequirementLevel enum values."""
|
|
assert RequirementLevel.MUSS.value == "MUSS"
|
|
assert RequirementLevel.SOLL.value == "SOLL"
|
|
assert RequirementLevel.KANN.value == "KANN"
|
|
assert RequirementLevel.DARF_NICHT.value == "DARF NICHT"
|
|
|
|
def test_requirement_pattern_muss(self, extractor):
|
|
"""Test MUSS pattern detection."""
|
|
import re
|
|
pattern = extractor.PATTERNS["requirement"]
|
|
|
|
# Test uppercase MUSS
|
|
text_upper = "Die Anwendung MUSS sichere Passwörter verwenden."
|
|
matches = re.findall(pattern, text_upper)
|
|
assert len(matches) > 0
|
|
assert matches[0].upper() == "MUSS"
|
|
|
|
# Test lowercase muss
|
|
text_lower = "Das System muss verschlüsselt sein."
|
|
matches = re.findall(pattern, text_lower)
|
|
assert len(matches) > 0
|
|
assert matches[0].upper() == "MUSS"
|
|
|
|
# Note: Pattern does not match conjugations like "müssen"
|
|
# since BSI-TR documents use "MUSS" or "muss" as requirement markers
|
|
|
|
def test_requirement_pattern_soll(self, extractor):
|
|
"""Test SOLL pattern detection."""
|
|
import re
|
|
pattern = extractor.PATTERNS["requirement"]
|
|
|
|
test_texts = [
|
|
"Die Anwendung SOLL MFA unterstützen.",
|
|
"Das System soll Logging implementieren.",
|
|
]
|
|
|
|
for text in test_texts:
|
|
matches = re.findall(pattern, text)
|
|
assert len(matches) > 0
|
|
assert matches[0].upper() == "SOLL"
|
|
|
|
def test_requirement_pattern_kann(self, extractor):
|
|
"""Test KANN pattern detection."""
|
|
import re
|
|
pattern = extractor.PATTERNS["requirement"]
|
|
|
|
test_texts = [
|
|
"Die Anwendung KANN biometrische Auth anbieten.",
|
|
"Das System kann zusätzliche Features haben.",
|
|
]
|
|
|
|
for text in test_texts:
|
|
matches = re.findall(pattern, text)
|
|
assert len(matches) > 0
|
|
assert matches[0].upper() == "KANN"
|
|
|
|
def test_requirement_pattern_darf_nicht(self, extractor):
|
|
"""Test DARF NICHT pattern detection."""
|
|
import re
|
|
pattern = extractor.PATTERNS["requirement"]
|
|
|
|
test_texts = [
|
|
"Die Anwendung DARF NICHT Passwörter im Klartext speichern.",
|
|
"Das System darf nicht unverschlüsselt kommunizieren.",
|
|
]
|
|
|
|
for text in test_texts:
|
|
matches = re.findall(pattern, text, re.IGNORECASE)
|
|
assert len(matches) > 0
|
|
|
|
|
|
# ============================================================================
|
|
# Pattern Matching Tests
|
|
# ============================================================================
|
|
|
|
class TestPatternMatching:
|
|
"""Tests for regex pattern matching."""
|
|
|
|
def test_aspect_id_pattern(self, extractor):
|
|
"""Test aspect ID pattern matching."""
|
|
import re
|
|
pattern = extractor.PATTERNS["aspect_id"]
|
|
|
|
test_cases = [
|
|
("O.Auth_1", True),
|
|
("O.Crypto_23", True),
|
|
("T.Network_5", True),
|
|
("O.Session_100", True),
|
|
("InvalidID", False),
|
|
("O.Auth", False), # Missing number
|
|
]
|
|
|
|
for text, should_match in test_cases:
|
|
match = re.search(pattern, text)
|
|
if should_match:
|
|
assert match is not None, f"Pattern should match: {text}"
|
|
else:
|
|
assert match is None, f"Pattern should not match: {text}"
|
|
|
|
def test_section_pattern(self, extractor):
|
|
"""Test section number pattern matching."""
|
|
import re
|
|
pattern = extractor.PATTERNS["section"]
|
|
|
|
test_cases = [
|
|
("4.2.1", True),
|
|
("1.0", True),
|
|
("10.5.3", True),
|
|
("invalid", False),
|
|
]
|
|
|
|
for text, should_match in test_cases:
|
|
match = re.search(pattern, text)
|
|
if should_match:
|
|
assert match is not None, f"Pattern should match: {text}"
|
|
|
|
def test_section_aspect_pattern(self, extractor):
|
|
"""Test section-based aspect pattern."""
|
|
import re
|
|
pattern = extractor.PATTERNS["section_aspect"]
|
|
|
|
test_cases = [
|
|
"Prüfaspekt 4.2.1",
|
|
"Pruefaspekt 10.5",
|
|
"Anforderung 3.1.2",
|
|
]
|
|
|
|
for text in test_cases:
|
|
match = re.search(pattern, text)
|
|
assert match is not None, f"Pattern should match: {text}"
|
|
assert match.group(1) is not None # Should capture section number
|
|
|
|
|
|
# ============================================================================
|
|
# BSIAspect Model Tests
|
|
# ============================================================================
|
|
|
|
class TestBSIAspectModel:
|
|
"""Tests for BSIAspect data model."""
|
|
|
|
def test_bsi_aspect_creation(self):
|
|
"""Test creating a BSIAspect instance."""
|
|
aspect = BSIAspect(
|
|
aspect_id="O.Auth_1",
|
|
title="Sichere Passwörter",
|
|
full_text="Die Anwendung MUSS starke Passwörter erzwingen.",
|
|
category=AspectCategory.AUTHENTICATION,
|
|
page_number=10,
|
|
section="4.2.1",
|
|
requirement_level=RequirementLevel.MUSS,
|
|
source_document="BSI-TR-03161-2",
|
|
)
|
|
|
|
assert aspect.aspect_id == "O.Auth_1"
|
|
assert aspect.title == "Sichere Passwörter"
|
|
assert aspect.category == AspectCategory.AUTHENTICATION
|
|
assert aspect.requirement_level == RequirementLevel.MUSS
|
|
assert aspect.page_number == 10
|
|
|
|
def test_bsi_aspect_with_optional_fields(self):
|
|
"""Test BSIAspect with optional fields."""
|
|
aspect = BSIAspect(
|
|
aspect_id="O.Auth_1",
|
|
title="Test",
|
|
full_text="Test text",
|
|
category=AspectCategory.AUTHENTICATION,
|
|
page_number=1,
|
|
section="1.0",
|
|
requirement_level=RequirementLevel.MUSS,
|
|
source_document="Test",
|
|
context_before="Context before",
|
|
context_after="Context after",
|
|
related_aspects=["O.Auth_2", "O.Auth_3"],
|
|
keywords=["password", "authentication"],
|
|
)
|
|
|
|
assert aspect.context_before == "Context before"
|
|
assert aspect.context_after == "Context after"
|
|
assert len(aspect.related_aspects) == 2
|
|
assert "password" in aspect.keywords
|
|
|
|
|
|
# ============================================================================
|
|
# Text Extraction Tests
|
|
# ============================================================================
|
|
|
|
class TestTextExtraction:
|
|
"""Tests for text extraction logic."""
|
|
|
|
@patch("compliance.services.pdf_extractor.fitz")
|
|
def test_extract_aspects_from_text_with_ids(self, mock_fitz, extractor):
|
|
"""Test extracting aspects that have explicit IDs."""
|
|
text = """
|
|
4.2 Authentifizierung
|
|
|
|
O.Auth_1: Sichere Passwörter
|
|
Die Anwendung MUSS starke Passwörter erzwingen.
|
|
|
|
O.Auth_2: Multi-Faktor
|
|
Die Anwendung SOLL MFA unterstützen.
|
|
"""
|
|
|
|
# Extract aspects from text
|
|
aspects = extractor._extract_aspects_from_text(
|
|
text=text,
|
|
page_num=1,
|
|
source_document="Test"
|
|
)
|
|
|
|
# Should find at least the aspects
|
|
assert isinstance(aspects, list)
|
|
|
|
def test_extract_multiple_requirement_levels(self, extractor):
|
|
"""Test extracting text with multiple requirement levels."""
|
|
text = """
|
|
Das System MUSS verschlüsselt sein.
|
|
Es SOLL Logging aktivieren.
|
|
Es KANN zusätzliche Features haben.
|
|
Es DARF NICHT Passwörter speichern.
|
|
"""
|
|
|
|
import re
|
|
pattern = extractor.PATTERNS["requirement"]
|
|
matches = re.findall(pattern, text, re.IGNORECASE)
|
|
|
|
# Should find all 4 requirement levels
|
|
assert len(matches) >= 4
|
|
|
|
|
|
# ============================================================================
|
|
# Integration Tests
|
|
# ============================================================================
|
|
|
|
class TestPDFExtractionIntegration:
|
|
"""Integration tests for complete PDF extraction workflow."""
|
|
|
|
@patch("compliance.services.pdf_extractor.fitz")
|
|
def test_complete_extraction_workflow(self, mock_fitz, extractor):
|
|
"""Test complete extraction from PDF to aspects."""
|
|
# Create mock PDF with realistic content
|
|
mock_doc = MagicMock()
|
|
mock_doc.__len__ = MagicMock(return_value=2) # 2 pages
|
|
|
|
page1 = MagicMock()
|
|
page1.get_text = MagicMock(return_value="""
|
|
4.2.1 Authentifizierung
|
|
|
|
O.Auth_1: Sichere Passwörter
|
|
Die Anwendung MUSS starke Passwörter mit mindestens 8 Zeichen erzwingen.
|
|
""")
|
|
|
|
page2 = MagicMock()
|
|
page2.get_text = MagicMock(return_value="""
|
|
4.2.2 Session Management
|
|
|
|
O.Sess_1: Session Timeout
|
|
Die Anwendung SOLL nach 15 Minuten Inaktivität die Session beenden.
|
|
""")
|
|
|
|
mock_doc.__getitem__ = MagicMock(side_effect=[page1, page2])
|
|
mock_fitz.open = MagicMock(return_value=mock_doc)
|
|
|
|
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
|
|
tmp_path = tmp.name
|
|
|
|
try:
|
|
aspects = extractor.extract_from_file(tmp_path, source_name="BSI-TR-03161-2")
|
|
|
|
# Verify extraction worked
|
|
assert isinstance(aspects, list)
|
|
|
|
# PDF was closed
|
|
mock_doc.close.assert_called_once()
|
|
finally:
|
|
Path(tmp_path).unlink(missing_ok=True)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
pytest.main([__file__, "-v"])
|