This repository has been archived on 2026-02-15. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
breakpilot-pwa/backend/tests/test_compliance_pdf_extractor.py
Benjamin Admin 21a844cb8a fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.

This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).

Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 09:51:32 +01:00

477 lines
17 KiB
Python

"""
Tests for Compliance PDF Extractor.
Tests cover:
- BSIPDFExtractor.extract_from_file()
- Aspect categorization
- Requirement level detection (MUSS/SOLL/KANN)
- Text parsing and pattern matching
"""
import pytest
import tempfile
from pathlib import Path
from unittest.mock import MagicMock, patch, mock_open
import sys
# Mock fitz if not available
try:
import fitz
except ImportError:
fitz = MagicMock()
sys.modules['fitz'] = fitz
from compliance.services.pdf_extractor import (
BSIPDFExtractor,
BSIAspect,
RequirementLevel,
AspectCategory,
)
@pytest.fixture
def extractor():
"""Create a BSIPDFExtractor instance."""
with patch("compliance.services.pdf_extractor.fitz", MagicMock()):
return BSIPDFExtractor()
@pytest.fixture
def mock_pdf():
"""Create a mock PDF document."""
mock_doc = MagicMock()
mock_doc.__len__ = MagicMock(return_value=1) # 1 page
mock_page = MagicMock()
mock_page.get_text = MagicMock(return_value="""
4.2.1 Authentifizierung
O.Auth_1: Sichere Passwörter
Die Anwendung MUSS starke Passwörter erzwingen.
Passwörter MÜSSEN mindestens 8 Zeichen lang sein.
O.Auth_2: Multi-Faktor-Authentifizierung
Die Anwendung SOLL Multi-Faktor-Authentifizierung unterstützen.
""")
mock_doc.__getitem__ = MagicMock(return_value=mock_page)
return mock_doc
# ============================================================================
# BSIPDFExtractor Tests
# ============================================================================
class TestBSIPDFExtractor:
"""Tests for BSIPDFExtractor."""
@patch("compliance.services.pdf_extractor.fitz", MagicMock())
def test_extractor_initialization(self):
"""Test that extractor can be initialized."""
extractor = BSIPDFExtractor()
assert extractor is not None
assert extractor.logger is not None
def test_extractor_requires_pymupdf(self):
"""Test that extractor raises error if PyMuPDF not available."""
with patch("compliance.services.pdf_extractor.fitz", None):
with pytest.raises(ImportError) as excinfo:
BSIPDFExtractor()
assert "PyMuPDF" in str(excinfo.value)
def test_extract_from_nonexistent_file(self, extractor):
"""Test extraction from non-existent file raises error."""
with pytest.raises(FileNotFoundError):
extractor.extract_from_file("/nonexistent/file.pdf")
@patch("compliance.services.pdf_extractor.fitz")
def test_extract_from_file_basic(self, mock_fitz, extractor, mock_pdf):
"""Test basic PDF extraction."""
mock_fitz.open = MagicMock(return_value=mock_pdf)
# Create a temporary PDF file
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
tmp_path = tmp.name
try:
aspects = extractor.extract_from_file(tmp_path)
assert isinstance(aspects, list)
# Should extract aspects from the mock PDF
finally:
Path(tmp_path).unlink(missing_ok=True)
@patch("compliance.services.pdf_extractor.fitz")
def test_extract_from_file_with_source_name(self, mock_fitz, extractor, mock_pdf):
"""Test extraction with custom source name."""
mock_fitz.open = MagicMock(return_value=mock_pdf)
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
tmp_path = tmp.name
try:
aspects = extractor.extract_from_file(tmp_path, source_name="BSI-TR-03161-2")
# Should use provided source name
if aspects:
assert aspects[0].source_document == "BSI-TR-03161-2"
finally:
Path(tmp_path).unlink(missing_ok=True)
# ============================================================================
# Categorization Tests
# ============================================================================
class TestAspectCategorization:
"""Tests for aspect categorization."""
def test_category_map_authentication(self, extractor):
"""Test authentication category detection."""
category_map = extractor.CATEGORY_MAP
assert category_map.get("O.Auth") == AspectCategory.AUTHENTICATION
def test_category_map_cryptography(self, extractor):
"""Test cryptography category detection."""
category_map = extractor.CATEGORY_MAP
assert category_map.get("O.Cryp") == AspectCategory.CRYPTOGRAPHY
assert category_map.get("O.Crypto") == AspectCategory.CRYPTOGRAPHY
def test_category_map_session_management(self, extractor):
"""Test session management category detection."""
category_map = extractor.CATEGORY_MAP
assert category_map.get("O.Sess") == AspectCategory.SESSION_MANAGEMENT
def test_category_map_input_validation(self, extractor):
"""Test input validation category detection."""
category_map = extractor.CATEGORY_MAP
assert category_map.get("O.Input") == AspectCategory.INPUT_VALIDATION
def test_category_map_sql_injection(self, extractor):
"""Test SQL injection category detection."""
category_map = extractor.CATEGORY_MAP
assert category_map.get("O.SQL") == AspectCategory.SQL_INJECTION
def test_category_map_test_aspect(self, extractor):
"""Test that T.* aspects are categorized as test aspects."""
category_map = extractor.CATEGORY_MAP
assert category_map.get("T.") == AspectCategory.TEST_ASPECT
def test_category_keywords_authentication(self, extractor):
"""Test authentication keywords are present."""
keywords = extractor.CATEGORY_KEYWORDS[AspectCategory.AUTHENTICATION]
assert "authentication" in keywords
assert "login" in keywords
assert "password" in keywords or "passwort" in keywords
assert "oauth" in keywords
def test_category_keywords_cryptography(self, extractor):
"""Test cryptography keywords are present."""
keywords = extractor.CATEGORY_KEYWORDS[AspectCategory.CRYPTOGRAPHY]
assert "encryption" in keywords or "verschlüsselung" in keywords
assert "tls" in keywords
assert "aes" in keywords or "rsa" in keywords
def test_categorize_by_aspect_id(self, extractor):
"""Test categorization based on aspect ID prefix."""
# Test various aspect ID patterns
test_cases = [
("O.Auth_1", AspectCategory.AUTHENTICATION),
("O.Crypto_2", AspectCategory.CRYPTOGRAPHY),
("O.Sess_3", AspectCategory.SESSION_MANAGEMENT),
("O.Input_4", AspectCategory.INPUT_VALIDATION),
("T.Auth_1", AspectCategory.TEST_ASPECT),
]
for aspect_id, expected_category in test_cases:
# Find matching prefix in category map
for prefix, category in extractor.CATEGORY_MAP.items():
if aspect_id.startswith(prefix):
assert category == expected_category
break
# ============================================================================
# Requirement Level Tests
# ============================================================================
class TestRequirementLevelDetection:
"""Tests for requirement level detection (MUSS/SOLL/KANN)."""
def test_requirement_level_enum(self):
"""Test RequirementLevel enum values."""
assert RequirementLevel.MUSS.value == "MUSS"
assert RequirementLevel.SOLL.value == "SOLL"
assert RequirementLevel.KANN.value == "KANN"
assert RequirementLevel.DARF_NICHT.value == "DARF NICHT"
def test_requirement_pattern_muss(self, extractor):
"""Test MUSS pattern detection."""
import re
pattern = extractor.PATTERNS["requirement"]
# Test uppercase MUSS
text_upper = "Die Anwendung MUSS sichere Passwörter verwenden."
matches = re.findall(pattern, text_upper)
assert len(matches) > 0
assert matches[0].upper() == "MUSS"
# Test lowercase muss
text_lower = "Das System muss verschlüsselt sein."
matches = re.findall(pattern, text_lower)
assert len(matches) > 0
assert matches[0].upper() == "MUSS"
# Note: Pattern does not match conjugations like "müssen"
# since BSI-TR documents use "MUSS" or "muss" as requirement markers
def test_requirement_pattern_soll(self, extractor):
"""Test SOLL pattern detection."""
import re
pattern = extractor.PATTERNS["requirement"]
test_texts = [
"Die Anwendung SOLL MFA unterstützen.",
"Das System soll Logging implementieren.",
]
for text in test_texts:
matches = re.findall(pattern, text)
assert len(matches) > 0
assert matches[0].upper() == "SOLL"
def test_requirement_pattern_kann(self, extractor):
"""Test KANN pattern detection."""
import re
pattern = extractor.PATTERNS["requirement"]
test_texts = [
"Die Anwendung KANN biometrische Auth anbieten.",
"Das System kann zusätzliche Features haben.",
]
for text in test_texts:
matches = re.findall(pattern, text)
assert len(matches) > 0
assert matches[0].upper() == "KANN"
def test_requirement_pattern_darf_nicht(self, extractor):
"""Test DARF NICHT pattern detection."""
import re
pattern = extractor.PATTERNS["requirement"]
test_texts = [
"Die Anwendung DARF NICHT Passwörter im Klartext speichern.",
"Das System darf nicht unverschlüsselt kommunizieren.",
]
for text in test_texts:
matches = re.findall(pattern, text, re.IGNORECASE)
assert len(matches) > 0
# ============================================================================
# Pattern Matching Tests
# ============================================================================
class TestPatternMatching:
"""Tests for regex pattern matching."""
def test_aspect_id_pattern(self, extractor):
"""Test aspect ID pattern matching."""
import re
pattern = extractor.PATTERNS["aspect_id"]
test_cases = [
("O.Auth_1", True),
("O.Crypto_23", True),
("T.Network_5", True),
("O.Session_100", True),
("InvalidID", False),
("O.Auth", False), # Missing number
]
for text, should_match in test_cases:
match = re.search(pattern, text)
if should_match:
assert match is not None, f"Pattern should match: {text}"
else:
assert match is None, f"Pattern should not match: {text}"
def test_section_pattern(self, extractor):
"""Test section number pattern matching."""
import re
pattern = extractor.PATTERNS["section"]
test_cases = [
("4.2.1", True),
("1.0", True),
("10.5.3", True),
("invalid", False),
]
for text, should_match in test_cases:
match = re.search(pattern, text)
if should_match:
assert match is not None, f"Pattern should match: {text}"
def test_section_aspect_pattern(self, extractor):
"""Test section-based aspect pattern."""
import re
pattern = extractor.PATTERNS["section_aspect"]
test_cases = [
"Prüfaspekt 4.2.1",
"Pruefaspekt 10.5",
"Anforderung 3.1.2",
]
for text in test_cases:
match = re.search(pattern, text)
assert match is not None, f"Pattern should match: {text}"
assert match.group(1) is not None # Should capture section number
# ============================================================================
# BSIAspect Model Tests
# ============================================================================
class TestBSIAspectModel:
"""Tests for BSIAspect data model."""
def test_bsi_aspect_creation(self):
"""Test creating a BSIAspect instance."""
aspect = BSIAspect(
aspect_id="O.Auth_1",
title="Sichere Passwörter",
full_text="Die Anwendung MUSS starke Passwörter erzwingen.",
category=AspectCategory.AUTHENTICATION,
page_number=10,
section="4.2.1",
requirement_level=RequirementLevel.MUSS,
source_document="BSI-TR-03161-2",
)
assert aspect.aspect_id == "O.Auth_1"
assert aspect.title == "Sichere Passwörter"
assert aspect.category == AspectCategory.AUTHENTICATION
assert aspect.requirement_level == RequirementLevel.MUSS
assert aspect.page_number == 10
def test_bsi_aspect_with_optional_fields(self):
"""Test BSIAspect with optional fields."""
aspect = BSIAspect(
aspect_id="O.Auth_1",
title="Test",
full_text="Test text",
category=AspectCategory.AUTHENTICATION,
page_number=1,
section="1.0",
requirement_level=RequirementLevel.MUSS,
source_document="Test",
context_before="Context before",
context_after="Context after",
related_aspects=["O.Auth_2", "O.Auth_3"],
keywords=["password", "authentication"],
)
assert aspect.context_before == "Context before"
assert aspect.context_after == "Context after"
assert len(aspect.related_aspects) == 2
assert "password" in aspect.keywords
# ============================================================================
# Text Extraction Tests
# ============================================================================
class TestTextExtraction:
"""Tests for text extraction logic."""
@patch("compliance.services.pdf_extractor.fitz")
def test_extract_aspects_from_text_with_ids(self, mock_fitz, extractor):
"""Test extracting aspects that have explicit IDs."""
text = """
4.2 Authentifizierung
O.Auth_1: Sichere Passwörter
Die Anwendung MUSS starke Passwörter erzwingen.
O.Auth_2: Multi-Faktor
Die Anwendung SOLL MFA unterstützen.
"""
# Extract aspects from text
aspects = extractor._extract_aspects_from_text(
text=text,
page_num=1,
source_document="Test"
)
# Should find at least the aspects
assert isinstance(aspects, list)
def test_extract_multiple_requirement_levels(self, extractor):
"""Test extracting text with multiple requirement levels."""
text = """
Das System MUSS verschlüsselt sein.
Es SOLL Logging aktivieren.
Es KANN zusätzliche Features haben.
Es DARF NICHT Passwörter speichern.
"""
import re
pattern = extractor.PATTERNS["requirement"]
matches = re.findall(pattern, text, re.IGNORECASE)
# Should find all 4 requirement levels
assert len(matches) >= 4
# ============================================================================
# Integration Tests
# ============================================================================
class TestPDFExtractionIntegration:
"""Integration tests for complete PDF extraction workflow."""
@patch("compliance.services.pdf_extractor.fitz")
def test_complete_extraction_workflow(self, mock_fitz, extractor):
"""Test complete extraction from PDF to aspects."""
# Create mock PDF with realistic content
mock_doc = MagicMock()
mock_doc.__len__ = MagicMock(return_value=2) # 2 pages
page1 = MagicMock()
page1.get_text = MagicMock(return_value="""
4.2.1 Authentifizierung
O.Auth_1: Sichere Passwörter
Die Anwendung MUSS starke Passwörter mit mindestens 8 Zeichen erzwingen.
""")
page2 = MagicMock()
page2.get_text = MagicMock(return_value="""
4.2.2 Session Management
O.Sess_1: Session Timeout
Die Anwendung SOLL nach 15 Minuten Inaktivität die Session beenden.
""")
mock_doc.__getitem__ = MagicMock(side_effect=[page1, page2])
mock_fitz.open = MagicMock(return_value=mock_doc)
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
tmp_path = tmp.name
try:
aspects = extractor.extract_from_file(tmp_path, source_name="BSI-TR-03161-2")
# Verify extraction worked
assert isinstance(aspects, list)
# PDF was closed
mock_doc.close.assert_called_once()
finally:
Path(tmp_path).unlink(missing_ok=True)
if __name__ == "__main__":
pytest.main([__file__, "-v"])