Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 37s
CI/CD / test-python-backend-compliance (push) Successful in 39s
CI/CD / test-python-document-crawler (push) Successful in 26s
CI/CD / test-python-dsms-gateway (push) Successful in 23s
CI/CD / validate-canonical-controls (push) Successful in 12s
CI/CD / Deploy (push) Has been skipped
Interactive Training Videos (CP-TRAIN): - DB migration 022: training_checkpoints + checkpoint_progress tables - NarratorScript generation via Anthropic (AI Teacher persona, German) - TTS batch synthesis + interactive video pipeline (slides + checkpoint slides + FFmpeg) - 4 new API endpoints: generate-interactive, interactive-manifest, checkpoint submit, checkpoint progress - InteractiveVideoPlayer component (HTML5 Video, quiz overlay, seek protection, progress tracking) - Learner portal integration with automatic completion on all checkpoints passed - 30 new tests (handler validation + grading logic + manifest/progress + seek protection) Training Blocks: - Block generator, block store, block config CRUD + preview/generate endpoints - Migration 021: training_blocks schema Control Generator + Canonical Library: - Control generator routes + service enhancements - Canonical control library helpers, sidebar entry - Citation backfill service + tests - CE libraries data (hazard, protection, evidence, lifecycle, components) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
222 lines
8.1 KiB
Python
222 lines
8.1 KiB
Python
"""Tests for citation_backfill.py — article/paragraph enrichment."""
|
|
import hashlib
|
|
import json
|
|
from unittest.mock import AsyncMock, MagicMock, patch
|
|
|
|
import pytest
|
|
|
|
from compliance.services.citation_backfill import (
|
|
CitationBackfill,
|
|
MatchResult,
|
|
_parse_concatenated_source,
|
|
_parse_json,
|
|
)
|
|
from compliance.services.rag_client import RAGSearchResult
|
|
|
|
|
|
# =============================================================================
|
|
# Unit tests: _parse_concatenated_source
|
|
# =============================================================================
|
|
|
|
|
|
class TestParseConcatenatedSource:
|
|
def test_dsgvo_art(self):
|
|
result = _parse_concatenated_source("DSGVO Art. 35")
|
|
assert result == {"name": "DSGVO", "article": "Art. 35"}
|
|
|
|
def test_nis2_artikel(self):
|
|
result = _parse_concatenated_source("NIS2 Artikel 21 Abs. 2")
|
|
assert result == {"name": "NIS2", "article": "Artikel 21 Abs. 2"}
|
|
|
|
def test_long_name_with_article(self):
|
|
result = _parse_concatenated_source("Verordnung (EU) 2024/1689 (KI-Verordnung) Art. 6")
|
|
assert result == {"name": "Verordnung (EU) 2024/1689 (KI-Verordnung)", "article": "Art. 6"}
|
|
|
|
def test_paragraph_sign(self):
|
|
result = _parse_concatenated_source("BDSG § 42")
|
|
assert result == {"name": "BDSG", "article": "§ 42"}
|
|
|
|
def test_paragraph_sign_with_abs(self):
|
|
result = _parse_concatenated_source("TTDSG § 25 Abs. 1")
|
|
assert result == {"name": "TTDSG", "article": "§ 25 Abs. 1"}
|
|
|
|
def test_no_article(self):
|
|
result = _parse_concatenated_source("DSGVO")
|
|
assert result is None
|
|
|
|
def test_empty_string(self):
|
|
result = _parse_concatenated_source("")
|
|
assert result is None
|
|
|
|
def test_none(self):
|
|
result = _parse_concatenated_source(None)
|
|
assert result is None
|
|
|
|
def test_just_name_no_article(self):
|
|
result = _parse_concatenated_source("Cyber Resilience Act")
|
|
assert result is None
|
|
|
|
|
|
# =============================================================================
|
|
# Unit tests: _parse_json
|
|
# =============================================================================
|
|
|
|
|
|
class TestParseJson:
|
|
def test_direct_json(self):
|
|
result = _parse_json('{"article": "Art. 35", "paragraph": "Abs. 1"}')
|
|
assert result == {"article": "Art. 35", "paragraph": "Abs. 1"}
|
|
|
|
def test_markdown_code_block(self):
|
|
raw = '```json\n{"article": "§ 42", "paragraph": ""}\n```'
|
|
result = _parse_json(raw)
|
|
assert result == {"article": "§ 42", "paragraph": ""}
|
|
|
|
def test_text_with_json(self):
|
|
raw = 'Der Artikel ist {"article": "Art. 6", "paragraph": "Abs. 2"} wie beschrieben.'
|
|
result = _parse_json(raw)
|
|
assert result == {"article": "Art. 6", "paragraph": "Abs. 2"}
|
|
|
|
def test_empty(self):
|
|
assert _parse_json("") is None
|
|
assert _parse_json(None) is None
|
|
|
|
def test_no_json(self):
|
|
assert _parse_json("Das ist kein JSON.") is None
|
|
|
|
|
|
# =============================================================================
|
|
# Integration tests: CitationBackfill matching
|
|
# =============================================================================
|
|
|
|
|
|
def _make_rag_chunk(text="Test text", article="Art. 35", paragraph="Abs. 1",
|
|
regulation_code="eu_2016_679", regulation_name="DSGVO"):
|
|
return RAGSearchResult(
|
|
text=text,
|
|
regulation_code=regulation_code,
|
|
regulation_name=regulation_name,
|
|
regulation_short="DSGVO",
|
|
category="datenschutz",
|
|
article=article,
|
|
paragraph=paragraph,
|
|
source_url="https://example.com",
|
|
score=0.0,
|
|
collection="bp_compliance_gesetze",
|
|
)
|
|
|
|
|
|
class TestCitationBackfillMatching:
|
|
def setup_method(self):
|
|
self.db = MagicMock()
|
|
self.rag = MagicMock()
|
|
self.backfill = CitationBackfill(db=self.db, rag_client=self.rag)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_hash_match(self):
|
|
"""Tier 1: exact text hash matches a RAG chunk."""
|
|
source_text = "Dies ist ein Gesetzestext mit spezifischen Anforderungen an die Datensicherheit."
|
|
chunk = _make_rag_chunk(text=source_text, article="Art. 32", paragraph="Abs. 1")
|
|
h = hashlib.sha256(source_text.encode()).hexdigest()
|
|
self.backfill._rag_index = {h: chunk}
|
|
|
|
ctrl = {
|
|
"control_id": "DATA-001",
|
|
"source_original_text": source_text,
|
|
"source_citation": {"source": "DSGVO Art. 32"},
|
|
"generation_metadata": {"source_regulation": "eu_2016_679"},
|
|
}
|
|
|
|
result = await self.backfill._match_control(ctrl)
|
|
assert result is not None
|
|
assert result.method == "hash"
|
|
assert result.article == "Art. 32"
|
|
assert result.paragraph == "Abs. 1"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_regex_match(self):
|
|
"""Tier 2: regex parses concatenated source when no hash match."""
|
|
self.backfill._rag_index = {}
|
|
|
|
ctrl = {
|
|
"control_id": "NET-010",
|
|
"source_original_text": None, # No original text available
|
|
"source_citation": {"source": "NIS2 Artikel 21"},
|
|
"generation_metadata": {},
|
|
}
|
|
|
|
result = await self.backfill._match_control(ctrl)
|
|
assert result is not None
|
|
assert result.method == "regex"
|
|
assert result.article == "Artikel 21"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_llm_match(self):
|
|
"""Tier 3: Ollama LLM identifies article/paragraph."""
|
|
self.backfill._rag_index = {}
|
|
|
|
ctrl = {
|
|
"control_id": "AUTH-005",
|
|
"source_original_text": "Verantwortliche muessen geeignete technische Massnahmen treffen...",
|
|
"source_citation": {"source": "DSGVO"}, # No article in source
|
|
"generation_metadata": {"source_regulation": "eu_2016_679"},
|
|
}
|
|
|
|
with patch("compliance.services.citation_backfill._llm_ollama", new_callable=AsyncMock) as mock_llm:
|
|
mock_llm.return_value = '{"article": "Art. 25", "paragraph": "Abs. 1"}'
|
|
result = await self.backfill._match_control(ctrl)
|
|
|
|
assert result is not None
|
|
assert result.method == "llm"
|
|
assert result.article == "Art. 25"
|
|
assert result.paragraph == "Abs. 1"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_no_match(self):
|
|
"""No match when no source text and no parseable source."""
|
|
self.backfill._rag_index = {}
|
|
|
|
ctrl = {
|
|
"control_id": "SEC-001",
|
|
"source_original_text": None,
|
|
"source_citation": {"source": "Unknown Source"},
|
|
"generation_metadata": {},
|
|
}
|
|
|
|
result = await self.backfill._match_control(ctrl)
|
|
assert result is None
|
|
|
|
def test_update_control_cleans_source(self):
|
|
"""_update_control splits concatenated source and adds article/paragraph."""
|
|
ctrl = {
|
|
"id": "test-uuid-123",
|
|
"control_id": "DATA-001",
|
|
"source_citation": {"source": "DSGVO Art. 32", "license": "EU_LAW"},
|
|
"generation_metadata": {"processing_path": "structured"},
|
|
}
|
|
match = MatchResult(article="Art. 32", paragraph="Abs. 1", method="hash")
|
|
|
|
self.backfill._update_control(ctrl, match)
|
|
|
|
call_args = self.db.execute.call_args
|
|
params = call_args[1] if call_args[1] else call_args[0][1]
|
|
citation = json.loads(params["citation"])
|
|
metadata = json.loads(params["metadata"])
|
|
|
|
assert citation["source"] == "DSGVO" # Cleaned: article removed
|
|
assert citation["article"] == "Art. 32"
|
|
assert citation["paragraph"] == "Abs. 1"
|
|
assert metadata["source_paragraph"] == "Abs. 1"
|
|
assert metadata["backfill_method"] == "hash"
|
|
assert "backfill_at" in metadata
|
|
|
|
def test_rule3_not_loaded(self):
|
|
"""Verify the SQL query only loads Rule 1+2 controls."""
|
|
# Simulate what _load_controls_needing_backfill does
|
|
self.db.execute.return_value = MagicMock(keys=lambda: [], __iter__=lambda s: iter([]))
|
|
self.backfill._load_controls_needing_backfill()
|
|
|
|
sql_text = str(self.db.execute.call_args[0][0].text)
|
|
assert "license_rule IN (1, 2)" in sql_text
|
|
assert "source_citation IS NOT NULL" in sql_text
|