feat(training+controls): interactive video pipeline, training blocks, control generator, CE libraries
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 37s
CI/CD / test-python-backend-compliance (push) Successful in 39s
CI/CD / test-python-document-crawler (push) Successful in 26s
CI/CD / test-python-dsms-gateway (push) Successful in 23s
CI/CD / validate-canonical-controls (push) Successful in 12s
CI/CD / Deploy (push) Has been skipped
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 37s
CI/CD / test-python-backend-compliance (push) Successful in 39s
CI/CD / test-python-document-crawler (push) Successful in 26s
CI/CD / test-python-dsms-gateway (push) Successful in 23s
CI/CD / validate-canonical-controls (push) Successful in 12s
CI/CD / Deploy (push) Has been skipped
Interactive Training Videos (CP-TRAIN): - DB migration 022: training_checkpoints + checkpoint_progress tables - NarratorScript generation via Anthropic (AI Teacher persona, German) - TTS batch synthesis + interactive video pipeline (slides + checkpoint slides + FFmpeg) - 4 new API endpoints: generate-interactive, interactive-manifest, checkpoint submit, checkpoint progress - InteractiveVideoPlayer component (HTML5 Video, quiz overlay, seek protection, progress tracking) - Learner portal integration with automatic completion on all checkpoints passed - 30 new tests (handler validation + grading logic + manifest/progress + seek protection) Training Blocks: - Block generator, block store, block config CRUD + preview/generate endpoints - Migration 021: training_blocks schema Control Generator + Canonical Library: - Control generator routes + service enhancements - Canonical control library helpers, sidebar entry - Citation backfill service + tests - CE libraries data (hazard, protection, evidence, lifecycle, components) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
221
backend-compliance/tests/test_citation_backfill.py
Normal file
221
backend-compliance/tests/test_citation_backfill.py
Normal file
@@ -0,0 +1,221 @@
|
||||
"""Tests for citation_backfill.py — article/paragraph enrichment."""
|
||||
import hashlib
|
||||
import json
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from compliance.services.citation_backfill import (
|
||||
CitationBackfill,
|
||||
MatchResult,
|
||||
_parse_concatenated_source,
|
||||
_parse_json,
|
||||
)
|
||||
from compliance.services.rag_client import RAGSearchResult
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Unit tests: _parse_concatenated_source
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class TestParseConcatenatedSource:
|
||||
def test_dsgvo_art(self):
|
||||
result = _parse_concatenated_source("DSGVO Art. 35")
|
||||
assert result == {"name": "DSGVO", "article": "Art. 35"}
|
||||
|
||||
def test_nis2_artikel(self):
|
||||
result = _parse_concatenated_source("NIS2 Artikel 21 Abs. 2")
|
||||
assert result == {"name": "NIS2", "article": "Artikel 21 Abs. 2"}
|
||||
|
||||
def test_long_name_with_article(self):
|
||||
result = _parse_concatenated_source("Verordnung (EU) 2024/1689 (KI-Verordnung) Art. 6")
|
||||
assert result == {"name": "Verordnung (EU) 2024/1689 (KI-Verordnung)", "article": "Art. 6"}
|
||||
|
||||
def test_paragraph_sign(self):
|
||||
result = _parse_concatenated_source("BDSG § 42")
|
||||
assert result == {"name": "BDSG", "article": "§ 42"}
|
||||
|
||||
def test_paragraph_sign_with_abs(self):
|
||||
result = _parse_concatenated_source("TTDSG § 25 Abs. 1")
|
||||
assert result == {"name": "TTDSG", "article": "§ 25 Abs. 1"}
|
||||
|
||||
def test_no_article(self):
|
||||
result = _parse_concatenated_source("DSGVO")
|
||||
assert result is None
|
||||
|
||||
def test_empty_string(self):
|
||||
result = _parse_concatenated_source("")
|
||||
assert result is None
|
||||
|
||||
def test_none(self):
|
||||
result = _parse_concatenated_source(None)
|
||||
assert result is None
|
||||
|
||||
def test_just_name_no_article(self):
|
||||
result = _parse_concatenated_source("Cyber Resilience Act")
|
||||
assert result is None
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Unit tests: _parse_json
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class TestParseJson:
|
||||
def test_direct_json(self):
|
||||
result = _parse_json('{"article": "Art. 35", "paragraph": "Abs. 1"}')
|
||||
assert result == {"article": "Art. 35", "paragraph": "Abs. 1"}
|
||||
|
||||
def test_markdown_code_block(self):
|
||||
raw = '```json\n{"article": "§ 42", "paragraph": ""}\n```'
|
||||
result = _parse_json(raw)
|
||||
assert result == {"article": "§ 42", "paragraph": ""}
|
||||
|
||||
def test_text_with_json(self):
|
||||
raw = 'Der Artikel ist {"article": "Art. 6", "paragraph": "Abs. 2"} wie beschrieben.'
|
||||
result = _parse_json(raw)
|
||||
assert result == {"article": "Art. 6", "paragraph": "Abs. 2"}
|
||||
|
||||
def test_empty(self):
|
||||
assert _parse_json("") is None
|
||||
assert _parse_json(None) is None
|
||||
|
||||
def test_no_json(self):
|
||||
assert _parse_json("Das ist kein JSON.") is None
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Integration tests: CitationBackfill matching
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def _make_rag_chunk(text="Test text", article="Art. 35", paragraph="Abs. 1",
|
||||
regulation_code="eu_2016_679", regulation_name="DSGVO"):
|
||||
return RAGSearchResult(
|
||||
text=text,
|
||||
regulation_code=regulation_code,
|
||||
regulation_name=regulation_name,
|
||||
regulation_short="DSGVO",
|
||||
category="datenschutz",
|
||||
article=article,
|
||||
paragraph=paragraph,
|
||||
source_url="https://example.com",
|
||||
score=0.0,
|
||||
collection="bp_compliance_gesetze",
|
||||
)
|
||||
|
||||
|
||||
class TestCitationBackfillMatching:
|
||||
def setup_method(self):
|
||||
self.db = MagicMock()
|
||||
self.rag = MagicMock()
|
||||
self.backfill = CitationBackfill(db=self.db, rag_client=self.rag)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_hash_match(self):
|
||||
"""Tier 1: exact text hash matches a RAG chunk."""
|
||||
source_text = "Dies ist ein Gesetzestext mit spezifischen Anforderungen an die Datensicherheit."
|
||||
chunk = _make_rag_chunk(text=source_text, article="Art. 32", paragraph="Abs. 1")
|
||||
h = hashlib.sha256(source_text.encode()).hexdigest()
|
||||
self.backfill._rag_index = {h: chunk}
|
||||
|
||||
ctrl = {
|
||||
"control_id": "DATA-001",
|
||||
"source_original_text": source_text,
|
||||
"source_citation": {"source": "DSGVO Art. 32"},
|
||||
"generation_metadata": {"source_regulation": "eu_2016_679"},
|
||||
}
|
||||
|
||||
result = await self.backfill._match_control(ctrl)
|
||||
assert result is not None
|
||||
assert result.method == "hash"
|
||||
assert result.article == "Art. 32"
|
||||
assert result.paragraph == "Abs. 1"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_regex_match(self):
|
||||
"""Tier 2: regex parses concatenated source when no hash match."""
|
||||
self.backfill._rag_index = {}
|
||||
|
||||
ctrl = {
|
||||
"control_id": "NET-010",
|
||||
"source_original_text": None, # No original text available
|
||||
"source_citation": {"source": "NIS2 Artikel 21"},
|
||||
"generation_metadata": {},
|
||||
}
|
||||
|
||||
result = await self.backfill._match_control(ctrl)
|
||||
assert result is not None
|
||||
assert result.method == "regex"
|
||||
assert result.article == "Artikel 21"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_llm_match(self):
|
||||
"""Tier 3: Ollama LLM identifies article/paragraph."""
|
||||
self.backfill._rag_index = {}
|
||||
|
||||
ctrl = {
|
||||
"control_id": "AUTH-005",
|
||||
"source_original_text": "Verantwortliche muessen geeignete technische Massnahmen treffen...",
|
||||
"source_citation": {"source": "DSGVO"}, # No article in source
|
||||
"generation_metadata": {"source_regulation": "eu_2016_679"},
|
||||
}
|
||||
|
||||
with patch("compliance.services.citation_backfill._llm_ollama", new_callable=AsyncMock) as mock_llm:
|
||||
mock_llm.return_value = '{"article": "Art. 25", "paragraph": "Abs. 1"}'
|
||||
result = await self.backfill._match_control(ctrl)
|
||||
|
||||
assert result is not None
|
||||
assert result.method == "llm"
|
||||
assert result.article == "Art. 25"
|
||||
assert result.paragraph == "Abs. 1"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_no_match(self):
|
||||
"""No match when no source text and no parseable source."""
|
||||
self.backfill._rag_index = {}
|
||||
|
||||
ctrl = {
|
||||
"control_id": "SEC-001",
|
||||
"source_original_text": None,
|
||||
"source_citation": {"source": "Unknown Source"},
|
||||
"generation_metadata": {},
|
||||
}
|
||||
|
||||
result = await self.backfill._match_control(ctrl)
|
||||
assert result is None
|
||||
|
||||
def test_update_control_cleans_source(self):
|
||||
"""_update_control splits concatenated source and adds article/paragraph."""
|
||||
ctrl = {
|
||||
"id": "test-uuid-123",
|
||||
"control_id": "DATA-001",
|
||||
"source_citation": {"source": "DSGVO Art. 32", "license": "EU_LAW"},
|
||||
"generation_metadata": {"processing_path": "structured"},
|
||||
}
|
||||
match = MatchResult(article="Art. 32", paragraph="Abs. 1", method="hash")
|
||||
|
||||
self.backfill._update_control(ctrl, match)
|
||||
|
||||
call_args = self.db.execute.call_args
|
||||
params = call_args[1] if call_args[1] else call_args[0][1]
|
||||
citation = json.loads(params["citation"])
|
||||
metadata = json.loads(params["metadata"])
|
||||
|
||||
assert citation["source"] == "DSGVO" # Cleaned: article removed
|
||||
assert citation["article"] == "Art. 32"
|
||||
assert citation["paragraph"] == "Abs. 1"
|
||||
assert metadata["source_paragraph"] == "Abs. 1"
|
||||
assert metadata["backfill_method"] == "hash"
|
||||
assert "backfill_at" in metadata
|
||||
|
||||
def test_rule3_not_loaded(self):
|
||||
"""Verify the SQL query only loads Rule 1+2 controls."""
|
||||
# Simulate what _load_controls_needing_backfill does
|
||||
self.db.execute.return_value = MagicMock(keys=lambda: [], __iter__=lambda s: iter([]))
|
||||
self.backfill._load_controls_needing_backfill()
|
||||
|
||||
sql_text = str(self.db.execute.call_args[0][0].text)
|
||||
assert "license_rule IN (1, 2)" in sql_text
|
||||
assert "source_citation IS NOT NULL" in sql_text
|
||||
Reference in New Issue
Block a user