Files
breakpilot-core/control-pipeline/tests/test_d3_metadata.py
T
Benjamin Admin 93099b2770 feat(pipeline): structural metadata end-to-end (Blocks D2-D4)
D2: RAG service stores section/section_title/paragraph/paragraph_num/page
from embedding service chunks_with_metadata into Qdrant payloads.

D3: Control generator prefers section > article > section_title from
Qdrant, adds page to source_citation and generation_metadata.

D4: Validated with real BGB §§ 312-312k text. Found and fixed critical
bug where Phase 3 overlap destroyed the [§ ...] section prefix, causing
only the first chunk per document to have metadata. All subsequent
chunks lost section info.

Also fixes pre-existing lint issues (unused imports, ambiguous variable
names, duplicate dict key, bare except).

456 tests passing (58 embedding + 387 pipeline + 11 rag-service).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-01 20:34:00 +02:00

167 lines
5.6 KiB
Python

"""Tests for D3: Structural metadata flow (section priority, page in citation)."""
import json
from typing import Optional
from services.rag_client import RAGSearchResult
def _make_chunk(
article: str = "",
paragraph: str = "",
page: Optional[int] = None,
) -> RAGSearchResult:
return RAGSearchResult(
text="Test chunk text",
regulation_code="DSGVO",
regulation_name="Datenschutz-Grundverordnung",
regulation_short="DSGVO",
category="data_protection",
article=article,
paragraph=paragraph,
source_url="https://example.com",
score=0.95,
collection="bp_compliance_de",
page=page,
)
class TestRAGSearchResultPage:
"""RAGSearchResult now carries a page field."""
def test_page_default_none(self):
chunk = _make_chunk()
assert chunk.page is None
def test_page_set(self):
chunk = _make_chunk(page=42)
assert chunk.page == 42
def test_page_zero(self):
chunk = _make_chunk(page=0)
assert chunk.page == 0
class TestQdrantPayloadPriority:
"""section (D2) should take priority over article (legacy)."""
def test_section_preferred_over_article(self):
payload = {"section": "§ 312k", "article": "Art. 312", "section_title": "Kuendigungsbutton"}
article = payload.get("section", "") or payload.get("article", "") or payload.get("section_title", "")
assert article == "§ 312k"
def test_article_fallback_when_no_section(self):
payload = {"section": "", "article": "Art. 35", "section_title": ""}
article = payload.get("section", "") or payload.get("article", "") or payload.get("section_title", "")
assert article == "Art. 35"
def test_section_title_last_resort(self):
payload = {"section": "", "article": "", "section_title": "Informationspflichten"}
article = payload.get("section", "") or payload.get("article", "") or payload.get("section_title", "")
assert article == "Informationspflichten"
def test_all_empty(self):
payload = {"section": "", "article": "", "section_title": ""}
article = payload.get("section", "") or payload.get("article", "") or payload.get("section_title", "")
assert article == ""
def test_page_from_payload(self):
payload = {"page": 847}
assert payload.get("page") == 847
def test_page_none_from_payload(self):
payload = {}
assert payload.get("page") is None
class TestSourceCitationPage:
"""source_citation dict should include page when available."""
def _build_citation(self, chunk: RAGSearchResult) -> dict:
"""Mirrors the citation-building logic from control_generator.py."""
return {
"source": chunk.regulation_name,
"article": chunk.article,
"paragraph": chunk.paragraph,
"page": chunk.page,
"license": "free_use",
"source_type": "law",
"url": chunk.source_url or "",
}
def test_citation_with_page(self):
chunk = _make_chunk(article="§ 312k", paragraph="Abs. 1", page=847)
citation = self._build_citation(chunk)
assert citation["page"] == 847
def test_citation_without_page(self):
chunk = _make_chunk(article="§ 312k", paragraph="Abs. 1")
citation = self._build_citation(chunk)
assert citation["page"] is None
def test_citation_serializable(self):
chunk = _make_chunk(article="Art. 35", page=12)
citation = self._build_citation(chunk)
serialized = json.dumps(citation)
restored = json.loads(serialized)
assert restored["page"] == 12
class TestFormatCitation:
"""_format_citation should include page number."""
def _format_citation(self, citation) -> str:
"""Mirrors _format_citation from decomposition_pass.py."""
if not citation:
return ""
if isinstance(citation, str):
try:
c = json.loads(citation)
if isinstance(c, dict):
parts = []
if c.get("source"):
parts.append(c["source"])
if c.get("article"):
parts.append(c["article"])
if c.get("paragraph"):
parts.append(c["paragraph"])
if c.get("page") is not None:
parts.append(f"S. {c['page']}")
return " ".join(parts) if parts else citation
except (json.JSONDecodeError, TypeError):
return citation
return str(citation)
def test_format_with_page(self):
citation = json.dumps({
"source": "DSGVO",
"article": "Art. 35",
"paragraph": "Abs. 1",
"page": 42,
})
result = self._format_citation(citation)
assert result == "DSGVO Art. 35 Abs. 1 S. 42"
def test_format_without_page(self):
citation = json.dumps({
"source": "BGB",
"article": "§ 312k",
"paragraph": "",
})
result = self._format_citation(citation)
assert result == "BGB § 312k"
def test_format_page_zero(self):
citation = json.dumps({
"source": "BGB",
"article": "§ 1",
"paragraph": "",
"page": 0,
})
result = self._format_citation(citation)
assert result == "BGB § 1 S. 0"
def test_format_empty_citation(self):
assert self._format_citation("") == ""
assert self._format_citation(None) == ""