93099b2770
D2: RAG service stores section/section_title/paragraph/paragraph_num/page from embedding service chunks_with_metadata into Qdrant payloads. D3: Control generator prefers section > article > section_title from Qdrant, adds page to source_citation and generation_metadata. D4: Validated with real BGB §§ 312-312k text. Found and fixed critical bug where Phase 3 overlap destroyed the [§ ...] section prefix, causing only the first chunk per document to have metadata. All subsequent chunks lost section info. Also fixes pre-existing lint issues (unused imports, ambiguous variable names, duplicate dict key, bare except). 456 tests passing (58 embedding + 387 pipeline + 11 rag-service). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
167 lines
5.6 KiB
Python
167 lines
5.6 KiB
Python
"""Tests for D3: Structural metadata flow (section priority, page in citation)."""
|
|
|
|
import json
|
|
from typing import Optional
|
|
|
|
from services.rag_client import RAGSearchResult
|
|
|
|
|
|
def _make_chunk(
|
|
article: str = "",
|
|
paragraph: str = "",
|
|
page: Optional[int] = None,
|
|
) -> RAGSearchResult:
|
|
return RAGSearchResult(
|
|
text="Test chunk text",
|
|
regulation_code="DSGVO",
|
|
regulation_name="Datenschutz-Grundverordnung",
|
|
regulation_short="DSGVO",
|
|
category="data_protection",
|
|
article=article,
|
|
paragraph=paragraph,
|
|
source_url="https://example.com",
|
|
score=0.95,
|
|
collection="bp_compliance_de",
|
|
page=page,
|
|
)
|
|
|
|
|
|
class TestRAGSearchResultPage:
|
|
"""RAGSearchResult now carries a page field."""
|
|
|
|
def test_page_default_none(self):
|
|
chunk = _make_chunk()
|
|
assert chunk.page is None
|
|
|
|
def test_page_set(self):
|
|
chunk = _make_chunk(page=42)
|
|
assert chunk.page == 42
|
|
|
|
def test_page_zero(self):
|
|
chunk = _make_chunk(page=0)
|
|
assert chunk.page == 0
|
|
|
|
|
|
class TestQdrantPayloadPriority:
|
|
"""section (D2) should take priority over article (legacy)."""
|
|
|
|
def test_section_preferred_over_article(self):
|
|
payload = {"section": "§ 312k", "article": "Art. 312", "section_title": "Kuendigungsbutton"}
|
|
article = payload.get("section", "") or payload.get("article", "") or payload.get("section_title", "")
|
|
assert article == "§ 312k"
|
|
|
|
def test_article_fallback_when_no_section(self):
|
|
payload = {"section": "", "article": "Art. 35", "section_title": ""}
|
|
article = payload.get("section", "") or payload.get("article", "") or payload.get("section_title", "")
|
|
assert article == "Art. 35"
|
|
|
|
def test_section_title_last_resort(self):
|
|
payload = {"section": "", "article": "", "section_title": "Informationspflichten"}
|
|
article = payload.get("section", "") or payload.get("article", "") or payload.get("section_title", "")
|
|
assert article == "Informationspflichten"
|
|
|
|
def test_all_empty(self):
|
|
payload = {"section": "", "article": "", "section_title": ""}
|
|
article = payload.get("section", "") or payload.get("article", "") or payload.get("section_title", "")
|
|
assert article == ""
|
|
|
|
def test_page_from_payload(self):
|
|
payload = {"page": 847}
|
|
assert payload.get("page") == 847
|
|
|
|
def test_page_none_from_payload(self):
|
|
payload = {}
|
|
assert payload.get("page") is None
|
|
|
|
|
|
class TestSourceCitationPage:
|
|
"""source_citation dict should include page when available."""
|
|
|
|
def _build_citation(self, chunk: RAGSearchResult) -> dict:
|
|
"""Mirrors the citation-building logic from control_generator.py."""
|
|
return {
|
|
"source": chunk.regulation_name,
|
|
"article": chunk.article,
|
|
"paragraph": chunk.paragraph,
|
|
"page": chunk.page,
|
|
"license": "free_use",
|
|
"source_type": "law",
|
|
"url": chunk.source_url or "",
|
|
}
|
|
|
|
def test_citation_with_page(self):
|
|
chunk = _make_chunk(article="§ 312k", paragraph="Abs. 1", page=847)
|
|
citation = self._build_citation(chunk)
|
|
assert citation["page"] == 847
|
|
|
|
def test_citation_without_page(self):
|
|
chunk = _make_chunk(article="§ 312k", paragraph="Abs. 1")
|
|
citation = self._build_citation(chunk)
|
|
assert citation["page"] is None
|
|
|
|
def test_citation_serializable(self):
|
|
chunk = _make_chunk(article="Art. 35", page=12)
|
|
citation = self._build_citation(chunk)
|
|
serialized = json.dumps(citation)
|
|
restored = json.loads(serialized)
|
|
assert restored["page"] == 12
|
|
|
|
|
|
class TestFormatCitation:
|
|
"""_format_citation should include page number."""
|
|
|
|
def _format_citation(self, citation) -> str:
|
|
"""Mirrors _format_citation from decomposition_pass.py."""
|
|
if not citation:
|
|
return ""
|
|
if isinstance(citation, str):
|
|
try:
|
|
c = json.loads(citation)
|
|
if isinstance(c, dict):
|
|
parts = []
|
|
if c.get("source"):
|
|
parts.append(c["source"])
|
|
if c.get("article"):
|
|
parts.append(c["article"])
|
|
if c.get("paragraph"):
|
|
parts.append(c["paragraph"])
|
|
if c.get("page") is not None:
|
|
parts.append(f"S. {c['page']}")
|
|
return " ".join(parts) if parts else citation
|
|
except (json.JSONDecodeError, TypeError):
|
|
return citation
|
|
return str(citation)
|
|
|
|
def test_format_with_page(self):
|
|
citation = json.dumps({
|
|
"source": "DSGVO",
|
|
"article": "Art. 35",
|
|
"paragraph": "Abs. 1",
|
|
"page": 42,
|
|
})
|
|
result = self._format_citation(citation)
|
|
assert result == "DSGVO Art. 35 Abs. 1 S. 42"
|
|
|
|
def test_format_without_page(self):
|
|
citation = json.dumps({
|
|
"source": "BGB",
|
|
"article": "§ 312k",
|
|
"paragraph": "",
|
|
})
|
|
result = self._format_citation(citation)
|
|
assert result == "BGB § 312k"
|
|
|
|
def test_format_page_zero(self):
|
|
citation = json.dumps({
|
|
"source": "BGB",
|
|
"article": "§ 1",
|
|
"paragraph": "",
|
|
"page": 0,
|
|
})
|
|
result = self._format_citation(citation)
|
|
assert result == "BGB § 1 S. 0"
|
|
|
|
def test_format_empty_citation(self):
|
|
assert self._format_citation("") == ""
|
|
assert self._format_citation(None) == ""
|