feat(pipeline): structural metadata end-to-end (Blocks D2-D4)
D2: RAG service stores section/section_title/paragraph/paragraph_num/page from embedding service chunks_with_metadata into Qdrant payloads. D3: Control generator prefers section > article > section_title from Qdrant, adds page to source_citation and generation_metadata. D4: Validated with real BGB §§ 312-312k text. Found and fixed critical bug where Phase 3 overlap destroyed the [§ ...] section prefix, causing only the first chunk per document to have metadata. All subsequent chunks lost section info. Also fixes pre-existing lint issues (unused imports, ambiguous variable names, duplicate dict key, bare except). 456 tests passing (58 embedding + 387 pipeline + 11 rag-service). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,166 @@
|
||||
"""Tests for D3: Structural metadata flow (section priority, page in citation)."""
|
||||
|
||||
import json
|
||||
from typing import Optional
|
||||
|
||||
from services.rag_client import RAGSearchResult
|
||||
|
||||
|
||||
def _make_chunk(
|
||||
article: str = "",
|
||||
paragraph: str = "",
|
||||
page: Optional[int] = None,
|
||||
) -> RAGSearchResult:
|
||||
return RAGSearchResult(
|
||||
text="Test chunk text",
|
||||
regulation_code="DSGVO",
|
||||
regulation_name="Datenschutz-Grundverordnung",
|
||||
regulation_short="DSGVO",
|
||||
category="data_protection",
|
||||
article=article,
|
||||
paragraph=paragraph,
|
||||
source_url="https://example.com",
|
||||
score=0.95,
|
||||
collection="bp_compliance_de",
|
||||
page=page,
|
||||
)
|
||||
|
||||
|
||||
class TestRAGSearchResultPage:
|
||||
"""RAGSearchResult now carries a page field."""
|
||||
|
||||
def test_page_default_none(self):
|
||||
chunk = _make_chunk()
|
||||
assert chunk.page is None
|
||||
|
||||
def test_page_set(self):
|
||||
chunk = _make_chunk(page=42)
|
||||
assert chunk.page == 42
|
||||
|
||||
def test_page_zero(self):
|
||||
chunk = _make_chunk(page=0)
|
||||
assert chunk.page == 0
|
||||
|
||||
|
||||
class TestQdrantPayloadPriority:
|
||||
"""section (D2) should take priority over article (legacy)."""
|
||||
|
||||
def test_section_preferred_over_article(self):
|
||||
payload = {"section": "§ 312k", "article": "Art. 312", "section_title": "Kuendigungsbutton"}
|
||||
article = payload.get("section", "") or payload.get("article", "") or payload.get("section_title", "")
|
||||
assert article == "§ 312k"
|
||||
|
||||
def test_article_fallback_when_no_section(self):
|
||||
payload = {"section": "", "article": "Art. 35", "section_title": ""}
|
||||
article = payload.get("section", "") or payload.get("article", "") or payload.get("section_title", "")
|
||||
assert article == "Art. 35"
|
||||
|
||||
def test_section_title_last_resort(self):
|
||||
payload = {"section": "", "article": "", "section_title": "Informationspflichten"}
|
||||
article = payload.get("section", "") or payload.get("article", "") or payload.get("section_title", "")
|
||||
assert article == "Informationspflichten"
|
||||
|
||||
def test_all_empty(self):
|
||||
payload = {"section": "", "article": "", "section_title": ""}
|
||||
article = payload.get("section", "") or payload.get("article", "") or payload.get("section_title", "")
|
||||
assert article == ""
|
||||
|
||||
def test_page_from_payload(self):
|
||||
payload = {"page": 847}
|
||||
assert payload.get("page") == 847
|
||||
|
||||
def test_page_none_from_payload(self):
|
||||
payload = {}
|
||||
assert payload.get("page") is None
|
||||
|
||||
|
||||
class TestSourceCitationPage:
|
||||
"""source_citation dict should include page when available."""
|
||||
|
||||
def _build_citation(self, chunk: RAGSearchResult) -> dict:
|
||||
"""Mirrors the citation-building logic from control_generator.py."""
|
||||
return {
|
||||
"source": chunk.regulation_name,
|
||||
"article": chunk.article,
|
||||
"paragraph": chunk.paragraph,
|
||||
"page": chunk.page,
|
||||
"license": "free_use",
|
||||
"source_type": "law",
|
||||
"url": chunk.source_url or "",
|
||||
}
|
||||
|
||||
def test_citation_with_page(self):
|
||||
chunk = _make_chunk(article="§ 312k", paragraph="Abs. 1", page=847)
|
||||
citation = self._build_citation(chunk)
|
||||
assert citation["page"] == 847
|
||||
|
||||
def test_citation_without_page(self):
|
||||
chunk = _make_chunk(article="§ 312k", paragraph="Abs. 1")
|
||||
citation = self._build_citation(chunk)
|
||||
assert citation["page"] is None
|
||||
|
||||
def test_citation_serializable(self):
|
||||
chunk = _make_chunk(article="Art. 35", page=12)
|
||||
citation = self._build_citation(chunk)
|
||||
serialized = json.dumps(citation)
|
||||
restored = json.loads(serialized)
|
||||
assert restored["page"] == 12
|
||||
|
||||
|
||||
class TestFormatCitation:
|
||||
"""_format_citation should include page number."""
|
||||
|
||||
def _format_citation(self, citation) -> str:
|
||||
"""Mirrors _format_citation from decomposition_pass.py."""
|
||||
if not citation:
|
||||
return ""
|
||||
if isinstance(citation, str):
|
||||
try:
|
||||
c = json.loads(citation)
|
||||
if isinstance(c, dict):
|
||||
parts = []
|
||||
if c.get("source"):
|
||||
parts.append(c["source"])
|
||||
if c.get("article"):
|
||||
parts.append(c["article"])
|
||||
if c.get("paragraph"):
|
||||
parts.append(c["paragraph"])
|
||||
if c.get("page") is not None:
|
||||
parts.append(f"S. {c['page']}")
|
||||
return " ".join(parts) if parts else citation
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
return citation
|
||||
return str(citation)
|
||||
|
||||
def test_format_with_page(self):
|
||||
citation = json.dumps({
|
||||
"source": "DSGVO",
|
||||
"article": "Art. 35",
|
||||
"paragraph": "Abs. 1",
|
||||
"page": 42,
|
||||
})
|
||||
result = self._format_citation(citation)
|
||||
assert result == "DSGVO Art. 35 Abs. 1 S. 42"
|
||||
|
||||
def test_format_without_page(self):
|
||||
citation = json.dumps({
|
||||
"source": "BGB",
|
||||
"article": "§ 312k",
|
||||
"paragraph": "",
|
||||
})
|
||||
result = self._format_citation(citation)
|
||||
assert result == "BGB § 312k"
|
||||
|
||||
def test_format_page_zero(self):
|
||||
citation = json.dumps({
|
||||
"source": "BGB",
|
||||
"article": "§ 1",
|
||||
"paragraph": "",
|
||||
"page": 0,
|
||||
})
|
||||
result = self._format_citation(citation)
|
||||
assert result == "BGB § 1 S. 0"
|
||||
|
||||
def test_format_empty_citation(self):
|
||||
assert self._format_citation("") == ""
|
||||
assert self._format_citation(None) == ""
|
||||
Reference in New Issue
Block a user