93099b2770
D2: RAG service stores section/section_title/paragraph/paragraph_num/page from embedding service chunks_with_metadata into Qdrant payloads. D3: Control generator prefers section > article > section_title from Qdrant, adds page to source_citation and generation_metadata. D4: Validated with real BGB §§ 312-312k text. Found and fixed critical bug where Phase 3 overlap destroyed the [§ ...] section prefix, causing only the first chunk per document to have metadata. All subsequent chunks lost section info. Also fixes pre-existing lint issues (unused imports, ambiguous variable names, duplicate dict key, bare except). 456 tests passing (58 embedding + 387 pipeline + 11 rag-service). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
173 lines
5.2 KiB
Python
173 lines
5.2 KiB
Python
"""Tests for document upload payload building — structural metadata (D2)."""
|
|
|
|
# Mirror the constant from api/documents.py to avoid heavy import chain
|
|
# (api → jose, qdrant_client, minio, etc.)
|
|
_STRUCT_FIELDS = ("section", "section_title", "paragraph", "paragraph_num", "page")
|
|
|
|
|
|
def _build_payload(
|
|
chunk: str,
|
|
index: int,
|
|
chunks_meta: list[dict],
|
|
extra_metadata: "dict | None" = None,
|
|
) -> dict:
|
|
"""Replicate the payload-building logic from documents.py for unit testing."""
|
|
payload = {
|
|
"document_id": "test-doc-id",
|
|
"object_name": "test/path.pdf",
|
|
"filename": "path.pdf",
|
|
"chunk_index": index,
|
|
"chunk_text": chunk,
|
|
"data_type": "law",
|
|
"bundesland": "bund",
|
|
"use_case": "compliance",
|
|
"year": "2026",
|
|
**(extra_metadata or {}),
|
|
}
|
|
if index < len(chunks_meta):
|
|
meta = chunks_meta[index]
|
|
for field in _STRUCT_FIELDS:
|
|
value = meta.get(field)
|
|
if value is not None and value != "":
|
|
payload[field] = value
|
|
return payload
|
|
|
|
|
|
class TestPayloadStructuralMetadata:
|
|
"""Tests for structural metadata merging into Qdrant payloads."""
|
|
|
|
def test_payload_contains_structural_metadata(self):
|
|
"""Metadata fields from chunks_with_metadata land in the payload."""
|
|
meta = [
|
|
{
|
|
"text": "chunk text",
|
|
"section": "§ 312k",
|
|
"section_title": "Kuendigungsbutton",
|
|
"paragraph": "Abs. 1",
|
|
"paragraph_num": 1,
|
|
"page": 847,
|
|
"index": 0,
|
|
}
|
|
]
|
|
|
|
payload = _build_payload("chunk text", 0, meta)
|
|
|
|
assert payload["section"] == "§ 312k"
|
|
assert payload["section_title"] == "Kuendigungsbutton"
|
|
assert payload["paragraph"] == "Abs. 1"
|
|
assert payload["paragraph_num"] == 1
|
|
assert payload["page"] == 847
|
|
|
|
def test_payload_without_metadata_backwards_compat(self):
|
|
"""Empty metadata list → payload has no structural fields."""
|
|
payload = _build_payload("chunk text", 0, [])
|
|
|
|
for field in _STRUCT_FIELDS:
|
|
assert field not in payload
|
|
|
|
def test_payload_skips_empty_values(self):
|
|
"""Empty string and None values are NOT added to payload."""
|
|
meta = [
|
|
{
|
|
"text": "chunk text",
|
|
"section": "",
|
|
"section_title": "",
|
|
"paragraph": "",
|
|
"paragraph_num": None,
|
|
"page": None,
|
|
"index": 0,
|
|
}
|
|
]
|
|
|
|
payload = _build_payload("chunk text", 0, meta)
|
|
|
|
for field in _STRUCT_FIELDS:
|
|
assert field not in payload
|
|
|
|
def test_metadata_overrides_extra_metadata(self):
|
|
"""Auto-extracted metadata takes precedence over manual extra_metadata."""
|
|
meta = [
|
|
{
|
|
"text": "chunk text",
|
|
"section": "§ 25",
|
|
"section_title": "",
|
|
"paragraph": "",
|
|
"paragraph_num": None,
|
|
"page": None,
|
|
"index": 0,
|
|
}
|
|
]
|
|
extra = {"section": "manual-value"}
|
|
|
|
payload = _build_payload("chunk text", 0, meta, extra_metadata=extra)
|
|
|
|
assert payload["section"] == "§ 25"
|
|
|
|
def test_partial_metadata_alignment(self):
|
|
"""3 chunks but only 2 metadata entries → third payload has no structural fields."""
|
|
meta = [
|
|
{
|
|
"text": "c1",
|
|
"section": "§ 1",
|
|
"section_title": "",
|
|
"paragraph": "",
|
|
"paragraph_num": None,
|
|
"page": None,
|
|
"index": 0,
|
|
},
|
|
{
|
|
"text": "c2",
|
|
"section": "§ 2",
|
|
"section_title": "",
|
|
"paragraph": "",
|
|
"paragraph_num": None,
|
|
"page": None,
|
|
"index": 1,
|
|
},
|
|
]
|
|
|
|
p0 = _build_payload("c1", 0, meta)
|
|
p1 = _build_payload("c2", 1, meta)
|
|
p2 = _build_payload("c3", 2, meta)
|
|
|
|
assert p0["section"] == "§ 1"
|
|
assert p1["section"] == "§ 2"
|
|
assert "section" not in p2
|
|
|
|
def test_zero_paragraph_num_is_kept(self):
|
|
"""paragraph_num=0 is a valid value and should be stored."""
|
|
meta = [
|
|
{
|
|
"text": "chunk",
|
|
"section": "",
|
|
"section_title": "",
|
|
"paragraph": "",
|
|
"paragraph_num": 0,
|
|
"page": None,
|
|
"index": 0,
|
|
}
|
|
]
|
|
|
|
payload = _build_payload("chunk", 0, meta)
|
|
|
|
# 0 is not None and not "" → should be stored
|
|
assert payload["paragraph_num"] == 0
|
|
|
|
def test_page_zero_is_kept(self):
|
|
"""page=0 is a valid value (first page) and should be stored."""
|
|
meta = [
|
|
{
|
|
"text": "chunk",
|
|
"section": "",
|
|
"section_title": "",
|
|
"paragraph": "",
|
|
"paragraph_num": None,
|
|
"page": 0,
|
|
"index": 0,
|
|
}
|
|
]
|
|
|
|
payload = _build_payload("chunk", 0, meta)
|
|
|
|
assert payload["page"] == 0
|