feat(pipeline): structural metadata end-to-end (Blocks D2-D4)

D2: RAG service stores section/section_title/paragraph/paragraph_num/page
from embedding service chunks_with_metadata into Qdrant payloads.

D3: Control generator prefers section > article > section_title from
Qdrant, adds page to source_citation and generation_metadata.

D4: Validated with real BGB §§ 312-312k text. Found and fixed critical
bug where Phase 3 overlap destroyed the [§ ...] section prefix, causing
only the first chunk per document to have metadata. All subsequent
chunks lost section info.

Also fixes pre-existing lint issues (unused imports, ambiguous variable
names, duplicate dict key, bare except).

456 tests passing (58 embedding + 387 pipeline + 11 rag-service).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-05-01 20:34:00 +02:00
parent da21339e76
commit 93099b2770
15 changed files with 1086 additions and 25 deletions
View File
+7
View File
@@ -0,0 +1,7 @@
"""Shared test fixtures for rag-service tests."""
import os
import sys
# Ensure rag-service root is on sys.path so imports resolve
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+172
View File
@@ -0,0 +1,172 @@
"""Tests for document upload payload building — structural metadata (D2)."""
# Mirror the constant from api/documents.py to avoid heavy import chain
# (api → jose, qdrant_client, minio, etc.)
_STRUCT_FIELDS = ("section", "section_title", "paragraph", "paragraph_num", "page")
def _build_payload(
chunk: str,
index: int,
chunks_meta: list[dict],
extra_metadata: "dict | None" = None,
) -> dict:
"""Replicate the payload-building logic from documents.py for unit testing."""
payload = {
"document_id": "test-doc-id",
"object_name": "test/path.pdf",
"filename": "path.pdf",
"chunk_index": index,
"chunk_text": chunk,
"data_type": "law",
"bundesland": "bund",
"use_case": "compliance",
"year": "2026",
**(extra_metadata or {}),
}
if index < len(chunks_meta):
meta = chunks_meta[index]
for field in _STRUCT_FIELDS:
value = meta.get(field)
if value is not None and value != "":
payload[field] = value
return payload
class TestPayloadStructuralMetadata:
"""Tests for structural metadata merging into Qdrant payloads."""
def test_payload_contains_structural_metadata(self):
"""Metadata fields from chunks_with_metadata land in the payload."""
meta = [
{
"text": "chunk text",
"section": "§ 312k",
"section_title": "Kuendigungsbutton",
"paragraph": "Abs. 1",
"paragraph_num": 1,
"page": 847,
"index": 0,
}
]
payload = _build_payload("chunk text", 0, meta)
assert payload["section"] == "§ 312k"
assert payload["section_title"] == "Kuendigungsbutton"
assert payload["paragraph"] == "Abs. 1"
assert payload["paragraph_num"] == 1
assert payload["page"] == 847
def test_payload_without_metadata_backwards_compat(self):
"""Empty metadata list → payload has no structural fields."""
payload = _build_payload("chunk text", 0, [])
for field in _STRUCT_FIELDS:
assert field not in payload
def test_payload_skips_empty_values(self):
"""Empty string and None values are NOT added to payload."""
meta = [
{
"text": "chunk text",
"section": "",
"section_title": "",
"paragraph": "",
"paragraph_num": None,
"page": None,
"index": 0,
}
]
payload = _build_payload("chunk text", 0, meta)
for field in _STRUCT_FIELDS:
assert field not in payload
def test_metadata_overrides_extra_metadata(self):
"""Auto-extracted metadata takes precedence over manual extra_metadata."""
meta = [
{
"text": "chunk text",
"section": "§ 25",
"section_title": "",
"paragraph": "",
"paragraph_num": None,
"page": None,
"index": 0,
}
]
extra = {"section": "manual-value"}
payload = _build_payload("chunk text", 0, meta, extra_metadata=extra)
assert payload["section"] == "§ 25"
def test_partial_metadata_alignment(self):
"""3 chunks but only 2 metadata entries → third payload has no structural fields."""
meta = [
{
"text": "c1",
"section": "§ 1",
"section_title": "",
"paragraph": "",
"paragraph_num": None,
"page": None,
"index": 0,
},
{
"text": "c2",
"section": "§ 2",
"section_title": "",
"paragraph": "",
"paragraph_num": None,
"page": None,
"index": 1,
},
]
p0 = _build_payload("c1", 0, meta)
p1 = _build_payload("c2", 1, meta)
p2 = _build_payload("c3", 2, meta)
assert p0["section"] == "§ 1"
assert p1["section"] == "§ 2"
assert "section" not in p2
def test_zero_paragraph_num_is_kept(self):
"""paragraph_num=0 is a valid value and should be stored."""
meta = [
{
"text": "chunk",
"section": "",
"section_title": "",
"paragraph": "",
"paragraph_num": 0,
"page": None,
"index": 0,
}
]
payload = _build_payload("chunk", 0, meta)
# 0 is not None and not "" → should be stored
assert payload["paragraph_num"] == 0
def test_page_zero_is_kept(self):
"""page=0 is a valid value (first page) and should be stored."""
meta = [
{
"text": "chunk",
"section": "",
"section_title": "",
"paragraph": "",
"paragraph_num": None,
"page": 0,
"index": 0,
}
]
payload = _build_payload("chunk", 0, meta)
assert payload["page"] == 0
+135
View File
@@ -0,0 +1,135 @@
"""Tests for EmbeddingClient.chunk_text() — ChunkResult with metadata (D2)."""
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from embedding_client import ChunkResult, EmbeddingClient
@pytest.fixture
def client():
with patch("embedding_client.settings") as mock_settings:
mock_settings.EMBEDDING_SERVICE_URL = "http://localhost:8087"
return EmbeddingClient()
def _mock_response(json_data: dict, status_code: int = 200):
"""Create a mock httpx response (sync methods like .json() and .raise_for_status())."""
resp = MagicMock()
resp.status_code = status_code
resp.json.return_value = json_data
return resp
@pytest.mark.asyncio
async def test_chunk_text_returns_chunk_result(client):
"""chunk_text returns ChunkResult with both chunks and metadata."""
mock_json = {
"chunks": ["chunk1 text", "chunk2 text"],
"chunks_with_metadata": [
{
"text": "chunk1 text",
"section": "§ 25",
"section_title": "Informationspflichten",
"paragraph": "Abs. 1",
"paragraph_num": 1,
"page": None,
"index": 0,
},
{
"text": "chunk2 text",
"section": "§ 25",
"section_title": "Informationspflichten",
"paragraph": "Abs. 2",
"paragraph_num": 2,
"page": None,
"index": 1,
},
],
"count": 2,
"strategy": "recursive",
}
with patch("httpx.AsyncClient") as mock_client_cls:
mock_client = AsyncMock()
mock_client.post.return_value = _mock_response(mock_json)
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
mock_client.__aexit__ = AsyncMock(return_value=False)
mock_client_cls.return_value = mock_client
result = await client.chunk_text("some legal text")
assert isinstance(result, ChunkResult)
assert result.chunks == ["chunk1 text", "chunk2 text"]
assert len(result.chunks_with_metadata) == 2
assert result.chunks_with_metadata[0]["section"] == "§ 25"
assert result.chunks_with_metadata[1]["paragraph"] == "Abs. 2"
@pytest.mark.asyncio
async def test_chunk_text_without_metadata_field(client):
"""Embedding service response without chunks_with_metadata → empty list."""
mock_json = {
"chunks": ["chunk1"],
"count": 1,
"strategy": "semantic",
}
with patch("httpx.AsyncClient") as mock_client_cls:
mock_client = AsyncMock()
mock_client.post.return_value = _mock_response(mock_json)
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
mock_client.__aexit__ = AsyncMock(return_value=False)
mock_client_cls.return_value = mock_client
result = await client.chunk_text("text", strategy="semantic")
assert isinstance(result, ChunkResult)
assert result.chunks == ["chunk1"]
assert result.chunks_with_metadata == []
@pytest.mark.asyncio
async def test_chunk_text_with_null_metadata(client):
"""chunks_with_metadata: null in response → empty list."""
mock_json = {
"chunks": ["chunk1"],
"chunks_with_metadata": None,
"count": 1,
"strategy": "recursive",
}
with patch("httpx.AsyncClient") as mock_client_cls:
mock_client = AsyncMock()
mock_client.post.return_value = _mock_response(mock_json)
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
mock_client.__aexit__ = AsyncMock(return_value=False)
mock_client_cls.return_value = mock_client
result = await client.chunk_text("text")
assert result.chunks_with_metadata == []
@pytest.mark.asyncio
async def test_chunk_text_empty(client):
"""Empty text → empty chunks and metadata."""
mock_json = {
"chunks": [],
"chunks_with_metadata": [],
"count": 0,
"strategy": "recursive",
}
with patch("httpx.AsyncClient") as mock_client_cls:
mock_client = AsyncMock()
mock_client.post.return_value = _mock_response(mock_json)
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
mock_client.__aexit__ = AsyncMock(return_value=False)
mock_client_cls.return_value = mock_client
result = await client.chunk_text("")
assert result.chunks == []
assert result.chunks_with_metadata == []