feat(pipeline): structural metadata end-to-end (Blocks D2-D4)
D2: RAG service stores section/section_title/paragraph/paragraph_num/page from embedding service chunks_with_metadata into Qdrant payloads. D3: Control generator prefers section > article > section_title from Qdrant, adds page to source_citation and generation_metadata. D4: Validated with real BGB §§ 312-312k text. Found and fixed critical bug where Phase 3 overlap destroyed the [§ ...] section prefix, causing only the first chunk per document to have metadata. All subsequent chunks lost section info. Also fixes pre-existing lint issues (unused imports, ambiguous variable names, duplicate dict key, bare except). 456 tests passing (58 embedding + 387 pipeline + 11 rag-service). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -14,6 +14,9 @@ logger = logging.getLogger("rag-service.api.documents")
|
||||
|
||||
router = APIRouter(prefix="/api/v1/documents")
|
||||
|
||||
# Structural metadata fields from embedding-service chunks_with_metadata (D2)
|
||||
_STRUCT_FIELDS = ("section", "section_title", "paragraph", "paragraph_num", "page")
|
||||
|
||||
|
||||
# ---- Request / Response models --------------------------------------------
|
||||
|
||||
@@ -110,7 +113,7 @@ async def upload_document(
|
||||
|
||||
# --- Chunk ---
|
||||
try:
|
||||
chunks = await embedding_client.chunk_text(
|
||||
chunk_result = await embedding_client.chunk_text(
|
||||
text=text,
|
||||
strategy=chunk_strategy,
|
||||
chunk_size=chunk_size,
|
||||
@@ -120,6 +123,9 @@ async def upload_document(
|
||||
logger.error("Chunking failed: %s", exc)
|
||||
raise HTTPException(status_code=500, detail=f"Chunking failed: {exc}")
|
||||
|
||||
chunks = chunk_result.chunks
|
||||
chunks_meta = chunk_result.chunks_with_metadata
|
||||
|
||||
if not chunks:
|
||||
raise HTTPException(status_code=400, detail="Chunking produced zero chunks")
|
||||
|
||||
@@ -154,6 +160,13 @@ async def upload_document(
|
||||
"year": year,
|
||||
**extra_metadata,
|
||||
}
|
||||
# Merge structural metadata from embedding service (D2)
|
||||
if i < len(chunks_meta):
|
||||
meta = chunks_meta[i]
|
||||
for field in _STRUCT_FIELDS:
|
||||
value = meta.get(field)
|
||||
if value is not None and value != "":
|
||||
payload[field] = value
|
||||
payloads.append(payload)
|
||||
|
||||
# --- Index in Qdrant ---
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import logging
|
||||
import os
|
||||
from typing import Optional
|
||||
from dataclasses import dataclass
|
||||
|
||||
import httpx
|
||||
|
||||
@@ -19,6 +19,14 @@ _OLLAMA_EMBED_MODEL = os.getenv("OLLAMA_EMBED_MODEL", "bge-m3")
|
||||
_EMBED_BATCH_SIZE = int(os.getenv("EMBED_BATCH_SIZE", "32"))
|
||||
|
||||
|
||||
@dataclass
|
||||
class ChunkResult:
|
||||
"""Result from the embedding service /chunk endpoint."""
|
||||
|
||||
chunks: list[str]
|
||||
chunks_with_metadata: list[dict]
|
||||
|
||||
|
||||
class EmbeddingClient:
|
||||
"""
|
||||
Hybrid client:
|
||||
@@ -120,10 +128,10 @@ class EmbeddingClient:
|
||||
strategy: str = "recursive",
|
||||
chunk_size: int = 512,
|
||||
overlap: int = 50,
|
||||
) -> list[str]:
|
||||
) -> ChunkResult:
|
||||
"""
|
||||
Ask the embedding service to chunk a long text.
|
||||
Returns a list of chunk strings.
|
||||
Returns ChunkResult with plain chunks and structural metadata.
|
||||
"""
|
||||
async with httpx.AsyncClient(timeout=_TIMEOUT) as client:
|
||||
response = await client.post(
|
||||
@@ -137,7 +145,10 @@ class EmbeddingClient:
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
return data.get("chunks", [])
|
||||
return ChunkResult(
|
||||
chunks=data.get("chunks", []),
|
||||
chunks_with_metadata=data.get("chunks_with_metadata") or [],
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# PDF extraction (via embedding-service)
|
||||
|
||||
@@ -0,0 +1,7 @@
|
||||
"""Shared test fixtures for rag-service tests."""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Ensure rag-service root is on sys.path so imports resolve
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||||
@@ -0,0 +1,172 @@
|
||||
"""Tests for document upload payload building — structural metadata (D2)."""
|
||||
|
||||
# Mirror the constant from api/documents.py to avoid heavy import chain
|
||||
# (api → jose, qdrant_client, minio, etc.)
|
||||
_STRUCT_FIELDS = ("section", "section_title", "paragraph", "paragraph_num", "page")
|
||||
|
||||
|
||||
def _build_payload(
|
||||
chunk: str,
|
||||
index: int,
|
||||
chunks_meta: list[dict],
|
||||
extra_metadata: "dict | None" = None,
|
||||
) -> dict:
|
||||
"""Replicate the payload-building logic from documents.py for unit testing."""
|
||||
payload = {
|
||||
"document_id": "test-doc-id",
|
||||
"object_name": "test/path.pdf",
|
||||
"filename": "path.pdf",
|
||||
"chunk_index": index,
|
||||
"chunk_text": chunk,
|
||||
"data_type": "law",
|
||||
"bundesland": "bund",
|
||||
"use_case": "compliance",
|
||||
"year": "2026",
|
||||
**(extra_metadata or {}),
|
||||
}
|
||||
if index < len(chunks_meta):
|
||||
meta = chunks_meta[index]
|
||||
for field in _STRUCT_FIELDS:
|
||||
value = meta.get(field)
|
||||
if value is not None and value != "":
|
||||
payload[field] = value
|
||||
return payload
|
||||
|
||||
|
||||
class TestPayloadStructuralMetadata:
|
||||
"""Tests for structural metadata merging into Qdrant payloads."""
|
||||
|
||||
def test_payload_contains_structural_metadata(self):
|
||||
"""Metadata fields from chunks_with_metadata land in the payload."""
|
||||
meta = [
|
||||
{
|
||||
"text": "chunk text",
|
||||
"section": "§ 312k",
|
||||
"section_title": "Kuendigungsbutton",
|
||||
"paragraph": "Abs. 1",
|
||||
"paragraph_num": 1,
|
||||
"page": 847,
|
||||
"index": 0,
|
||||
}
|
||||
]
|
||||
|
||||
payload = _build_payload("chunk text", 0, meta)
|
||||
|
||||
assert payload["section"] == "§ 312k"
|
||||
assert payload["section_title"] == "Kuendigungsbutton"
|
||||
assert payload["paragraph"] == "Abs. 1"
|
||||
assert payload["paragraph_num"] == 1
|
||||
assert payload["page"] == 847
|
||||
|
||||
def test_payload_without_metadata_backwards_compat(self):
|
||||
"""Empty metadata list → payload has no structural fields."""
|
||||
payload = _build_payload("chunk text", 0, [])
|
||||
|
||||
for field in _STRUCT_FIELDS:
|
||||
assert field not in payload
|
||||
|
||||
def test_payload_skips_empty_values(self):
|
||||
"""Empty string and None values are NOT added to payload."""
|
||||
meta = [
|
||||
{
|
||||
"text": "chunk text",
|
||||
"section": "",
|
||||
"section_title": "",
|
||||
"paragraph": "",
|
||||
"paragraph_num": None,
|
||||
"page": None,
|
||||
"index": 0,
|
||||
}
|
||||
]
|
||||
|
||||
payload = _build_payload("chunk text", 0, meta)
|
||||
|
||||
for field in _STRUCT_FIELDS:
|
||||
assert field not in payload
|
||||
|
||||
def test_metadata_overrides_extra_metadata(self):
|
||||
"""Auto-extracted metadata takes precedence over manual extra_metadata."""
|
||||
meta = [
|
||||
{
|
||||
"text": "chunk text",
|
||||
"section": "§ 25",
|
||||
"section_title": "",
|
||||
"paragraph": "",
|
||||
"paragraph_num": None,
|
||||
"page": None,
|
||||
"index": 0,
|
||||
}
|
||||
]
|
||||
extra = {"section": "manual-value"}
|
||||
|
||||
payload = _build_payload("chunk text", 0, meta, extra_metadata=extra)
|
||||
|
||||
assert payload["section"] == "§ 25"
|
||||
|
||||
def test_partial_metadata_alignment(self):
|
||||
"""3 chunks but only 2 metadata entries → third payload has no structural fields."""
|
||||
meta = [
|
||||
{
|
||||
"text": "c1",
|
||||
"section": "§ 1",
|
||||
"section_title": "",
|
||||
"paragraph": "",
|
||||
"paragraph_num": None,
|
||||
"page": None,
|
||||
"index": 0,
|
||||
},
|
||||
{
|
||||
"text": "c2",
|
||||
"section": "§ 2",
|
||||
"section_title": "",
|
||||
"paragraph": "",
|
||||
"paragraph_num": None,
|
||||
"page": None,
|
||||
"index": 1,
|
||||
},
|
||||
]
|
||||
|
||||
p0 = _build_payload("c1", 0, meta)
|
||||
p1 = _build_payload("c2", 1, meta)
|
||||
p2 = _build_payload("c3", 2, meta)
|
||||
|
||||
assert p0["section"] == "§ 1"
|
||||
assert p1["section"] == "§ 2"
|
||||
assert "section" not in p2
|
||||
|
||||
def test_zero_paragraph_num_is_kept(self):
|
||||
"""paragraph_num=0 is a valid value and should be stored."""
|
||||
meta = [
|
||||
{
|
||||
"text": "chunk",
|
||||
"section": "",
|
||||
"section_title": "",
|
||||
"paragraph": "",
|
||||
"paragraph_num": 0,
|
||||
"page": None,
|
||||
"index": 0,
|
||||
}
|
||||
]
|
||||
|
||||
payload = _build_payload("chunk", 0, meta)
|
||||
|
||||
# 0 is not None and not "" → should be stored
|
||||
assert payload["paragraph_num"] == 0
|
||||
|
||||
def test_page_zero_is_kept(self):
|
||||
"""page=0 is a valid value (first page) and should be stored."""
|
||||
meta = [
|
||||
{
|
||||
"text": "chunk",
|
||||
"section": "",
|
||||
"section_title": "",
|
||||
"paragraph": "",
|
||||
"paragraph_num": None,
|
||||
"page": 0,
|
||||
"index": 0,
|
||||
}
|
||||
]
|
||||
|
||||
payload = _build_payload("chunk", 0, meta)
|
||||
|
||||
assert payload["page"] == 0
|
||||
@@ -0,0 +1,135 @@
|
||||
"""Tests for EmbeddingClient.chunk_text() — ChunkResult with metadata (D2)."""
|
||||
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from embedding_client import ChunkResult, EmbeddingClient
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def client():
|
||||
with patch("embedding_client.settings") as mock_settings:
|
||||
mock_settings.EMBEDDING_SERVICE_URL = "http://localhost:8087"
|
||||
return EmbeddingClient()
|
||||
|
||||
|
||||
def _mock_response(json_data: dict, status_code: int = 200):
|
||||
"""Create a mock httpx response (sync methods like .json() and .raise_for_status())."""
|
||||
resp = MagicMock()
|
||||
resp.status_code = status_code
|
||||
resp.json.return_value = json_data
|
||||
return resp
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_chunk_text_returns_chunk_result(client):
|
||||
"""chunk_text returns ChunkResult with both chunks and metadata."""
|
||||
mock_json = {
|
||||
"chunks": ["chunk1 text", "chunk2 text"],
|
||||
"chunks_with_metadata": [
|
||||
{
|
||||
"text": "chunk1 text",
|
||||
"section": "§ 25",
|
||||
"section_title": "Informationspflichten",
|
||||
"paragraph": "Abs. 1",
|
||||
"paragraph_num": 1,
|
||||
"page": None,
|
||||
"index": 0,
|
||||
},
|
||||
{
|
||||
"text": "chunk2 text",
|
||||
"section": "§ 25",
|
||||
"section_title": "Informationspflichten",
|
||||
"paragraph": "Abs. 2",
|
||||
"paragraph_num": 2,
|
||||
"page": None,
|
||||
"index": 1,
|
||||
},
|
||||
],
|
||||
"count": 2,
|
||||
"strategy": "recursive",
|
||||
}
|
||||
|
||||
with patch("httpx.AsyncClient") as mock_client_cls:
|
||||
mock_client = AsyncMock()
|
||||
mock_client.post.return_value = _mock_response(mock_json)
|
||||
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
||||
mock_client.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_client_cls.return_value = mock_client
|
||||
|
||||
result = await client.chunk_text("some legal text")
|
||||
|
||||
assert isinstance(result, ChunkResult)
|
||||
assert result.chunks == ["chunk1 text", "chunk2 text"]
|
||||
assert len(result.chunks_with_metadata) == 2
|
||||
assert result.chunks_with_metadata[0]["section"] == "§ 25"
|
||||
assert result.chunks_with_metadata[1]["paragraph"] == "Abs. 2"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_chunk_text_without_metadata_field(client):
|
||||
"""Embedding service response without chunks_with_metadata → empty list."""
|
||||
mock_json = {
|
||||
"chunks": ["chunk1"],
|
||||
"count": 1,
|
||||
"strategy": "semantic",
|
||||
}
|
||||
|
||||
with patch("httpx.AsyncClient") as mock_client_cls:
|
||||
mock_client = AsyncMock()
|
||||
mock_client.post.return_value = _mock_response(mock_json)
|
||||
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
||||
mock_client.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_client_cls.return_value = mock_client
|
||||
|
||||
result = await client.chunk_text("text", strategy="semantic")
|
||||
|
||||
assert isinstance(result, ChunkResult)
|
||||
assert result.chunks == ["chunk1"]
|
||||
assert result.chunks_with_metadata == []
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_chunk_text_with_null_metadata(client):
|
||||
"""chunks_with_metadata: null in response → empty list."""
|
||||
mock_json = {
|
||||
"chunks": ["chunk1"],
|
||||
"chunks_with_metadata": None,
|
||||
"count": 1,
|
||||
"strategy": "recursive",
|
||||
}
|
||||
|
||||
with patch("httpx.AsyncClient") as mock_client_cls:
|
||||
mock_client = AsyncMock()
|
||||
mock_client.post.return_value = _mock_response(mock_json)
|
||||
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
||||
mock_client.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_client_cls.return_value = mock_client
|
||||
|
||||
result = await client.chunk_text("text")
|
||||
|
||||
assert result.chunks_with_metadata == []
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_chunk_text_empty(client):
|
||||
"""Empty text → empty chunks and metadata."""
|
||||
mock_json = {
|
||||
"chunks": [],
|
||||
"chunks_with_metadata": [],
|
||||
"count": 0,
|
||||
"strategy": "recursive",
|
||||
}
|
||||
|
||||
with patch("httpx.AsyncClient") as mock_client_cls:
|
||||
mock_client = AsyncMock()
|
||||
mock_client.post.return_value = _mock_response(mock_json)
|
||||
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
||||
mock_client.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_client_cls.return_value = mock_client
|
||||
|
||||
result = await client.chunk_text("")
|
||||
|
||||
assert result.chunks == []
|
||||
assert result.chunks_with_metadata == []
|
||||
Reference in New Issue
Block a user