feat(pipeline): structural metadata end-to-end (Blocks D2-D4)
D2: RAG service stores section/section_title/paragraph/paragraph_num/page from embedding service chunks_with_metadata into Qdrant payloads. D3: Control generator prefers section > article > section_title from Qdrant, adds page to source_citation and generation_metadata. D4: Validated with real BGB §§ 312-312k text. Found and fixed critical bug where Phase 3 overlap destroyed the [§ ...] section prefix, causing only the first chunk per document to have metadata. All subsequent chunks lost section info. Also fixes pre-existing lint issues (unused imports, ambiguous variable names, duplicate dict key, bare except). 456 tests passing (58 embedding + 387 pipeline + 11 rag-service). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
import logging
|
||||
import os
|
||||
from typing import Optional
|
||||
from dataclasses import dataclass
|
||||
|
||||
import httpx
|
||||
|
||||
@@ -19,6 +19,14 @@ _OLLAMA_EMBED_MODEL = os.getenv("OLLAMA_EMBED_MODEL", "bge-m3")
|
||||
_EMBED_BATCH_SIZE = int(os.getenv("EMBED_BATCH_SIZE", "32"))
|
||||
|
||||
|
||||
@dataclass
|
||||
class ChunkResult:
|
||||
"""Result from the embedding service /chunk endpoint."""
|
||||
|
||||
chunks: list[str]
|
||||
chunks_with_metadata: list[dict]
|
||||
|
||||
|
||||
class EmbeddingClient:
|
||||
"""
|
||||
Hybrid client:
|
||||
@@ -120,10 +128,10 @@ class EmbeddingClient:
|
||||
strategy: str = "recursive",
|
||||
chunk_size: int = 512,
|
||||
overlap: int = 50,
|
||||
) -> list[str]:
|
||||
) -> ChunkResult:
|
||||
"""
|
||||
Ask the embedding service to chunk a long text.
|
||||
Returns a list of chunk strings.
|
||||
Returns ChunkResult with plain chunks and structural metadata.
|
||||
"""
|
||||
async with httpx.AsyncClient(timeout=_TIMEOUT) as client:
|
||||
response = await client.post(
|
||||
@@ -137,7 +145,10 @@ class EmbeddingClient:
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
return data.get("chunks", [])
|
||||
return ChunkResult(
|
||||
chunks=data.get("chunks", []),
|
||||
chunks_with_metadata=data.get("chunks_with_metadata") or [],
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# PDF extraction (via embedding-service)
|
||||
|
||||
Reference in New Issue
Block a user