feat(pipeline): structural metadata end-to-end (Blocks D2-D4)
D2: RAG service stores section/section_title/paragraph/paragraph_num/page from embedding service chunks_with_metadata into Qdrant payloads. D3: Control generator prefers section > article > section_title from Qdrant, adds page to source_citation and generation_metadata. D4: Validated with real BGB §§ 312-312k text. Found and fixed critical bug where Phase 3 overlap destroyed the [§ ...] section prefix, causing only the first chunk per document to have metadata. All subsequent chunks lost section info. Also fixes pre-existing lint issues (unused imports, ambiguous variable names, duplicate dict key, bare except). 456 tests passing (58 embedding + 387 pipeline + 11 rag-service). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -14,6 +14,9 @@ logger = logging.getLogger("rag-service.api.documents")
|
||||
|
||||
router = APIRouter(prefix="/api/v1/documents")
|
||||
|
||||
# Structural metadata fields from embedding-service chunks_with_metadata (D2)
|
||||
_STRUCT_FIELDS = ("section", "section_title", "paragraph", "paragraph_num", "page")
|
||||
|
||||
|
||||
# ---- Request / Response models --------------------------------------------
|
||||
|
||||
@@ -110,7 +113,7 @@ async def upload_document(
|
||||
|
||||
# --- Chunk ---
|
||||
try:
|
||||
chunks = await embedding_client.chunk_text(
|
||||
chunk_result = await embedding_client.chunk_text(
|
||||
text=text,
|
||||
strategy=chunk_strategy,
|
||||
chunk_size=chunk_size,
|
||||
@@ -120,6 +123,9 @@ async def upload_document(
|
||||
logger.error("Chunking failed: %s", exc)
|
||||
raise HTTPException(status_code=500, detail=f"Chunking failed: {exc}")
|
||||
|
||||
chunks = chunk_result.chunks
|
||||
chunks_meta = chunk_result.chunks_with_metadata
|
||||
|
||||
if not chunks:
|
||||
raise HTTPException(status_code=400, detail="Chunking produced zero chunks")
|
||||
|
||||
@@ -154,6 +160,13 @@ async def upload_document(
|
||||
"year": year,
|
||||
**extra_metadata,
|
||||
}
|
||||
# Merge structural metadata from embedding service (D2)
|
||||
if i < len(chunks_meta):
|
||||
meta = chunks_meta[i]
|
||||
for field in _STRUCT_FIELDS:
|
||||
value = meta.get(field)
|
||||
if value is not None and value != "":
|
||||
payload[field] = value
|
||||
payloads.append(payload)
|
||||
|
||||
# --- Index in Qdrant ---
|
||||
|
||||
Reference in New Issue
Block a user