feat(pipeline): structural metadata end-to-end (Blocks D2-D4)

D2: RAG service stores section/section_title/paragraph/paragraph_num/page
from embedding service chunks_with_metadata into Qdrant payloads.

D3: Control generator prefers section > article > section_title from
Qdrant, adds page to source_citation and generation_metadata.

D4: Validated with real BGB §§ 312-312k text. Found and fixed critical
bug where Phase 3 overlap destroyed the [§ ...] section prefix, causing
only the first chunk per document to have metadata. All subsequent
chunks lost section info.

Also fixes pre-existing lint issues (unused imports, ambiguous variable
names, duplicate dict key, bare except).

456 tests passing (58 embedding + 387 pipeline + 11 rag-service).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-05-01 20:34:00 +02:00
parent da21339e76
commit 93099b2770
15 changed files with 1086 additions and 25 deletions
+13 -7
View File
@@ -25,8 +25,7 @@ import re
import uuid
from collections import defaultdict
from dataclasses import dataclass, field, asdict
from datetime import datetime, timezone
from typing import Dict, List, Optional, Set
from typing import Dict, List, Optional
import httpx
from pydantic import BaseModel
@@ -34,7 +33,7 @@ from sqlalchemy import text
from sqlalchemy.orm import Session
from .rag_client import ComplianceRAGClient, RAGSearchResult, get_rag_client
from .similarity_detector import check_similarity, SimilarityReport
from .similarity_detector import check_similarity
logger = logging.getLogger(__name__)
@@ -1019,11 +1018,12 @@ class ControlGeneratorPipeline:
regulation_name=reg_name,
regulation_short=reg_short,
category=payload.get("category", "") or payload.get("data_type", ""),
article=payload.get("article", "") or payload.get("section_title", "") or payload.get("section", ""),
article=payload.get("section", "") or payload.get("article", "") or payload.get("section_title", ""),
paragraph=payload.get("paragraph", ""),
source_url=payload.get("source_url", "") or payload.get("source", "") or payload.get("url", ""),
score=0.0,
collection=collection,
page=payload.get("page"),
)
all_results.append(chunk)
collection_new += 1
@@ -1127,6 +1127,7 @@ Quelle: {chunk.regulation_name} ({chunk.regulation_code}), {chunk.article}"""
"source": canonical_source,
"article": effective_article,
"paragraph": effective_paragraph,
"page": chunk.page,
"license": license_info.get("license", ""),
"source_type": license_info.get("source_type", "law"),
"url": chunk.source_url or "",
@@ -1141,6 +1142,7 @@ Quelle: {chunk.regulation_name} ({chunk.regulation_code}), {chunk.article}"""
"source_regulation": chunk.regulation_code,
"source_article": effective_article,
"source_paragraph": effective_paragraph,
"source_page": chunk.page,
}
return control
@@ -1194,6 +1196,7 @@ Quelle: {chunk.regulation_name}, {chunk.article}"""
"source": canonical_source,
"article": effective_article,
"paragraph": effective_paragraph,
"page": chunk.page,
"license": license_info.get("license", ""),
"license_notice": attribution,
"source_type": license_info.get("source_type", "standard"),
@@ -1209,6 +1212,7 @@ Quelle: {chunk.regulation_name}, {chunk.article}"""
"source_regulation": chunk.regulation_code,
"source_article": effective_article,
"source_paragraph": effective_paragraph,
"source_page": chunk.page,
}
return control
@@ -1368,6 +1372,7 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Elementen. Fuer Chunks ohne A
"source": canonical_source,
"article": effective_article,
"paragraph": effective_paragraph,
"page": chunk.page,
"license": lic.get("license", ""),
"license_notice": lic.get("attribution", ""),
"source_type": lic.get("source_type", "law"),
@@ -1384,6 +1389,7 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Elementen. Fuer Chunks ohne A
"source_regulation": chunk.regulation_code,
"source_article": effective_article,
"source_paragraph": effective_paragraph,
"source_page": chunk.page,
"batch_size": len(chunks),
"document_grouped": same_doc,
}
@@ -1479,14 +1485,14 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Elementen. Fuer Aspekte ohne
) -> list[Optional[GeneratedControl]]:
"""Process a batch of (chunk, license_info) through stages 3-5."""
# Split by license rule: Rule 1+2 → structure, Rule 3 → reform
structure_items = [(c, l) for c, l in batch_items if l["rule"] in (1, 2)]
reform_items = [(c, l) for c, l in batch_items if l["rule"] == 3]
structure_items = [(c, lic) for c, lic in batch_items if lic["rule"] in (1, 2)]
reform_items = [(c, lic) for c, lic in batch_items if lic["rule"] == 3]
all_controls: dict[int, Optional[GeneratedControl]] = {}
if structure_items:
s_chunks = [c for c, _ in structure_items]
s_lics = [l for _, l in structure_items]
s_lics = [lic for _, lic in structure_items]
try:
s_controls = await self._structure_batch(s_chunks, s_lics)
except Exception as e:
@@ -24,7 +24,6 @@ import json
import logging
import os
import re
import uuid
from dataclasses import dataclass, field
from typing import Optional
@@ -56,7 +55,7 @@ ANTHROPIC_API_URL = "https://api.anthropic.com/v1"
# Patterns are defined in normative_patterns.py and imported here
# with local aliases for backward compatibility.
from .normative_patterns import (
from .normative_patterns import ( # noqa: E402
PFLICHT_RE as _PFLICHT_RE,
EMPFEHLUNG_RE as _EMPFEHLUNG_RE,
KANN_RE as _KANN_RE,
@@ -3472,7 +3471,7 @@ class DecompositionPass:
"category": atomic.category,
"parent_uuid": parent_uuid,
"gen_meta": json.dumps({
"decomposition_source": candidate_id,
"decomposition_source_id": candidate_id,
"decomposition_method": "pass0b",
"engine_version": "v2",
"action_object_class": getattr(atomic, "domain", ""),
@@ -4104,6 +4103,8 @@ def _format_citation(citation) -> str:
parts.append(c["article"])
if c.get("paragraph"):
parts.append(c["paragraph"])
if c.get("page") is not None:
parts.append(f"S. {c['page']}")
return " ".join(parts) if parts else citation
except (json.JSONDecodeError, TypeError):
return citation
+1
View File
@@ -34,6 +34,7 @@ class RAGSearchResult:
source_url: str
score: float
collection: str = ""
page: Optional[int] = None
class ComplianceRAGClient: