feat(rag): Legal-Metadaten — article_label + deterministische IDs + chunk_hash

Neues pures Modul legal_metadata.py (nur stdlib, lokal+CI testbar): §3-Normalisierung
section->article, strikte Header-Extraktion (Datum/Seiten-Rauschen -> kein Falsch-Zitat),
citation_style pro Regulierung (EU/CH=article, DE=paragraph), Urteil=Aktenzeichen statt §,
camelCase-Klarnamen (ProdHaftG), deterministische uuid5-Point-ID + chunk_hash (sha256).
documents.py verdrahtet build_legal_fields in den Payload-Build + document_version.
10 Tests gruen. Vertrag: rag_reingest_spec.md (§2/§3).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-06-20 14:35:07 +02:00
parent adb7c6802c
commit dac2a9f685
3 changed files with 285 additions and 9 deletions
+30 -9
View File
@@ -10,6 +10,7 @@ from embedding_client import embedding_client
from html_utils import decode_html_bytes, looks_like_html, strip_html
from minio_client_wrapper import minio_wrapper
from qdrant_client_wrapper import qdrant_wrapper
from legal_metadata import build_legal_fields, compute_chunk_hash, deterministic_point_id
logger = logging.getLogger("rag-service.api.documents")
@@ -49,6 +50,7 @@ async def upload_document(
chunk_size: int = Form(default=512),
chunk_overlap: int = Form(default=50),
metadata_json: Optional[str] = Form(default=None),
document_version: str = Form(default="1"),
):
"""
Upload a document:
@@ -153,36 +155,55 @@ async def upload_document(
except json.JSONDecodeError:
logger.warning("Invalid metadata_json, ignoring")
# --- Build payloads ---
# --- Build payloads (rag_reingest_spec.md §2/§3: zitierfaehige Legal-Metadaten) ---
reg_code = (
extra_metadata.get("regulation_code")
or extra_metadata.get("regulation_short")
or extra_metadata.get("regulation_id")
or ""
).strip()
payloads = []
ids = []
for i, chunk in enumerate(chunks):
meta = chunks_meta[i] if i < len(chunks_meta) else {}
legal = build_legal_fields(
meta,
reg_code,
chunk,
citation_style=extra_metadata.get("citation_style"),
display_name=extra_metadata.get("regulation_short"),
source_type=extra_metadata.get("source_type"),
)
payload = {
"document_id": document_id,
"document_version": document_version,
"object_name": object_name,
"filename": filename,
"chunk_index": i,
"chunk_text": chunk,
"chunk_hash": compute_chunk_hash(chunk),
"data_type": data_type,
"bundesland": bundesland,
"use_case": use_case,
"year": year,
**extra_metadata,
**{k: v for k, v in legal.items() if v not in (None, "")},
}
# Merge structural metadata from embedding service (D2)
if i < len(chunks_meta):
meta = chunks_meta[i]
for field in _STRUCT_FIELDS:
value = meta.get(field)
if value is not None and value != "":
payload[field] = value
# Seite aus den Struktur-Metadaten uebernehmen (nicht Teil von legal)
if meta.get("page") not in (None, ""):
payload["page"] = meta["page"]
payloads.append(payload)
ids.append(
deterministic_point_id(reg_code, legal["article"], legal["paragraph"], i, document_version)
)
# --- Index in Qdrant ---
# --- Index in Qdrant (deterministische IDs fuer stabilen Re-Link) ---
try:
indexed = await qdrant_wrapper.index_documents(
collection=collection,
vectors=embeddings,
payloads=payloads,
ids=ids,
)
except Exception as exc:
logger.error("Qdrant indexing failed: %s", exc)