feat(rag): Legal-Metadaten — article_label + deterministische IDs + chunk_hash
Neues pures Modul legal_metadata.py (nur stdlib, lokal+CI testbar): §3-Normalisierung section->article, strikte Header-Extraktion (Datum/Seiten-Rauschen -> kein Falsch-Zitat), citation_style pro Regulierung (EU/CH=article, DE=paragraph), Urteil=Aktenzeichen statt §, camelCase-Klarnamen (ProdHaftG), deterministische uuid5-Point-ID + chunk_hash (sha256). documents.py verdrahtet build_legal_fields in den Payload-Build + document_version. 10 Tests gruen. Vertrag: rag_reingest_spec.md (§2/§3). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -10,6 +10,7 @@ from embedding_client import embedding_client
|
||||
from html_utils import decode_html_bytes, looks_like_html, strip_html
|
||||
from minio_client_wrapper import minio_wrapper
|
||||
from qdrant_client_wrapper import qdrant_wrapper
|
||||
from legal_metadata import build_legal_fields, compute_chunk_hash, deterministic_point_id
|
||||
|
||||
logger = logging.getLogger("rag-service.api.documents")
|
||||
|
||||
@@ -49,6 +50,7 @@ async def upload_document(
|
||||
chunk_size: int = Form(default=512),
|
||||
chunk_overlap: int = Form(default=50),
|
||||
metadata_json: Optional[str] = Form(default=None),
|
||||
document_version: str = Form(default="1"),
|
||||
):
|
||||
"""
|
||||
Upload a document:
|
||||
@@ -153,36 +155,55 @@ async def upload_document(
|
||||
except json.JSONDecodeError:
|
||||
logger.warning("Invalid metadata_json, ignoring")
|
||||
|
||||
# --- Build payloads ---
|
||||
# --- Build payloads (rag_reingest_spec.md §2/§3: zitierfaehige Legal-Metadaten) ---
|
||||
reg_code = (
|
||||
extra_metadata.get("regulation_code")
|
||||
or extra_metadata.get("regulation_short")
|
||||
or extra_metadata.get("regulation_id")
|
||||
or ""
|
||||
).strip()
|
||||
payloads = []
|
||||
ids = []
|
||||
for i, chunk in enumerate(chunks):
|
||||
meta = chunks_meta[i] if i < len(chunks_meta) else {}
|
||||
legal = build_legal_fields(
|
||||
meta,
|
||||
reg_code,
|
||||
chunk,
|
||||
citation_style=extra_metadata.get("citation_style"),
|
||||
display_name=extra_metadata.get("regulation_short"),
|
||||
source_type=extra_metadata.get("source_type"),
|
||||
)
|
||||
payload = {
|
||||
"document_id": document_id,
|
||||
"document_version": document_version,
|
||||
"object_name": object_name,
|
||||
"filename": filename,
|
||||
"chunk_index": i,
|
||||
"chunk_text": chunk,
|
||||
"chunk_hash": compute_chunk_hash(chunk),
|
||||
"data_type": data_type,
|
||||
"bundesland": bundesland,
|
||||
"use_case": use_case,
|
||||
"year": year,
|
||||
**extra_metadata,
|
||||
**{k: v for k, v in legal.items() if v not in (None, "")},
|
||||
}
|
||||
# Merge structural metadata from embedding service (D2)
|
||||
if i < len(chunks_meta):
|
||||
meta = chunks_meta[i]
|
||||
for field in _STRUCT_FIELDS:
|
||||
value = meta.get(field)
|
||||
if value is not None and value != "":
|
||||
payload[field] = value
|
||||
# Seite aus den Struktur-Metadaten uebernehmen (nicht Teil von legal)
|
||||
if meta.get("page") not in (None, ""):
|
||||
payload["page"] = meta["page"]
|
||||
payloads.append(payload)
|
||||
ids.append(
|
||||
deterministic_point_id(reg_code, legal["article"], legal["paragraph"], i, document_version)
|
||||
)
|
||||
|
||||
# --- Index in Qdrant ---
|
||||
# --- Index in Qdrant (deterministische IDs fuer stabilen Re-Link) ---
|
||||
try:
|
||||
indexed = await qdrant_wrapper.index_documents(
|
||||
collection=collection,
|
||||
vectors=embeddings,
|
||||
payloads=payloads,
|
||||
ids=ids,
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.error("Qdrant indexing failed: %s", exc)
|
||||
|
||||
Reference in New Issue
Block a user