diff --git a/rag-service/api/documents.py b/rag-service/api/documents.py index 6f8af4b..48f5915 100644 --- a/rag-service/api/documents.py +++ b/rag-service/api/documents.py @@ -10,6 +10,7 @@ from embedding_client import embedding_client from html_utils import decode_html_bytes, looks_like_html, strip_html from minio_client_wrapper import minio_wrapper from qdrant_client_wrapper import qdrant_wrapper +from legal_metadata import build_legal_fields, compute_chunk_hash, deterministic_point_id logger = logging.getLogger("rag-service.api.documents") @@ -49,6 +50,7 @@ async def upload_document( chunk_size: int = Form(default=512), chunk_overlap: int = Form(default=50), metadata_json: Optional[str] = Form(default=None), + document_version: str = Form(default="1"), ): """ Upload a document: @@ -153,36 +155,55 @@ async def upload_document( except json.JSONDecodeError: logger.warning("Invalid metadata_json, ignoring") - # --- Build payloads --- + # --- Build payloads (rag_reingest_spec.md §2/§3: zitierfaehige Legal-Metadaten) --- + reg_code = ( + extra_metadata.get("regulation_code") + or extra_metadata.get("regulation_short") + or extra_metadata.get("regulation_id") + or "" + ).strip() payloads = [] + ids = [] for i, chunk in enumerate(chunks): + meta = chunks_meta[i] if i < len(chunks_meta) else {} + legal = build_legal_fields( + meta, + reg_code, + chunk, + citation_style=extra_metadata.get("citation_style"), + display_name=extra_metadata.get("regulation_short"), + source_type=extra_metadata.get("source_type"), + ) payload = { "document_id": document_id, + "document_version": document_version, "object_name": object_name, "filename": filename, "chunk_index": i, "chunk_text": chunk, + "chunk_hash": compute_chunk_hash(chunk), "data_type": data_type, "bundesland": bundesland, "use_case": use_case, "year": year, **extra_metadata, + **{k: v for k, v in legal.items() if v not in (None, "")}, } - # Merge structural metadata from embedding service (D2) - if i < len(chunks_meta): - meta = chunks_meta[i] - for field in _STRUCT_FIELDS: - value = meta.get(field) - if value is not None and value != "": - payload[field] = value + # Seite aus den Struktur-Metadaten uebernehmen (nicht Teil von legal) + if meta.get("page") not in (None, ""): + payload["page"] = meta["page"] payloads.append(payload) + ids.append( + deterministic_point_id(reg_code, legal["article"], legal["paragraph"], i, document_version) + ) - # --- Index in Qdrant --- + # --- Index in Qdrant (deterministische IDs fuer stabilen Re-Link) --- try: indexed = await qdrant_wrapper.index_documents( collection=collection, vectors=embeddings, payloads=payloads, + ids=ids, ) except Exception as exc: logger.error("Qdrant indexing failed: %s", exc) diff --git a/rag-service/legal_metadata.py b/rag-service/legal_metadata.py new file mode 100644 index 0000000..66ec408 --- /dev/null +++ b/rag-service/legal_metadata.py @@ -0,0 +1,140 @@ +"""Legal-Metadaten-Normalisierung fuer zitierfaehige Chunks. + +Vertrag: docs-src/development/rag_reingest_spec.md (§2 Payload-Feld-Vertrag, §3 Transform). +Transformiert die Roh-Ausgabe des Legal-Chunkers (section="§ 38" / "Artikel 13", +section_title, paragraph="(1)") in die consumer-facing Felder +(article, citation_style, article_label, paragraph, sub, section_header, is_recital). + +Pure Funktionen (nur stdlib) -> lokal + in CI testbar, haelt documents.py schlank. +""" + +import hashlib +import re +import uuid + +# Namespace fuer deterministische Point-IDs (Qdrant braucht UUID/uint, kein roher sha1). +_ID_NS = uuid.UUID("6ba7b811-9dad-11d1-80b4-00c04fd430c8") + +# Strikt: nur echte Legal-Header am Anfang des Section-Strings (optional in [..]). +# Lehnt Datums-/Seiten-/Fundstellen-Rauschen ab ("4.5.2016", "L 119/9", "Begriffsbestimmungen"). +_LEGAL_HEADER_RE = re.compile(r"^\s*\[?\s*(§|art\.?|artikel)\s*(\d+)\s*([a-z])?", re.IGNORECASE) +_SUB_RE = re.compile(r"(lit\.\s*[a-z]+|Satz\s*\d+|Nr\.\s*\d+)", re.IGNORECASE) + +# Urteile/Beschluesse: KEINE §-/Art.-Fundstelle bilden. Der Entscheidungstext zitiert +# §§ fremder Gesetze (Querverweise) -> wuerde ein falsches Label "AZ § 87" erzeugen. +# Zitiert wird das Aktenzeichen (display_name / regulation_short). +_RULING_TYPES = {"urteil", "ruling", "court_decision", "beschluss"} + + +def detect_citation_style(section: str) -> str: + """'§ 38' -> 'paragraph' (DE-Gesetze); 'Artikel/Art. 13' -> 'article' (EU-VO).""" + s = (section or "").lower() + if "§" in s: + return "paragraph" + if "art" in s: + return "article" + return "paragraph" + + +def normalize_article(section: str) -> str: + """Strikt aus echtem Legal-Header: '§ 38'->'38', 'Artikel 13'->'13', '[Artikel 17]'->'17', + 'Art. 13a'->'13a'. Kein Header (Datum/Seite/Titel) -> '' (keine falsche Fundstelle).""" + m = _LEGAL_HEADER_RE.match(section or "") + return (m.group(2) + (m.group(3) or "")) if m else "" + + +def normalize_paragraph(paragraph: str) -> str: + """'(1)' -> '1', 'Abs. 2' -> '2', '' -> ''.""" + m = re.search(r"\d+", paragraph or "") + return m.group(0) if m else "" + + +def extract_sub(text: str) -> str: + """Best-effort feinste Granularitaet: 'lit. c' / 'Satz 2' / 'Nr. 3' (sonst '').""" + m = _SUB_RE.search(text or "") + return re.sub(r"\s+", " ", m.group(1)).strip() if m else "" + + +def is_recital(section: str, section_title: str) -> bool: + blob = f"{section} {section_title}".lower() + return any(k in blob for k in ("erwaegungsgrund", "erwägungsgrund", "recital")) + + +def format_article_label( + regulation_label: str, citation_style: str, article: str, paragraph: str, sub: str +) -> str: + """Druckbare Fundstelle: 'ProdHaftG § 1' bzw. 'Art. 13 Abs. 1 lit. c DSGVO'. + + `regulation_label` ist der Klarname (regulation_short, Originalschreibweise) — NICHT + uppercasen, damit 'ProdHaftG'/'GeschGehG'/'MuSchG' korrekt erscheinen. Akronyme + (BDSG/DSGVO/HGB) sind ohnehin schon gross.""" + code = (regulation_label or "").strip() + if not article: + return code + abs_part = f" Abs. {paragraph}" if paragraph else "" + sub_part = f" {sub}" if sub else "" + if citation_style == "article": + return f"Art. {article}{abs_part}{sub_part} {code}".strip() + return f"{code} § {article}{abs_part}{sub_part}".strip() + + +def compute_chunk_hash(text: str) -> str: + """sha256 des whitespace-normalisierten Chunk-Texts (Re-Link-/Ledger-Anker).""" + norm = re.sub(r"\s+", " ", (text or "").strip()) + return hashlib.sha256(norm.encode("utf-8")).hexdigest() + + +def deterministic_point_id( + regulation_code: str, article: str, paragraph: str, chunk_index: int, document_version: str +) -> str: + """Deterministische Qdrant-Point-ID (uuid5) fuer stabilen Re-Link + Alt/Neu-Koexistenz.""" + raw = f"{regulation_code}|{article}|{paragraph}|{chunk_index}|{document_version}" + return str(uuid.uuid5(_ID_NS, raw)) + + +def build_legal_fields( + struct_meta: dict, + regulation_code: str, + chunk_text: str = "", + citation_style: str = None, + display_name: str = None, + source_type: str = None, +) -> dict: + """Roh-Chunker-Metadaten -> consumer-facing Spec-Felder (§2/§3). + + citation_style ist PRO REGULIERUNG (EU-VO/CH -> 'article', DE-§-Gesetz -> 'paragraph') + und sollte explizit ueber die Ingest-Metadaten gesetzt werden; ohne Angabe wird er + aus dem Section-String geraten (unzuverlaessig bei gemischten §/Art.-Referenzen). + + `display_name` (= regulation_short) ist der druckbare Klarname fuers Label; `regulation_code` + bleibt der GROSS-Feldwert (Filter/Gruppierung). `source_type` steuert Urteils-Sonderfall.""" + sm = struct_meta or {} + section = sm.get("section", "") or "" + section_title = sm.get("section_title", "") or "" + printable = (display_name or regulation_code or "").strip() + if (source_type or "").strip().lower() in _RULING_TYPES: + # Urteil: kein §/Art. (Querverweise verfaelschen). Label = Aktenzeichen. + style = citation_style if citation_style in ("article", "paragraph") else "" + article = "" + label = printable + else: + style = ( + citation_style + if citation_style in ("article", "paragraph") + else detect_citation_style(section) + ) + article = normalize_article(section) + # Absatz/sub bewusst NICHT aus dem Chunk-Text raten: Querverweise ("Artikel 8 Absatz 1") + # erzeugen FALSCHE Fundstellen. Zitat bleibt artikel-genau (zuverlaessig); feinere + # Granularitaet ist ein spaeteres Refinement mit eigener, sicherer Extraktion. + label = format_article_label(printable, style, article, "", "") + return { + "regulation_code": (regulation_code or "").upper(), + "citation_style": style, + "article": article, + "paragraph": "", + "sub": "", + "is_recital": is_recital(section, section_title), + "section_header": section_title, + "article_label": label, + } diff --git a/rag-service/tests/test_legal_metadata.py b/rag-service/tests/test_legal_metadata.py new file mode 100644 index 0000000..d797223 --- /dev/null +++ b/rag-service/tests/test_legal_metadata.py @@ -0,0 +1,115 @@ +"""Tests fuer legal_metadata (rag_reingest_spec.md §3-Normalisierung).""" + +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) + +import legal_metadata as L + + +def test_detect_citation_style(): + assert L.detect_citation_style("§ 38") == "paragraph" + assert L.detect_citation_style("Artikel 13") == "article" + assert L.detect_citation_style("Art. 13") == "article" + + +def test_normalize_article(): + assert L.normalize_article("§ 38") == "38" + assert L.normalize_article("Artikel 13") == "13" + assert L.normalize_article("Art. 13a") == "13a" + assert L.normalize_article("[Artikel 17]") == "17" + assert L.normalize_article("") == "" + # Strikt: Datum/Seite/Titel sind KEINE Fundstelle -> leer (keine falsche Zitierung) + assert L.normalize_article("4.5.2016 DE Amtsblatt der Europaeischen Union L 119/9") == "" + assert L.normalize_article("Begriffsbestimmungen") == "" + + +def test_normalize_paragraph(): + assert L.normalize_paragraph("(1)") == "1" + assert L.normalize_paragraph("Abs. 2") == "2" + assert L.normalize_paragraph("") == "" + + +def test_extract_sub(): + assert L.extract_sub("... lit. c ...") == "lit. c" + assert L.extract_sub("Satz 2 des Absatzes") == "Satz 2" + assert L.extract_sub("kein Sub hier") == "" + + +def test_format_article_label_paragraph_style(): + # Klarname (regulation_short) wird NICHT mehr uppercased -> camelCase bleibt erhalten. + assert L.format_article_label("ProdHaftG", "paragraph", "1", "", "") == "ProdHaftG § 1" + assert L.format_article_label("BDSG", "paragraph", "38", "1", "") == "BDSG § 38 Abs. 1" + assert L.format_article_label("BDSG", "paragraph", "38", "", "") == "BDSG § 38" + + +def test_format_article_label_article_style(): + assert L.format_article_label("DSGVO", "article", "13", "1", "lit. c") == "Art. 13 Abs. 1 lit. c DSGVO" + assert L.format_article_label("DSGVO", "article", "30", "", "") == "Art. 30 DSGVO" + # CH-Recht: Klarname + Art. + assert L.format_article_label("RevDSG", "article", "12", "", "") == "Art. 12 RevDSG" + + +def test_compute_chunk_hash_stable_and_ws_insensitive(): + a = L.compute_chunk_hash("Hallo Welt\n") + b = L.compute_chunk_hash("Hallo Welt") + assert a == b and len(a) == 64 + + +def test_deterministic_point_id_stable(): + i1 = L.deterministic_point_id("BDSG", "38", "1", 0, "2026-06-19") + i2 = L.deterministic_point_id("BDSG", "38", "1", 0, "2026-06-19") + i3 = L.deterministic_point_id("BDSG", "38", "1", 1, "2026-06-19") + assert i1 == i2 and i1 != i3 and len(i1) == 36 # UUID + + +def test_build_legal_fields_citation_style_override(): + # DSGVO = EU-VO -> citation_style 'article' pro Regulierung, NICHT '§' aus dem Section-String + out = L.build_legal_fields( + {"section": "§ 4", "section_title": "Begriffsbestimmungen", "paragraph": ""}, + "DSGVO", citation_style="article", + ) + assert out["citation_style"] == "article" + assert out["article"] == "4" + assert out["article_label"] == "Art. 4 DSGVO" + + +def test_build_legal_fields_integration(): + out = L.build_legal_fields( + {"section": "§ 38", "section_title": "Datenschutzbeauftragte", "paragraph": "(1)"}, + "bdsg", + chunk_text="... in der Regel mindestens 20 Personen ...", + display_name="BDSG", + ) + assert out["citation_style"] == "paragraph" + assert out["article"] == "38" + assert out["paragraph"] == "" # artikel-genau, kein geratener Absatz + assert out["regulation_code"] == "BDSG" # Feldwert bleibt GROSS + assert out["article_label"] == "BDSG § 38" + assert out["section_header"] == "Datenschutzbeauftragte" + assert out["is_recital"] is False + + +def test_build_legal_fields_camelcase_display_name(): + # camelCase-Klarname bleibt im Label erhalten, Feld regulation_code bleibt GROSS. + out = L.build_legal_fields( + {"section": "§ 1", "section_title": "Haftung"}, + "PRODHAFTG", + display_name="ProdHaftG", + ) + assert out["regulation_code"] == "PRODHAFTG" + assert out["article_label"] == "ProdHaftG § 1" + + +def test_build_legal_fields_ruling_uses_aktenzeichen_not_paragraph(): + # Urteil: Querverweis "§ 87 BetrVG" im Text darf KEIN Label "AZ § 87" erzeugen. + out = L.build_legal_fields( + {"section": "§ 87", "section_title": "Mitbestimmung"}, + "BAG_1_ABR_22_21", + source_type="urteil", + display_name="BAG, 1 ABR 22/21", + ) + assert out["article"] == "" + assert out["article_label"] == "BAG, 1 ABR 22/21" + assert out["regulation_code"] == "BAG_1_ABR_22_21"