feat(rag): regulation_short Casing-Normalisierung am Ingest-Rand
Der Re-Ingest leitet regulation_short z.T. via title()-Casing aus Dateinamen ab
('dsgvo'->'Dsgvo', 'osha otm'->'Osha Otm') -> falsche Akronyme im Payload UND im
article_label ('Art. 37 Dsgvo'). NEU: normalize_regulation_short() in legal_metadata,
token-basiert mit kuratiertem Akronym-Set -> nur gelistete Akronyme werden gross,
legitimes Mixed-Case (GeschGehG, MuSchG, GoBD, MiCA, eIDAS, EuGH) bleibt unberuehrt.
Angewandt am Ingest-Rand in documents.py (greift fuer Payload-Feld + display_name).
+13 Tests gruen. Bestandsdaten brauchen separaten einmaligen Qdrant-Patch.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -113,3 +113,27 @@ def test_build_legal_fields_ruling_uses_aktenzeichen_not_paragraph():
|
||||
assert out["article"] == ""
|
||||
assert out["article_label"] == "BAG, 1 ABR 22/21"
|
||||
assert out["regulation_code"] == "BAG_1_ABR_22_21"
|
||||
|
||||
|
||||
def test_normalize_regulation_short():
|
||||
# title-caste Akronyme -> korrekt GROSS (Token-basiert)
|
||||
assert L.normalize_regulation_short("Dsgvo") == "DSGVO"
|
||||
assert L.normalize_regulation_short("Gpsr") == "GPSR"
|
||||
assert L.normalize_regulation_short("Ifrs") == "IFRS"
|
||||
assert L.normalize_regulation_short("EU Mdr") == "EU MDR"
|
||||
assert L.normalize_regulation_short("OWASP Api TOP10") == "OWASP API TOP10"
|
||||
assert (
|
||||
L.normalize_regulation_short("Osha Otm Section-4-safety-hazards Chapter-1")
|
||||
== "OSHA OTM Section-4-safety-hazards Chapter-1"
|
||||
)
|
||||
# legitimes Mixed-Case bleibt unberuehrt (NICHT in der Akronym-Liste)
|
||||
assert L.normalize_regulation_short("GeschGehG") == "GeschGehG"
|
||||
assert L.normalize_regulation_short("MuSchG") == "MuSchG"
|
||||
assert L.normalize_regulation_short("GoBD") == "GoBD"
|
||||
assert L.normalize_regulation_short("MiCA") == "MiCA"
|
||||
assert L.normalize_regulation_short("eIDAS") == "eIDAS"
|
||||
assert L.normalize_regulation_short("EuGH C-252/21 (Meta)") == "EuGH C-252/21 (Meta)"
|
||||
# Ganz-Wert-Overrides + Leerfaelle
|
||||
assert L.normalize_regulation_short("Dataact") == "Data Act"
|
||||
assert L.normalize_regulation_short("") == ""
|
||||
assert L.normalize_regulation_short(None) == ""
|
||||
|
||||
Reference in New Issue
Block a user