Improve object normalization: shorter keys, synonym expansion, qualifier stripping

- Truncate object keys to 40 chars (was 80) at underscore boundary
- Strip German qualifying prepositional phrases (bei/für/gemäß/von/zur/...)
- Add 65 new synonym mappings for near-duplicate patterns found in analysis
- Strip trailing noise tokens (articles/prepositions)
- Add _truncate_at_boundary() helper and _QUALIFYING_PHRASE_RE regex
- 11 new tests for normalization improvements (227 total pass)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-28 08:55:48 +01:00
parent fb2cf29b34
commit f8d9919b97
2 changed files with 194 additions and 1 deletions

View File

@@ -974,6 +974,76 @@ class TestObjectNormalization:
assert "ue" in result
assert "ä" not in result
# --- New tests for improved normalization (2026-03-28) ---
def test_qualifying_phrase_stripped(self):
"""Prepositional qualifiers like 'bei X' are stripped."""
base = _normalize_object("Eskalationsprozess")
qualified = _normalize_object(
"Eskalationsprozess bei Schwellenwertüberschreitung"
)
assert base == qualified
def test_fuer_phrase_stripped(self):
"""'für kritische Systeme' qualifier is stripped."""
base = _normalize_object("Backup-Verfahren")
qualified = _normalize_object("Backup-Verfahren für kritische Systeme")
assert base == qualified
def test_gemaess_phrase_stripped(self):
"""'gemäß Artikel 32' qualifier is stripped."""
base = _normalize_object("Verschlüsselung")
qualified = _normalize_object("Verschlüsselung gemäß Artikel 32")
assert base == qualified
def test_truncation_at_40_chars(self):
"""Objects truncated at 40 chars at word boundary."""
long_obj = "interner_eskalationsprozess_bei_schwellenwertueberschreitung_und_mehr"
result = _normalize_object(long_obj)
assert len(result) <= 40
def test_near_synonym_erkennung(self):
"""'Früherkennung' and 'frühzeitige Erkennung' collapse."""
a = _normalize_object("Früherkennung von Anomalien")
b = _normalize_object("frühzeitige Erkennung von Angriffen")
assert a == b
def test_near_synonym_eskalation(self):
"""'Eskalationsprozess' and 'Eskalationsverfahren' collapse."""
a = _normalize_object("Eskalationsprozess")
b = _normalize_object("Eskalationsverfahren")
assert a == b
def test_near_synonym_meldeprozess(self):
"""'Meldeprozess' and 'Meldeverfahren' collapse to notification."""
a = _normalize_object("Meldeprozess")
b = _normalize_object("Meldeverfahren")
assert a == b
def test_near_synonym_ueberwachung(self):
"""'Überwachung' and 'Monitoring' collapse."""
a = _normalize_object("Überwachung")
b = _normalize_object("Monitoring")
assert a == b
def test_trailing_noise_stripped(self):
"""Trailing articles/prepositions are stripped."""
result = _normalize_object("Schutz der")
assert not result.endswith("_der")
def test_vendor_synonyms(self):
"""Lieferant/Dienstleister/Auftragsverarbeiter collapse to vendor."""
a = _normalize_object("Lieferant")
b = _normalize_object("Dienstleister")
c = _normalize_object("Auftragsverarbeiter")
assert a == b == c
def test_patch_mgmt_synonyms(self):
"""Patchmanagement/Aktualisierung collapse."""
a = _normalize_object("Patchmanagement")
b = _normalize_object("Softwareaktualisierung")
assert a == b
# ---------------------------------------------------------------------------
# GAP 5: OUTPUT VALIDATOR TESTS