Improve object normalization: shorter keys, synonym expansion, qualifier stripping
- Truncate object keys to 40 chars (was 80) at underscore boundary - Strip German qualifying prepositional phrases (bei/für/gemäß/von/zur/...) - Add 65 new synonym mappings for near-duplicate patterns found in analysis - Strip trailing noise tokens (articles/prepositions) - Add _truncate_at_boundary() helper and _QUALIFYING_PHRASE_RE regex - 11 new tests for normalization improvements (227 total pass) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -974,6 +974,76 @@ class TestObjectNormalization:
|
||||
assert "ue" in result
|
||||
assert "ä" not in result
|
||||
|
||||
# --- New tests for improved normalization (2026-03-28) ---
|
||||
|
||||
def test_qualifying_phrase_stripped(self):
|
||||
"""Prepositional qualifiers like 'bei X' are stripped."""
|
||||
base = _normalize_object("Eskalationsprozess")
|
||||
qualified = _normalize_object(
|
||||
"Eskalationsprozess bei Schwellenwertüberschreitung"
|
||||
)
|
||||
assert base == qualified
|
||||
|
||||
def test_fuer_phrase_stripped(self):
|
||||
"""'für kritische Systeme' qualifier is stripped."""
|
||||
base = _normalize_object("Backup-Verfahren")
|
||||
qualified = _normalize_object("Backup-Verfahren für kritische Systeme")
|
||||
assert base == qualified
|
||||
|
||||
def test_gemaess_phrase_stripped(self):
|
||||
"""'gemäß Artikel 32' qualifier is stripped."""
|
||||
base = _normalize_object("Verschlüsselung")
|
||||
qualified = _normalize_object("Verschlüsselung gemäß Artikel 32")
|
||||
assert base == qualified
|
||||
|
||||
def test_truncation_at_40_chars(self):
|
||||
"""Objects truncated at 40 chars at word boundary."""
|
||||
long_obj = "interner_eskalationsprozess_bei_schwellenwertueberschreitung_und_mehr"
|
||||
result = _normalize_object(long_obj)
|
||||
assert len(result) <= 40
|
||||
|
||||
def test_near_synonym_erkennung(self):
|
||||
"""'Früherkennung' and 'frühzeitige Erkennung' collapse."""
|
||||
a = _normalize_object("Früherkennung von Anomalien")
|
||||
b = _normalize_object("frühzeitige Erkennung von Angriffen")
|
||||
assert a == b
|
||||
|
||||
def test_near_synonym_eskalation(self):
|
||||
"""'Eskalationsprozess' and 'Eskalationsverfahren' collapse."""
|
||||
a = _normalize_object("Eskalationsprozess")
|
||||
b = _normalize_object("Eskalationsverfahren")
|
||||
assert a == b
|
||||
|
||||
def test_near_synonym_meldeprozess(self):
|
||||
"""'Meldeprozess' and 'Meldeverfahren' collapse to notification."""
|
||||
a = _normalize_object("Meldeprozess")
|
||||
b = _normalize_object("Meldeverfahren")
|
||||
assert a == b
|
||||
|
||||
def test_near_synonym_ueberwachung(self):
|
||||
"""'Überwachung' and 'Monitoring' collapse."""
|
||||
a = _normalize_object("Überwachung")
|
||||
b = _normalize_object("Monitoring")
|
||||
assert a == b
|
||||
|
||||
def test_trailing_noise_stripped(self):
|
||||
"""Trailing articles/prepositions are stripped."""
|
||||
result = _normalize_object("Schutz der")
|
||||
assert not result.endswith("_der")
|
||||
|
||||
def test_vendor_synonyms(self):
|
||||
"""Lieferant/Dienstleister/Auftragsverarbeiter collapse to vendor."""
|
||||
a = _normalize_object("Lieferant")
|
||||
b = _normalize_object("Dienstleister")
|
||||
c = _normalize_object("Auftragsverarbeiter")
|
||||
assert a == b == c
|
||||
|
||||
def test_patch_mgmt_synonyms(self):
|
||||
"""Patchmanagement/Aktualisierung collapse."""
|
||||
a = _normalize_object("Patchmanagement")
|
||||
b = _normalize_object("Softwareaktualisierung")
|
||||
assert a == b
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# GAP 5: OUTPUT VALIDATOR TESTS
|
||||
|
||||
Reference in New Issue
Block a user