Compare commits

...

3 Commits

Author SHA1 Message Date
Benjamin Admin
447ec08509 Add migration 082: widen source_article to TEXT, fix pass0b query filters
All checks were successful
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Successful in 40s
CI/CD / test-python-backend-compliance (push) Successful in 31s
CI/CD / test-python-document-crawler (push) Successful in 21s
CI/CD / test-python-dsms-gateway (push) Successful in 18s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Successful in 5s
- source_article/source_regulation VARCHAR(100) → TEXT for long NIST refs
- Pass 0b NOT EXISTS queries now skip deprecated/duplicate controls
- Duplicate Guard excludes deprecated/duplicate from existence check

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-28 12:47:26 +01:00
Benjamin Admin
8cb1dc1108 Fix pass0b queries to skip deprecated/duplicate controls
The NOT EXISTS check and Duplicate Guard now exclude deprecated and
duplicate controls, enabling clean re-runs after invalidation.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-28 09:09:16 +01:00
Benjamin Admin
f8d9919b97 Improve object normalization: shorter keys, synonym expansion, qualifier stripping
- Truncate object keys to 40 chars (was 80) at underscore boundary
- Strip German qualifying prepositional phrases (bei/für/gemäß/von/zur/...)
- Add 65 new synonym mappings for near-duplicate patterns found in analysis
- Strip trailing noise tokens (articles/prepositions)
- Add _truncate_at_boundary() helper and _QUALIFYING_PHRASE_RE regex
- 11 new tests for normalization improvements (227 total pass)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-28 08:55:48 +01:00
3 changed files with 201 additions and 2 deletions

View File

@@ -1415,6 +1415,74 @@ _OBJECT_SYNONYMS: dict[str, str] = {
"zugriff": "access_control",
"einwilligung": "consent",
"zustimmung": "consent",
# Near-synonym expansions found via heavy-control analysis (2026-03-28)
"erkennung": "detection",
"früherkennung": "detection",
"frühzeitige erkennung": "detection",
"frühzeitigen erkennung": "detection",
"detektion": "detection",
"eskalation": "escalation",
"eskalationsprozess": "escalation",
"eskalationsverfahren": "escalation",
"benachrichtigungsprozess": "notification",
"benachrichtigungsverfahren": "notification",
"meldeprozess": "notification",
"meldeverfahren": "notification",
"meldesystem": "notification",
"benachrichtigungssystem": "notification",
"überwachung": "monitoring",
"monitoring": "monitoring",
"kontinuierliche überwachung": "monitoring",
"laufende überwachung": "monitoring",
"prüfung": "audit",
"überprüfung": "audit",
"kontrolle": "control_check",
"sicherheitskontrolle": "control_check",
"dokumentation": "documentation",
"aufzeichnungspflicht": "documentation",
"protokollierung": "logging",
"logführung": "logging",
"logmanagement": "logging",
"wiederherstellung": "recovery",
"notfallwiederherstellung": "recovery",
"disaster recovery": "recovery",
"notfallplan": "contingency_plan",
"notfallplanung": "contingency_plan",
"wiederanlaufplan": "contingency_plan",
"klassifizierung": "classification",
"kategorisierung": "classification",
"einstufung": "classification",
"segmentierung": "segmentation",
"netzwerksegmentierung": "segmentation",
"netzwerk-segmentierung": "segmentation",
"trennung": "segmentation",
"isolierung": "isolation",
"patch": "patch_mgmt",
"patchmanagement": "patch_mgmt",
"patch-management": "patch_mgmt",
"aktualisierung": "patch_mgmt",
"softwareaktualisierung": "patch_mgmt",
"härtung": "hardening",
"systemhärtung": "hardening",
"härtungsmaßnahme": "hardening",
"löschung": "deletion",
"datenlöschung": "deletion",
"löschkonzept": "deletion",
"anonymisierung": "anonymization",
"pseudonymisierung": "pseudonymization",
"zugangssteuerung": "access_control",
"zugangskontrolle": "access_control",
"zugriffssteuerung": "access_control",
"zugriffskontrolle": "access_control",
"schlüsselmanagement": "key_mgmt",
"schlüsselverwaltung": "key_mgmt",
"key management": "key_mgmt",
"zertifikatsverwaltung": "cert_mgmt",
"zertifikatsmanagement": "cert_mgmt",
"lieferant": "vendor",
"dienstleister": "vendor",
"auftragsverarbeiter": "vendor",
"drittanbieter": "vendor",
}
@@ -1435,12 +1503,20 @@ def _normalize_object(object_raw: str) -> str:
Applies synonym mapping to collapse German terms to canonical forms
(e.g., 'Richtlinie' -> 'policy', 'Verzeichnis' -> 'register').
Then strips qualifying prepositional phrases that would create
near-duplicate keys (e.g., 'bei Schwellenwertüberschreitung').
Truncates to 40 chars to collapse overly specific variants.
"""
if not object_raw:
return "unknown"
obj_lower = object_raw.strip().lower()
# Strip qualifying prepositional phrases that don't change core identity.
# These create near-duplicate keys like "eskalationsprozess" vs
# "eskalationsprozess bei schwellenwertüberschreitung".
obj_lower = _QUALIFYING_PHRASE_RE.sub("", obj_lower).strip()
# Synonym mapping — find the longest matching synonym
best_match = ""
best_canonical = ""
@@ -1456,7 +1532,54 @@ def _normalize_object(object_raw: str) -> str:
for src, dst in [("ä", "ae"), ("ö", "oe"), ("ü", "ue"), ("ß", "ss")]:
obj = obj.replace(src, dst)
obj = re.sub(r"[^a-z0-9_]", "", obj)
return obj[:80] or "unknown"
# Strip trailing noise tokens (articles/prepositions stuck at the end)
obj = re.sub(r"(_(?:der|die|das|des|dem|den|fuer|bei|von|zur|zum|mit|auf|in|und|oder|aus|an|ueber|nach|gegen|unter|vor|zwischen|als|durch|ohne|wie))+$", "", obj)
# Truncate at 40 chars (at underscore boundary) to collapse
# overly specific suffixes that create near-duplicate keys.
obj = _truncate_at_boundary(obj, 40)
return obj or "unknown"
# Regex to strip German qualifying prepositional phrases from object text.
# Matches patterns like "bei schwellenwertüberschreitung",
# "für kritische systeme", "gemäß artikel 32" etc.
_QUALIFYING_PHRASE_RE = re.compile(
r"\s+(?:"
r"bei\s+\w+"
r"|für\s+(?:die\s+|den\s+|das\s+|kritische\s+)?\w+"
r"|gemäß\s+\w+"
r"|nach\s+\w+"
r"|von\s+\w+"
r"|im\s+(?:falle?\s+|rahmen\s+)?\w+"
r"|mit\s+(?:den\s+|der\s+|dem\s+)?\w+"
r"|auf\s+(?:basis|grundlage)\s+\w+"
r"|zur\s+(?:einhaltung|sicherstellung|gewährleistung|vermeidung|erfüllung)\s*\w*"
r"|durch\s+(?:den\s+|die\s+|das\s+)?\w+"
r"|über\s+(?:den\s+|die\s+|das\s+)?\w+"
r"|unter\s+\w+"
r"|zwischen\s+\w+"
r"|innerhalb\s+\w+"
r"|gegenüber\s+\w+"
r"|hinsichtlich\s+\w+"
r"|bezüglich\s+\w+"
r"|einschließlich\s+\w+"
r").*$",
re.IGNORECASE,
)
def _truncate_at_boundary(text: str, max_len: int) -> str:
"""Truncate text at the last underscore boundary within max_len."""
if len(text) <= max_len:
return text
truncated = text[:max_len]
last_sep = truncated.rfind("_")
if last_sep > max_len // 2:
return truncated[:last_sep]
return truncated
# ── 7b. Framework / Composite Detection ──────────────────────────────────
@@ -2327,6 +2450,7 @@ class DecompositionPass:
SELECT 1 FROM canonical_controls ac
WHERE ac.parent_control_uuid = oc.parent_control_uuid
AND ac.decomposition_method = 'pass0b'
AND ac.release_state NOT IN ('deprecated', 'duplicate')
AND ac.title LIKE '%' || LEFT(oc.action, 20) || '%'
)
"""
@@ -2902,7 +3026,7 @@ class DecompositionPass:
SELECT id::text FROM canonical_controls
WHERE parent_control_uuid = CAST(:parent AS uuid)
AND generation_metadata->>'merge_group_hint' = :hint
AND release_state != 'rejected'
AND release_state NOT IN ('rejected', 'deprecated', 'duplicate')
LIMIT 1
"""),
{"parent": parent_uuid, "hint": merge_hint},
@@ -3168,6 +3292,7 @@ class DecompositionPass:
SELECT 1 FROM canonical_controls ac
WHERE ac.parent_control_uuid = oc.parent_control_uuid
AND ac.decomposition_method = 'pass0b'
AND ac.release_state NOT IN ('deprecated', 'duplicate')
AND ac.title LIKE '%' || LEFT(oc.action, 20) || '%'
)
"""

View File

@@ -0,0 +1,4 @@
-- Widen source_article and source_regulation to TEXT to handle long NIST references
-- e.g. "SC-22 (und weitere redaktionelle Änderungen SC-7, SC-14, SC-17, ...)"
ALTER TABLE control_parent_links ALTER COLUMN source_article TYPE TEXT;
ALTER TABLE control_parent_links ALTER COLUMN source_regulation TYPE TEXT;

View File

@@ -974,6 +974,76 @@ class TestObjectNormalization:
assert "ue" in result
assert "ä" not in result
# --- New tests for improved normalization (2026-03-28) ---
def test_qualifying_phrase_stripped(self):
"""Prepositional qualifiers like 'bei X' are stripped."""
base = _normalize_object("Eskalationsprozess")
qualified = _normalize_object(
"Eskalationsprozess bei Schwellenwertüberschreitung"
)
assert base == qualified
def test_fuer_phrase_stripped(self):
"""'für kritische Systeme' qualifier is stripped."""
base = _normalize_object("Backup-Verfahren")
qualified = _normalize_object("Backup-Verfahren für kritische Systeme")
assert base == qualified
def test_gemaess_phrase_stripped(self):
"""'gemäß Artikel 32' qualifier is stripped."""
base = _normalize_object("Verschlüsselung")
qualified = _normalize_object("Verschlüsselung gemäß Artikel 32")
assert base == qualified
def test_truncation_at_40_chars(self):
"""Objects truncated at 40 chars at word boundary."""
long_obj = "interner_eskalationsprozess_bei_schwellenwertueberschreitung_und_mehr"
result = _normalize_object(long_obj)
assert len(result) <= 40
def test_near_synonym_erkennung(self):
"""'Früherkennung' and 'frühzeitige Erkennung' collapse."""
a = _normalize_object("Früherkennung von Anomalien")
b = _normalize_object("frühzeitige Erkennung von Angriffen")
assert a == b
def test_near_synonym_eskalation(self):
"""'Eskalationsprozess' and 'Eskalationsverfahren' collapse."""
a = _normalize_object("Eskalationsprozess")
b = _normalize_object("Eskalationsverfahren")
assert a == b
def test_near_synonym_meldeprozess(self):
"""'Meldeprozess' and 'Meldeverfahren' collapse to notification."""
a = _normalize_object("Meldeprozess")
b = _normalize_object("Meldeverfahren")
assert a == b
def test_near_synonym_ueberwachung(self):
"""'Überwachung' and 'Monitoring' collapse."""
a = _normalize_object("Überwachung")
b = _normalize_object("Monitoring")
assert a == b
def test_trailing_noise_stripped(self):
"""Trailing articles/prepositions are stripped."""
result = _normalize_object("Schutz der")
assert not result.endswith("_der")
def test_vendor_synonyms(self):
"""Lieferant/Dienstleister/Auftragsverarbeiter collapse to vendor."""
a = _normalize_object("Lieferant")
b = _normalize_object("Dienstleister")
c = _normalize_object("Auftragsverarbeiter")
assert a == b == c
def test_patch_mgmt_synonyms(self):
"""Patchmanagement/Aktualisierung collapse."""
a = _normalize_object("Patchmanagement")
b = _normalize_object("Softwareaktualisierung")
assert a == b
# ---------------------------------------------------------------------------
# GAP 5: OUTPUT VALIDATOR TESTS