fix(pipeline): improve quality metrics heuristics

- Fix truncated title detection: only flag near-200-char titles or mid-word cutoffs
- Fix evidence leak detection: check title start patterns, not keyword substring
  ("nachweisen" verb is valid action, "Nachweis vorliegen" is evidence)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-26 09:53:52 +02:00
parent d583971afd
commit 5aaa62dca7

View File

@@ -2392,9 +2392,13 @@ async def get_quality_metrics(
duplicate_rate = round(dup_rows / max(total_with_mk, 1) * 100, 1)
# Evidence leak rate: controls that look like evidence
evidence_keywords = ("nachweis", "screenshot", "export", "zertifizierung",
"auditbericht", "prüfbericht", "protokoll")
# Evidence leak rate: controls whose TITLE starts with evidence-type nouns
# "nachweisen" (verb) is a valid action; "Nachweis vorliegen" is evidence
evidence_start_patterns = (
"nachweis vorli", "nachweis muss vorl", "screenshot",
"export der", "auditbericht", "prüfbericht",
"log-auszug", "jira-ticket", "sbom-nachweis",
)
evidence_count = 0
rows = db.execute(text(f"""
SELECT title FROM compliance.canonical_controls cc
@@ -2404,17 +2408,23 @@ async def get_quality_metrics(
""")).fetchall()
for row in rows:
title_lower = (row[0] or "").lower()
if any(kw in title_lower for kw in evidence_keywords):
if any(title_lower.startswith(p) for p in evidence_start_patterns):
evidence_count += 1
evidence_leak_rate = round(evidence_count / max(total, 1) * 100, 1)
# Truncated title rate: titles ending mid-word (heuristic)
# Truncated title rate: titles cut off mid-word
# True truncation: ends with incomplete word fragments (no space before last char)
# NOT truncation: German titles ending with complete verbs (infinitives etc.)
truncated_count = 0
for row in rows:
title = (row[0] or "").strip()
if title and len(title) >= 75:
# Likely truncated if it's close to max and doesn't end with a word boundary
if not title[-1] in ".!?)\"'":
if title and len(title) >= 195:
# Near the 200-char limit — likely truncated
truncated_count += 1
elif title and len(title) >= 75:
# Check for mid-word cutoff: last "word" has <3 chars and title is long
last_word = title.split()[-1] if title.split() else ""
if len(last_word) <= 2 and not last_word.endswith((".", "!", "?")):
truncated_count += 1
truncated_title_rate = round(truncated_count / max(total, 1) * 100, 1)