fix(pipeline): improve quality metrics heuristics

- Fix truncated title detection: only flag near-200-char titles or mid-word cutoffs - Fix evidence leak detection: check title start patterns, not keyword substring ("nachweisen" verb is valid action, "Nachweis vorliegen" is evidence) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-26 09:53:52 +02:00
parent d583971afd
commit 5aaa62dca7
1 changed files with 18 additions and 8 deletions
@@ -2392,9 +2392,13 @@ async def get_quality_metrics(

        duplicate_rate = round(dup_rows / max(total_with_mk, 1) * 100, 1)

-        # Evidence leak rate: controls that look like evidence
-        evidence_keywords = ("nachweis", "screenshot", "export", "zertifizierung",
-                             "auditbericht", "prüfbericht", "protokoll")
+        # Evidence leak rate: controls whose TITLE starts with evidence-type nouns
+        # "nachweisen" (verb) is a valid action; "Nachweis vorliegen" is evidence
+        evidence_start_patterns = (
+            "nachweis vorli", "nachweis muss vorl", "screenshot",
+            "export der", "auditbericht", "prüfbericht",
+            "log-auszug", "jira-ticket", "sbom-nachweis",
+        )
        evidence_count = 0
        rows = db.execute(text(f"""
            SELECT title FROM compliance.canonical_controls cc
@@ -2404,17 +2408,23 @@ async def get_quality_metrics(
        """)).fetchall()
        for row in rows:
            title_lower = (row[0] or "").lower()
-            if any(kw in title_lower for kw in evidence_keywords):
+            if any(title_lower.startswith(p) for p in evidence_start_patterns):
                evidence_count += 1
        evidence_leak_rate = round(evidence_count / max(total, 1) * 100, 1)

-        # Truncated title rate: titles ending mid-word (heuristic)
+        # Truncated title rate: titles cut off mid-word
+        # True truncation: ends with incomplete word fragments (no space before last char)
+        # NOT truncation: German titles ending with complete verbs (infinitives etc.)
        truncated_count = 0
        for row in rows:
            title = (row[0] or "").strip()
-            if title and len(title) >= 75:
-                # Likely truncated if it's close to max and doesn't end with a word boundary
-                if not title[-1] in ".!?)\"'":
+            if title and len(title) >= 195:
+                # Near the 200-char limit — likely truncated
+                truncated_count += 1
+            elif title and len(title) >= 75:
+                # Check for mid-word cutoff: last "word" has <3 chars and title is long
+                last_word = title.split()[-1] if title.split() else ""
+                if len(last_word) <= 2 and not last_word.endswith((".", "!", "?")):
                    truncated_count += 1
        truncated_title_rate = round(truncated_count / max(total, 1) * 100, 1)