fix(pipeline): improve quality metrics heuristics
- Fix truncated title detection: only flag near-200-char titles or mid-word cutoffs
- Fix evidence leak detection: check title start patterns, not keyword substring
("nachweisen" verb is valid action, "Nachweis vorliegen" is evidence)
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -2392,9 +2392,13 @@ async def get_quality_metrics(
|
||||
|
||||
duplicate_rate = round(dup_rows / max(total_with_mk, 1) * 100, 1)
|
||||
|
||||
# Evidence leak rate: controls that look like evidence
|
||||
evidence_keywords = ("nachweis", "screenshot", "export", "zertifizierung",
|
||||
"auditbericht", "prüfbericht", "protokoll")
|
||||
# Evidence leak rate: controls whose TITLE starts with evidence-type nouns
|
||||
# "nachweisen" (verb) is a valid action; "Nachweis vorliegen" is evidence
|
||||
evidence_start_patterns = (
|
||||
"nachweis vorli", "nachweis muss vorl", "screenshot",
|
||||
"export der", "auditbericht", "prüfbericht",
|
||||
"log-auszug", "jira-ticket", "sbom-nachweis",
|
||||
)
|
||||
evidence_count = 0
|
||||
rows = db.execute(text(f"""
|
||||
SELECT title FROM compliance.canonical_controls cc
|
||||
@@ -2404,17 +2408,23 @@ async def get_quality_metrics(
|
||||
""")).fetchall()
|
||||
for row in rows:
|
||||
title_lower = (row[0] or "").lower()
|
||||
if any(kw in title_lower for kw in evidence_keywords):
|
||||
if any(title_lower.startswith(p) for p in evidence_start_patterns):
|
||||
evidence_count += 1
|
||||
evidence_leak_rate = round(evidence_count / max(total, 1) * 100, 1)
|
||||
|
||||
# Truncated title rate: titles ending mid-word (heuristic)
|
||||
# Truncated title rate: titles cut off mid-word
|
||||
# True truncation: ends with incomplete word fragments (no space before last char)
|
||||
# NOT truncation: German titles ending with complete verbs (infinitives etc.)
|
||||
truncated_count = 0
|
||||
for row in rows:
|
||||
title = (row[0] or "").strip()
|
||||
if title and len(title) >= 75:
|
||||
# Likely truncated if it's close to max and doesn't end with a word boundary
|
||||
if not title[-1] in ".!?)\"'":
|
||||
if title and len(title) >= 195:
|
||||
# Near the 200-char limit — likely truncated
|
||||
truncated_count += 1
|
||||
elif title and len(title) >= 75:
|
||||
# Check for mid-word cutoff: last "word" has <3 chars and title is long
|
||||
last_word = title.split()[-1] if title.split() else ""
|
||||
if len(last_word) <= 2 and not last_word.endswith((".", "!", "?")):
|
||||
truncated_count += 1
|
||||
truncated_title_rate = round(truncated_count / max(total, 1) * 100, 1)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user