fix(pipeline): improve quality metrics heuristics
- Fix truncated title detection: only flag near-200-char titles or mid-word cutoffs
- Fix evidence leak detection: check title start patterns, not keyword substring
("nachweisen" verb is valid action, "Nachweis vorliegen" is evidence)
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -2392,9 +2392,13 @@ async def get_quality_metrics(
|
|||||||
|
|
||||||
duplicate_rate = round(dup_rows / max(total_with_mk, 1) * 100, 1)
|
duplicate_rate = round(dup_rows / max(total_with_mk, 1) * 100, 1)
|
||||||
|
|
||||||
# Evidence leak rate: controls that look like evidence
|
# Evidence leak rate: controls whose TITLE starts with evidence-type nouns
|
||||||
evidence_keywords = ("nachweis", "screenshot", "export", "zertifizierung",
|
# "nachweisen" (verb) is a valid action; "Nachweis vorliegen" is evidence
|
||||||
"auditbericht", "prüfbericht", "protokoll")
|
evidence_start_patterns = (
|
||||||
|
"nachweis vorli", "nachweis muss vorl", "screenshot",
|
||||||
|
"export der", "auditbericht", "prüfbericht",
|
||||||
|
"log-auszug", "jira-ticket", "sbom-nachweis",
|
||||||
|
)
|
||||||
evidence_count = 0
|
evidence_count = 0
|
||||||
rows = db.execute(text(f"""
|
rows = db.execute(text(f"""
|
||||||
SELECT title FROM compliance.canonical_controls cc
|
SELECT title FROM compliance.canonical_controls cc
|
||||||
@@ -2404,17 +2408,23 @@ async def get_quality_metrics(
|
|||||||
""")).fetchall()
|
""")).fetchall()
|
||||||
for row in rows:
|
for row in rows:
|
||||||
title_lower = (row[0] or "").lower()
|
title_lower = (row[0] or "").lower()
|
||||||
if any(kw in title_lower for kw in evidence_keywords):
|
if any(title_lower.startswith(p) for p in evidence_start_patterns):
|
||||||
evidence_count += 1
|
evidence_count += 1
|
||||||
evidence_leak_rate = round(evidence_count / max(total, 1) * 100, 1)
|
evidence_leak_rate = round(evidence_count / max(total, 1) * 100, 1)
|
||||||
|
|
||||||
# Truncated title rate: titles ending mid-word (heuristic)
|
# Truncated title rate: titles cut off mid-word
|
||||||
|
# True truncation: ends with incomplete word fragments (no space before last char)
|
||||||
|
# NOT truncation: German titles ending with complete verbs (infinitives etc.)
|
||||||
truncated_count = 0
|
truncated_count = 0
|
||||||
for row in rows:
|
for row in rows:
|
||||||
title = (row[0] or "").strip()
|
title = (row[0] or "").strip()
|
||||||
if title and len(title) >= 75:
|
if title and len(title) >= 195:
|
||||||
# Likely truncated if it's close to max and doesn't end with a word boundary
|
# Near the 200-char limit — likely truncated
|
||||||
if not title[-1] in ".!?)\"'":
|
truncated_count += 1
|
||||||
|
elif title and len(title) >= 75:
|
||||||
|
# Check for mid-word cutoff: last "word" has <3 chars and title is long
|
||||||
|
last_word = title.split()[-1] if title.split() else ""
|
||||||
|
if len(last_word) <= 2 and not last_word.endswith((".", "!", "?")):
|
||||||
truncated_count += 1
|
truncated_count += 1
|
||||||
truncated_title_rate = round(truncated_count / max(total, 1) * 100, 1)
|
truncated_title_rate = round(truncated_count / max(total, 1) * 100, 1)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user