feat(qa): recital detection, review split, duplicate comparison
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 42s
CI/CD / test-python-backend-compliance (push) Successful in 34s
CI/CD / test-python-document-crawler (push) Successful in 21s
CI/CD / test-python-dsms-gateway (push) Successful in 20s
CI/CD / validate-canonical-controls (push) Successful in 12s
CI/CD / Deploy (push) Has been skipped
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 42s
CI/CD / test-python-backend-compliance (push) Successful in 34s
CI/CD / test-python-document-crawler (push) Successful in 21s
CI/CD / test-python-dsms-gateway (push) Successful in 20s
CI/CD / validate-canonical-controls (push) Successful in 12s
CI/CD / Deploy (push) Has been skipped
Add _detect_recital() to QA pipeline — flags controls where source_original_text contains Erwägungsgrund markers instead of article text (28% of controls with source text affected). - Recital detection via regex + phrase matching in QA validation - 10 new tests (TestRecitalDetection), 81 total - ReviewCompare component for side-by-side duplicate comparison - Review mode split: Duplikat-Verdacht vs Rule-3-ohne-Anchor tabs - MkDocs: recital detection documentation - Detection script for bulk analysis (scripts/find_recital_controls.py) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
79
scripts/find_recital_controls.py
Normal file
79
scripts/find_recital_controls.py
Normal file
@@ -0,0 +1,79 @@
|
||||
"""Find controls where source_original_text contains Erwägungsgrund (recital) markers
|
||||
instead of actual article text — indicates wrong article tagging in RAG chunks."""
|
||||
|
||||
import sqlalchemy
|
||||
import os
|
||||
import json
|
||||
import re
|
||||
|
||||
url = os.environ.get("DATABASE_URL", "")
|
||||
if not url:
|
||||
print("DATABASE_URL not set")
|
||||
exit(1)
|
||||
|
||||
engine = sqlalchemy.create_engine(url)
|
||||
|
||||
with engine.connect() as conn:
|
||||
conn.execute(sqlalchemy.text("SET search_path TO compliance,public"))
|
||||
|
||||
r = conn.execute(sqlalchemy.text("""
|
||||
SELECT control_id, title,
|
||||
source_citation::text,
|
||||
source_original_text,
|
||||
pipeline_version, release_state,
|
||||
generation_metadata::text
|
||||
FROM canonical_controls
|
||||
WHERE source_original_text IS NOT NULL
|
||||
AND source_original_text != ''
|
||||
AND source_citation IS NOT NULL
|
||||
ORDER BY control_id
|
||||
""")).fetchall()
|
||||
|
||||
# Pattern: standalone recital number like (125)\n or (126) at line start
|
||||
recital_re = re.compile(r'\((\d{1,3})\)\s*\n')
|
||||
|
||||
# Pattern: article reference like "Artikel 43" in the text
|
||||
artikel_re = re.compile(r'Artikel\s+(\d+)', re.IGNORECASE)
|
||||
|
||||
suspects_recital = []
|
||||
suspects_mismatch = []
|
||||
|
||||
for row in r:
|
||||
cid, title, citation_json, orig, pv, state, meta_json = row
|
||||
if not orig:
|
||||
continue
|
||||
|
||||
citation = json.loads(citation_json) if citation_json else {}
|
||||
claimed_article = citation.get("article", "")
|
||||
|
||||
# Check 1: Recital markers in source text
|
||||
recital_matches = recital_re.findall(orig)
|
||||
has_recital = len(recital_matches) > 0
|
||||
|
||||
# Check 2: Text mentions a different article than claimed
|
||||
artikel_matches = artikel_re.findall(orig)
|
||||
claimed_num = re.search(r'\d+', claimed_article).group() if re.search(r'\d+', claimed_article) else ""
|
||||
different_articles = [a for a in artikel_matches if a != claimed_num] if claimed_num else []
|
||||
|
||||
if has_recital:
|
||||
suspects_recital.append({
|
||||
"control_id": cid,
|
||||
"title": title[:80],
|
||||
"claimed_article": claimed_article,
|
||||
"claimed_paragraph": citation.get("paragraph", ""),
|
||||
"recitals_found": recital_matches[:5],
|
||||
"v": pv,
|
||||
"state": state,
|
||||
})
|
||||
|
||||
print(f"=== Ergebnis ===")
|
||||
print(f"Geprueft: {len(r)} Controls mit source_original_text")
|
||||
print(f"Erwaegungsgrund-Verdacht: {len(suspects_recital)}")
|
||||
print()
|
||||
|
||||
if suspects_recital:
|
||||
print(f"{'Control':<12} {'Behauptet':<18} {'Recitals':<20} {'v':>2} {'State':<15} Titel")
|
||||
print("-" * 120)
|
||||
for s in suspects_recital:
|
||||
recitals = ",".join(s["recitals_found"])
|
||||
print(f"{s['control_id']:<12} {s['claimed_article']:<10} {s['claimed_paragraph']:<7} ({recitals}){'':<{max(0,15-len(recitals))}} v{s['v']} {s['state']:<15} {s['title']}")
|
||||
Reference in New Issue
Block a user