Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 42s
CI/CD / test-python-backend-compliance (push) Successful in 34s
CI/CD / test-python-document-crawler (push) Successful in 21s
CI/CD / test-python-dsms-gateway (push) Successful in 20s
CI/CD / validate-canonical-controls (push) Successful in 12s
CI/CD / Deploy (push) Has been skipped
Add _detect_recital() to QA pipeline — flags controls where source_original_text contains Erwägungsgrund markers instead of article text (28% of controls with source text affected). - Recital detection via regex + phrase matching in QA validation - 10 new tests (TestRecitalDetection), 81 total - ReviewCompare component for side-by-side duplicate comparison - Review mode split: Duplikat-Verdacht vs Rule-3-ohne-Anchor tabs - MkDocs: recital detection documentation - Detection script for bulk analysis (scripts/find_recital_controls.py) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
80 lines
2.9 KiB
Python
80 lines
2.9 KiB
Python
"""Find controls where source_original_text contains Erwägungsgrund (recital) markers
|
|
instead of actual article text — indicates wrong article tagging in RAG chunks."""
|
|
|
|
import sqlalchemy
|
|
import os
|
|
import json
|
|
import re
|
|
|
|
url = os.environ.get("DATABASE_URL", "")
|
|
if not url:
|
|
print("DATABASE_URL not set")
|
|
exit(1)
|
|
|
|
engine = sqlalchemy.create_engine(url)
|
|
|
|
with engine.connect() as conn:
|
|
conn.execute(sqlalchemy.text("SET search_path TO compliance,public"))
|
|
|
|
r = conn.execute(sqlalchemy.text("""
|
|
SELECT control_id, title,
|
|
source_citation::text,
|
|
source_original_text,
|
|
pipeline_version, release_state,
|
|
generation_metadata::text
|
|
FROM canonical_controls
|
|
WHERE source_original_text IS NOT NULL
|
|
AND source_original_text != ''
|
|
AND source_citation IS NOT NULL
|
|
ORDER BY control_id
|
|
""")).fetchall()
|
|
|
|
# Pattern: standalone recital number like (125)\n or (126) at line start
|
|
recital_re = re.compile(r'\((\d{1,3})\)\s*\n')
|
|
|
|
# Pattern: article reference like "Artikel 43" in the text
|
|
artikel_re = re.compile(r'Artikel\s+(\d+)', re.IGNORECASE)
|
|
|
|
suspects_recital = []
|
|
suspects_mismatch = []
|
|
|
|
for row in r:
|
|
cid, title, citation_json, orig, pv, state, meta_json = row
|
|
if not orig:
|
|
continue
|
|
|
|
citation = json.loads(citation_json) if citation_json else {}
|
|
claimed_article = citation.get("article", "")
|
|
|
|
# Check 1: Recital markers in source text
|
|
recital_matches = recital_re.findall(orig)
|
|
has_recital = len(recital_matches) > 0
|
|
|
|
# Check 2: Text mentions a different article than claimed
|
|
artikel_matches = artikel_re.findall(orig)
|
|
claimed_num = re.search(r'\d+', claimed_article).group() if re.search(r'\d+', claimed_article) else ""
|
|
different_articles = [a for a in artikel_matches if a != claimed_num] if claimed_num else []
|
|
|
|
if has_recital:
|
|
suspects_recital.append({
|
|
"control_id": cid,
|
|
"title": title[:80],
|
|
"claimed_article": claimed_article,
|
|
"claimed_paragraph": citation.get("paragraph", ""),
|
|
"recitals_found": recital_matches[:5],
|
|
"v": pv,
|
|
"state": state,
|
|
})
|
|
|
|
print(f"=== Ergebnis ===")
|
|
print(f"Geprueft: {len(r)} Controls mit source_original_text")
|
|
print(f"Erwaegungsgrund-Verdacht: {len(suspects_recital)}")
|
|
print()
|
|
|
|
if suspects_recital:
|
|
print(f"{'Control':<12} {'Behauptet':<18} {'Recitals':<20} {'v':>2} {'State':<15} Titel")
|
|
print("-" * 120)
|
|
for s in suspects_recital:
|
|
recitals = ",".join(s["recitals_found"])
|
|
print(f"{s['control_id']:<12} {s['claimed_article']:<10} {s['claimed_paragraph']:<7} ({recitals}){'':<{max(0,15-len(recitals))}} v{s['v']} {s['state']:<15} {s['title']}")
|