"""Find controls where source_original_text contains Erwägungsgrund (recital) markers instead of actual article text — indicates wrong article tagging in RAG chunks.""" import sqlalchemy import os import json import re url = os.environ.get("DATABASE_URL", "") if not url: print("DATABASE_URL not set") exit(1) engine = sqlalchemy.create_engine(url) with engine.connect() as conn: conn.execute(sqlalchemy.text("SET search_path TO compliance,public")) r = conn.execute(sqlalchemy.text(""" SELECT control_id, title, source_citation::text, source_original_text, pipeline_version, release_state, generation_metadata::text FROM canonical_controls WHERE source_original_text IS NOT NULL AND source_original_text != '' AND source_citation IS NOT NULL ORDER BY control_id """)).fetchall() # Pattern: standalone recital number like (125)\n or (126) at line start recital_re = re.compile(r'\((\d{1,3})\)\s*\n') # Pattern: article reference like "Artikel 43" in the text artikel_re = re.compile(r'Artikel\s+(\d+)', re.IGNORECASE) suspects_recital = [] suspects_mismatch = [] for row in r: cid, title, citation_json, orig, pv, state, meta_json = row if not orig: continue citation = json.loads(citation_json) if citation_json else {} claimed_article = citation.get("article", "") # Check 1: Recital markers in source text recital_matches = recital_re.findall(orig) has_recital = len(recital_matches) > 0 # Check 2: Text mentions a different article than claimed artikel_matches = artikel_re.findall(orig) claimed_num = re.search(r'\d+', claimed_article).group() if re.search(r'\d+', claimed_article) else "" different_articles = [a for a in artikel_matches if a != claimed_num] if claimed_num else [] if has_recital: suspects_recital.append({ "control_id": cid, "title": title[:80], "claimed_article": claimed_article, "claimed_paragraph": citation.get("paragraph", ""), "recitals_found": recital_matches[:5], "v": pv, "state": state, }) print(f"=== Ergebnis ===") print(f"Geprueft: {len(r)} Controls mit source_original_text") print(f"Erwaegungsgrund-Verdacht: {len(suspects_recital)}") print() if suspects_recital: print(f"{'Control':<12} {'Behauptet':<18} {'Recitals':<20} {'v':>2} {'State':<15} Titel") print("-" * 120) for s in suspects_recital: recitals = ",".join(s["recitals_found"]) print(f"{s['control_id']:<12} {s['claimed_article']:<10} {s['claimed_paragraph']:<7} ({recitals}){'':<{max(0,15-len(recitals))}} v{s['v']} {s['state']:<15} {s['title']}")