breakpilot-compliance/scripts/find_recital_controls.py

"""Find controls where source_original_text contains Erwägungsgrund (recital) markers
instead of actual article text — indicates wrong article tagging in RAG chunks."""

import sqlalchemy
import os
import json
import re

url = os.environ.get("DATABASE_URL", "")
if not url:
    print("DATABASE_URL not set")
    exit(1)

engine = sqlalchemy.create_engine(url)

with engine.connect() as conn:
    conn.execute(sqlalchemy.text("SET search_path TO compliance,public"))

    r = conn.execute(sqlalchemy.text("""
        SELECT control_id, title,
               source_citation::text,
               source_original_text,
               pipeline_version, release_state,
               generation_metadata::text
        FROM canonical_controls
        WHERE source_original_text IS NOT NULL
          AND source_original_text != ''
          AND source_citation IS NOT NULL
        ORDER BY control_id
    """)).fetchall()

    # Pattern: standalone recital number like (125)\n or (126) at line start
    recital_re = re.compile(r'\((\d{1,3})\)\s*\n')

    # Pattern: article reference like "Artikel 43" in the text
    artikel_re = re.compile(r'Artikel\s+(\d+)', re.IGNORECASE)

    suspects_recital = []
    suspects_mismatch = []

    for row in r:
        cid, title, citation_json, orig, pv, state, meta_json = row
        if not orig:
            continue

        citation = json.loads(citation_json) if citation_json else {}
        claimed_article = citation.get("article", "")

        # Check 1: Recital markers in source text
        recital_matches = recital_re.findall(orig)
        has_recital = len(recital_matches) > 0

        # Check 2: Text mentions a different article than claimed
        artikel_matches = artikel_re.findall(orig)
        claimed_num = re.search(r'\d+', claimed_article).group() if re.search(r'\d+', claimed_article) else ""
        different_articles = [a for a in artikel_matches if a != claimed_num] if claimed_num else []

        if has_recital:
            suspects_recital.append({
                "control_id": cid,
                "title": title[:80],
                "claimed_article": claimed_article,
                "claimed_paragraph": citation.get("paragraph", ""),
                "recitals_found": recital_matches[:5],
                "v": pv,
                "state": state,
            })

    print(f"=== Ergebnis ===")
    print(f"Geprueft: {len(r)} Controls mit source_original_text")
    print(f"Erwaegungsgrund-Verdacht: {len(suspects_recital)}")
    print()

    if suspects_recital:
        print(f"{'Control':<12} {'Behauptet':<18} {'Recitals':<20} {'v':>2} {'State':<15} Titel")
        print("-" * 120)
        for s in suspects_recital:
            recitals = ",".join(s["recitals_found"])
            print(f"{s['control_id']:<12} {s['claimed_article']:<10} {s['claimed_paragraph']:<7} ({recitals}){'':<{max(0,15-len(recitals))}} v{s['v']} {s['state']:<15} {s['title']}")