breakpilot-compliance/scripts/qa/backfill_job_66228863.py

"""
Backfill script for job 66228863 — fix 216 controls that were wrongly processed as Rule 3.

eu_2023_1542 (Batterieverordnung) was missing from REGULATION_LICENSE_MAP, so all controls
were generated with Rule 3 (restricted): no source_citation, no source_original_text,
release_state=too_close, customer_visible=False.

This script:
1. Finds all 216 chunk→control pairs from the job
2. Fetches original chunk text from Qdrant (via chunk_hash)
3. Extracts article/paragraph references from chunk text
4. Updates each control: license_rule=1, source_citation, source_original_text,
   release_state=draft, customer_visible=True, generation_metadata
5. Updates processed_chunks to reflect the corrected license_rule
"""

import hashlib
import json
import os
import re
import sys
from sqlalchemy import create_engine, text

# Try httpx first (available in container), fall back to requests
try:
    import httpx
    def http_post(url, json_data, timeout=30):
        return httpx.post(url, json=json_data, timeout=timeout).json()
except ImportError:
    import requests
    def http_post(url, json_data, timeout=30):
        return requests.post(url, json=json_data, timeout=timeout).json()

# ── Configuration ──────────────────────────────────────────────────────────

DB_URL = os.environ['DATABASE_URL']
QDRANT_URL = os.environ.get('QDRANT_URL', 'http://host.docker.internal:6333')
JOB_ID = '66228863-e79f-46fb-9f22-4bd8e1ec53d2'
DRY_RUN = '--dry-run' in sys.argv

LICENSE_INFO = {
    "license": "EU_LAW",
    "rule": 1,
    "source_type": "law",
    "name": "Batterieverordnung",
}

# Article/paragraph extraction patterns
ARTICLE_PATTERN = re.compile(
    r'(?:Artikel|Art\.?)\s+(\d+[a-z]?)',
    re.IGNORECASE
)
PARAGRAPH_PATTERN = re.compile(
    r'(?:Absatz|Abs\.?)\s+(\d+)',
    re.IGNORECASE
)
# Also match "Artikel X Absatz Y" or "(Y)" after article
ARTICLE_TITLE_PATTERN = re.compile(
    r'Artikel\s+(\d+[a-z]?)\s*\n([^\n]+)',
    re.IGNORECASE
)

def extract_article_paragraph(chunk_text: str) -> tuple[str, str]:
    """Extract the most prominent article and paragraph from chunk text."""
    articles = ARTICLE_PATTERN.findall(chunk_text)
    paragraphs = PARAGRAPH_PATTERN.findall(chunk_text)

    # Take the first (most prominent) article mention
    article = f"Art. {articles[0]}" if articles else ""
    paragraph = f"Abs. {paragraphs[0]}" if paragraphs else ""
    return article, paragraph


def main():
    engine = create_engine(DB_URL, connect_args={"options": "-c search_path=compliance,public"})

    with engine.begin() as conn:
        # ── Step 1: Get all chunk→control pairs ────────────────────────
        rows = conn.execute(text("""
            SELECT pc.chunk_hash, pc.regulation_code, pc.collection,
                   jsonb_array_elements_text(pc.generated_control_ids)::uuid as control_id,
                   pc.id as chunk_row_id
            FROM compliance.canonical_processed_chunks pc
            WHERE pc.job_id = :job_id
              AND jsonb_array_length(COALESCE(pc.generated_control_ids, '[]'::jsonb)) > 0
        """), {"job_id": JOB_ID}).fetchall()

        print(f"Found {len(rows)} chunk→control pairs")

        # ── Step 2: Collect unique chunk hashes for Qdrant lookup ──────
        chunk_hashes = set()
        for row in rows:
            chunk_hashes.add(row[0])
        print(f"Unique chunk hashes: {len(chunk_hashes)}")

        # ── Step 3: Fetch all chunks from Qdrant in batches ───────────
        # Build a hash→text+metadata map by scrolling the collection
        hash_to_qdrant = {}  # chunk_hash → {text, regulation_name_de, ...}
        collection = "bp_compliance_ce"
        offset = None
        batch_num = 0

        print(f"Fetching chunks from Qdrant ({collection})...")
        while True:
            params = {
                "filter": {"must": [{"key": "regulation_id", "match": {"value": "eu_2023_1542"}}]},
                "limit": 200,
                "with_payload": ["chunk_text", "regulation_name_de", "regulation_short",
                                 "source", "celex", "chunk_index"],
                "with_vectors": False,
            }
            if offset:
                params["offset"] = offset

            result = http_post(
                f"{QDRANT_URL}/collections/{collection}/points/scroll",
                params,
                timeout=30,
            )
            points = result.get("result", {}).get("points", [])
            next_offset = result.get("result", {}).get("next_page_offset")
            batch_num += 1

            for p in points:
                text_content = p["payload"].get("chunk_text", "")
                h = hashlib.sha256(text_content.encode()).hexdigest()
                if h in chunk_hashes:
                    hash_to_qdrant[h] = {
                        "text": text_content,
                        "regulation_name_de": p["payload"].get("regulation_name_de", "Batterieverordnung"),
                        "regulation_short": p["payload"].get("regulation_short", "BattVO"),
                        "source": p["payload"].get("source", ""),
                        "celex": p["payload"].get("celex", ""),
                        "chunk_index": p["payload"].get("chunk_index"),
                    }

            sys.stdout.write(f"\r  Batch {batch_num}: scanned {batch_num * 200} points, matched {len(hash_to_qdrant)}/{len(chunk_hashes)}")
            sys.stdout.flush()

            if not next_offset or len(hash_to_qdrant) == len(chunk_hashes):
                break
            offset = next_offset

        print(f"\n  Matched {len(hash_to_qdrant)}/{len(chunk_hashes)} chunks from Qdrant")

        # ── Step 4: Update controls ───────────────────────────────────
        updated = 0
        skipped = 0
        errors = 0

        for row in rows:
            chunk_hash = row[0]
            regulation_code = row[1]
            control_id = row[3]
            chunk_row_id = row[4]

            qdrant_data = hash_to_qdrant.get(chunk_hash)
            if not qdrant_data:
                print(f"\n  WARN: No Qdrant match for chunk {chunk_hash[:20]}... (control {control_id})")
                skipped += 1
                continue

            chunk_text = qdrant_data["text"]
            source_name = qdrant_data["regulation_name_de"]
            article, paragraph = extract_article_paragraph(chunk_text)

            source_citation = {
                "source": source_name,
                "article": article,
                "paragraph": paragraph,
                "license": LICENSE_INFO["license"],
                "source_type": LICENSE_INFO["source_type"],
                "url": f"https://eur-lex.europa.eu/legal-content/DE/TXT/?uri=CELEX:{qdrant_data['celex']}" if qdrant_data.get("celex") else "",
            }

            # Build updated generation_metadata (preserve existing fields)
            new_meta_patch = {
                "license_rule": 1,
                "source_regulation": regulation_code,
                "source_article": article,
                "source_paragraph": paragraph,
                "backfill_reason": "LICENSE_MAP missing eu_2023_1542",
                "backfill_date": "2026-03-19",
            }

            if DRY_RUN:
                if updated < 3:
                    print(f"\n  [DRY RUN] Would update control {control_id}")
                    print(f"    citation: {json.dumps(source_citation, ensure_ascii=False)[:120]}")
                    print(f"    article: {article}, paragraph: {paragraph}")
                    print(f"    text[:80]: {chunk_text[:80]}")
                updated += 1
                continue

            try:
                # Update the control
                conn.execute(text("""
                    UPDATE compliance.canonical_controls
                    SET license_rule = 1,
                        source_original_text = :source_text,
                        source_citation = CAST(:citation AS jsonb),
                        customer_visible = true,
                        release_state = CASE
                            WHEN release_state = 'too_close' THEN 'draft'
                            ELSE release_state
                        END,
                        generation_metadata = COALESCE(generation_metadata, '{}'::jsonb) || CAST(:meta_patch AS jsonb),
                        updated_at = NOW()
                    WHERE id = :control_id
                """), {
                    "control_id": control_id,
                    "source_text": chunk_text,
                    "citation": json.dumps(source_citation, ensure_ascii=False),
                    "meta_patch": json.dumps(new_meta_patch),
                })

                # Update the processed_chunk record too
                conn.execute(text("""
                    UPDATE compliance.canonical_processed_chunks
                    SET license_rule = 1,
                        source_license = 'EU_LAW',
                        processing_path = 'structured_batch'
                    WHERE id = :chunk_id
                """), {"chunk_id": chunk_row_id})

                updated += 1
            except Exception as e:
                print(f"\n  ERROR updating control {control_id}: {e}")
                errors += 1

        print(f"\n\n=== BACKFILL COMPLETE ===")
        print(f"  Updated:  {updated}")
        print(f"  Skipped:  {skipped} (no Qdrant match)")
        print(f"  Errors:   {errors}")
        print(f"  Dry run:  {DRY_RUN}")

        if DRY_RUN:
            print("\n  Run without --dry-run to apply changes.")

        # ── Step 5: Verify ────────────────────────────────────────────
        if not DRY_RUN:
            r = conn.execute(text("""
                WITH ctrl_ids AS (
                    SELECT DISTINCT jsonb_array_elements_text(generated_control_ids)::uuid as ctrl_id
                    FROM compliance.canonical_processed_chunks
                    WHERE job_id = :job_id
                      AND jsonb_array_length(COALESCE(generated_control_ids, '[]'::jsonb)) > 0
                )
                SELECT release_state, license_rule, customer_visible, count(*)
                FROM compliance.canonical_controls c
                JOIN ctrl_ids ci ON c.id = ci.ctrl_id
                GROUP BY release_state, license_rule, customer_visible
                ORDER BY release_state
            """), {"job_id": JOB_ID})
            print("\n=== Verification ===")
            for row in r.fetchall():
                print(f"  {str(row[0]):20s} rule={row[1]} visible={row[2]}  count={row[3]}")


if __name__ == "__main__":
    main()