""" Backfill script for job 66228863 — fix 216 controls that were wrongly processed as Rule 3. eu_2023_1542 (Batterieverordnung) was missing from REGULATION_LICENSE_MAP, so all controls were generated with Rule 3 (restricted): no source_citation, no source_original_text, release_state=too_close, customer_visible=False. This script: 1. Finds all 216 chunk→control pairs from the job 2. Fetches original chunk text from Qdrant (via chunk_hash) 3. Extracts article/paragraph references from chunk text 4. Updates each control: license_rule=1, source_citation, source_original_text, release_state=draft, customer_visible=True, generation_metadata 5. Updates processed_chunks to reflect the corrected license_rule """ import hashlib import json import os import re import sys from sqlalchemy import create_engine, text # Try httpx first (available in container), fall back to requests try: import httpx def http_post(url, json_data, timeout=30): return httpx.post(url, json=json_data, timeout=timeout).json() except ImportError: import requests def http_post(url, json_data, timeout=30): return requests.post(url, json=json_data, timeout=timeout).json() # ── Configuration ────────────────────────────────────────────────────────── DB_URL = os.environ['DATABASE_URL'] QDRANT_URL = os.environ.get('QDRANT_URL', 'http://host.docker.internal:6333') JOB_ID = '66228863-e79f-46fb-9f22-4bd8e1ec53d2' DRY_RUN = '--dry-run' in sys.argv LICENSE_INFO = { "license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Batterieverordnung", } # Article/paragraph extraction patterns ARTICLE_PATTERN = re.compile( r'(?:Artikel|Art\.?)\s+(\d+[a-z]?)', re.IGNORECASE ) PARAGRAPH_PATTERN = re.compile( r'(?:Absatz|Abs\.?)\s+(\d+)', re.IGNORECASE ) # Also match "Artikel X Absatz Y" or "(Y)" after article ARTICLE_TITLE_PATTERN = re.compile( r'Artikel\s+(\d+[a-z]?)\s*\n([^\n]+)', re.IGNORECASE ) def extract_article_paragraph(chunk_text: str) -> tuple[str, str]: """Extract the most prominent article and paragraph from chunk text.""" articles = ARTICLE_PATTERN.findall(chunk_text) paragraphs = PARAGRAPH_PATTERN.findall(chunk_text) # Take the first (most prominent) article mention article = f"Art. {articles[0]}" if articles else "" paragraph = f"Abs. {paragraphs[0]}" if paragraphs else "" return article, paragraph def main(): engine = create_engine(DB_URL, connect_args={"options": "-c search_path=compliance,public"}) with engine.begin() as conn: # ── Step 1: Get all chunk→control pairs ──────────────────────── rows = conn.execute(text(""" SELECT pc.chunk_hash, pc.regulation_code, pc.collection, jsonb_array_elements_text(pc.generated_control_ids)::uuid as control_id, pc.id as chunk_row_id FROM compliance.canonical_processed_chunks pc WHERE pc.job_id = :job_id AND jsonb_array_length(COALESCE(pc.generated_control_ids, '[]'::jsonb)) > 0 """), {"job_id": JOB_ID}).fetchall() print(f"Found {len(rows)} chunk→control pairs") # ── Step 2: Collect unique chunk hashes for Qdrant lookup ────── chunk_hashes = set() for row in rows: chunk_hashes.add(row[0]) print(f"Unique chunk hashes: {len(chunk_hashes)}") # ── Step 3: Fetch all chunks from Qdrant in batches ─────────── # Build a hash→text+metadata map by scrolling the collection hash_to_qdrant = {} # chunk_hash → {text, regulation_name_de, ...} collection = "bp_compliance_ce" offset = None batch_num = 0 print(f"Fetching chunks from Qdrant ({collection})...") while True: params = { "filter": {"must": [{"key": "regulation_id", "match": {"value": "eu_2023_1542"}}]}, "limit": 200, "with_payload": ["chunk_text", "regulation_name_de", "regulation_short", "source", "celex", "chunk_index"], "with_vectors": False, } if offset: params["offset"] = offset result = http_post( f"{QDRANT_URL}/collections/{collection}/points/scroll", params, timeout=30, ) points = result.get("result", {}).get("points", []) next_offset = result.get("result", {}).get("next_page_offset") batch_num += 1 for p in points: text_content = p["payload"].get("chunk_text", "") h = hashlib.sha256(text_content.encode()).hexdigest() if h in chunk_hashes: hash_to_qdrant[h] = { "text": text_content, "regulation_name_de": p["payload"].get("regulation_name_de", "Batterieverordnung"), "regulation_short": p["payload"].get("regulation_short", "BattVO"), "source": p["payload"].get("source", ""), "celex": p["payload"].get("celex", ""), "chunk_index": p["payload"].get("chunk_index"), } sys.stdout.write(f"\r Batch {batch_num}: scanned {batch_num * 200} points, matched {len(hash_to_qdrant)}/{len(chunk_hashes)}") sys.stdout.flush() if not next_offset or len(hash_to_qdrant) == len(chunk_hashes): break offset = next_offset print(f"\n Matched {len(hash_to_qdrant)}/{len(chunk_hashes)} chunks from Qdrant") # ── Step 4: Update controls ─────────────────────────────────── updated = 0 skipped = 0 errors = 0 for row in rows: chunk_hash = row[0] regulation_code = row[1] control_id = row[3] chunk_row_id = row[4] qdrant_data = hash_to_qdrant.get(chunk_hash) if not qdrant_data: print(f"\n WARN: No Qdrant match for chunk {chunk_hash[:20]}... (control {control_id})") skipped += 1 continue chunk_text = qdrant_data["text"] source_name = qdrant_data["regulation_name_de"] article, paragraph = extract_article_paragraph(chunk_text) source_citation = { "source": source_name, "article": article, "paragraph": paragraph, "license": LICENSE_INFO["license"], "source_type": LICENSE_INFO["source_type"], "url": f"https://eur-lex.europa.eu/legal-content/DE/TXT/?uri=CELEX:{qdrant_data['celex']}" if qdrant_data.get("celex") else "", } # Build updated generation_metadata (preserve existing fields) new_meta_patch = { "license_rule": 1, "source_regulation": regulation_code, "source_article": article, "source_paragraph": paragraph, "backfill_reason": "LICENSE_MAP missing eu_2023_1542", "backfill_date": "2026-03-19", } if DRY_RUN: if updated < 3: print(f"\n [DRY RUN] Would update control {control_id}") print(f" citation: {json.dumps(source_citation, ensure_ascii=False)[:120]}") print(f" article: {article}, paragraph: {paragraph}") print(f" text[:80]: {chunk_text[:80]}") updated += 1 continue try: # Update the control conn.execute(text(""" UPDATE compliance.canonical_controls SET license_rule = 1, source_original_text = :source_text, source_citation = CAST(:citation AS jsonb), customer_visible = true, release_state = CASE WHEN release_state = 'too_close' THEN 'draft' ELSE release_state END, generation_metadata = COALESCE(generation_metadata, '{}'::jsonb) || CAST(:meta_patch AS jsonb), updated_at = NOW() WHERE id = :control_id """), { "control_id": control_id, "source_text": chunk_text, "citation": json.dumps(source_citation, ensure_ascii=False), "meta_patch": json.dumps(new_meta_patch), }) # Update the processed_chunk record too conn.execute(text(""" UPDATE compliance.canonical_processed_chunks SET license_rule = 1, source_license = 'EU_LAW', processing_path = 'structured_batch' WHERE id = :chunk_id """), {"chunk_id": chunk_row_id}) updated += 1 except Exception as e: print(f"\n ERROR updating control {control_id}: {e}") errors += 1 print(f"\n\n=== BACKFILL COMPLETE ===") print(f" Updated: {updated}") print(f" Skipped: {skipped} (no Qdrant match)") print(f" Errors: {errors}") print(f" Dry run: {DRY_RUN}") if DRY_RUN: print("\n Run without --dry-run to apply changes.") # ── Step 5: Verify ──────────────────────────────────────────── if not DRY_RUN: r = conn.execute(text(""" WITH ctrl_ids AS ( SELECT DISTINCT jsonb_array_elements_text(generated_control_ids)::uuid as ctrl_id FROM compliance.canonical_processed_chunks WHERE job_id = :job_id AND jsonb_array_length(COALESCE(generated_control_ids, '[]'::jsonb)) > 0 ) SELECT release_state, license_rule, customer_visible, count(*) FROM compliance.canonical_controls c JOIN ctrl_ids ci ON c.id = ci.ctrl_id GROUP BY release_state, license_rule, customer_visible ORDER BY release_state """), {"job_id": JOB_ID}) print("\n=== Verification ===") for row in r.fetchall(): print(f" {str(row[0]):20s} rule={row[1]} visible={row[2]} count={row[3]}") if __name__ == "__main__": main()