chore(qa): add PDF-based control QA scripts and results

QA pipeline that matches control source_original_text directly against original PDF documents to verify article/paragraph assignments. Covers backfill, dedup, source normalization, Qdrant cleanup, and prod sync. Key results (2026-03-20): - 4,110/7,943 controls matched to PDF (100% for major EU regs) - 3,366 article corrections, 705 new assignments - 1,290 controls from Erwägungsgründe (preamble) identified - 779 controls from Anhänge (annexes) identified Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 00:56:13 +01:00
parent 1cc34c23d9
commit 9b0f25c105
12 changed files with 30839 additions and 0 deletions
@@ -0,0 +1,261 @@
+"""
+Backfill script for job 66228863 — fix 216 controls that were wrongly processed as Rule 3.
+
+eu_2023_1542 (Batterieverordnung) was missing from REGULATION_LICENSE_MAP, so all controls
+were generated with Rule 3 (restricted): no source_citation, no source_original_text,
+release_state=too_close, customer_visible=False.
+
+This script:
+1. Finds all 216 chunk→control pairs from the job
+2. Fetches original chunk text from Qdrant (via chunk_hash)
+3. Extracts article/paragraph references from chunk text
+4. Updates each control: license_rule=1, source_citation, source_original_text,
+   release_state=draft, customer_visible=True, generation_metadata
+5. Updates processed_chunks to reflect the corrected license_rule
+"""
+
+import hashlib
+import json
+import os
+import re
+import sys
+from sqlalchemy import create_engine, text
+
+# Try httpx first (available in container), fall back to requests
+try:
+    import httpx
+    def http_post(url, json_data, timeout=30):
+        return httpx.post(url, json=json_data, timeout=timeout).json()
+except ImportError:
+    import requests
+    def http_post(url, json_data, timeout=30):
+        return requests.post(url, json=json_data, timeout=timeout).json()
+
+# ── Configuration ──────────────────────────────────────────────────────────
+
+DB_URL = os.environ['DATABASE_URL']
+QDRANT_URL = os.environ.get('QDRANT_URL', 'http://host.docker.internal:6333')
+JOB_ID = '66228863-e79f-46fb-9f22-4bd8e1ec53d2'
+DRY_RUN = '--dry-run' in sys.argv
+
+LICENSE_INFO = {
+    "license": "EU_LAW",
+    "rule": 1,
+    "source_type": "law",
+    "name": "Batterieverordnung",
+}
+
+# Article/paragraph extraction patterns
+ARTICLE_PATTERN = re.compile(
+    r'(?:Artikel|Art\.?)\s+(\d+[a-z]?)',
+    re.IGNORECASE
+)
+PARAGRAPH_PATTERN = re.compile(
+    r'(?:Absatz|Abs\.?)\s+(\d+)',
+    re.IGNORECASE
+)
+# Also match "Artikel X Absatz Y" or "(Y)" after article
+ARTICLE_TITLE_PATTERN = re.compile(
+    r'Artikel\s+(\d+[a-z]?)\s*\n([^\n]+)',
+    re.IGNORECASE
+)
+
+def extract_article_paragraph(chunk_text: str) -> tuple[str, str]:
+    """Extract the most prominent article and paragraph from chunk text."""
+    articles = ARTICLE_PATTERN.findall(chunk_text)
+    paragraphs = PARAGRAPH_PATTERN.findall(chunk_text)
+
+    # Take the first (most prominent) article mention
+    article = f"Art. {articles[0]}" if articles else ""
+    paragraph = f"Abs. {paragraphs[0]}" if paragraphs else ""
+    return article, paragraph
+
+
+def main():
+    engine = create_engine(DB_URL, connect_args={"options": "-c search_path=compliance,public"})
+
+    with engine.begin() as conn:
+        # ── Step 1: Get all chunk→control pairs ────────────────────────
+        rows = conn.execute(text("""
+            SELECT pc.chunk_hash, pc.regulation_code, pc.collection,
+                   jsonb_array_elements_text(pc.generated_control_ids)::uuid as control_id,
+                   pc.id as chunk_row_id
+            FROM compliance.canonical_processed_chunks pc
+            WHERE pc.job_id = :job_id
+              AND jsonb_array_length(COALESCE(pc.generated_control_ids, '[]'::jsonb)) > 0
+        """), {"job_id": JOB_ID}).fetchall()
+
+        print(f"Found {len(rows)} chunk→control pairs")
+
+        # ── Step 2: Collect unique chunk hashes for Qdrant lookup ──────
+        chunk_hashes = set()
+        for row in rows:
+            chunk_hashes.add(row[0])
+        print(f"Unique chunk hashes: {len(chunk_hashes)}")
+
+        # ── Step 3: Fetch all chunks from Qdrant in batches ───────────
+        # Build a hash→text+metadata map by scrolling the collection
+        hash_to_qdrant = {}  # chunk_hash → {text, regulation_name_de, ...}
+        collection = "bp_compliance_ce"
+        offset = None
+        batch_num = 0
+
+        print(f"Fetching chunks from Qdrant ({collection})...")
+        while True:
+            params = {
+                "filter": {"must": [{"key": "regulation_id", "match": {"value": "eu_2023_1542"}}]},
+                "limit": 200,
+                "with_payload": ["chunk_text", "regulation_name_de", "regulation_short",
+                                 "source", "celex", "chunk_index"],
+                "with_vectors": False,
+            }
+            if offset:
+                params["offset"] = offset
+
+            result = http_post(
+                f"{QDRANT_URL}/collections/{collection}/points/scroll",
+                params,
+                timeout=30,
+            )
+            points = result.get("result", {}).get("points", [])
+            next_offset = result.get("result", {}).get("next_page_offset")
+            batch_num += 1
+
+            for p in points:
+                text_content = p["payload"].get("chunk_text", "")
+                h = hashlib.sha256(text_content.encode()).hexdigest()
+                if h in chunk_hashes:
+                    hash_to_qdrant[h] = {
+                        "text": text_content,
+                        "regulation_name_de": p["payload"].get("regulation_name_de", "Batterieverordnung"),
+                        "regulation_short": p["payload"].get("regulation_short", "BattVO"),
+                        "source": p["payload"].get("source", ""),
+                        "celex": p["payload"].get("celex", ""),
+                        "chunk_index": p["payload"].get("chunk_index"),
+                    }
+
+            sys.stdout.write(f"\r  Batch {batch_num}: scanned {batch_num * 200} points, matched {len(hash_to_qdrant)}/{len(chunk_hashes)}")
+            sys.stdout.flush()
+
+            if not next_offset or len(hash_to_qdrant) == len(chunk_hashes):
+                break
+            offset = next_offset
+
+        print(f"\n  Matched {len(hash_to_qdrant)}/{len(chunk_hashes)} chunks from Qdrant")
+
+        # ── Step 4: Update controls ───────────────────────────────────
+        updated = 0
+        skipped = 0
+        errors = 0
+
+        for row in rows:
+            chunk_hash = row[0]
+            regulation_code = row[1]
+            control_id = row[3]
+            chunk_row_id = row[4]
+
+            qdrant_data = hash_to_qdrant.get(chunk_hash)
+            if not qdrant_data:
+                print(f"\n  WARN: No Qdrant match for chunk {chunk_hash[:20]}... (control {control_id})")
+                skipped += 1
+                continue
+
+            chunk_text = qdrant_data["text"]
+            source_name = qdrant_data["regulation_name_de"]
+            article, paragraph = extract_article_paragraph(chunk_text)
+
+            source_citation = {
+                "source": source_name,
+                "article": article,
+                "paragraph": paragraph,
+                "license": LICENSE_INFO["license"],
+                "source_type": LICENSE_INFO["source_type"],
+                "url": f"https://eur-lex.europa.eu/legal-content/DE/TXT/?uri=CELEX:{qdrant_data['celex']}" if qdrant_data.get("celex") else "",
+            }
+
+            # Build updated generation_metadata (preserve existing fields)
+            new_meta_patch = {
+                "license_rule": 1,
+                "source_regulation": regulation_code,
+                "source_article": article,
+                "source_paragraph": paragraph,
+                "backfill_reason": "LICENSE_MAP missing eu_2023_1542",
+                "backfill_date": "2026-03-19",
+            }
+
+            if DRY_RUN:
+                if updated < 3:
+                    print(f"\n  [DRY RUN] Would update control {control_id}")
+                    print(f"    citation: {json.dumps(source_citation, ensure_ascii=False)[:120]}")
+                    print(f"    article: {article}, paragraph: {paragraph}")
+                    print(f"    text[:80]: {chunk_text[:80]}")
+                updated += 1
+                continue
+
+            try:
+                # Update the control
+                conn.execute(text("""
+                    UPDATE compliance.canonical_controls
+                    SET license_rule = 1,
+                        source_original_text = :source_text,
+                        source_citation = CAST(:citation AS jsonb),
+                        customer_visible = true,
+                        release_state = CASE
+                            WHEN release_state = 'too_close' THEN 'draft'
+                            ELSE release_state
+                        END,
+                        generation_metadata = COALESCE(generation_metadata, '{}'::jsonb) || CAST(:meta_patch AS jsonb),
+                        updated_at = NOW()
+                    WHERE id = :control_id
+                """), {
+                    "control_id": control_id,
+                    "source_text": chunk_text,
+                    "citation": json.dumps(source_citation, ensure_ascii=False),
+                    "meta_patch": json.dumps(new_meta_patch),
+                })
+
+                # Update the processed_chunk record too
+                conn.execute(text("""
+                    UPDATE compliance.canonical_processed_chunks
+                    SET license_rule = 1,
+                        source_license = 'EU_LAW',
+                        processing_path = 'structured_batch'
+                    WHERE id = :chunk_id
+                """), {"chunk_id": chunk_row_id})
+
+                updated += 1
+            except Exception as e:
+                print(f"\n  ERROR updating control {control_id}: {e}")
+                errors += 1
+
+        print(f"\n\n=== BACKFILL COMPLETE ===")
+        print(f"  Updated:  {updated}")
+        print(f"  Skipped:  {skipped} (no Qdrant match)")
+        print(f"  Errors:   {errors}")
+        print(f"  Dry run:  {DRY_RUN}")
+
+        if DRY_RUN:
+            print("\n  Run without --dry-run to apply changes.")
+
+        # ── Step 5: Verify ────────────────────────────────────────────
+        if not DRY_RUN:
+            r = conn.execute(text("""
+                WITH ctrl_ids AS (
+                    SELECT DISTINCT jsonb_array_elements_text(generated_control_ids)::uuid as ctrl_id
+                    FROM compliance.canonical_processed_chunks
+                    WHERE job_id = :job_id
+                      AND jsonb_array_length(COALESCE(generated_control_ids, '[]'::jsonb)) > 0
+                )
+                SELECT release_state, license_rule, customer_visible, count(*)
+                FROM compliance.canonical_controls c
+                JOIN ctrl_ids ci ON c.id = ci.ctrl_id
+                GROUP BY release_state, license_rule, customer_visible
+                ORDER BY release_state
+            """), {"job_id": JOB_ID})
+            print("\n=== Verification ===")
+            for row in r.fetchall():
+                print(f"  {str(row[0]):20s} rule={row[1]} visible={row[2]}  count={row[3]}")
+
+
+if __name__ == "__main__":
+    main()