Files
breakpilot-compliance/scripts/qa/backfill_job_66228863.py
Benjamin Admin 9b0f25c105
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 36s
CI/CD / test-python-backend-compliance (push) Successful in 32s
CI/CD / test-python-document-crawler (push) Successful in 22s
CI/CD / test-python-dsms-gateway (push) Successful in 19s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
chore(qa): add PDF-based control QA scripts and results
QA pipeline that matches control source_original_text directly against
original PDF documents to verify article/paragraph assignments. Covers
backfill, dedup, source normalization, Qdrant cleanup, and prod sync.

Key results (2026-03-20):
- 4,110/7,943 controls matched to PDF (100% for major EU regs)
- 3,366 article corrections, 705 new assignments
- 1,290 controls from Erwägungsgründe (preamble) identified
- 779 controls from Anhänge (annexes) identified

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 00:56:13 +01:00

262 lines
11 KiB
Python

"""
Backfill script for job 66228863 — fix 216 controls that were wrongly processed as Rule 3.
eu_2023_1542 (Batterieverordnung) was missing from REGULATION_LICENSE_MAP, so all controls
were generated with Rule 3 (restricted): no source_citation, no source_original_text,
release_state=too_close, customer_visible=False.
This script:
1. Finds all 216 chunk→control pairs from the job
2. Fetches original chunk text from Qdrant (via chunk_hash)
3. Extracts article/paragraph references from chunk text
4. Updates each control: license_rule=1, source_citation, source_original_text,
release_state=draft, customer_visible=True, generation_metadata
5. Updates processed_chunks to reflect the corrected license_rule
"""
import hashlib
import json
import os
import re
import sys
from sqlalchemy import create_engine, text
# Try httpx first (available in container), fall back to requests
try:
import httpx
def http_post(url, json_data, timeout=30):
return httpx.post(url, json=json_data, timeout=timeout).json()
except ImportError:
import requests
def http_post(url, json_data, timeout=30):
return requests.post(url, json=json_data, timeout=timeout).json()
# ── Configuration ──────────────────────────────────────────────────────────
DB_URL = os.environ['DATABASE_URL']
QDRANT_URL = os.environ.get('QDRANT_URL', 'http://host.docker.internal:6333')
JOB_ID = '66228863-e79f-46fb-9f22-4bd8e1ec53d2'
DRY_RUN = '--dry-run' in sys.argv
LICENSE_INFO = {
"license": "EU_LAW",
"rule": 1,
"source_type": "law",
"name": "Batterieverordnung",
}
# Article/paragraph extraction patterns
ARTICLE_PATTERN = re.compile(
r'(?:Artikel|Art\.?)\s+(\d+[a-z]?)',
re.IGNORECASE
)
PARAGRAPH_PATTERN = re.compile(
r'(?:Absatz|Abs\.?)\s+(\d+)',
re.IGNORECASE
)
# Also match "Artikel X Absatz Y" or "(Y)" after article
ARTICLE_TITLE_PATTERN = re.compile(
r'Artikel\s+(\d+[a-z]?)\s*\n([^\n]+)',
re.IGNORECASE
)
def extract_article_paragraph(chunk_text: str) -> tuple[str, str]:
"""Extract the most prominent article and paragraph from chunk text."""
articles = ARTICLE_PATTERN.findall(chunk_text)
paragraphs = PARAGRAPH_PATTERN.findall(chunk_text)
# Take the first (most prominent) article mention
article = f"Art. {articles[0]}" if articles else ""
paragraph = f"Abs. {paragraphs[0]}" if paragraphs else ""
return article, paragraph
def main():
engine = create_engine(DB_URL, connect_args={"options": "-c search_path=compliance,public"})
with engine.begin() as conn:
# ── Step 1: Get all chunk→control pairs ────────────────────────
rows = conn.execute(text("""
SELECT pc.chunk_hash, pc.regulation_code, pc.collection,
jsonb_array_elements_text(pc.generated_control_ids)::uuid as control_id,
pc.id as chunk_row_id
FROM compliance.canonical_processed_chunks pc
WHERE pc.job_id = :job_id
AND jsonb_array_length(COALESCE(pc.generated_control_ids, '[]'::jsonb)) > 0
"""), {"job_id": JOB_ID}).fetchall()
print(f"Found {len(rows)} chunk→control pairs")
# ── Step 2: Collect unique chunk hashes for Qdrant lookup ──────
chunk_hashes = set()
for row in rows:
chunk_hashes.add(row[0])
print(f"Unique chunk hashes: {len(chunk_hashes)}")
# ── Step 3: Fetch all chunks from Qdrant in batches ───────────
# Build a hash→text+metadata map by scrolling the collection
hash_to_qdrant = {} # chunk_hash → {text, regulation_name_de, ...}
collection = "bp_compliance_ce"
offset = None
batch_num = 0
print(f"Fetching chunks from Qdrant ({collection})...")
while True:
params = {
"filter": {"must": [{"key": "regulation_id", "match": {"value": "eu_2023_1542"}}]},
"limit": 200,
"with_payload": ["chunk_text", "regulation_name_de", "regulation_short",
"source", "celex", "chunk_index"],
"with_vectors": False,
}
if offset:
params["offset"] = offset
result = http_post(
f"{QDRANT_URL}/collections/{collection}/points/scroll",
params,
timeout=30,
)
points = result.get("result", {}).get("points", [])
next_offset = result.get("result", {}).get("next_page_offset")
batch_num += 1
for p in points:
text_content = p["payload"].get("chunk_text", "")
h = hashlib.sha256(text_content.encode()).hexdigest()
if h in chunk_hashes:
hash_to_qdrant[h] = {
"text": text_content,
"regulation_name_de": p["payload"].get("regulation_name_de", "Batterieverordnung"),
"regulation_short": p["payload"].get("regulation_short", "BattVO"),
"source": p["payload"].get("source", ""),
"celex": p["payload"].get("celex", ""),
"chunk_index": p["payload"].get("chunk_index"),
}
sys.stdout.write(f"\r Batch {batch_num}: scanned {batch_num * 200} points, matched {len(hash_to_qdrant)}/{len(chunk_hashes)}")
sys.stdout.flush()
if not next_offset or len(hash_to_qdrant) == len(chunk_hashes):
break
offset = next_offset
print(f"\n Matched {len(hash_to_qdrant)}/{len(chunk_hashes)} chunks from Qdrant")
# ── Step 4: Update controls ───────────────────────────────────
updated = 0
skipped = 0
errors = 0
for row in rows:
chunk_hash = row[0]
regulation_code = row[1]
control_id = row[3]
chunk_row_id = row[4]
qdrant_data = hash_to_qdrant.get(chunk_hash)
if not qdrant_data:
print(f"\n WARN: No Qdrant match for chunk {chunk_hash[:20]}... (control {control_id})")
skipped += 1
continue
chunk_text = qdrant_data["text"]
source_name = qdrant_data["regulation_name_de"]
article, paragraph = extract_article_paragraph(chunk_text)
source_citation = {
"source": source_name,
"article": article,
"paragraph": paragraph,
"license": LICENSE_INFO["license"],
"source_type": LICENSE_INFO["source_type"],
"url": f"https://eur-lex.europa.eu/legal-content/DE/TXT/?uri=CELEX:{qdrant_data['celex']}" if qdrant_data.get("celex") else "",
}
# Build updated generation_metadata (preserve existing fields)
new_meta_patch = {
"license_rule": 1,
"source_regulation": regulation_code,
"source_article": article,
"source_paragraph": paragraph,
"backfill_reason": "LICENSE_MAP missing eu_2023_1542",
"backfill_date": "2026-03-19",
}
if DRY_RUN:
if updated < 3:
print(f"\n [DRY RUN] Would update control {control_id}")
print(f" citation: {json.dumps(source_citation, ensure_ascii=False)[:120]}")
print(f" article: {article}, paragraph: {paragraph}")
print(f" text[:80]: {chunk_text[:80]}")
updated += 1
continue
try:
# Update the control
conn.execute(text("""
UPDATE compliance.canonical_controls
SET license_rule = 1,
source_original_text = :source_text,
source_citation = CAST(:citation AS jsonb),
customer_visible = true,
release_state = CASE
WHEN release_state = 'too_close' THEN 'draft'
ELSE release_state
END,
generation_metadata = COALESCE(generation_metadata, '{}'::jsonb) || CAST(:meta_patch AS jsonb),
updated_at = NOW()
WHERE id = :control_id
"""), {
"control_id": control_id,
"source_text": chunk_text,
"citation": json.dumps(source_citation, ensure_ascii=False),
"meta_patch": json.dumps(new_meta_patch),
})
# Update the processed_chunk record too
conn.execute(text("""
UPDATE compliance.canonical_processed_chunks
SET license_rule = 1,
source_license = 'EU_LAW',
processing_path = 'structured_batch'
WHERE id = :chunk_id
"""), {"chunk_id": chunk_row_id})
updated += 1
except Exception as e:
print(f"\n ERROR updating control {control_id}: {e}")
errors += 1
print(f"\n\n=== BACKFILL COMPLETE ===")
print(f" Updated: {updated}")
print(f" Skipped: {skipped} (no Qdrant match)")
print(f" Errors: {errors}")
print(f" Dry run: {DRY_RUN}")
if DRY_RUN:
print("\n Run without --dry-run to apply changes.")
# ── Step 5: Verify ────────────────────────────────────────────
if not DRY_RUN:
r = conn.execute(text("""
WITH ctrl_ids AS (
SELECT DISTINCT jsonb_array_elements_text(generated_control_ids)::uuid as ctrl_id
FROM compliance.canonical_processed_chunks
WHERE job_id = :job_id
AND jsonb_array_length(COALESCE(generated_control_ids, '[]'::jsonb)) > 0
)
SELECT release_state, license_rule, customer_visible, count(*)
FROM compliance.canonical_controls c
JOIN ctrl_ids ci ON c.id = ci.ctrl_id
GROUP BY release_state, license_rule, customer_visible
ORDER BY release_state
"""), {"job_id": JOB_ID})
print("\n=== Verification ===")
for row in r.fetchall():
print(f" {str(row[0]):20s} rule={row[1]} visible={row[2]} count={row[3]}")
if __name__ == "__main__":
main()