Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 36s
CI/CD / test-python-backend-compliance (push) Successful in 32s
CI/CD / test-python-document-crawler (push) Successful in 22s
CI/CD / test-python-dsms-gateway (push) Successful in 19s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
QA pipeline that matches control source_original_text directly against original PDF documents to verify article/paragraph assignments. Covers backfill, dedup, source normalization, Qdrant cleanup, and prod sync. Key results (2026-03-20): - 4,110/7,943 controls matched to PDF (100% for major EU regs) - 3,366 article corrections, 705 new assignments - 1,290 controls from Erwägungsgründe (preamble) identified - 779 controls from Anhänge (annexes) identified Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
262 lines
11 KiB
Python
262 lines
11 KiB
Python
"""
|
|
Backfill script for job 66228863 — fix 216 controls that were wrongly processed as Rule 3.
|
|
|
|
eu_2023_1542 (Batterieverordnung) was missing from REGULATION_LICENSE_MAP, so all controls
|
|
were generated with Rule 3 (restricted): no source_citation, no source_original_text,
|
|
release_state=too_close, customer_visible=False.
|
|
|
|
This script:
|
|
1. Finds all 216 chunk→control pairs from the job
|
|
2. Fetches original chunk text from Qdrant (via chunk_hash)
|
|
3. Extracts article/paragraph references from chunk text
|
|
4. Updates each control: license_rule=1, source_citation, source_original_text,
|
|
release_state=draft, customer_visible=True, generation_metadata
|
|
5. Updates processed_chunks to reflect the corrected license_rule
|
|
"""
|
|
|
|
import hashlib
|
|
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
from sqlalchemy import create_engine, text
|
|
|
|
# Try httpx first (available in container), fall back to requests
|
|
try:
|
|
import httpx
|
|
def http_post(url, json_data, timeout=30):
|
|
return httpx.post(url, json=json_data, timeout=timeout).json()
|
|
except ImportError:
|
|
import requests
|
|
def http_post(url, json_data, timeout=30):
|
|
return requests.post(url, json=json_data, timeout=timeout).json()
|
|
|
|
# ── Configuration ──────────────────────────────────────────────────────────
|
|
|
|
DB_URL = os.environ['DATABASE_URL']
|
|
QDRANT_URL = os.environ.get('QDRANT_URL', 'http://host.docker.internal:6333')
|
|
JOB_ID = '66228863-e79f-46fb-9f22-4bd8e1ec53d2'
|
|
DRY_RUN = '--dry-run' in sys.argv
|
|
|
|
LICENSE_INFO = {
|
|
"license": "EU_LAW",
|
|
"rule": 1,
|
|
"source_type": "law",
|
|
"name": "Batterieverordnung",
|
|
}
|
|
|
|
# Article/paragraph extraction patterns
|
|
ARTICLE_PATTERN = re.compile(
|
|
r'(?:Artikel|Art\.?)\s+(\d+[a-z]?)',
|
|
re.IGNORECASE
|
|
)
|
|
PARAGRAPH_PATTERN = re.compile(
|
|
r'(?:Absatz|Abs\.?)\s+(\d+)',
|
|
re.IGNORECASE
|
|
)
|
|
# Also match "Artikel X Absatz Y" or "(Y)" after article
|
|
ARTICLE_TITLE_PATTERN = re.compile(
|
|
r'Artikel\s+(\d+[a-z]?)\s*\n([^\n]+)',
|
|
re.IGNORECASE
|
|
)
|
|
|
|
def extract_article_paragraph(chunk_text: str) -> tuple[str, str]:
|
|
"""Extract the most prominent article and paragraph from chunk text."""
|
|
articles = ARTICLE_PATTERN.findall(chunk_text)
|
|
paragraphs = PARAGRAPH_PATTERN.findall(chunk_text)
|
|
|
|
# Take the first (most prominent) article mention
|
|
article = f"Art. {articles[0]}" if articles else ""
|
|
paragraph = f"Abs. {paragraphs[0]}" if paragraphs else ""
|
|
return article, paragraph
|
|
|
|
|
|
def main():
|
|
engine = create_engine(DB_URL, connect_args={"options": "-c search_path=compliance,public"})
|
|
|
|
with engine.begin() as conn:
|
|
# ── Step 1: Get all chunk→control pairs ────────────────────────
|
|
rows = conn.execute(text("""
|
|
SELECT pc.chunk_hash, pc.regulation_code, pc.collection,
|
|
jsonb_array_elements_text(pc.generated_control_ids)::uuid as control_id,
|
|
pc.id as chunk_row_id
|
|
FROM compliance.canonical_processed_chunks pc
|
|
WHERE pc.job_id = :job_id
|
|
AND jsonb_array_length(COALESCE(pc.generated_control_ids, '[]'::jsonb)) > 0
|
|
"""), {"job_id": JOB_ID}).fetchall()
|
|
|
|
print(f"Found {len(rows)} chunk→control pairs")
|
|
|
|
# ── Step 2: Collect unique chunk hashes for Qdrant lookup ──────
|
|
chunk_hashes = set()
|
|
for row in rows:
|
|
chunk_hashes.add(row[0])
|
|
print(f"Unique chunk hashes: {len(chunk_hashes)}")
|
|
|
|
# ── Step 3: Fetch all chunks from Qdrant in batches ───────────
|
|
# Build a hash→text+metadata map by scrolling the collection
|
|
hash_to_qdrant = {} # chunk_hash → {text, regulation_name_de, ...}
|
|
collection = "bp_compliance_ce"
|
|
offset = None
|
|
batch_num = 0
|
|
|
|
print(f"Fetching chunks from Qdrant ({collection})...")
|
|
while True:
|
|
params = {
|
|
"filter": {"must": [{"key": "regulation_id", "match": {"value": "eu_2023_1542"}}]},
|
|
"limit": 200,
|
|
"with_payload": ["chunk_text", "regulation_name_de", "regulation_short",
|
|
"source", "celex", "chunk_index"],
|
|
"with_vectors": False,
|
|
}
|
|
if offset:
|
|
params["offset"] = offset
|
|
|
|
result = http_post(
|
|
f"{QDRANT_URL}/collections/{collection}/points/scroll",
|
|
params,
|
|
timeout=30,
|
|
)
|
|
points = result.get("result", {}).get("points", [])
|
|
next_offset = result.get("result", {}).get("next_page_offset")
|
|
batch_num += 1
|
|
|
|
for p in points:
|
|
text_content = p["payload"].get("chunk_text", "")
|
|
h = hashlib.sha256(text_content.encode()).hexdigest()
|
|
if h in chunk_hashes:
|
|
hash_to_qdrant[h] = {
|
|
"text": text_content,
|
|
"regulation_name_de": p["payload"].get("regulation_name_de", "Batterieverordnung"),
|
|
"regulation_short": p["payload"].get("regulation_short", "BattVO"),
|
|
"source": p["payload"].get("source", ""),
|
|
"celex": p["payload"].get("celex", ""),
|
|
"chunk_index": p["payload"].get("chunk_index"),
|
|
}
|
|
|
|
sys.stdout.write(f"\r Batch {batch_num}: scanned {batch_num * 200} points, matched {len(hash_to_qdrant)}/{len(chunk_hashes)}")
|
|
sys.stdout.flush()
|
|
|
|
if not next_offset or len(hash_to_qdrant) == len(chunk_hashes):
|
|
break
|
|
offset = next_offset
|
|
|
|
print(f"\n Matched {len(hash_to_qdrant)}/{len(chunk_hashes)} chunks from Qdrant")
|
|
|
|
# ── Step 4: Update controls ───────────────────────────────────
|
|
updated = 0
|
|
skipped = 0
|
|
errors = 0
|
|
|
|
for row in rows:
|
|
chunk_hash = row[0]
|
|
regulation_code = row[1]
|
|
control_id = row[3]
|
|
chunk_row_id = row[4]
|
|
|
|
qdrant_data = hash_to_qdrant.get(chunk_hash)
|
|
if not qdrant_data:
|
|
print(f"\n WARN: No Qdrant match for chunk {chunk_hash[:20]}... (control {control_id})")
|
|
skipped += 1
|
|
continue
|
|
|
|
chunk_text = qdrant_data["text"]
|
|
source_name = qdrant_data["regulation_name_de"]
|
|
article, paragraph = extract_article_paragraph(chunk_text)
|
|
|
|
source_citation = {
|
|
"source": source_name,
|
|
"article": article,
|
|
"paragraph": paragraph,
|
|
"license": LICENSE_INFO["license"],
|
|
"source_type": LICENSE_INFO["source_type"],
|
|
"url": f"https://eur-lex.europa.eu/legal-content/DE/TXT/?uri=CELEX:{qdrant_data['celex']}" if qdrant_data.get("celex") else "",
|
|
}
|
|
|
|
# Build updated generation_metadata (preserve existing fields)
|
|
new_meta_patch = {
|
|
"license_rule": 1,
|
|
"source_regulation": regulation_code,
|
|
"source_article": article,
|
|
"source_paragraph": paragraph,
|
|
"backfill_reason": "LICENSE_MAP missing eu_2023_1542",
|
|
"backfill_date": "2026-03-19",
|
|
}
|
|
|
|
if DRY_RUN:
|
|
if updated < 3:
|
|
print(f"\n [DRY RUN] Would update control {control_id}")
|
|
print(f" citation: {json.dumps(source_citation, ensure_ascii=False)[:120]}")
|
|
print(f" article: {article}, paragraph: {paragraph}")
|
|
print(f" text[:80]: {chunk_text[:80]}")
|
|
updated += 1
|
|
continue
|
|
|
|
try:
|
|
# Update the control
|
|
conn.execute(text("""
|
|
UPDATE compliance.canonical_controls
|
|
SET license_rule = 1,
|
|
source_original_text = :source_text,
|
|
source_citation = CAST(:citation AS jsonb),
|
|
customer_visible = true,
|
|
release_state = CASE
|
|
WHEN release_state = 'too_close' THEN 'draft'
|
|
ELSE release_state
|
|
END,
|
|
generation_metadata = COALESCE(generation_metadata, '{}'::jsonb) || CAST(:meta_patch AS jsonb),
|
|
updated_at = NOW()
|
|
WHERE id = :control_id
|
|
"""), {
|
|
"control_id": control_id,
|
|
"source_text": chunk_text,
|
|
"citation": json.dumps(source_citation, ensure_ascii=False),
|
|
"meta_patch": json.dumps(new_meta_patch),
|
|
})
|
|
|
|
# Update the processed_chunk record too
|
|
conn.execute(text("""
|
|
UPDATE compliance.canonical_processed_chunks
|
|
SET license_rule = 1,
|
|
source_license = 'EU_LAW',
|
|
processing_path = 'structured_batch'
|
|
WHERE id = :chunk_id
|
|
"""), {"chunk_id": chunk_row_id})
|
|
|
|
updated += 1
|
|
except Exception as e:
|
|
print(f"\n ERROR updating control {control_id}: {e}")
|
|
errors += 1
|
|
|
|
print(f"\n\n=== BACKFILL COMPLETE ===")
|
|
print(f" Updated: {updated}")
|
|
print(f" Skipped: {skipped} (no Qdrant match)")
|
|
print(f" Errors: {errors}")
|
|
print(f" Dry run: {DRY_RUN}")
|
|
|
|
if DRY_RUN:
|
|
print("\n Run without --dry-run to apply changes.")
|
|
|
|
# ── Step 5: Verify ────────────────────────────────────────────
|
|
if not DRY_RUN:
|
|
r = conn.execute(text("""
|
|
WITH ctrl_ids AS (
|
|
SELECT DISTINCT jsonb_array_elements_text(generated_control_ids)::uuid as ctrl_id
|
|
FROM compliance.canonical_processed_chunks
|
|
WHERE job_id = :job_id
|
|
AND jsonb_array_length(COALESCE(generated_control_ids, '[]'::jsonb)) > 0
|
|
)
|
|
SELECT release_state, license_rule, customer_visible, count(*)
|
|
FROM compliance.canonical_controls c
|
|
JOIN ctrl_ids ci ON c.id = ci.ctrl_id
|
|
GROUP BY release_state, license_rule, customer_visible
|
|
ORDER BY release_state
|
|
"""), {"job_id": JOB_ID})
|
|
print("\n=== Verification ===")
|
|
for row in r.fetchall():
|
|
print(f" {str(row[0]):20s} rule={row[1]} visible={row[2]} count={row[3]}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|