chore(qa): add PDF-based control QA scripts and results
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 36s
CI/CD / test-python-backend-compliance (push) Successful in 32s
CI/CD / test-python-document-crawler (push) Successful in 22s
CI/CD / test-python-dsms-gateway (push) Successful in 19s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 36s
CI/CD / test-python-backend-compliance (push) Successful in 32s
CI/CD / test-python-document-crawler (push) Successful in 22s
CI/CD / test-python-dsms-gateway (push) Successful in 19s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
QA pipeline that matches control source_original_text directly against original PDF documents to verify article/paragraph assignments. Covers backfill, dedup, source normalization, Qdrant cleanup, and prod sync. Key results (2026-03-20): - 4,110/7,943 controls matched to PDF (100% for major EU regs) - 3,366 article corrections, 705 new assignments - 1,290 controls from Erwägungsgründe (preamble) identified - 779 controls from Anhänge (annexes) identified Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
261
scripts/qa/backfill_job_66228863.py
Normal file
261
scripts/qa/backfill_job_66228863.py
Normal file
@@ -0,0 +1,261 @@
|
||||
"""
|
||||
Backfill script for job 66228863 — fix 216 controls that were wrongly processed as Rule 3.
|
||||
|
||||
eu_2023_1542 (Batterieverordnung) was missing from REGULATION_LICENSE_MAP, so all controls
|
||||
were generated with Rule 3 (restricted): no source_citation, no source_original_text,
|
||||
release_state=too_close, customer_visible=False.
|
||||
|
||||
This script:
|
||||
1. Finds all 216 chunk→control pairs from the job
|
||||
2. Fetches original chunk text from Qdrant (via chunk_hash)
|
||||
3. Extracts article/paragraph references from chunk text
|
||||
4. Updates each control: license_rule=1, source_citation, source_original_text,
|
||||
release_state=draft, customer_visible=True, generation_metadata
|
||||
5. Updates processed_chunks to reflect the corrected license_rule
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from sqlalchemy import create_engine, text
|
||||
|
||||
# Try httpx first (available in container), fall back to requests
|
||||
try:
|
||||
import httpx
|
||||
def http_post(url, json_data, timeout=30):
|
||||
return httpx.post(url, json=json_data, timeout=timeout).json()
|
||||
except ImportError:
|
||||
import requests
|
||||
def http_post(url, json_data, timeout=30):
|
||||
return requests.post(url, json=json_data, timeout=timeout).json()
|
||||
|
||||
# ── Configuration ──────────────────────────────────────────────────────────
|
||||
|
||||
DB_URL = os.environ['DATABASE_URL']
|
||||
QDRANT_URL = os.environ.get('QDRANT_URL', 'http://host.docker.internal:6333')
|
||||
JOB_ID = '66228863-e79f-46fb-9f22-4bd8e1ec53d2'
|
||||
DRY_RUN = '--dry-run' in sys.argv
|
||||
|
||||
LICENSE_INFO = {
|
||||
"license": "EU_LAW",
|
||||
"rule": 1,
|
||||
"source_type": "law",
|
||||
"name": "Batterieverordnung",
|
||||
}
|
||||
|
||||
# Article/paragraph extraction patterns
|
||||
ARTICLE_PATTERN = re.compile(
|
||||
r'(?:Artikel|Art\.?)\s+(\d+[a-z]?)',
|
||||
re.IGNORECASE
|
||||
)
|
||||
PARAGRAPH_PATTERN = re.compile(
|
||||
r'(?:Absatz|Abs\.?)\s+(\d+)',
|
||||
re.IGNORECASE
|
||||
)
|
||||
# Also match "Artikel X Absatz Y" or "(Y)" after article
|
||||
ARTICLE_TITLE_PATTERN = re.compile(
|
||||
r'Artikel\s+(\d+[a-z]?)\s*\n([^\n]+)',
|
||||
re.IGNORECASE
|
||||
)
|
||||
|
||||
def extract_article_paragraph(chunk_text: str) -> tuple[str, str]:
|
||||
"""Extract the most prominent article and paragraph from chunk text."""
|
||||
articles = ARTICLE_PATTERN.findall(chunk_text)
|
||||
paragraphs = PARAGRAPH_PATTERN.findall(chunk_text)
|
||||
|
||||
# Take the first (most prominent) article mention
|
||||
article = f"Art. {articles[0]}" if articles else ""
|
||||
paragraph = f"Abs. {paragraphs[0]}" if paragraphs else ""
|
||||
return article, paragraph
|
||||
|
||||
|
||||
def main():
|
||||
engine = create_engine(DB_URL, connect_args={"options": "-c search_path=compliance,public"})
|
||||
|
||||
with engine.begin() as conn:
|
||||
# ── Step 1: Get all chunk→control pairs ────────────────────────
|
||||
rows = conn.execute(text("""
|
||||
SELECT pc.chunk_hash, pc.regulation_code, pc.collection,
|
||||
jsonb_array_elements_text(pc.generated_control_ids)::uuid as control_id,
|
||||
pc.id as chunk_row_id
|
||||
FROM compliance.canonical_processed_chunks pc
|
||||
WHERE pc.job_id = :job_id
|
||||
AND jsonb_array_length(COALESCE(pc.generated_control_ids, '[]'::jsonb)) > 0
|
||||
"""), {"job_id": JOB_ID}).fetchall()
|
||||
|
||||
print(f"Found {len(rows)} chunk→control pairs")
|
||||
|
||||
# ── Step 2: Collect unique chunk hashes for Qdrant lookup ──────
|
||||
chunk_hashes = set()
|
||||
for row in rows:
|
||||
chunk_hashes.add(row[0])
|
||||
print(f"Unique chunk hashes: {len(chunk_hashes)}")
|
||||
|
||||
# ── Step 3: Fetch all chunks from Qdrant in batches ───────────
|
||||
# Build a hash→text+metadata map by scrolling the collection
|
||||
hash_to_qdrant = {} # chunk_hash → {text, regulation_name_de, ...}
|
||||
collection = "bp_compliance_ce"
|
||||
offset = None
|
||||
batch_num = 0
|
||||
|
||||
print(f"Fetching chunks from Qdrant ({collection})...")
|
||||
while True:
|
||||
params = {
|
||||
"filter": {"must": [{"key": "regulation_id", "match": {"value": "eu_2023_1542"}}]},
|
||||
"limit": 200,
|
||||
"with_payload": ["chunk_text", "regulation_name_de", "regulation_short",
|
||||
"source", "celex", "chunk_index"],
|
||||
"with_vectors": False,
|
||||
}
|
||||
if offset:
|
||||
params["offset"] = offset
|
||||
|
||||
result = http_post(
|
||||
f"{QDRANT_URL}/collections/{collection}/points/scroll",
|
||||
params,
|
||||
timeout=30,
|
||||
)
|
||||
points = result.get("result", {}).get("points", [])
|
||||
next_offset = result.get("result", {}).get("next_page_offset")
|
||||
batch_num += 1
|
||||
|
||||
for p in points:
|
||||
text_content = p["payload"].get("chunk_text", "")
|
||||
h = hashlib.sha256(text_content.encode()).hexdigest()
|
||||
if h in chunk_hashes:
|
||||
hash_to_qdrant[h] = {
|
||||
"text": text_content,
|
||||
"regulation_name_de": p["payload"].get("regulation_name_de", "Batterieverordnung"),
|
||||
"regulation_short": p["payload"].get("regulation_short", "BattVO"),
|
||||
"source": p["payload"].get("source", ""),
|
||||
"celex": p["payload"].get("celex", ""),
|
||||
"chunk_index": p["payload"].get("chunk_index"),
|
||||
}
|
||||
|
||||
sys.stdout.write(f"\r Batch {batch_num}: scanned {batch_num * 200} points, matched {len(hash_to_qdrant)}/{len(chunk_hashes)}")
|
||||
sys.stdout.flush()
|
||||
|
||||
if not next_offset or len(hash_to_qdrant) == len(chunk_hashes):
|
||||
break
|
||||
offset = next_offset
|
||||
|
||||
print(f"\n Matched {len(hash_to_qdrant)}/{len(chunk_hashes)} chunks from Qdrant")
|
||||
|
||||
# ── Step 4: Update controls ───────────────────────────────────
|
||||
updated = 0
|
||||
skipped = 0
|
||||
errors = 0
|
||||
|
||||
for row in rows:
|
||||
chunk_hash = row[0]
|
||||
regulation_code = row[1]
|
||||
control_id = row[3]
|
||||
chunk_row_id = row[4]
|
||||
|
||||
qdrant_data = hash_to_qdrant.get(chunk_hash)
|
||||
if not qdrant_data:
|
||||
print(f"\n WARN: No Qdrant match for chunk {chunk_hash[:20]}... (control {control_id})")
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
chunk_text = qdrant_data["text"]
|
||||
source_name = qdrant_data["regulation_name_de"]
|
||||
article, paragraph = extract_article_paragraph(chunk_text)
|
||||
|
||||
source_citation = {
|
||||
"source": source_name,
|
||||
"article": article,
|
||||
"paragraph": paragraph,
|
||||
"license": LICENSE_INFO["license"],
|
||||
"source_type": LICENSE_INFO["source_type"],
|
||||
"url": f"https://eur-lex.europa.eu/legal-content/DE/TXT/?uri=CELEX:{qdrant_data['celex']}" if qdrant_data.get("celex") else "",
|
||||
}
|
||||
|
||||
# Build updated generation_metadata (preserve existing fields)
|
||||
new_meta_patch = {
|
||||
"license_rule": 1,
|
||||
"source_regulation": regulation_code,
|
||||
"source_article": article,
|
||||
"source_paragraph": paragraph,
|
||||
"backfill_reason": "LICENSE_MAP missing eu_2023_1542",
|
||||
"backfill_date": "2026-03-19",
|
||||
}
|
||||
|
||||
if DRY_RUN:
|
||||
if updated < 3:
|
||||
print(f"\n [DRY RUN] Would update control {control_id}")
|
||||
print(f" citation: {json.dumps(source_citation, ensure_ascii=False)[:120]}")
|
||||
print(f" article: {article}, paragraph: {paragraph}")
|
||||
print(f" text[:80]: {chunk_text[:80]}")
|
||||
updated += 1
|
||||
continue
|
||||
|
||||
try:
|
||||
# Update the control
|
||||
conn.execute(text("""
|
||||
UPDATE compliance.canonical_controls
|
||||
SET license_rule = 1,
|
||||
source_original_text = :source_text,
|
||||
source_citation = CAST(:citation AS jsonb),
|
||||
customer_visible = true,
|
||||
release_state = CASE
|
||||
WHEN release_state = 'too_close' THEN 'draft'
|
||||
ELSE release_state
|
||||
END,
|
||||
generation_metadata = COALESCE(generation_metadata, '{}'::jsonb) || CAST(:meta_patch AS jsonb),
|
||||
updated_at = NOW()
|
||||
WHERE id = :control_id
|
||||
"""), {
|
||||
"control_id": control_id,
|
||||
"source_text": chunk_text,
|
||||
"citation": json.dumps(source_citation, ensure_ascii=False),
|
||||
"meta_patch": json.dumps(new_meta_patch),
|
||||
})
|
||||
|
||||
# Update the processed_chunk record too
|
||||
conn.execute(text("""
|
||||
UPDATE compliance.canonical_processed_chunks
|
||||
SET license_rule = 1,
|
||||
source_license = 'EU_LAW',
|
||||
processing_path = 'structured_batch'
|
||||
WHERE id = :chunk_id
|
||||
"""), {"chunk_id": chunk_row_id})
|
||||
|
||||
updated += 1
|
||||
except Exception as e:
|
||||
print(f"\n ERROR updating control {control_id}: {e}")
|
||||
errors += 1
|
||||
|
||||
print(f"\n\n=== BACKFILL COMPLETE ===")
|
||||
print(f" Updated: {updated}")
|
||||
print(f" Skipped: {skipped} (no Qdrant match)")
|
||||
print(f" Errors: {errors}")
|
||||
print(f" Dry run: {DRY_RUN}")
|
||||
|
||||
if DRY_RUN:
|
||||
print("\n Run without --dry-run to apply changes.")
|
||||
|
||||
# ── Step 5: Verify ────────────────────────────────────────────
|
||||
if not DRY_RUN:
|
||||
r = conn.execute(text("""
|
||||
WITH ctrl_ids AS (
|
||||
SELECT DISTINCT jsonb_array_elements_text(generated_control_ids)::uuid as ctrl_id
|
||||
FROM compliance.canonical_processed_chunks
|
||||
WHERE job_id = :job_id
|
||||
AND jsonb_array_length(COALESCE(generated_control_ids, '[]'::jsonb)) > 0
|
||||
)
|
||||
SELECT release_state, license_rule, customer_visible, count(*)
|
||||
FROM compliance.canonical_controls c
|
||||
JOIN ctrl_ids ci ON c.id = ci.ctrl_id
|
||||
GROUP BY release_state, license_rule, customer_visible
|
||||
ORDER BY release_state
|
||||
"""), {"job_id": JOB_ID})
|
||||
print("\n=== Verification ===")
|
||||
for row in r.fetchall():
|
||||
print(f" {str(row[0]):20s} rule={row[1]} visible={row[2]} count={row[3]}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user