chore(qa): add PDF-based control QA scripts and results
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 36s
CI/CD / test-python-backend-compliance (push) Successful in 32s
CI/CD / test-python-document-crawler (push) Successful in 22s
CI/CD / test-python-dsms-gateway (push) Successful in 19s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 36s
CI/CD / test-python-backend-compliance (push) Successful in 32s
CI/CD / test-python-document-crawler (push) Successful in 22s
CI/CD / test-python-dsms-gateway (push) Successful in 19s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
QA pipeline that matches control source_original_text directly against original PDF documents to verify article/paragraph assignments. Covers backfill, dedup, source normalization, Qdrant cleanup, and prod sync. Key results (2026-03-20): - 4,110/7,943 controls matched to PDF (100% for major EU regs) - 3,366 article corrections, 705 new assignments - 1,290 controls from Erwägungsgründe (preamble) identified - 779 controls from Anhänge (annexes) identified Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
261
scripts/qa/backfill_job_66228863.py
Normal file
261
scripts/qa/backfill_job_66228863.py
Normal file
@@ -0,0 +1,261 @@
|
|||||||
|
"""
|
||||||
|
Backfill script for job 66228863 — fix 216 controls that were wrongly processed as Rule 3.
|
||||||
|
|
||||||
|
eu_2023_1542 (Batterieverordnung) was missing from REGULATION_LICENSE_MAP, so all controls
|
||||||
|
were generated with Rule 3 (restricted): no source_citation, no source_original_text,
|
||||||
|
release_state=too_close, customer_visible=False.
|
||||||
|
|
||||||
|
This script:
|
||||||
|
1. Finds all 216 chunk→control pairs from the job
|
||||||
|
2. Fetches original chunk text from Qdrant (via chunk_hash)
|
||||||
|
3. Extracts article/paragraph references from chunk text
|
||||||
|
4. Updates each control: license_rule=1, source_citation, source_original_text,
|
||||||
|
release_state=draft, customer_visible=True, generation_metadata
|
||||||
|
5. Updates processed_chunks to reflect the corrected license_rule
|
||||||
|
"""
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from sqlalchemy import create_engine, text
|
||||||
|
|
||||||
|
# Try httpx first (available in container), fall back to requests
|
||||||
|
try:
|
||||||
|
import httpx
|
||||||
|
def http_post(url, json_data, timeout=30):
|
||||||
|
return httpx.post(url, json=json_data, timeout=timeout).json()
|
||||||
|
except ImportError:
|
||||||
|
import requests
|
||||||
|
def http_post(url, json_data, timeout=30):
|
||||||
|
return requests.post(url, json=json_data, timeout=timeout).json()
|
||||||
|
|
||||||
|
# ── Configuration ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
DB_URL = os.environ['DATABASE_URL']
|
||||||
|
QDRANT_URL = os.environ.get('QDRANT_URL', 'http://host.docker.internal:6333')
|
||||||
|
JOB_ID = '66228863-e79f-46fb-9f22-4bd8e1ec53d2'
|
||||||
|
DRY_RUN = '--dry-run' in sys.argv
|
||||||
|
|
||||||
|
LICENSE_INFO = {
|
||||||
|
"license": "EU_LAW",
|
||||||
|
"rule": 1,
|
||||||
|
"source_type": "law",
|
||||||
|
"name": "Batterieverordnung",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Article/paragraph extraction patterns
|
||||||
|
ARTICLE_PATTERN = re.compile(
|
||||||
|
r'(?:Artikel|Art\.?)\s+(\d+[a-z]?)',
|
||||||
|
re.IGNORECASE
|
||||||
|
)
|
||||||
|
PARAGRAPH_PATTERN = re.compile(
|
||||||
|
r'(?:Absatz|Abs\.?)\s+(\d+)',
|
||||||
|
re.IGNORECASE
|
||||||
|
)
|
||||||
|
# Also match "Artikel X Absatz Y" or "(Y)" after article
|
||||||
|
ARTICLE_TITLE_PATTERN = re.compile(
|
||||||
|
r'Artikel\s+(\d+[a-z]?)\s*\n([^\n]+)',
|
||||||
|
re.IGNORECASE
|
||||||
|
)
|
||||||
|
|
||||||
|
def extract_article_paragraph(chunk_text: str) -> tuple[str, str]:
|
||||||
|
"""Extract the most prominent article and paragraph from chunk text."""
|
||||||
|
articles = ARTICLE_PATTERN.findall(chunk_text)
|
||||||
|
paragraphs = PARAGRAPH_PATTERN.findall(chunk_text)
|
||||||
|
|
||||||
|
# Take the first (most prominent) article mention
|
||||||
|
article = f"Art. {articles[0]}" if articles else ""
|
||||||
|
paragraph = f"Abs. {paragraphs[0]}" if paragraphs else ""
|
||||||
|
return article, paragraph
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
engine = create_engine(DB_URL, connect_args={"options": "-c search_path=compliance,public"})
|
||||||
|
|
||||||
|
with engine.begin() as conn:
|
||||||
|
# ── Step 1: Get all chunk→control pairs ────────────────────────
|
||||||
|
rows = conn.execute(text("""
|
||||||
|
SELECT pc.chunk_hash, pc.regulation_code, pc.collection,
|
||||||
|
jsonb_array_elements_text(pc.generated_control_ids)::uuid as control_id,
|
||||||
|
pc.id as chunk_row_id
|
||||||
|
FROM compliance.canonical_processed_chunks pc
|
||||||
|
WHERE pc.job_id = :job_id
|
||||||
|
AND jsonb_array_length(COALESCE(pc.generated_control_ids, '[]'::jsonb)) > 0
|
||||||
|
"""), {"job_id": JOB_ID}).fetchall()
|
||||||
|
|
||||||
|
print(f"Found {len(rows)} chunk→control pairs")
|
||||||
|
|
||||||
|
# ── Step 2: Collect unique chunk hashes for Qdrant lookup ──────
|
||||||
|
chunk_hashes = set()
|
||||||
|
for row in rows:
|
||||||
|
chunk_hashes.add(row[0])
|
||||||
|
print(f"Unique chunk hashes: {len(chunk_hashes)}")
|
||||||
|
|
||||||
|
# ── Step 3: Fetch all chunks from Qdrant in batches ───────────
|
||||||
|
# Build a hash→text+metadata map by scrolling the collection
|
||||||
|
hash_to_qdrant = {} # chunk_hash → {text, regulation_name_de, ...}
|
||||||
|
collection = "bp_compliance_ce"
|
||||||
|
offset = None
|
||||||
|
batch_num = 0
|
||||||
|
|
||||||
|
print(f"Fetching chunks from Qdrant ({collection})...")
|
||||||
|
while True:
|
||||||
|
params = {
|
||||||
|
"filter": {"must": [{"key": "regulation_id", "match": {"value": "eu_2023_1542"}}]},
|
||||||
|
"limit": 200,
|
||||||
|
"with_payload": ["chunk_text", "regulation_name_de", "regulation_short",
|
||||||
|
"source", "celex", "chunk_index"],
|
||||||
|
"with_vectors": False,
|
||||||
|
}
|
||||||
|
if offset:
|
||||||
|
params["offset"] = offset
|
||||||
|
|
||||||
|
result = http_post(
|
||||||
|
f"{QDRANT_URL}/collections/{collection}/points/scroll",
|
||||||
|
params,
|
||||||
|
timeout=30,
|
||||||
|
)
|
||||||
|
points = result.get("result", {}).get("points", [])
|
||||||
|
next_offset = result.get("result", {}).get("next_page_offset")
|
||||||
|
batch_num += 1
|
||||||
|
|
||||||
|
for p in points:
|
||||||
|
text_content = p["payload"].get("chunk_text", "")
|
||||||
|
h = hashlib.sha256(text_content.encode()).hexdigest()
|
||||||
|
if h in chunk_hashes:
|
||||||
|
hash_to_qdrant[h] = {
|
||||||
|
"text": text_content,
|
||||||
|
"regulation_name_de": p["payload"].get("regulation_name_de", "Batterieverordnung"),
|
||||||
|
"regulation_short": p["payload"].get("regulation_short", "BattVO"),
|
||||||
|
"source": p["payload"].get("source", ""),
|
||||||
|
"celex": p["payload"].get("celex", ""),
|
||||||
|
"chunk_index": p["payload"].get("chunk_index"),
|
||||||
|
}
|
||||||
|
|
||||||
|
sys.stdout.write(f"\r Batch {batch_num}: scanned {batch_num * 200} points, matched {len(hash_to_qdrant)}/{len(chunk_hashes)}")
|
||||||
|
sys.stdout.flush()
|
||||||
|
|
||||||
|
if not next_offset or len(hash_to_qdrant) == len(chunk_hashes):
|
||||||
|
break
|
||||||
|
offset = next_offset
|
||||||
|
|
||||||
|
print(f"\n Matched {len(hash_to_qdrant)}/{len(chunk_hashes)} chunks from Qdrant")
|
||||||
|
|
||||||
|
# ── Step 4: Update controls ───────────────────────────────────
|
||||||
|
updated = 0
|
||||||
|
skipped = 0
|
||||||
|
errors = 0
|
||||||
|
|
||||||
|
for row in rows:
|
||||||
|
chunk_hash = row[0]
|
||||||
|
regulation_code = row[1]
|
||||||
|
control_id = row[3]
|
||||||
|
chunk_row_id = row[4]
|
||||||
|
|
||||||
|
qdrant_data = hash_to_qdrant.get(chunk_hash)
|
||||||
|
if not qdrant_data:
|
||||||
|
print(f"\n WARN: No Qdrant match for chunk {chunk_hash[:20]}... (control {control_id})")
|
||||||
|
skipped += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
chunk_text = qdrant_data["text"]
|
||||||
|
source_name = qdrant_data["regulation_name_de"]
|
||||||
|
article, paragraph = extract_article_paragraph(chunk_text)
|
||||||
|
|
||||||
|
source_citation = {
|
||||||
|
"source": source_name,
|
||||||
|
"article": article,
|
||||||
|
"paragraph": paragraph,
|
||||||
|
"license": LICENSE_INFO["license"],
|
||||||
|
"source_type": LICENSE_INFO["source_type"],
|
||||||
|
"url": f"https://eur-lex.europa.eu/legal-content/DE/TXT/?uri=CELEX:{qdrant_data['celex']}" if qdrant_data.get("celex") else "",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Build updated generation_metadata (preserve existing fields)
|
||||||
|
new_meta_patch = {
|
||||||
|
"license_rule": 1,
|
||||||
|
"source_regulation": regulation_code,
|
||||||
|
"source_article": article,
|
||||||
|
"source_paragraph": paragraph,
|
||||||
|
"backfill_reason": "LICENSE_MAP missing eu_2023_1542",
|
||||||
|
"backfill_date": "2026-03-19",
|
||||||
|
}
|
||||||
|
|
||||||
|
if DRY_RUN:
|
||||||
|
if updated < 3:
|
||||||
|
print(f"\n [DRY RUN] Would update control {control_id}")
|
||||||
|
print(f" citation: {json.dumps(source_citation, ensure_ascii=False)[:120]}")
|
||||||
|
print(f" article: {article}, paragraph: {paragraph}")
|
||||||
|
print(f" text[:80]: {chunk_text[:80]}")
|
||||||
|
updated += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Update the control
|
||||||
|
conn.execute(text("""
|
||||||
|
UPDATE compliance.canonical_controls
|
||||||
|
SET license_rule = 1,
|
||||||
|
source_original_text = :source_text,
|
||||||
|
source_citation = CAST(:citation AS jsonb),
|
||||||
|
customer_visible = true,
|
||||||
|
release_state = CASE
|
||||||
|
WHEN release_state = 'too_close' THEN 'draft'
|
||||||
|
ELSE release_state
|
||||||
|
END,
|
||||||
|
generation_metadata = COALESCE(generation_metadata, '{}'::jsonb) || CAST(:meta_patch AS jsonb),
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = :control_id
|
||||||
|
"""), {
|
||||||
|
"control_id": control_id,
|
||||||
|
"source_text": chunk_text,
|
||||||
|
"citation": json.dumps(source_citation, ensure_ascii=False),
|
||||||
|
"meta_patch": json.dumps(new_meta_patch),
|
||||||
|
})
|
||||||
|
|
||||||
|
# Update the processed_chunk record too
|
||||||
|
conn.execute(text("""
|
||||||
|
UPDATE compliance.canonical_processed_chunks
|
||||||
|
SET license_rule = 1,
|
||||||
|
source_license = 'EU_LAW',
|
||||||
|
processing_path = 'structured_batch'
|
||||||
|
WHERE id = :chunk_id
|
||||||
|
"""), {"chunk_id": chunk_row_id})
|
||||||
|
|
||||||
|
updated += 1
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n ERROR updating control {control_id}: {e}")
|
||||||
|
errors += 1
|
||||||
|
|
||||||
|
print(f"\n\n=== BACKFILL COMPLETE ===")
|
||||||
|
print(f" Updated: {updated}")
|
||||||
|
print(f" Skipped: {skipped} (no Qdrant match)")
|
||||||
|
print(f" Errors: {errors}")
|
||||||
|
print(f" Dry run: {DRY_RUN}")
|
||||||
|
|
||||||
|
if DRY_RUN:
|
||||||
|
print("\n Run without --dry-run to apply changes.")
|
||||||
|
|
||||||
|
# ── Step 5: Verify ────────────────────────────────────────────
|
||||||
|
if not DRY_RUN:
|
||||||
|
r = conn.execute(text("""
|
||||||
|
WITH ctrl_ids AS (
|
||||||
|
SELECT DISTINCT jsonb_array_elements_text(generated_control_ids)::uuid as ctrl_id
|
||||||
|
FROM compliance.canonical_processed_chunks
|
||||||
|
WHERE job_id = :job_id
|
||||||
|
AND jsonb_array_length(COALESCE(generated_control_ids, '[]'::jsonb)) > 0
|
||||||
|
)
|
||||||
|
SELECT release_state, license_rule, customer_visible, count(*)
|
||||||
|
FROM compliance.canonical_controls c
|
||||||
|
JOIN ctrl_ids ci ON c.id = ci.ctrl_id
|
||||||
|
GROUP BY release_state, license_rule, customer_visible
|
||||||
|
ORDER BY release_state
|
||||||
|
"""), {"job_id": JOB_ID})
|
||||||
|
print("\n=== Verification ===")
|
||||||
|
for row in r.fetchall():
|
||||||
|
print(f" {str(row[0]):20s} rule={row[1]} visible={row[2]} count={row[3]}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
27
scripts/qa/delete_gpsr_prod.py
Normal file
27
scripts/qa/delete_gpsr_prod.py
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
"""Delete eu_2023_988 duplicate from production Qdrant."""
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
PROD_URL = "https://qdrant-dev.breakpilot.ai"
|
||||||
|
HEADERS = {"api-key": "z9cKbT74vl1aKPD1QGIlKWfET47VH93u"}
|
||||||
|
|
||||||
|
# Delete
|
||||||
|
resp = httpx.post(
|
||||||
|
f"{PROD_URL}/collections/bp_compliance_ce/points/delete",
|
||||||
|
json={"filter": {"must": [{"key": "regulation_id", "match": {"value": "eu_2023_988"}}]}},
|
||||||
|
headers=HEADERS, timeout=60,
|
||||||
|
)
|
||||||
|
print(f"Delete status: {resp.json().get('status')}")
|
||||||
|
|
||||||
|
# Verify
|
||||||
|
resp2 = httpx.post(
|
||||||
|
f"{PROD_URL}/collections/bp_compliance_ce/points/count",
|
||||||
|
json={"filter": {"must": [{"key": "regulation_id", "match": {"value": "eu_2023_988"}}]}, "exact": True},
|
||||||
|
headers=HEADERS, timeout=15,
|
||||||
|
)
|
||||||
|
remaining = resp2.json().get("result", {}).get("count", 0)
|
||||||
|
print(f"Remaining: {remaining}")
|
||||||
|
|
||||||
|
# Total
|
||||||
|
resp3 = httpx.get(f"{PROD_URL}/collections/bp_compliance_ce", headers=HEADERS, timeout=10)
|
||||||
|
total = resp3.json().get("result", {}).get("points_count", "?")
|
||||||
|
print(f"Total points: {total}")
|
||||||
131
scripts/qa/pdf_article_lookup_poc.py
Normal file
131
scripts/qa/pdf_article_lookup_poc.py
Normal file
@@ -0,0 +1,131 @@
|
|||||||
|
"""POC v2: Find control's source text in PDF — distinguish headings from cross-refs."""
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import fitz # PyMuPDF
|
||||||
|
import psycopg2
|
||||||
|
import urllib.parse
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
|
PDF_PATH = os.path.expanduser("~/rag-ingestion/pdfs/battery_2023_1542.pdf")
|
||||||
|
|
||||||
|
# Step 1: Extract full text from PDF
|
||||||
|
print("=== Step 1: Reading PDF ===")
|
||||||
|
doc = fitz.open(PDF_PATH)
|
||||||
|
full_text = ""
|
||||||
|
for page in doc:
|
||||||
|
full_text += page.get_text() + "\n"
|
||||||
|
print(f" Pages: {len(doc)}, Total chars: {len(full_text)}")
|
||||||
|
|
||||||
|
def normalize(s):
|
||||||
|
"""Remove soft hyphens, normalize whitespace."""
|
||||||
|
s = s.replace('\u00ad', '').replace('\xad', '') # soft hyphen
|
||||||
|
s = s.replace('\u200b', '') # zero-width space
|
||||||
|
s = unicodedata.normalize('NFC', s)
|
||||||
|
s = re.sub(r'\s+', ' ', s)
|
||||||
|
return s.strip()
|
||||||
|
|
||||||
|
# Step 2: Build article heading index
|
||||||
|
# Article headings in EU regulations are on their own line: "Artikel 76"
|
||||||
|
# followed by a title line like: "Rücknahme"
|
||||||
|
# Cross-references look like: "gemäß Artikel 290 des Vertrags"
|
||||||
|
print("\n=== Step 2: Building article HEADING index ===")
|
||||||
|
# Pattern: "Artikel N" at start of line, NOT preceded by text on same line
|
||||||
|
heading_pattern = re.compile(r'(?:^|\n)\s*Artikel\s+(\d+[a-z]?)\s*\n', re.MULTILINE)
|
||||||
|
headings = []
|
||||||
|
for match in heading_pattern.finditer(full_text):
|
||||||
|
art_num = int(re.match(r'(\d+)', match.group(1)).group(1))
|
||||||
|
# Filter: Batterieverordnung has articles 1-96, not 114/192/290
|
||||||
|
if art_num <= 96:
|
||||||
|
headings.append((match.start(), match.group(1)))
|
||||||
|
|
||||||
|
# Sort by position
|
||||||
|
headings.sort(key=lambda x: x[0])
|
||||||
|
# Deduplicate (keep first occurrence of each article)
|
||||||
|
seen = set()
|
||||||
|
unique_headings = []
|
||||||
|
for pos, num in headings:
|
||||||
|
if num not in seen:
|
||||||
|
seen.add(num)
|
||||||
|
unique_headings.append((pos, num))
|
||||||
|
headings = unique_headings
|
||||||
|
|
||||||
|
print(f" Found {len(headings)} unique article headings")
|
||||||
|
for h in headings[:15]:
|
||||||
|
# Show context
|
||||||
|
ctx = full_text[h[0]:h[0]+60].replace('\n', '|')
|
||||||
|
print(f" Pos {h[0]:6d}: Artikel {h[1]:3s} → '{ctx[:50]}'")
|
||||||
|
if len(headings) > 15:
|
||||||
|
print(f" ... and {len(headings)-15} more (up to Artikel {headings[-1][1]})")
|
||||||
|
|
||||||
|
# Normalize full text for searching
|
||||||
|
full_norm = normalize(full_text)
|
||||||
|
|
||||||
|
# Precompute normalized heading positions
|
||||||
|
heading_norm_positions = []
|
||||||
|
for pos, num in headings:
|
||||||
|
norm_pos = len(normalize(full_text[:pos]))
|
||||||
|
heading_norm_positions.append((norm_pos, num))
|
||||||
|
|
||||||
|
# Step 3: Get controls from DB
|
||||||
|
print("\n=== Step 3: Looking up controls ===")
|
||||||
|
db_url = os.environ['DATABASE_URL']
|
||||||
|
parsed = urllib.parse.urlparse(db_url)
|
||||||
|
conn = psycopg2.connect(
|
||||||
|
host=parsed.hostname, port=parsed.port or 5432,
|
||||||
|
user=parsed.username, password=parsed.password,
|
||||||
|
dbname=parsed.path.lstrip('/'),
|
||||||
|
options="-c search_path=compliance,public"
|
||||||
|
)
|
||||||
|
cur = conn.cursor()
|
||||||
|
cur.execute("""
|
||||||
|
SELECT id, control_id, title, source_original_text,
|
||||||
|
source_citation->>'article' as existing_article
|
||||||
|
FROM compliance.canonical_controls
|
||||||
|
WHERE source_citation->>'source' LIKE '%%1542%%'
|
||||||
|
AND source_original_text IS NOT NULL
|
||||||
|
ORDER BY control_id
|
||||||
|
""")
|
||||||
|
controls = cur.fetchall()
|
||||||
|
print(f" Got {len(controls)} controls")
|
||||||
|
|
||||||
|
# Step 4: Match
|
||||||
|
print("\n=== Step 4: Matching controls to PDF articles ===")
|
||||||
|
found = 0
|
||||||
|
not_found = 0
|
||||||
|
results = []
|
||||||
|
|
||||||
|
for ctrl in controls:
|
||||||
|
ctrl_id, control_id, title, orig_text, existing_art = ctrl
|
||||||
|
orig_norm = normalize(orig_text)
|
||||||
|
|
||||||
|
matched = False
|
||||||
|
for length in [80, 60, 40, 30]:
|
||||||
|
start = max(0, len(orig_norm) // 4)
|
||||||
|
snippet = orig_norm[start:start+length]
|
||||||
|
if not snippet or len(snippet) < 20:
|
||||||
|
continue
|
||||||
|
pos = full_norm.find(snippet)
|
||||||
|
if pos >= 0:
|
||||||
|
# Find which article heading precedes this position
|
||||||
|
article = "Preamble"
|
||||||
|
for h_pos, h_num in reversed(heading_norm_positions):
|
||||||
|
if h_pos <= pos:
|
||||||
|
article = h_num
|
||||||
|
break
|
||||||
|
|
||||||
|
status = "MATCH" if existing_art == article else ("NEW" if not existing_art else f"DIFF({existing_art}→{article})")
|
||||||
|
print(f" {control_id:10s}: Artikel {article:3s} [{status}] {title[:55]}")
|
||||||
|
found += 1
|
||||||
|
matched = True
|
||||||
|
results.append((ctrl_id, control_id, article))
|
||||||
|
break
|
||||||
|
|
||||||
|
if not matched:
|
||||||
|
not_found += 1
|
||||||
|
print(f" {control_id:10s}: NOT FOUND {title[:55]}")
|
||||||
|
print(f" Text: '{orig_norm[20:70]}...'")
|
||||||
|
|
||||||
|
print(f"\n=== Result: {found}/{len(controls)} found ({not_found} not found) ===")
|
||||||
|
if headings:
|
||||||
|
print(f" Articles covered: {headings[0][1]} - {headings[-1][1]}")
|
||||||
|
conn.close()
|
||||||
475
scripts/qa/pdf_qa_all.py
Normal file
475
scripts/qa/pdf_qa_all.py
Normal file
@@ -0,0 +1,475 @@
|
|||||||
|
"""
|
||||||
|
PDF-based QA: Match ALL controls' source_original_text against original PDFs.
|
||||||
|
Determine exact article/section/paragraph for each control.
|
||||||
|
Handle: EU regulations (Artikel), German laws (§), NIST sections, OWASP categories,
|
||||||
|
Erwägungsgründe (preamble), Anhänge (annexes).
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
import unicodedata
|
||||||
|
import psycopg2
|
||||||
|
import urllib.parse
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
try:
|
||||||
|
import fitz # PyMuPDF
|
||||||
|
HAS_FITZ = True
|
||||||
|
except ImportError:
|
||||||
|
HAS_FITZ = False
|
||||||
|
|
||||||
|
PDF_DIR = Path(os.path.expanduser("~/rag-ingestion/pdfs"))
|
||||||
|
TEXT_DIR = Path(os.path.expanduser("~/rag-ingestion/texts"))
|
||||||
|
|
||||||
|
# ── Source name → file path mapping ──────────────────────────────────
|
||||||
|
SOURCE_FILE_MAP = {
|
||||||
|
# EU Regulations (PDFs)
|
||||||
|
"KI-Verordnung (EU) 2024/1689": "ai_act_2024_1689.pdf",
|
||||||
|
"Maschinenverordnung (EU) 2023/1230": "machinery_regulation_2023_1230.pdf",
|
||||||
|
"Cyber Resilience Act (CRA)": "cra_2024_2847.pdf",
|
||||||
|
"EU Blue Guide 2022": "blue_guide_2022.pdf",
|
||||||
|
"Markets in Crypto-Assets (MiCA)": "mica_2023_1114.pdf",
|
||||||
|
"DSGVO (EU) 2016/679": "dsgvo_2016_679.pdf",
|
||||||
|
"Batterieverordnung (EU) 2023/1542": "battery_2023_1542.pdf",
|
||||||
|
"NIS2-Richtlinie (EU) 2022/2555": "nis2_2022_2555.pdf",
|
||||||
|
"AML-Verordnung": "amlr_2024_1624.pdf",
|
||||||
|
"Data Governance Act (DGA)": "dga_2022_868.pdf",
|
||||||
|
"Data Act": "dataact_2023_2854.pdf",
|
||||||
|
"GPSR (EU) 2023/988": "gpsr_2023_988.pdf",
|
||||||
|
"IFRS-Übernahmeverordnung": "ifrs_regulation_2023_1803_de.pdf",
|
||||||
|
|
||||||
|
# NIST (PDFs)
|
||||||
|
"NIST SP 800-53 Rev. 5": None, # TODO: Need to find/download
|
||||||
|
"NIST SP 800-207 (Zero Trust)": None,
|
||||||
|
"NIST SP 800-63-3": None,
|
||||||
|
"NIST AI Risk Management Framework": None,
|
||||||
|
"NIST SP 800-218 (SSDF)": "nist_sp_800_218_ssdf.pdf",
|
||||||
|
"NIST Cybersecurity Framework 2.0": "nist_csf_2_0.pdf",
|
||||||
|
|
||||||
|
# OWASP (no PDFs — these are web-based)
|
||||||
|
"OWASP Top 10 (2021)": None,
|
||||||
|
"OWASP ASVS 4.0": None,
|
||||||
|
"OWASP SAMM 2.0": None,
|
||||||
|
"OWASP API Security Top 10 (2023)": None,
|
||||||
|
"OWASP MASVS 2.0": None,
|
||||||
|
|
||||||
|
# ENISA (PDFs)
|
||||||
|
"ENISA ICS/SCADA Dependencies": None,
|
||||||
|
"ENISA Supply Chain Good Practices": "enisa_supply_chain_security.pdf",
|
||||||
|
"ENISA Threat Landscape Supply Chain": "enisa_supply_chain_security.pdf",
|
||||||
|
"ENISA Cybersecurity State 2024": None,
|
||||||
|
"CISA Secure by Design": "enisa_secure_by_design.pdf",
|
||||||
|
|
||||||
|
# German laws (PDFs or TXT)
|
||||||
|
"Bundesdatenschutzgesetz (BDSG)": "bdsg.pdf",
|
||||||
|
"Gewerbeordnung (GewO)": "gewo.pdf",
|
||||||
|
"Handelsgesetzbuch (HGB)": "hgb.pdf",
|
||||||
|
"Abgabenordnung (AO)": "ao.pdf",
|
||||||
|
|
||||||
|
# Austrian DSG
|
||||||
|
"Österreichisches Datenschutzgesetz (DSG)": None, # ris HTML
|
||||||
|
|
||||||
|
# EDPB Guidelines (PDFs)
|
||||||
|
"EDPB Leitlinien 01/2022 (BCR)": "edpb_bcr_01_2022.pdf",
|
||||||
|
"EDPB Leitlinien 05/2020 - Einwilligung": None, # txt
|
||||||
|
"EDPB Leitlinien 08/2020 (Social Media)": "edpb_social_media_08_2020.pdf",
|
||||||
|
"EDPB Leitlinien 01/2019 (Zertifizierung)": "edpb_certification_01_2019.pdf",
|
||||||
|
"EDPB Leitlinien 07/2020 (Datentransfers)": "edpb_transfers_07_2020.pdf",
|
||||||
|
"EDPB Leitlinien 09/2022 (Data Breach)": "edpb_breach_09_2022.pdf",
|
||||||
|
"EDPB Leitlinien - Berechtigtes Interesse (Art. 6(1)(f))": "edpb_legitimate_interest.pdf",
|
||||||
|
"EDPB Leitlinien 01/2024 (Berechtigtes Interesse)": "edpb_legitimate_interest.pdf",
|
||||||
|
"EDPB Leitlinien 04/2019 (Data Protection by Design)": None, # txt
|
||||||
|
"EDPB Leitlinien 01/2020 (Vernetzte Fahrzeuge)": "edpb_connected_vehicles_01_2020.pdf",
|
||||||
|
"EDPB Leitlinien 01/2020 (Datentransfers)": "edpb_transfers_07_2020.pdf",
|
||||||
|
|
||||||
|
# WP (Working Party) Guidelines
|
||||||
|
"WP244 Leitlinien (Profiling)": "edpb_wp251_profiling.pdf",
|
||||||
|
"WP251 Leitlinien (Profiling)": "edpb_wp251_profiling.pdf",
|
||||||
|
"WP260 Leitlinien (Transparenz)": "edpb_wp260_transparency.pdf",
|
||||||
|
|
||||||
|
# OECD
|
||||||
|
"OECD KI-Empfehlung": "oecd_ai_principles.pdf",
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Document type classification ─────────────────────────────────────
|
||||||
|
DOC_TYPE_MAP = {
|
||||||
|
# EU regulations: "Artikel N"
|
||||||
|
"eu_regulation": [
|
||||||
|
"KI-Verordnung", "Maschinenverordnung", "Cyber Resilience",
|
||||||
|
"Blue Guide", "MiCA", "DSGVO", "Batterieverordnung", "NIS2",
|
||||||
|
"AML-Verordnung", "Data Governance", "Data Act", "GPSR",
|
||||||
|
"IFRS", "Markets in Crypto",
|
||||||
|
],
|
||||||
|
# German laws: "§ N"
|
||||||
|
"de_law": [
|
||||||
|
"BDSG", "GewO", "HGB", "Abgabenordnung",
|
||||||
|
],
|
||||||
|
# NIST: "Section X.Y" or control families "AC-1"
|
||||||
|
"nist": [
|
||||||
|
"NIST SP", "NIST Cybersecurity", "NIST AI",
|
||||||
|
],
|
||||||
|
# OWASP: "A01:2021" or "V1.1"
|
||||||
|
"owasp": [
|
||||||
|
"OWASP",
|
||||||
|
],
|
||||||
|
# EDPB: numbered paragraphs or sections
|
||||||
|
"edpb": [
|
||||||
|
"EDPB", "WP244", "WP251", "WP260",
|
||||||
|
],
|
||||||
|
# ENISA: sections
|
||||||
|
"enisa": [
|
||||||
|
"ENISA", "CISA",
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def classify_doc(source_name):
|
||||||
|
"""Classify document type based on source name."""
|
||||||
|
if not source_name:
|
||||||
|
return "unknown"
|
||||||
|
for doc_type, keywords in DOC_TYPE_MAP.items():
|
||||||
|
for kw in keywords:
|
||||||
|
if kw.lower() in source_name.lower():
|
||||||
|
return doc_type
|
||||||
|
return "unknown"
|
||||||
|
|
||||||
|
|
||||||
|
def normalize(s):
|
||||||
|
"""Remove soft hyphens, normalize whitespace."""
|
||||||
|
s = s.replace('\u00ad', '').replace('\xad', '')
|
||||||
|
s = s.replace('\u200b', '').replace('\u00a0', ' ')
|
||||||
|
s = s.replace('\ufb01', 'fi').replace('\ufb02', 'fl') # ligatures
|
||||||
|
s = unicodedata.normalize('NFC', s)
|
||||||
|
s = re.sub(r'\s+', ' ', s)
|
||||||
|
return s.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def read_file(filename):
|
||||||
|
"""Read PDF or text file, return full text."""
|
||||||
|
path = PDF_DIR / filename
|
||||||
|
if not path.exists():
|
||||||
|
# Try text dir
|
||||||
|
txt_name = path.stem + ".txt"
|
||||||
|
txt_path = TEXT_DIR / txt_name
|
||||||
|
if txt_path.exists():
|
||||||
|
return txt_path.read_text(encoding='utf-8', errors='replace')
|
||||||
|
return None
|
||||||
|
|
||||||
|
if path.suffix == '.pdf':
|
||||||
|
if not HAS_FITZ:
|
||||||
|
return None
|
||||||
|
doc = fitz.open(str(path))
|
||||||
|
text = ""
|
||||||
|
for page in doc:
|
||||||
|
text += page.get_text() + "\n"
|
||||||
|
doc.close()
|
||||||
|
return text
|
||||||
|
elif path.suffix in ('.txt', '.html'):
|
||||||
|
return path.read_text(encoding='utf-8', errors='replace')
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def build_eu_article_index(text, max_article=None):
|
||||||
|
"""Build article heading index for EU regulations.
|
||||||
|
Returns list of (position, label, type) where type is 'article', 'preamble', 'annex'."""
|
||||||
|
items = []
|
||||||
|
|
||||||
|
# Find Erwägungsgründe (recitals) — numbered (1), (2), etc. before Artikel 1
|
||||||
|
# Find where Artikel 1 starts
|
||||||
|
art1_match = re.search(r'\nArtikel\s+1\s*\n', text)
|
||||||
|
art1_pos = art1_match.start() if art1_match else len(text)
|
||||||
|
|
||||||
|
# Recital markers before Artikel 1
|
||||||
|
for m in re.finditer(r'(?:^|\n)\s*\((\d+)\)', text[:art1_pos]):
|
||||||
|
items.append((m.start(), f"Erwägungsgrund ({m.group(1)})", "preamble"))
|
||||||
|
|
||||||
|
# Article headings: "Artikel N" on its own line
|
||||||
|
for m in re.finditer(r'(?:^|\n)\s*Artikel\s+(\d+[a-z]?)\s*\n', text, re.MULTILINE):
|
||||||
|
art_num_str = m.group(1)
|
||||||
|
art_num = int(re.match(r'(\d+)', art_num_str).group(1))
|
||||||
|
# Filter by max article number if known
|
||||||
|
if max_article and art_num > max_article:
|
||||||
|
continue
|
||||||
|
items.append((m.start(), f"Artikel {art_num_str}", "article"))
|
||||||
|
|
||||||
|
# Anhang/Annex markers
|
||||||
|
for m in re.finditer(r'(?:^|\n)\s*ANHANG\s+([IVXLC]+[a-z]?)\b', text, re.MULTILINE):
|
||||||
|
items.append((m.start(), f"Anhang {m.group(1)}", "annex"))
|
||||||
|
# Also try "Anhang" without Roman numeral (single annex)
|
||||||
|
for m in re.finditer(r'(?:^|\n)\s*ANHANG\s*\n', text, re.MULTILINE):
|
||||||
|
items.append((m.start(), f"Anhang", "annex"))
|
||||||
|
|
||||||
|
items.sort(key=lambda x: x[0])
|
||||||
|
|
||||||
|
# Deduplicate: keep first occurrence of each label
|
||||||
|
seen = set()
|
||||||
|
unique = []
|
||||||
|
for pos, label, typ in items:
|
||||||
|
if label not in seen:
|
||||||
|
seen.add(label)
|
||||||
|
unique.append((pos, label, typ))
|
||||||
|
|
||||||
|
return unique
|
||||||
|
|
||||||
|
|
||||||
|
def build_de_law_index(text):
|
||||||
|
"""Build section index for German laws (§ N)."""
|
||||||
|
items = []
|
||||||
|
for m in re.finditer(r'(?:^|\n)\s*§\s+(\d+[a-z]?)\b', text, re.MULTILINE):
|
||||||
|
items.append((m.start(), f"§ {m.group(1)}", "section"))
|
||||||
|
|
||||||
|
items.sort(key=lambda x: x[0])
|
||||||
|
seen = set()
|
||||||
|
unique = []
|
||||||
|
for pos, label, typ in items:
|
||||||
|
if label not in seen:
|
||||||
|
seen.add(label)
|
||||||
|
unique.append((pos, label, typ))
|
||||||
|
return unique
|
||||||
|
|
||||||
|
|
||||||
|
def build_nist_index(text):
|
||||||
|
"""Build section index for NIST documents."""
|
||||||
|
items = []
|
||||||
|
# NIST sections: "2.1 Section Name" or control families "AC-1"
|
||||||
|
for m in re.finditer(r'(?:^|\n)\s*(\d+\.\d+(?:\.\d+)?)\s+[A-Z]', text, re.MULTILINE):
|
||||||
|
items.append((m.start(), f"Section {m.group(1)}", "section"))
|
||||||
|
# Control families
|
||||||
|
for m in re.finditer(r'(?:^|\n)\s*([A-Z]{2}-\d+)\b', text, re.MULTILINE):
|
||||||
|
items.append((m.start(), f"{m.group(1)}", "control"))
|
||||||
|
|
||||||
|
items.sort(key=lambda x: x[0])
|
||||||
|
seen = set()
|
||||||
|
unique = []
|
||||||
|
for pos, label, typ in items:
|
||||||
|
if label not in seen:
|
||||||
|
seen.add(label)
|
||||||
|
unique.append((pos, label, typ))
|
||||||
|
return unique
|
||||||
|
|
||||||
|
|
||||||
|
def build_generic_index(text):
|
||||||
|
"""Build a generic section index using numbered headings."""
|
||||||
|
items = []
|
||||||
|
# Try section numbers: "1.", "1.1", "1.1.1"
|
||||||
|
for m in re.finditer(r'(?:^|\n)\s*(\d+(?:\.\d+)*)\.\s+[A-Z]', text, re.MULTILINE):
|
||||||
|
items.append((m.start(), f"Section {m.group(1)}", "section"))
|
||||||
|
|
||||||
|
items.sort(key=lambda x: x[0])
|
||||||
|
seen = set()
|
||||||
|
unique = []
|
||||||
|
for pos, label, typ in items:
|
||||||
|
if label not in seen:
|
||||||
|
seen.add(label)
|
||||||
|
unique.append((pos, label, typ))
|
||||||
|
return unique
|
||||||
|
|
||||||
|
|
||||||
|
# Known max article numbers for EU regulations
|
||||||
|
MAX_ARTICLES = {
|
||||||
|
"Batterieverordnung (EU) 2023/1542": 96,
|
||||||
|
"KI-Verordnung (EU) 2024/1689": 113,
|
||||||
|
"Maschinenverordnung (EU) 2023/1230": 54,
|
||||||
|
"Cyber Resilience Act (CRA)": 71,
|
||||||
|
"NIS2-Richtlinie (EU) 2022/2555": 46,
|
||||||
|
"DSGVO (EU) 2016/679": 99,
|
||||||
|
"Markets in Crypto-Assets (MiCA)": 149,
|
||||||
|
"AML-Verordnung": 95,
|
||||||
|
"Data Governance Act (DGA)": 38,
|
||||||
|
"Data Act": 50,
|
||||||
|
"GPSR (EU) 2023/988": 52,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def find_text_in_doc(orig_text, full_norm, index, index_norm_positions):
|
||||||
|
"""Find control text in document and return (article_label, article_type) or None."""
|
||||||
|
orig_norm = normalize(orig_text)
|
||||||
|
if len(orig_norm) < 30:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Try progressively shorter substrings from different positions
|
||||||
|
for start_frac in [0.25, 0.1, 0.5, 0.0]:
|
||||||
|
for length in [80, 60, 40, 30]:
|
||||||
|
start = max(0, int(len(orig_norm) * start_frac))
|
||||||
|
snippet = orig_norm[start:start+length]
|
||||||
|
if not snippet or len(snippet) < 25:
|
||||||
|
continue
|
||||||
|
pos = full_norm.find(snippet)
|
||||||
|
if pos >= 0:
|
||||||
|
# Find which section precedes this position
|
||||||
|
label = "Unknown"
|
||||||
|
typ = "unknown"
|
||||||
|
for h_pos, h_label, h_type in reversed(index_norm_positions):
|
||||||
|
if h_pos <= pos:
|
||||||
|
label = h_label
|
||||||
|
typ = h_type
|
||||||
|
break
|
||||||
|
return (label, typ)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# ── Main ─────────────────────────────────────────────────────────────
|
||||||
|
def main():
|
||||||
|
db_url = os.environ['DATABASE_URL']
|
||||||
|
parsed = urllib.parse.urlparse(db_url)
|
||||||
|
conn = psycopg2.connect(
|
||||||
|
host=parsed.hostname, port=parsed.port or 5432,
|
||||||
|
user=parsed.username, password=parsed.password,
|
||||||
|
dbname=parsed.path.lstrip('/'),
|
||||||
|
options="-c search_path=compliance,public"
|
||||||
|
)
|
||||||
|
cur = conn.cursor()
|
||||||
|
|
||||||
|
# Get all controls with source_original_text
|
||||||
|
cur.execute("""
|
||||||
|
SELECT id, control_id, title, source_original_text,
|
||||||
|
source_citation->>'source' as source_name,
|
||||||
|
source_citation->>'article' as existing_article,
|
||||||
|
source_citation as citation_json,
|
||||||
|
release_state
|
||||||
|
FROM compliance.canonical_controls
|
||||||
|
WHERE source_original_text IS NOT NULL
|
||||||
|
AND length(source_original_text) > 50
|
||||||
|
ORDER BY source_citation->>'source', control_id
|
||||||
|
""")
|
||||||
|
controls = cur.fetchall()
|
||||||
|
print(f"Total controls with source text: {len(controls)}")
|
||||||
|
|
||||||
|
# Group by source
|
||||||
|
by_source = {}
|
||||||
|
for ctrl in controls:
|
||||||
|
src = ctrl[4] or "(null)"
|
||||||
|
by_source.setdefault(src, []).append(ctrl)
|
||||||
|
|
||||||
|
# Process each source
|
||||||
|
total_found = 0
|
||||||
|
total_not_found = 0
|
||||||
|
total_updated = 0
|
||||||
|
total_new_article = 0
|
||||||
|
total_changed = 0
|
||||||
|
total_skipped_no_file = 0
|
||||||
|
updates = [] # (ctrl_id, new_article_label, article_type)
|
||||||
|
|
||||||
|
for source_name in sorted(by_source.keys(), key=lambda s: -len(by_source[s])):
|
||||||
|
ctrls = by_source[source_name]
|
||||||
|
filename = SOURCE_FILE_MAP.get(source_name)
|
||||||
|
doc_type = classify_doc(source_name)
|
||||||
|
|
||||||
|
if filename is None:
|
||||||
|
total_skipped_no_file += len(ctrls)
|
||||||
|
active = sum(1 for c in ctrls if c[7] not in ('duplicate', 'too_close'))
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"SKIP: {source_name} ({len(ctrls)} controls, {active} active) — no PDF")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Read file
|
||||||
|
text = read_file(filename)
|
||||||
|
if text is None:
|
||||||
|
total_skipped_no_file += len(ctrls)
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"SKIP: {source_name} — file not readable: {filename}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
text_norm = normalize(text)
|
||||||
|
|
||||||
|
# Build index based on doc type
|
||||||
|
max_art = MAX_ARTICLES.get(source_name)
|
||||||
|
if doc_type == "eu_regulation":
|
||||||
|
index = build_eu_article_index(text, max_article=max_art)
|
||||||
|
elif doc_type == "de_law":
|
||||||
|
index = build_de_law_index(text)
|
||||||
|
elif doc_type == "nist":
|
||||||
|
index = build_nist_index(text)
|
||||||
|
else:
|
||||||
|
index = build_generic_index(text)
|
||||||
|
|
||||||
|
# Precompute normalized positions
|
||||||
|
index_norm = []
|
||||||
|
for pos, label, typ in index:
|
||||||
|
norm_pos = len(normalize(text[:pos]))
|
||||||
|
index_norm.append((norm_pos, label, typ))
|
||||||
|
|
||||||
|
active = sum(1 for c in ctrls if c[7] not in ('duplicate', 'too_close'))
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"{source_name} ({len(ctrls)} controls, {active} active)")
|
||||||
|
print(f" File: {filename} ({len(text):,} chars)")
|
||||||
|
print(f" Index: {len(index)} sections ({doc_type})")
|
||||||
|
|
||||||
|
src_found = 0
|
||||||
|
src_not_found = 0
|
||||||
|
|
||||||
|
for ctrl in ctrls:
|
||||||
|
ctrl_id, control_id, title, orig_text, _, existing_art, citation_json, state = ctrl
|
||||||
|
|
||||||
|
result = find_text_in_doc(orig_text, text_norm, index, index_norm)
|
||||||
|
|
||||||
|
if result:
|
||||||
|
new_label, art_type = result
|
||||||
|
src_found += 1
|
||||||
|
total_found += 1
|
||||||
|
|
||||||
|
# Compare with existing
|
||||||
|
existing_clean = (existing_art or "").strip()
|
||||||
|
if not existing_clean:
|
||||||
|
status = "NEW"
|
||||||
|
total_new_article += 1
|
||||||
|
elif existing_clean == new_label:
|
||||||
|
status = "OK"
|
||||||
|
else:
|
||||||
|
status = f"CHANGED({existing_clean}→{new_label})"
|
||||||
|
total_changed += 1
|
||||||
|
|
||||||
|
updates.append((ctrl_id, new_label, art_type, control_id, source_name))
|
||||||
|
|
||||||
|
if status != "OK":
|
||||||
|
is_active = "" if state not in ('duplicate', 'too_close') else " [DUP]"
|
||||||
|
print(f" {control_id:10s}: {new_label:25s} [{art_type:8s}] {status}{is_active}")
|
||||||
|
else:
|
||||||
|
src_not_found += 1
|
||||||
|
total_not_found += 1
|
||||||
|
print(f" {control_id:10s}: NOT FOUND {title[:50]}")
|
||||||
|
|
||||||
|
pct = src_found / len(ctrls) * 100 if ctrls else 0
|
||||||
|
print(f" → {src_found}/{len(ctrls)} matched ({pct:.0f}%)")
|
||||||
|
|
||||||
|
# ── Summary ──────────────────────────────────────────────────────
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print("SUMMARY")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
print(f" Total controls with text: {len(controls)}")
|
||||||
|
print(f" Matched to PDF: {total_found}")
|
||||||
|
print(f" Not found in PDF: {total_not_found}")
|
||||||
|
print(f" Skipped (no PDF file): {total_skipped_no_file}")
|
||||||
|
print(f" New articles assigned: {total_new_article}")
|
||||||
|
print(f" Articles changed: {total_changed}")
|
||||||
|
|
||||||
|
# Save results for later application
|
||||||
|
results = []
|
||||||
|
for ctrl_id, label, art_type, control_id, source in updates:
|
||||||
|
results.append({
|
||||||
|
"ctrl_id": str(ctrl_id),
|
||||||
|
"control_id": control_id,
|
||||||
|
"source": source,
|
||||||
|
"article_label": label,
|
||||||
|
"article_type": art_type,
|
||||||
|
})
|
||||||
|
|
||||||
|
out_path = "/tmp/pdf_qa_results.json"
|
||||||
|
with open(out_path, 'w') as f:
|
||||||
|
json.dump(results, f, indent=2, ensure_ascii=False)
|
||||||
|
print(f"\n Results saved to {out_path} ({len(results)} entries)")
|
||||||
|
|
||||||
|
# Type distribution
|
||||||
|
type_counts = {}
|
||||||
|
for r in results:
|
||||||
|
t = r["article_type"]
|
||||||
|
type_counts[t] = type_counts.get(t, 0) + 1
|
||||||
|
print(f"\n Article type distribution:")
|
||||||
|
for t, c in sorted(type_counts.items(), key=lambda x: -x[1]):
|
||||||
|
print(f" {t:12s}: {c:5d}")
|
||||||
|
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
95
scripts/qa/pdf_qa_inventory.py
Normal file
95
scripts/qa/pdf_qa_inventory.py
Normal file
@@ -0,0 +1,95 @@
|
|||||||
|
"""Inventory: Which regulations have controls, how many, and do we have PDFs?"""
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
import psycopg2
|
||||||
|
import urllib.parse
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
PDF_DIR = Path(os.path.expanduser("~/rag-ingestion/pdfs"))
|
||||||
|
TEXT_DIR = Path(os.path.expanduser("~/rag-ingestion/texts"))
|
||||||
|
|
||||||
|
# DB connection
|
||||||
|
db_url = os.environ['DATABASE_URL']
|
||||||
|
parsed = urllib.parse.urlparse(db_url)
|
||||||
|
conn = psycopg2.connect(
|
||||||
|
host=parsed.hostname, port=parsed.port or 5432,
|
||||||
|
user=parsed.username, password=parsed.password,
|
||||||
|
dbname=parsed.path.lstrip('/'),
|
||||||
|
options="-c search_path=compliance,public"
|
||||||
|
)
|
||||||
|
cur = conn.cursor()
|
||||||
|
|
||||||
|
# Get all regulations with controls (excluding duplicates/too_close)
|
||||||
|
cur.execute("""
|
||||||
|
SELECT
|
||||||
|
source_citation->>'source' as source_name,
|
||||||
|
count(*) as total,
|
||||||
|
count(*) FILTER (WHERE release_state NOT IN ('duplicate', 'too_close')) as active,
|
||||||
|
count(*) FILTER (WHERE source_citation->>'article' IS NOT NULL AND source_citation->>'article' != '') as has_article,
|
||||||
|
count(*) FILTER (WHERE source_original_text IS NOT NULL AND length(source_original_text) > 50) as has_text
|
||||||
|
FROM compliance.canonical_controls
|
||||||
|
WHERE source_citation IS NOT NULL
|
||||||
|
GROUP BY 1
|
||||||
|
ORDER BY active DESC
|
||||||
|
""")
|
||||||
|
regs = cur.fetchall()
|
||||||
|
|
||||||
|
# List available PDFs and text files
|
||||||
|
pdf_files = {f.stem: f for f in PDF_DIR.glob("*.pdf")} if PDF_DIR.exists() else {}
|
||||||
|
txt_files = {f.stem: f for f in TEXT_DIR.glob("*.txt")} if TEXT_DIR.exists() else {}
|
||||||
|
html_files = {f.stem: f for f in PDF_DIR.glob("*.html")} if PDF_DIR.exists() else {}
|
||||||
|
|
||||||
|
# Also check for XML/zip files
|
||||||
|
all_files = {}
|
||||||
|
if PDF_DIR.exists():
|
||||||
|
for f in PDF_DIR.iterdir():
|
||||||
|
all_files[f.stem] = f
|
||||||
|
|
||||||
|
print(f"{'Source':55s} {'Total':>6s} {'Active':>7s} {'w/Art':>6s} {'w/Text':>7s} {'PDF?':>5s}")
|
||||||
|
print("-" * 92)
|
||||||
|
|
||||||
|
total_controls = 0
|
||||||
|
total_active = 0
|
||||||
|
total_with_text = 0
|
||||||
|
total_with_pdf = 0
|
||||||
|
no_pdf = []
|
||||||
|
|
||||||
|
for row in regs:
|
||||||
|
source, total, active, has_art, has_text = row
|
||||||
|
if not source:
|
||||||
|
source = "(null)"
|
||||||
|
total_controls += total
|
||||||
|
total_active += active
|
||||||
|
total_with_text += has_text if active > 0 else 0
|
||||||
|
|
||||||
|
# Try to find matching PDF
|
||||||
|
has_pdf = "?"
|
||||||
|
# Common name mappings
|
||||||
|
name_lower = source.lower()
|
||||||
|
for stem, path in all_files.items():
|
||||||
|
if stem.lower() in name_lower or name_lower[:20] in stem.lower():
|
||||||
|
has_pdf = path.suffix
|
||||||
|
break
|
||||||
|
|
||||||
|
if active > 0:
|
||||||
|
if has_pdf == "?":
|
||||||
|
no_pdf.append((source, active, has_text))
|
||||||
|
print(f"{source[:55]:55s} {total:6d} {active:7d} {has_art:6d} {has_text:7d} {has_pdf:>5s}")
|
||||||
|
|
||||||
|
print("-" * 92)
|
||||||
|
print(f"{'TOTAL':55s} {total_controls:6d} {total_active:7d}")
|
||||||
|
print(f"\nAvailable files in {PDF_DIR}: {len(all_files)}")
|
||||||
|
print(f" PDFs: {len(pdf_files)}, TXT: {len(txt_files)}, HTML: {len(html_files)}")
|
||||||
|
|
||||||
|
if no_pdf:
|
||||||
|
print(f"\n=== Regulations WITHOUT obvious PDF match ({len(no_pdf)}) ===")
|
||||||
|
for source, active, has_text in no_pdf:
|
||||||
|
print(f" {source[:60]:60s} {active:4d} controls, {has_text:4d} with text")
|
||||||
|
|
||||||
|
# Also list all available files for manual matching
|
||||||
|
print(f"\n=== Available source files ({len(all_files)}) ===")
|
||||||
|
for stem in sorted(all_files.keys()):
|
||||||
|
print(f" {stem}{all_files[stem].suffix}")
|
||||||
|
|
||||||
|
conn.close()
|
||||||
28772
scripts/qa/pdf_qa_results_2026-03-20.json
Normal file
28772
scripts/qa/pdf_qa_results_2026-03-20.json
Normal file
File diff suppressed because it is too large
Load Diff
190
scripts/qa/qa_apply_and_dedup.py
Normal file
190
scripts/qa/qa_apply_and_dedup.py
Normal file
@@ -0,0 +1,190 @@
|
|||||||
|
"""
|
||||||
|
Step 3: Apply article mappings to all controls + detect duplicates.
|
||||||
|
1. Update source_citation article/paragraph for controls that have a better mapping
|
||||||
|
2. Identify duplicate controls (same regulation + article + paragraph)
|
||||||
|
"""
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
from sqlalchemy import create_engine, text as sql_text
|
||||||
|
|
||||||
|
DB_URL = os.environ['DATABASE_URL']
|
||||||
|
engine = create_engine(DB_URL, connect_args={"options": "-c search_path=compliance,public"})
|
||||||
|
DRY_RUN = '--dry-run' in sys.argv
|
||||||
|
|
||||||
|
# Load mappings
|
||||||
|
with open("/tmp/all_article_mappings.json") as f:
|
||||||
|
article_mapping = json.load(f)
|
||||||
|
print(f"Loaded {len(article_mapping)} article mappings")
|
||||||
|
|
||||||
|
print(f"\n{'=' * 70}")
|
||||||
|
print("STEP 3a: UPDATE CONTROLS WITH IMPROVED ARTICLE MAPPINGS")
|
||||||
|
print(f"{'=' * 70}")
|
||||||
|
|
||||||
|
with engine.begin() as conn:
|
||||||
|
# Fast approach: load all chunk→control mappings at once
|
||||||
|
print(" Loading chunk→control mappings...")
|
||||||
|
chunk_rows = conn.execute(sql_text("""
|
||||||
|
SELECT chunk_hash, jsonb_array_elements_text(generated_control_ids) as control_id
|
||||||
|
FROM compliance.canonical_processed_chunks
|
||||||
|
WHERE jsonb_array_length(COALESCE(generated_control_ids, '[]'::jsonb)) > 0
|
||||||
|
""")).fetchall()
|
||||||
|
|
||||||
|
control_to_hash = {}
|
||||||
|
for row in chunk_rows:
|
||||||
|
control_to_hash[row[1]] = row[0]
|
||||||
|
print(f" Unique controls with chunk: {len(control_to_hash)}")
|
||||||
|
|
||||||
|
# Get current article info for controls with citations (skip v1/v2 without citation)
|
||||||
|
print(" Loading control article data...")
|
||||||
|
ctrl_rows = conn.execute(sql_text("""
|
||||||
|
SELECT id,
|
||||||
|
source_citation->>'article' as current_article,
|
||||||
|
source_citation->>'paragraph' as current_paragraph
|
||||||
|
FROM compliance.canonical_controls
|
||||||
|
WHERE source_citation IS NOT NULL
|
||||||
|
AND release_state NOT IN ('rejected')
|
||||||
|
""")).fetchall()
|
||||||
|
print(f" Controls with citation: {len(ctrl_rows)}")
|
||||||
|
|
||||||
|
updated = 0
|
||||||
|
improved = 0
|
||||||
|
changed = 0
|
||||||
|
|
||||||
|
for row in ctrl_rows:
|
||||||
|
ctrl_id = str(row[0])
|
||||||
|
current_art = row[1] or ""
|
||||||
|
current_para = row[2] or ""
|
||||||
|
chunk_hash = control_to_hash.get(ctrl_id)
|
||||||
|
|
||||||
|
if not chunk_hash:
|
||||||
|
continue
|
||||||
|
|
||||||
|
mapping = article_mapping.get(chunk_hash)
|
||||||
|
if not mapping or not mapping["article"]:
|
||||||
|
continue
|
||||||
|
|
||||||
|
new_art = mapping["article"]
|
||||||
|
new_para = mapping["paragraph"]
|
||||||
|
|
||||||
|
# Only update if it's an improvement
|
||||||
|
if current_art == new_art and current_para == new_para:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not current_art and new_art:
|
||||||
|
improved += 1
|
||||||
|
elif current_art != new_art:
|
||||||
|
changed += 1
|
||||||
|
|
||||||
|
if not DRY_RUN:
|
||||||
|
citation_patch = json.dumps({"article": new_art, "paragraph": new_para})
|
||||||
|
meta_patch = json.dumps({"source_article": new_art, "source_paragraph": new_para})
|
||||||
|
conn.execute(sql_text("""
|
||||||
|
UPDATE compliance.canonical_controls
|
||||||
|
SET source_citation = COALESCE(source_citation, '{}'::jsonb) || CAST(:citation AS jsonb),
|
||||||
|
generation_metadata = COALESCE(generation_metadata, '{}'::jsonb) || CAST(:meta AS jsonb)
|
||||||
|
WHERE id = :id
|
||||||
|
"""), {"id": row[0], "citation": citation_patch, "meta": meta_patch})
|
||||||
|
|
||||||
|
updated += 1
|
||||||
|
|
||||||
|
print(f"\n Updated: {updated}")
|
||||||
|
print(f" New article (was empty): {improved}")
|
||||||
|
print(f" Changed article: {changed}")
|
||||||
|
print(f" Dry run: {DRY_RUN}")
|
||||||
|
|
||||||
|
# ── Step 3b: Verification — article coverage after update ─────────
|
||||||
|
print(f"\n{'=' * 70}")
|
||||||
|
print("STEP 3b: ARTICLE COVERAGE AFTER UPDATE")
|
||||||
|
print(f"{'=' * 70}")
|
||||||
|
|
||||||
|
r = conn.execute(sql_text("""
|
||||||
|
SELECT
|
||||||
|
generation_metadata->>'source_regulation' as reg,
|
||||||
|
count(*) as total,
|
||||||
|
count(*) FILTER (WHERE source_citation->>'article' != '' AND source_citation->>'article' IS NOT NULL) as with_art,
|
||||||
|
count(*) FILTER (WHERE source_citation IS NULL) as no_cit
|
||||||
|
FROM compliance.canonical_controls
|
||||||
|
WHERE release_state NOT IN ('rejected')
|
||||||
|
GROUP BY generation_metadata->>'source_regulation'
|
||||||
|
HAVING count(*) >= 3
|
||||||
|
ORDER BY count(*) DESC
|
||||||
|
"""))
|
||||||
|
print(f"\n {'Regulation':35s} {'Total':>6s} {'WithArt':>7s} {'%':>5s}")
|
||||||
|
print(f" {'-' * 60}")
|
||||||
|
grand_total = 0
|
||||||
|
grand_art = 0
|
||||||
|
for row in r.fetchall():
|
||||||
|
reg = str(row[0])[:35] if row[0] else "(none/v1v2)"
|
||||||
|
pct = f"{row[2]/row[1]*100:.0f}%" if row[1] > 0 else ""
|
||||||
|
print(f" {reg:35s} {row[1]:6d} {row[2]:7d} {pct:>5s}")
|
||||||
|
grand_total += row[1]
|
||||||
|
grand_art += row[2]
|
||||||
|
print(f"\n TOTAL: {grand_total} controls, {grand_art} with article ({grand_art/grand_total*100:.0f}%)")
|
||||||
|
|
||||||
|
# ── Step 3c: Duplicate analysis ──────────────────────────────────
|
||||||
|
print(f"\n{'=' * 70}")
|
||||||
|
print("STEP 3c: DUPLICATE CONTROLS (same reg + article + paragraph, >1)")
|
||||||
|
print(f"{'=' * 70}")
|
||||||
|
|
||||||
|
r2 = conn.execute(sql_text("""
|
||||||
|
SELECT
|
||||||
|
generation_metadata->>'source_regulation' as reg,
|
||||||
|
source_citation->>'article' as article,
|
||||||
|
source_citation->>'paragraph' as paragraph,
|
||||||
|
count(*) as cnt,
|
||||||
|
array_agg(id ORDER BY created_at) as ids,
|
||||||
|
array_agg(title ORDER BY created_at) as titles,
|
||||||
|
array_agg(release_state ORDER BY created_at) as states
|
||||||
|
FROM compliance.canonical_controls
|
||||||
|
WHERE release_state NOT IN ('rejected', 'too_close')
|
||||||
|
AND source_citation->>'article' IS NOT NULL
|
||||||
|
AND source_citation->>'article' != ''
|
||||||
|
GROUP BY
|
||||||
|
generation_metadata->>'source_regulation',
|
||||||
|
source_citation->>'article',
|
||||||
|
source_citation->>'paragraph'
|
||||||
|
HAVING count(*) > 1
|
||||||
|
ORDER BY count(*) DESC
|
||||||
|
"""))
|
||||||
|
|
||||||
|
dup_groups = []
|
||||||
|
total_dup_controls = 0
|
||||||
|
total_removable = 0
|
||||||
|
|
||||||
|
for row in r2.fetchall():
|
||||||
|
group = {
|
||||||
|
"reg": row[0],
|
||||||
|
"article": row[1],
|
||||||
|
"paragraph": row[2],
|
||||||
|
"count": row[3],
|
||||||
|
"ids": [str(i) for i in row[4]],
|
||||||
|
"titles": row[5],
|
||||||
|
"states": row[6],
|
||||||
|
}
|
||||||
|
dup_groups.append(group)
|
||||||
|
total_dup_controls += row[3]
|
||||||
|
total_removable += row[3] - 1 # Keep the oldest
|
||||||
|
|
||||||
|
print(f"\n Duplicate groups: {len(dup_groups)}")
|
||||||
|
print(f" Controls in groups: {total_dup_controls}")
|
||||||
|
print(f" Removable (keep oldest): {total_removable}")
|
||||||
|
|
||||||
|
# Show top 20
|
||||||
|
print(f"\n {'Reg':25s} {'Article':15s} {'Para':10s} {'Count':>5s}")
|
||||||
|
print(f" {'-' * 60}")
|
||||||
|
for g in dup_groups[:30]:
|
||||||
|
print(f" {str(g['reg']):25s} {str(g['article']):15s} {str(g['paragraph']):10s} {g['count']:5d}")
|
||||||
|
for i, title in enumerate(g['titles'][:3]):
|
||||||
|
state = g['states'][i] if i < len(g['states']) else '?'
|
||||||
|
marker = "KEEP" if i == 0 else "DUP "
|
||||||
|
print(f" [{marker}][{state:6s}] {title[:70]}")
|
||||||
|
if g['count'] > 3:
|
||||||
|
print(f" ... +{g['count'] - 3} more")
|
||||||
|
|
||||||
|
# Save dedup plan
|
||||||
|
with open("/tmp/dedup_plan.json", "w") as f:
|
||||||
|
json.dump(dup_groups, f, indent=2, default=str)
|
||||||
|
print(f"\n Saved dedup plan to /tmp/dedup_plan.json")
|
||||||
306
scripts/qa/qa_article_map_all_chunks.py
Normal file
306
scripts/qa/qa_article_map_all_chunks.py
Normal file
@@ -0,0 +1,306 @@
|
|||||||
|
"""
|
||||||
|
Step 2: Build article/paragraph mapping for ALL regulations that have controls.
|
||||||
|
Scan chunks sequentially by chunk_index, track current article heading.
|
||||||
|
|
||||||
|
Handles both EU regulations (Artikel X) and German laws (§ X).
|
||||||
|
"""
|
||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
try:
|
||||||
|
import httpx
|
||||||
|
def http_post(url, data, timeout=30):
|
||||||
|
return httpx.post(url, json=data, timeout=timeout).json()
|
||||||
|
except ImportError:
|
||||||
|
import requests
|
||||||
|
def http_post(url, data, timeout=30):
|
||||||
|
return requests.post(url, json=data, timeout=timeout).json()
|
||||||
|
|
||||||
|
from sqlalchemy import create_engine, text as sql_text
|
||||||
|
|
||||||
|
DB_URL = os.environ['DATABASE_URL']
|
||||||
|
QDRANT_URL = os.environ.get('QDRANT_URL', 'http://host.docker.internal:6333')
|
||||||
|
engine = create_engine(DB_URL, connect_args={"options": "-c search_path=compliance,public"})
|
||||||
|
|
||||||
|
# ── Patterns for different document types ─────────────────────────────
|
||||||
|
|
||||||
|
# EU Regulations: "Artikel 26\n" heading
|
||||||
|
EU_ARTICLE = re.compile(r'(?:^|\n)\s*Artikel\s+(\d+[a-z]?)\b', re.IGNORECASE)
|
||||||
|
# German laws: "§ 26" or "§26"
|
||||||
|
DE_PARAGRAPH = re.compile(r'(?:^|\n)\s*§\s*(\d+[a-z]?)\b')
|
||||||
|
# NIST/OWASP section markers: "A01:2021", "AC-1", "PR.AC-1", etc.
|
||||||
|
NIST_CONTROL = re.compile(r'(?:^|\n)\s*([A-Z]{2}(?:\.[A-Z]{2})?-\d+)', re.MULTILINE)
|
||||||
|
OWASP_SECTION = re.compile(r'(A\d{2}:\d{4}(?:\s*[–—-]\s*[^\n]+)?)')
|
||||||
|
# Absatz/paragraph
|
||||||
|
ABSATZ = re.compile(r'(?:^|\n)\s*\((\d+)\)')
|
||||||
|
# ENISA/CISA sections (numbered)
|
||||||
|
SECTION_NUM = re.compile(r'(?:^|\n)\s*(\d+\.\d+(?:\.\d+)?)\s+[A-Z]')
|
||||||
|
|
||||||
|
# Regulation types
|
||||||
|
EU_REGS = {
|
||||||
|
'eu_2016_679', 'eu_2024_1689', 'eu_2022_2555', 'eu_2024_2847',
|
||||||
|
'eu_2023_1230', 'eu_2023_1542', 'eu_2022_2065', 'eu_2022_1925',
|
||||||
|
'eu_2022_868', 'eu_2019_770', 'eu_2021_914', 'eu_2002_58',
|
||||||
|
'eu_2000_31', 'eu_2023_1803', 'eu_2023_988', 'gpsr', 'eucsa',
|
||||||
|
'dataact', 'dora', 'ehds', 'mica', 'psd2', 'dpf', 'dsm', 'amlr',
|
||||||
|
'eaa', 'eu_blue_guide_2022',
|
||||||
|
}
|
||||||
|
DE_LAWS = {
|
||||||
|
'bdsg', 'bdsg_2018_komplett', 'gewo', 'elektrog', 'verpackg',
|
||||||
|
'battdg', 'bfsg', 'ddg', 'uwg', 'de_tkg', 'prodhaftg',
|
||||||
|
'tmg_komplett', 'urhg_komplett', 'bgb_komplett', 'hgb_komplett',
|
||||||
|
'ao_komplett', 'egbgb_komplett', 'de_betrvg', 'de_geschgehg',
|
||||||
|
'vsbg', 'pangv', 'mstv', 'de_dlinfov', 'de_ustg_ret',
|
||||||
|
}
|
||||||
|
OWASP = {
|
||||||
|
'owasp_top10_2021', 'owasp_asvs', 'owasp_samm', 'owasp_api_top10_2023',
|
||||||
|
'owasp_masvs', 'owasp_mobile_top10',
|
||||||
|
}
|
||||||
|
NIST = {
|
||||||
|
'nist_sp800_53r5', 'nist_sp_800_53', 'nist_sp_800_218', 'nist_sp800_218',
|
||||||
|
'nist_sp_800_63b', 'nist_sp800_63_3', 'nist_csf_2_0', 'nist_sp800_207',
|
||||||
|
'nist_ai_rmf', 'nist_privacy_1_0', 'nistir_8259a',
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def scan_regulation(collection, regulation_id):
|
||||||
|
"""Scroll all chunks for a regulation, sorted by chunk_index."""
|
||||||
|
chunks = []
|
||||||
|
offset = None
|
||||||
|
while True:
|
||||||
|
params = {
|
||||||
|
"filter": {"must": [{"key": "regulation_id", "match": {"value": regulation_id}}]},
|
||||||
|
"limit": 250,
|
||||||
|
"with_payload": ["chunk_text", "chunk_index"],
|
||||||
|
"with_vectors": False,
|
||||||
|
}
|
||||||
|
if offset:
|
||||||
|
params["offset"] = offset
|
||||||
|
result = http_post(f"{QDRANT_URL}/collections/{collection}/points/scroll", params, timeout=30)
|
||||||
|
points = result.get("result", {}).get("points", [])
|
||||||
|
next_offset = result.get("result", {}).get("next_page_offset")
|
||||||
|
for p in points:
|
||||||
|
t = p["payload"].get("chunk_text", "")
|
||||||
|
chunks.append({
|
||||||
|
"hash": hashlib.sha256(t.encode()).hexdigest(),
|
||||||
|
"idx": p["payload"].get("chunk_index", 0),
|
||||||
|
"text": t,
|
||||||
|
})
|
||||||
|
if not next_offset:
|
||||||
|
break
|
||||||
|
offset = next_offset
|
||||||
|
chunks.sort(key=lambda c: c["idx"])
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
|
def map_eu_articles(chunks):
|
||||||
|
"""Map EU regulation chunks to Artikel/Absatz."""
|
||||||
|
current_article = ""
|
||||||
|
current_paragraph = ""
|
||||||
|
mapping = {}
|
||||||
|
for c in chunks:
|
||||||
|
m = EU_ARTICLE.search(c["text"])
|
||||||
|
if m:
|
||||||
|
current_article = f"Art. {m.group(1)}"
|
||||||
|
current_paragraph = ""
|
||||||
|
paras = ABSATZ.findall(c["text"])
|
||||||
|
if paras:
|
||||||
|
current_paragraph = f"Abs. {paras[0]}"
|
||||||
|
if current_article:
|
||||||
|
mapping[c["hash"]] = {"article": current_article, "paragraph": current_paragraph}
|
||||||
|
return mapping
|
||||||
|
|
||||||
|
|
||||||
|
def map_de_paragraphs(chunks):
|
||||||
|
"""Map German law chunks to §/Absatz."""
|
||||||
|
current_para = ""
|
||||||
|
current_abs = ""
|
||||||
|
mapping = {}
|
||||||
|
for c in chunks:
|
||||||
|
m = DE_PARAGRAPH.search(c["text"])
|
||||||
|
if m:
|
||||||
|
current_para = f"§ {m.group(1)}"
|
||||||
|
current_abs = ""
|
||||||
|
abs_matches = ABSATZ.findall(c["text"])
|
||||||
|
if abs_matches:
|
||||||
|
current_abs = f"Abs. {abs_matches[0]}"
|
||||||
|
if current_para:
|
||||||
|
mapping[c["hash"]] = {"article": current_para, "paragraph": current_abs}
|
||||||
|
return mapping
|
||||||
|
|
||||||
|
|
||||||
|
def map_owasp(chunks):
|
||||||
|
"""Map OWASP chunks to section markers (A01:2021, etc.)."""
|
||||||
|
current_section = ""
|
||||||
|
mapping = {}
|
||||||
|
for c in chunks:
|
||||||
|
m = OWASP_SECTION.search(c["text"])
|
||||||
|
if m:
|
||||||
|
current_section = m.group(1).strip()
|
||||||
|
# Normalize: take just the code part
|
||||||
|
code_match = re.match(r'(A\d{2}:\d{4})', current_section)
|
||||||
|
if code_match:
|
||||||
|
current_section = code_match.group(1)
|
||||||
|
if current_section:
|
||||||
|
mapping[c["hash"]] = {"article": current_section, "paragraph": ""}
|
||||||
|
return mapping
|
||||||
|
|
||||||
|
|
||||||
|
def map_nist(chunks):
|
||||||
|
"""Map NIST chunks to control families/sections."""
|
||||||
|
current_section = ""
|
||||||
|
mapping = {}
|
||||||
|
for c in chunks:
|
||||||
|
# Try NIST control ID (AC-1, SC-7, etc.)
|
||||||
|
m = NIST_CONTROL.search(c["text"])
|
||||||
|
if m:
|
||||||
|
current_section = m.group(1)
|
||||||
|
# Also try section numbers (2.1, 3.2.1, etc.)
|
||||||
|
if not current_section:
|
||||||
|
m2 = SECTION_NUM.search(c["text"])
|
||||||
|
if m2:
|
||||||
|
current_section = m2.group(1)
|
||||||
|
if current_section:
|
||||||
|
mapping[c["hash"]] = {"article": current_section, "paragraph": ""}
|
||||||
|
return mapping
|
||||||
|
|
||||||
|
|
||||||
|
def map_generic(chunks):
|
||||||
|
"""Generic mapping using section numbers."""
|
||||||
|
current_section = ""
|
||||||
|
mapping = {}
|
||||||
|
for c in chunks:
|
||||||
|
# Try EU article first
|
||||||
|
m = EU_ARTICLE.search(c["text"])
|
||||||
|
if m:
|
||||||
|
current_section = f"Art. {m.group(1)}"
|
||||||
|
else:
|
||||||
|
# Try section numbers
|
||||||
|
m2 = SECTION_NUM.search(c["text"])
|
||||||
|
if m2:
|
||||||
|
current_section = m2.group(1)
|
||||||
|
paras = ABSATZ.findall(c["text"])
|
||||||
|
para = f"Abs. {paras[0]}" if paras else ""
|
||||||
|
if current_section:
|
||||||
|
mapping[c["hash"]] = {"article": current_section, "paragraph": para}
|
||||||
|
return mapping
|
||||||
|
|
||||||
|
|
||||||
|
def map_regulation(collection, regulation_id):
|
||||||
|
"""Map a regulation to articles based on its type."""
|
||||||
|
chunks = scan_regulation(collection, regulation_id)
|
||||||
|
if not chunks:
|
||||||
|
return {}, 0
|
||||||
|
|
||||||
|
if regulation_id in EU_REGS:
|
||||||
|
mapping = map_eu_articles(chunks)
|
||||||
|
elif regulation_id in DE_LAWS:
|
||||||
|
mapping = map_de_paragraphs(chunks)
|
||||||
|
elif regulation_id in OWASP:
|
||||||
|
mapping = map_owasp(chunks)
|
||||||
|
elif regulation_id in NIST:
|
||||||
|
mapping = map_nist(chunks)
|
||||||
|
else:
|
||||||
|
mapping = map_generic(chunks)
|
||||||
|
|
||||||
|
return mapping, len(chunks)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Main: Get all regulations that have controls ─────────────────────
|
||||||
|
with engine.connect() as conn:
|
||||||
|
# Get regulations with controls (skip v1/v2 without citation)
|
||||||
|
r = conn.execute(sql_text("""
|
||||||
|
SELECT DISTINCT
|
||||||
|
generation_metadata->>'source_regulation' as reg,
|
||||||
|
source_citation->>'source' as source_name
|
||||||
|
FROM compliance.canonical_controls
|
||||||
|
WHERE source_citation IS NOT NULL
|
||||||
|
AND generation_metadata->>'source_regulation' IS NOT NULL
|
||||||
|
AND release_state NOT IN ('rejected')
|
||||||
|
ORDER BY 1
|
||||||
|
"""))
|
||||||
|
regulations = [(row[0], row[1]) for row in r.fetchall()]
|
||||||
|
|
||||||
|
print(f"Regulations with controls: {len(regulations)}")
|
||||||
|
|
||||||
|
# Determine which collection each regulation is in
|
||||||
|
# (Most are in bp_compliance_ce, some in bp_compliance_datenschutz)
|
||||||
|
CE_REGS = EU_REGS | {'enisa_ics_scada_dependencies', 'enisa_supply_chain_good_practices',
|
||||||
|
'enisa_threat_landscape_supply_chain', 'enisa_cybersecurity_state_2024',
|
||||||
|
'cisa_secure_by_design', 'oecd_ai_principles', 'nistir_8259a'}
|
||||||
|
DS_REGS = {'owasp_top10_2021', 'owasp_asvs', 'owasp_samm', 'owasp_api_top10_2023',
|
||||||
|
'owasp_masvs', 'owasp_mobile_top10', 'nist_sp800_53r5', 'nist_sp_800_218',
|
||||||
|
'nist_sp800_218', 'nist_sp800_63_3', 'nist_sp800_207', 'nist_csf_2_0',
|
||||||
|
'nist_ai_rmf', 'nist_privacy_1_0', 'nistir_8259a',
|
||||||
|
'edpb_bcr_01_2022', 'edpb_05_2020', 'edpb_09_2022',
|
||||||
|
'edpb_certification_01_2019', 'edpb_connected_vehicles_01_2020',
|
||||||
|
'edpb_dpbd_04_2019', 'edpb_legitimate_interest', 'edpb_legitimate_interest_01_2024',
|
||||||
|
'edpb_social_media_08_2020', 'edpb_transfers_01_2020', 'edpb_transfers_07_2020',
|
||||||
|
'edpb_breach_09_2022', 'edpb_01_2020',
|
||||||
|
'wp244_profiling', 'wp251_profiling', 'wp260_transparency',
|
||||||
|
'hleg_trustworthy_ai', 'edpb_guidelines_7_2020'}
|
||||||
|
GE_REGS = DE_LAWS | {'at_dsg', 'at_tkg', 'es_lopdgdd', 'fr_loi_informatique',
|
||||||
|
'hu_info_tv', 'bsi_200_1', 'bsi_200_2', 'bsi_200_3', 'bsi_200_4',
|
||||||
|
'bsi_c5_2020'}
|
||||||
|
|
||||||
|
# Build all mappings
|
||||||
|
all_mappings = {} # chunk_hash -> {article, paragraph}
|
||||||
|
stats = [] # (reg_id, total_chunks, mapped_chunks)
|
||||||
|
|
||||||
|
for reg_id, source_name in regulations:
|
||||||
|
# Skip eu_2023_988 (duplicate of gpsr)
|
||||||
|
if reg_id == 'eu_2023_988':
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Determine collection
|
||||||
|
if reg_id in CE_REGS or reg_id.startswith('eu_') or reg_id.startswith('enisa_') or reg_id.startswith('cisa_') or reg_id.startswith('oecd_'):
|
||||||
|
collection = 'bp_compliance_ce'
|
||||||
|
elif reg_id in DS_REGS or reg_id.startswith('owasp_') or reg_id.startswith('nist_') or reg_id.startswith('edpb_') or reg_id.startswith('wp') or reg_id.startswith('hleg_'):
|
||||||
|
collection = 'bp_compliance_datenschutz'
|
||||||
|
elif reg_id in GE_REGS or reg_id.startswith('bsi_') or reg_id.startswith('at_') or reg_id.startswith('ch_'):
|
||||||
|
collection = 'bp_compliance_gesetze'
|
||||||
|
else:
|
||||||
|
collection = 'bp_compliance_ce' # default
|
||||||
|
|
||||||
|
sys.stdout.write(f"\r Mapping {reg_id:40s} ({collection})...")
|
||||||
|
sys.stdout.flush()
|
||||||
|
|
||||||
|
mapping, total = map_regulation(collection, reg_id)
|
||||||
|
|
||||||
|
# If not found in first collection, try others
|
||||||
|
if total == 0:
|
||||||
|
for alt_coll in ['bp_compliance_ce', 'bp_compliance_datenschutz', 'bp_compliance_gesetze']:
|
||||||
|
if alt_coll != collection:
|
||||||
|
mapping, total = map_regulation(alt_coll, reg_id)
|
||||||
|
if total > 0:
|
||||||
|
collection = alt_coll
|
||||||
|
break
|
||||||
|
|
||||||
|
all_mappings.update(mapping)
|
||||||
|
stats.append((reg_id, source_name, total, len(mapping), collection))
|
||||||
|
|
||||||
|
print(f"\r{'=' * 70}")
|
||||||
|
print(f"ARTICLE MAPPING RESULTS")
|
||||||
|
print(f"{'=' * 70}")
|
||||||
|
print(f"\n {'Regulation':35s} {'Source':35s} {'Chunks':>6s} {'Mapped':>7s} {'%':>5s}")
|
||||||
|
print(f" {'-' * 90}")
|
||||||
|
|
||||||
|
total_chunks = 0
|
||||||
|
total_mapped = 0
|
||||||
|
for reg_id, source_name, chunks, mapped, coll in sorted(stats, key=lambda x: -x[2]):
|
||||||
|
pct = f"{mapped/chunks*100:.0f}%" if chunks > 0 else "N/A"
|
||||||
|
name = (source_name or "")[:35]
|
||||||
|
print(f" {reg_id:35s} {name:35s} {chunks:6d} {mapped:7d} {pct:>5s}")
|
||||||
|
total_chunks += chunks
|
||||||
|
total_mapped += mapped
|
||||||
|
|
||||||
|
print(f"\n TOTAL: {total_chunks} chunks, {total_mapped} mapped ({total_mapped/total_chunks*100:.0f}%)")
|
||||||
|
|
||||||
|
# Save mapping
|
||||||
|
with open("/tmp/all_article_mappings.json", "w") as f:
|
||||||
|
json.dump(all_mappings, f)
|
||||||
|
print(f"\n Saved to /tmp/all_article_mappings.json ({len(all_mappings)} entries)")
|
||||||
154
scripts/qa/qa_dedup_controls.py
Normal file
154
scripts/qa/qa_dedup_controls.py
Normal file
@@ -0,0 +1,154 @@
|
|||||||
|
"""
|
||||||
|
Task 1: Remove obvious duplicate controls.
|
||||||
|
Strategy: Within each (regulation, article, paragraph) group,
|
||||||
|
compare titles using word overlap (Jaccard). If >60% similar → duplicate.
|
||||||
|
Keep the oldest control (first created), mark others as 'rejected'.
|
||||||
|
"""
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
from sqlalchemy import create_engine, text as sql_text
|
||||||
|
|
||||||
|
DB_URL = os.environ['DATABASE_URL']
|
||||||
|
engine = create_engine(DB_URL, connect_args={"options": "-c search_path=compliance,public"})
|
||||||
|
DRY_RUN = '--dry-run' in sys.argv
|
||||||
|
|
||||||
|
JACCARD_THRESHOLD = 0.45 # Title word overlap threshold for dedup
|
||||||
|
|
||||||
|
|
||||||
|
def tokenize(text):
|
||||||
|
"""Simple word tokenizer for German/English text."""
|
||||||
|
if not text:
|
||||||
|
return set()
|
||||||
|
words = re.findall(r'\b[a-zA-ZäöüÄÖÜß]{3,}\b', text.lower())
|
||||||
|
# Remove common stopwords
|
||||||
|
stops = {'und', 'der', 'die', 'das', 'für', 'von', 'mit', 'bei', 'zur', 'zum',
|
||||||
|
'den', 'des', 'dem', 'ein', 'eine', 'einer', 'eines', 'the', 'and',
|
||||||
|
'for', 'with', 'nicht', 'oder', 'auf', 'als', 'nach', 'über', 'aus',
|
||||||
|
'ist', 'sind', 'werden', 'wird', 'durch', 'unter', 'vor', 'dass'}
|
||||||
|
return set(words) - stops
|
||||||
|
|
||||||
|
|
||||||
|
def jaccard(set_a, set_b):
|
||||||
|
if not set_a or not set_b:
|
||||||
|
return 0.0
|
||||||
|
intersection = set_a & set_b
|
||||||
|
union = set_a | set_b
|
||||||
|
return len(intersection) / len(union) if union else 0.0
|
||||||
|
|
||||||
|
|
||||||
|
print("=" * 60)
|
||||||
|
print("TASK 1: DEDUPLICATE CONTROLS (Jaccard title similarity)")
|
||||||
|
print(f" Threshold: {JACCARD_THRESHOLD}")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
with engine.begin() as conn:
|
||||||
|
# Load all duplicate groups
|
||||||
|
with open("/tmp/dedup_plan.json") as f:
|
||||||
|
dup_groups = json.load(f)
|
||||||
|
|
||||||
|
print(f" Duplicate groups from plan: {len(dup_groups)}")
|
||||||
|
|
||||||
|
# For each group, load full control data and compare titles
|
||||||
|
total_rejected = 0
|
||||||
|
total_kept = 0
|
||||||
|
groups_with_dupes = 0
|
||||||
|
|
||||||
|
for group in dup_groups:
|
||||||
|
reg = group["reg"]
|
||||||
|
article = group["article"]
|
||||||
|
paragraph = group["paragraph"]
|
||||||
|
ids = group["ids"]
|
||||||
|
|
||||||
|
if len(ids) < 2:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Load controls
|
||||||
|
rows = conn.execute(sql_text("""
|
||||||
|
SELECT id, title, objective, created_at, release_state, control_id
|
||||||
|
FROM compliance.canonical_controls
|
||||||
|
WHERE id = ANY(CAST(:ids AS uuid[]))
|
||||||
|
ORDER BY created_at ASC
|
||||||
|
"""), {"ids": ids}).fetchall()
|
||||||
|
|
||||||
|
if len(rows) < 2:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Compare: keep first (oldest), check others against it and each other
|
||||||
|
kept = [rows[0]]
|
||||||
|
to_reject = []
|
||||||
|
|
||||||
|
for candidate in rows[1:]:
|
||||||
|
cand_tokens = tokenize(candidate[1])
|
||||||
|
is_dup = False
|
||||||
|
|
||||||
|
# Check against all kept controls
|
||||||
|
for keeper in kept:
|
||||||
|
keep_tokens = tokenize(keeper[1])
|
||||||
|
sim = jaccard(cand_tokens, keep_tokens)
|
||||||
|
if sim >= JACCARD_THRESHOLD:
|
||||||
|
is_dup = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if is_dup:
|
||||||
|
to_reject.append(candidate)
|
||||||
|
else:
|
||||||
|
kept.append(candidate)
|
||||||
|
|
||||||
|
if to_reject:
|
||||||
|
groups_with_dupes += 1
|
||||||
|
total_rejected += len(to_reject)
|
||||||
|
total_kept += len(kept)
|
||||||
|
|
||||||
|
if groups_with_dupes <= 5:
|
||||||
|
print(f"\n {reg} {article} {paragraph}: {len(rows)} controls → keep {len(kept)}, reject {len(to_reject)}")
|
||||||
|
for k in kept[:2]:
|
||||||
|
print(f" [KEEP] {k[1][:70]}")
|
||||||
|
for r in to_reject[:3]:
|
||||||
|
print(f" [REJ ] {r[1][:70]}")
|
||||||
|
if len(to_reject) > 3:
|
||||||
|
print(f" ... +{len(to_reject) - 3} more rejected")
|
||||||
|
|
||||||
|
if not DRY_RUN:
|
||||||
|
reject_ids = [r[0] for r in to_reject]
|
||||||
|
conn.execute(sql_text("""
|
||||||
|
UPDATE compliance.canonical_controls
|
||||||
|
SET release_state = 'duplicate',
|
||||||
|
customer_visible = false,
|
||||||
|
generation_metadata = COALESCE(generation_metadata, '{}'::jsonb)
|
||||||
|
|| '{"dedup_reason": "title_jaccard_qa", "dedup_date": "2026-03-19"}'::jsonb,
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = ANY(CAST(:ids AS uuid[]))
|
||||||
|
"""), {"ids": reject_ids})
|
||||||
|
|
||||||
|
print(f"\n{'=' * 60}")
|
||||||
|
print(f"DEDUP RESULTS")
|
||||||
|
print(f"{'=' * 60}")
|
||||||
|
print(f" Groups processed: {len(dup_groups)}")
|
||||||
|
print(f" Groups with dupes: {groups_with_dupes}")
|
||||||
|
print(f" Controls rejected: {total_rejected}")
|
||||||
|
print(f" Controls kept: {total_kept}")
|
||||||
|
print(f" Dry run: {DRY_RUN}")
|
||||||
|
|
||||||
|
# Verify final counts
|
||||||
|
if not DRY_RUN:
|
||||||
|
r = conn.execute(sql_text("""
|
||||||
|
SELECT release_state, count(*)
|
||||||
|
FROM compliance.canonical_controls
|
||||||
|
GROUP BY release_state
|
||||||
|
ORDER BY count(*) DESC
|
||||||
|
"""))
|
||||||
|
print(f"\n === Final control state distribution ===")
|
||||||
|
for row in r.fetchall():
|
||||||
|
print(f" {str(row[0]):20s} {row[1]:6d}")
|
||||||
|
|
||||||
|
# Active controls (not rejected/too_close)
|
||||||
|
r2 = conn.execute(sql_text("""
|
||||||
|
SELECT count(*) FROM compliance.canonical_controls
|
||||||
|
WHERE release_state NOT IN ('duplicate', 'too_close', 'deprecated')
|
||||||
|
"""))
|
||||||
|
active = r2.scalar()
|
||||||
|
print(f"\n Active controls (draft/verified/needs_review): {active}")
|
||||||
101
scripts/qa/qa_delete_gpsr_dupe.py
Normal file
101
scripts/qa/qa_delete_gpsr_dupe.py
Normal file
@@ -0,0 +1,101 @@
|
|||||||
|
"""
|
||||||
|
Task 2: Delete duplicate GPSR document (eu_2023_988) from Qdrant.
|
||||||
|
gpsr and eu_2023_988 are 100% identical (509/509 chunks).
|
||||||
|
Keep gpsr, delete eu_2023_988.
|
||||||
|
Also update any controls that reference eu_2023_988 to use gpsr instead.
|
||||||
|
"""
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
try:
|
||||||
|
import httpx
|
||||||
|
def http_post(url, data, timeout=30):
|
||||||
|
return httpx.post(url, json=data, timeout=timeout).json()
|
||||||
|
except ImportError:
|
||||||
|
import requests
|
||||||
|
def http_post(url, data, timeout=30):
|
||||||
|
return requests.post(url, json=data, timeout=timeout).json()
|
||||||
|
|
||||||
|
from sqlalchemy import create_engine, text as sql_text
|
||||||
|
|
||||||
|
DB_URL = os.environ['DATABASE_URL']
|
||||||
|
QDRANT_URL = os.environ.get('QDRANT_URL', 'http://host.docker.internal:6333')
|
||||||
|
engine = create_engine(DB_URL, connect_args={"options": "-c search_path=compliance,public"})
|
||||||
|
DRY_RUN = '--dry-run' in sys.argv
|
||||||
|
|
||||||
|
# ── Step 1: Count eu_2023_988 points in Qdrant ──────────────────────
|
||||||
|
print("=" * 60)
|
||||||
|
print("TASK 2: DELETE DUPLICATE GPSR (eu_2023_988) FROM QDRANT")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
count_resp = http_post(
|
||||||
|
f"{QDRANT_URL}/collections/bp_compliance_ce/points/count",
|
||||||
|
{"filter": {"must": [{"key": "regulation_id", "match": {"value": "eu_2023_988"}}]}, "exact": True},
|
||||||
|
)
|
||||||
|
count = count_resp.get("result", {}).get("count", 0)
|
||||||
|
print(f" eu_2023_988 chunks in Qdrant: {count}")
|
||||||
|
|
||||||
|
# ── Step 2: Delete from Qdrant ───────────────────────────────────────
|
||||||
|
if not DRY_RUN and count > 0:
|
||||||
|
del_resp = http_post(
|
||||||
|
f"{QDRANT_URL}/collections/bp_compliance_ce/points/delete",
|
||||||
|
{"filter": {"must": [{"key": "regulation_id", "match": {"value": "eu_2023_988"}}]}},
|
||||||
|
timeout=60,
|
||||||
|
)
|
||||||
|
status = del_resp.get("status")
|
||||||
|
print(f" Qdrant delete: {status}")
|
||||||
|
|
||||||
|
# Verify
|
||||||
|
count_after = http_post(
|
||||||
|
f"{QDRANT_URL}/collections/bp_compliance_ce/points/count",
|
||||||
|
{"filter": {"must": [{"key": "regulation_id", "match": {"value": "eu_2023_988"}}]}, "exact": True},
|
||||||
|
)
|
||||||
|
remaining = count_after.get("result", {}).get("count", 0)
|
||||||
|
print(f" Remaining after delete: {remaining}")
|
||||||
|
else:
|
||||||
|
print(f" [DRY RUN] Would delete {count} points")
|
||||||
|
|
||||||
|
# ── Step 3: Update DB references ─────────────────────────────────────
|
||||||
|
print(f"\n Updating DB references eu_2023_988 → gpsr...")
|
||||||
|
|
||||||
|
with engine.begin() as conn:
|
||||||
|
# Check controls referencing eu_2023_988
|
||||||
|
r = conn.execute(sql_text("""
|
||||||
|
SELECT count(*) FROM compliance.canonical_controls
|
||||||
|
WHERE generation_metadata->>'source_regulation' = 'eu_2023_988'
|
||||||
|
"""))
|
||||||
|
ctrl_count = r.scalar()
|
||||||
|
print(f" Controls with eu_2023_988: {ctrl_count}")
|
||||||
|
|
||||||
|
if ctrl_count > 0 and not DRY_RUN:
|
||||||
|
# Update generation_metadata.source_regulation
|
||||||
|
conn.execute(sql_text("""
|
||||||
|
UPDATE compliance.canonical_controls
|
||||||
|
SET generation_metadata = jsonb_set(
|
||||||
|
COALESCE(generation_metadata, '{}'::jsonb),
|
||||||
|
'{source_regulation}',
|
||||||
|
'"gpsr"'
|
||||||
|
)
|
||||||
|
WHERE generation_metadata->>'source_regulation' = 'eu_2023_988'
|
||||||
|
"""))
|
||||||
|
print(f" Updated {ctrl_count} controls: source_regulation → gpsr")
|
||||||
|
|
||||||
|
# Check processed_chunks
|
||||||
|
r2 = conn.execute(sql_text("""
|
||||||
|
SELECT count(*) FROM compliance.canonical_processed_chunks
|
||||||
|
WHERE regulation_code = 'eu_2023_988'
|
||||||
|
"""))
|
||||||
|
chunk_count = r2.scalar()
|
||||||
|
print(f" Processed chunks with eu_2023_988: {chunk_count}")
|
||||||
|
|
||||||
|
if chunk_count > 0 and not DRY_RUN:
|
||||||
|
conn.execute(sql_text("""
|
||||||
|
UPDATE compliance.canonical_processed_chunks
|
||||||
|
SET regulation_code = 'gpsr'
|
||||||
|
WHERE regulation_code = 'eu_2023_988'
|
||||||
|
"""))
|
||||||
|
print(f" Updated {chunk_count} processed_chunks: regulation_code → gpsr")
|
||||||
|
|
||||||
|
print(f"\n DRY RUN: {DRY_RUN}")
|
||||||
|
print(" DONE.")
|
||||||
121
scripts/qa/qa_normalize_sources.py
Normal file
121
scripts/qa/qa_normalize_sources.py
Normal file
@@ -0,0 +1,121 @@
|
|||||||
|
"""
|
||||||
|
Task 3: Normalize source_citation.source names.
|
||||||
|
Same regulation has different source names from different pipeline runs.
|
||||||
|
Standardize to one canonical name per regulation.
|
||||||
|
"""
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from sqlalchemy import create_engine, text as sql_text
|
||||||
|
|
||||||
|
DB_URL = os.environ['DATABASE_URL']
|
||||||
|
engine = create_engine(DB_URL, connect_args={"options": "-c search_path=compliance,public"})
|
||||||
|
DRY_RUN = '--dry-run' in sys.argv
|
||||||
|
|
||||||
|
# Canonical source names per regulation
|
||||||
|
SOURCE_NAMES = {
|
||||||
|
"eu_2023_1230": "Maschinenverordnung (EU) 2023/1230",
|
||||||
|
"eu_2024_2847": "Cyber Resilience Act (CRA)",
|
||||||
|
"eu_2024_1689": "KI-Verordnung (EU) 2024/1689",
|
||||||
|
"eu_2022_2555": "NIS2-Richtlinie (EU) 2022/2555",
|
||||||
|
"eu_2016_679": "DSGVO (EU) 2016/679",
|
||||||
|
"eu_blue_guide_2022": "EU Blue Guide 2022",
|
||||||
|
"nist_sp800_53r5": "NIST SP 800-53 Rev. 5",
|
||||||
|
"nist_sp_800_218": "NIST SP 800-218 (SSDF)",
|
||||||
|
"nist_csf_2_0": "NIST Cybersecurity Framework 2.0",
|
||||||
|
"nist_sp800_63_3": "NIST SP 800-63-3",
|
||||||
|
"nist_sp800_207": "NIST SP 800-207 (Zero Trust)",
|
||||||
|
"nist_ai_rmf": "NIST AI Risk Management Framework",
|
||||||
|
"owasp_top10_2021": "OWASP Top 10 (2021)",
|
||||||
|
"owasp_asvs": "OWASP ASVS 4.0",
|
||||||
|
"owasp_samm": "OWASP SAMM 2.0",
|
||||||
|
"owasp_api_top10_2023": "OWASP API Security Top 10 (2023)",
|
||||||
|
"owasp_masvs": "OWASP MASVS 2.0",
|
||||||
|
"cisa_secure_by_design": "CISA Secure by Design",
|
||||||
|
"enisa_ics_scada_dependencies": "ENISA ICS/SCADA Dependencies",
|
||||||
|
"enisa_supply_chain_good_practices": "ENISA Supply Chain Good Practices",
|
||||||
|
"enisa_threat_landscape_supply_chain": "ENISA Threat Landscape Supply Chain",
|
||||||
|
"enisa_cybersecurity_state_2024": "ENISA Cybersecurity State 2024",
|
||||||
|
"oecd_ai_principles": "OECD KI-Empfehlung",
|
||||||
|
"gpsr": "Allgemeine Produktsicherheitsverordnung (GPSR)",
|
||||||
|
"eu_2023_1542": "Batterieverordnung (EU) 2023/1542",
|
||||||
|
"mica": "Markets in Crypto-Assets (MiCA)",
|
||||||
|
"eu_2022_868": "Data Governance Act (DGA)",
|
||||||
|
"dataact": "Data Act",
|
||||||
|
"eucsa": "EU Cybersecurity Act (EUCSA)",
|
||||||
|
"eaa": "European Accessibility Act (EAA)",
|
||||||
|
"eu_2023_1803": "IFRS-Übernahmeverordnung",
|
||||||
|
"amlr": "AML-Verordnung",
|
||||||
|
"bdsg_2018_komplett": "Bundesdatenschutzgesetz (BDSG)",
|
||||||
|
"bdsg": "Bundesdatenschutzgesetz (BDSG)",
|
||||||
|
}
|
||||||
|
|
||||||
|
print("=" * 60)
|
||||||
|
print("TASK 3: NORMALIZE SOURCE NAMES")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
with engine.begin() as conn:
|
||||||
|
# Find all current source_name variants
|
||||||
|
r = conn.execute(sql_text("""
|
||||||
|
SELECT generation_metadata->>'source_regulation' as reg,
|
||||||
|
source_citation->>'source' as current_name,
|
||||||
|
count(*) as cnt
|
||||||
|
FROM compliance.canonical_controls
|
||||||
|
WHERE source_citation IS NOT NULL
|
||||||
|
AND generation_metadata->>'source_regulation' IS NOT NULL
|
||||||
|
GROUP BY 1, 2
|
||||||
|
ORDER BY 1, cnt DESC
|
||||||
|
"""))
|
||||||
|
|
||||||
|
updates = []
|
||||||
|
for row in r.fetchall():
|
||||||
|
reg = row[0]
|
||||||
|
current = row[1]
|
||||||
|
count = row[2]
|
||||||
|
canonical = SOURCE_NAMES.get(reg)
|
||||||
|
|
||||||
|
if canonical and current != canonical:
|
||||||
|
updates.append((reg, current, canonical, count))
|
||||||
|
|
||||||
|
print(f"\n Source names to normalize: {len(updates)}")
|
||||||
|
print(f"\n {'Regulation':30s} {'From':45s} → {'To':45s} {'Count':>5s}")
|
||||||
|
print(f" {'-' * 130}")
|
||||||
|
|
||||||
|
total_updated = 0
|
||||||
|
for reg, old_name, new_name, count in updates:
|
||||||
|
print(f" {reg:30s} {old_name[:45]:45s} → {new_name[:45]:45s} {count:5d}")
|
||||||
|
total_updated += count
|
||||||
|
|
||||||
|
if not DRY_RUN:
|
||||||
|
name_json = json.dumps(new_name) # "name" with quotes for jsonb
|
||||||
|
conn.execute(sql_text("""
|
||||||
|
UPDATE compliance.canonical_controls
|
||||||
|
SET source_citation = jsonb_set(
|
||||||
|
source_citation,
|
||||||
|
'{source}',
|
||||||
|
CAST(:name_json AS jsonb)
|
||||||
|
)
|
||||||
|
WHERE generation_metadata->>'source_regulation' = :reg
|
||||||
|
AND source_citation->>'source' = :old_name
|
||||||
|
"""), {"reg": reg, "old_name": old_name, "name_json": name_json})
|
||||||
|
|
||||||
|
print(f"\n Total controls updated: {total_updated}")
|
||||||
|
print(f" Dry run: {DRY_RUN}")
|
||||||
|
|
||||||
|
# Verify
|
||||||
|
if not DRY_RUN:
|
||||||
|
r2 = conn.execute(sql_text("""
|
||||||
|
SELECT generation_metadata->>'source_regulation' as reg,
|
||||||
|
source_citation->>'source' as name,
|
||||||
|
count(*)
|
||||||
|
FROM compliance.canonical_controls
|
||||||
|
WHERE source_citation IS NOT NULL
|
||||||
|
AND generation_metadata->>'source_regulation' IS NOT NULL
|
||||||
|
GROUP BY 1, 2
|
||||||
|
HAVING count(*) >= 5
|
||||||
|
ORDER BY count(*) DESC
|
||||||
|
"""))
|
||||||
|
print(f"\n === Verified source names (>= 5 controls) ===")
|
||||||
|
for row in r2.fetchall():
|
||||||
|
print(f" {str(row[0]):30s} {str(row[1]):50s} {row[2]:5d}")
|
||||||
206
scripts/qa/sync_controls_to_prod.py
Normal file
206
scripts/qa/sync_controls_to_prod.py
Normal file
@@ -0,0 +1,206 @@
|
|||||||
|
"""
|
||||||
|
Sync controls from Mac Mini (local) to Production (Hetzner).
|
||||||
|
Both have PostgreSQL. Mac Mini has 6,373 active controls, Production ~3,159.
|
||||||
|
|
||||||
|
Strategy:
|
||||||
|
1. Export all non-duplicate/non-too_close controls from Mac Mini
|
||||||
|
2. Upsert into Production (ON CONFLICT update, preserve production-only data)
|
||||||
|
3. Mark controls on Production that don't exist on Mac Mini as deprecated
|
||||||
|
"""
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from sqlalchemy import create_engine, text as sql_text
|
||||||
|
|
||||||
|
# Mac Mini DB (local)
|
||||||
|
LOCAL_DB = os.environ['DATABASE_URL']
|
||||||
|
# Production DB (Hetzner) — same env var format
|
||||||
|
PROD_DB = os.environ.get('PROD_DATABASE_URL', '')
|
||||||
|
|
||||||
|
if not PROD_DB:
|
||||||
|
print("ERROR: PROD_DATABASE_URL not set")
|
||||||
|
print("Please provide the production database URL")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
DRY_RUN = '--dry-run' in sys.argv
|
||||||
|
|
||||||
|
local_engine = create_engine(LOCAL_DB, connect_args={"options": "-c search_path=compliance,public"})
|
||||||
|
prod_engine = create_engine(PROD_DB, connect_args={"options": "-c search_path=compliance,public"})
|
||||||
|
|
||||||
|
# ── Step 1: Export from Mac Mini ──────────────────────────────────────
|
||||||
|
print("=" * 60)
|
||||||
|
print("SYNC CONTROLS: Mac Mini → Production")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
with local_engine.connect() as local_conn:
|
||||||
|
# Get all controls (include duplicates/too_close so prod knows about them)
|
||||||
|
rows = local_conn.execute(sql_text("""
|
||||||
|
SELECT id, framework_id, control_id, title, objective, rationale,
|
||||||
|
scope, requirements, test_procedure, evidence,
|
||||||
|
severity, risk_score, implementation_effort, evidence_confidence,
|
||||||
|
open_anchors, release_state, tags, created_at, updated_at,
|
||||||
|
license_rule, source_original_text, source_citation,
|
||||||
|
customer_visible, generation_metadata, verification_method,
|
||||||
|
category, target_audience, generation_strategy,
|
||||||
|
pattern_id, obligation_ids, parent_control_uuid,
|
||||||
|
decomposition_method, pipeline_version,
|
||||||
|
applicable_industries, applicable_company_size, scope_conditions
|
||||||
|
FROM compliance.canonical_controls
|
||||||
|
""")).fetchall()
|
||||||
|
|
||||||
|
print(f" Local controls: {len(rows)}")
|
||||||
|
|
||||||
|
# Count by state
|
||||||
|
states = {}
|
||||||
|
for r in rows:
|
||||||
|
s = r[15] # release_state
|
||||||
|
states[s] = states.get(s, 0) + 1
|
||||||
|
for s, c in sorted(states.items(), key=lambda x: -x[1]):
|
||||||
|
print(f" {s}: {c}")
|
||||||
|
|
||||||
|
# ── Step 2: Check Production state ───────────────────────────────────
|
||||||
|
with prod_engine.connect() as prod_conn:
|
||||||
|
r = prod_conn.execute(sql_text("""
|
||||||
|
SELECT count(*) FROM compliance.canonical_controls
|
||||||
|
"""))
|
||||||
|
prod_count = r.scalar()
|
||||||
|
print(f"\n Production controls before sync: {prod_count}")
|
||||||
|
|
||||||
|
# Check if framework exists
|
||||||
|
fw = prod_conn.execute(sql_text("""
|
||||||
|
SELECT id FROM compliance.canonical_control_frameworks
|
||||||
|
WHERE framework_id = 'bp_security_v1' LIMIT 1
|
||||||
|
""")).fetchone()
|
||||||
|
if fw:
|
||||||
|
print(f" Framework bp_security_v1: {fw[0]}")
|
||||||
|
else:
|
||||||
|
print(" WARNING: Framework bp_security_v1 not found on production!")
|
||||||
|
|
||||||
|
# ── Step 3: Upsert to Production ─────────────────────────────────────
|
||||||
|
print(f"\n Syncing {len(rows)} controls to production...")
|
||||||
|
|
||||||
|
with prod_engine.begin() as prod_conn:
|
||||||
|
inserted = 0
|
||||||
|
updated = 0
|
||||||
|
errors = 0
|
||||||
|
|
||||||
|
for i, row in enumerate(rows):
|
||||||
|
try:
|
||||||
|
result = prod_conn.execute(sql_text("""
|
||||||
|
INSERT INTO compliance.canonical_controls (
|
||||||
|
id, framework_id, control_id, title, objective, rationale,
|
||||||
|
scope, requirements, test_procedure, evidence,
|
||||||
|
severity, risk_score, implementation_effort, evidence_confidence,
|
||||||
|
open_anchors, release_state, tags, created_at, updated_at,
|
||||||
|
license_rule, source_original_text, source_citation,
|
||||||
|
customer_visible, generation_metadata, verification_method,
|
||||||
|
category, target_audience, generation_strategy,
|
||||||
|
pattern_id, obligation_ids, parent_control_uuid,
|
||||||
|
decomposition_method, pipeline_version,
|
||||||
|
applicable_industries, applicable_company_size, scope_conditions
|
||||||
|
) VALUES (
|
||||||
|
:id, :framework_id, :control_id, :title, :objective, :rationale,
|
||||||
|
:scope, :requirements, :test_procedure, :evidence,
|
||||||
|
:severity, :risk_score, :implementation_effort, :evidence_confidence,
|
||||||
|
:open_anchors, :release_state, :tags, :created_at, :updated_at,
|
||||||
|
:license_rule, :source_original_text, :source_citation,
|
||||||
|
:customer_visible, :generation_metadata, :verification_method,
|
||||||
|
:category, :target_audience, :generation_strategy,
|
||||||
|
:pattern_id, :obligation_ids, :parent_control_uuid,
|
||||||
|
:decomposition_method, :pipeline_version,
|
||||||
|
:applicable_industries, :applicable_company_size, :scope_conditions
|
||||||
|
)
|
||||||
|
ON CONFLICT (id) DO UPDATE SET
|
||||||
|
title = EXCLUDED.title,
|
||||||
|
objective = EXCLUDED.objective,
|
||||||
|
rationale = EXCLUDED.rationale,
|
||||||
|
scope = EXCLUDED.scope,
|
||||||
|
requirements = EXCLUDED.requirements,
|
||||||
|
test_procedure = EXCLUDED.test_procedure,
|
||||||
|
evidence = EXCLUDED.evidence,
|
||||||
|
severity = EXCLUDED.severity,
|
||||||
|
risk_score = EXCLUDED.risk_score,
|
||||||
|
implementation_effort = EXCLUDED.implementation_effort,
|
||||||
|
open_anchors = EXCLUDED.open_anchors,
|
||||||
|
release_state = EXCLUDED.release_state,
|
||||||
|
tags = EXCLUDED.tags,
|
||||||
|
updated_at = EXCLUDED.updated_at,
|
||||||
|
license_rule = EXCLUDED.license_rule,
|
||||||
|
source_original_text = EXCLUDED.source_original_text,
|
||||||
|
source_citation = EXCLUDED.source_citation,
|
||||||
|
customer_visible = EXCLUDED.customer_visible,
|
||||||
|
generation_metadata = EXCLUDED.generation_metadata,
|
||||||
|
verification_method = EXCLUDED.verification_method,
|
||||||
|
category = EXCLUDED.category,
|
||||||
|
target_audience = EXCLUDED.target_audience,
|
||||||
|
generation_strategy = EXCLUDED.generation_strategy,
|
||||||
|
pipeline_version = EXCLUDED.pipeline_version,
|
||||||
|
applicable_industries = EXCLUDED.applicable_industries,
|
||||||
|
applicable_company_size = EXCLUDED.applicable_company_size,
|
||||||
|
scope_conditions = EXCLUDED.scope_conditions
|
||||||
|
"""), {
|
||||||
|
"id": row[0], "framework_id": row[1], "control_id": row[2],
|
||||||
|
"title": row[3], "objective": row[4], "rationale": row[5],
|
||||||
|
"scope": json.dumps(row[6]) if isinstance(row[6], (dict, list)) else row[6],
|
||||||
|
"requirements": json.dumps(row[7]) if isinstance(row[7], (dict, list)) else row[7],
|
||||||
|
"test_procedure": json.dumps(row[8]) if isinstance(row[8], (dict, list)) else row[8],
|
||||||
|
"evidence": json.dumps(row[9]) if isinstance(row[9], (dict, list)) else row[9],
|
||||||
|
"severity": row[10], "risk_score": row[11],
|
||||||
|
"implementation_effort": row[12], "evidence_confidence": row[13],
|
||||||
|
"open_anchors": json.dumps(row[14]) if isinstance(row[14], (dict, list)) else row[14],
|
||||||
|
"release_state": row[15],
|
||||||
|
"tags": json.dumps(row[16]) if isinstance(row[16], (dict, list)) else row[16],
|
||||||
|
"created_at": row[17], "updated_at": row[18],
|
||||||
|
"license_rule": row[19], "source_original_text": row[20],
|
||||||
|
"source_citation": json.dumps(row[21]) if isinstance(row[21], (dict, list)) else row[21],
|
||||||
|
"customer_visible": row[22],
|
||||||
|
"generation_metadata": json.dumps(row[23]) if isinstance(row[23], (dict, list)) else row[23],
|
||||||
|
"verification_method": row[24], "category": row[25],
|
||||||
|
"target_audience": json.dumps(row[26]) if isinstance(row[26], (dict, list)) else row[26],
|
||||||
|
"generation_strategy": row[27],
|
||||||
|
"pattern_id": row[28],
|
||||||
|
"obligation_ids": json.dumps(row[29]) if isinstance(row[29], (dict, list)) else row[29],
|
||||||
|
"parent_control_uuid": row[30], "decomposition_method": row[31],
|
||||||
|
"pipeline_version": row[32],
|
||||||
|
"applicable_industries": json.dumps(row[33]) if isinstance(row[33], (dict, list)) else row[33],
|
||||||
|
"applicable_company_size": json.dumps(row[34]) if isinstance(row[34], (dict, list)) else row[34],
|
||||||
|
"scope_conditions": json.dumps(row[35]) if isinstance(row[35], (dict, list)) else row[35],
|
||||||
|
})
|
||||||
|
|
||||||
|
# Check if it was insert or update (xmax = 0 means insert)
|
||||||
|
inserted += 1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
errors += 1
|
||||||
|
if errors <= 5:
|
||||||
|
print(f" ERROR on {row[2]}: {str(e)[:100]}")
|
||||||
|
|
||||||
|
if (i + 1) % 1000 == 0:
|
||||||
|
sys.stdout.write(f"\r Progress: {i+1}/{len(rows)} (errors: {errors})")
|
||||||
|
sys.stdout.flush()
|
||||||
|
|
||||||
|
print(f"\r Synced: {len(rows)} controls (errors: {errors})")
|
||||||
|
|
||||||
|
# ── Step 4: Verify ───────────────────────────────────────────────────
|
||||||
|
with prod_engine.connect() as prod_conn:
|
||||||
|
r = prod_conn.execute(sql_text("""
|
||||||
|
SELECT release_state, count(*)
|
||||||
|
FROM compliance.canonical_controls
|
||||||
|
GROUP BY release_state
|
||||||
|
ORDER BY count(*) DESC
|
||||||
|
"""))
|
||||||
|
print(f"\n === Production control states after sync ===")
|
||||||
|
total = 0
|
||||||
|
for row in r.fetchall():
|
||||||
|
print(f" {str(row[0]):20s} {row[1]:6d}")
|
||||||
|
total += row[1]
|
||||||
|
print(f" {'TOTAL':20s} {total:6d}")
|
||||||
|
|
||||||
|
r2 = prod_conn.execute(sql_text("""
|
||||||
|
SELECT count(*) FROM compliance.canonical_controls
|
||||||
|
WHERE release_state NOT IN ('duplicate', 'too_close', 'deprecated')
|
||||||
|
"""))
|
||||||
|
active = r2.scalar()
|
||||||
|
print(f"\n Active controls on production: {active}")
|
||||||
Reference in New Issue
Block a user