breakpilot-compliance/scripts/qa/qa_article_map_all_chunks.py

"""
Step 2: Build article/paragraph mapping for ALL regulations that have controls.
Scan chunks sequentially by chunk_index, track current article heading.

Handles both EU regulations (Artikel X) and German laws (§ X).
"""
import hashlib
import json
import os
import re
import sys
from collections import defaultdict

try:
    import httpx
    def http_post(url, data, timeout=30):
        return httpx.post(url, json=data, timeout=timeout).json()
except ImportError:
    import requests
    def http_post(url, data, timeout=30):
        return requests.post(url, json=data, timeout=timeout).json()

from sqlalchemy import create_engine, text as sql_text

DB_URL = os.environ['DATABASE_URL']
QDRANT_URL = os.environ.get('QDRANT_URL', 'http://host.docker.internal:6333')
engine = create_engine(DB_URL, connect_args={"options": "-c search_path=compliance,public"})

# ── Patterns for different document types ─────────────────────────────

# EU Regulations: "Artikel 26\n" heading
EU_ARTICLE = re.compile(r'(?:^|\n)\s*Artikel\s+(\d+[a-z]?)\b', re.IGNORECASE)
# German laws: "§ 26" or "§26"
DE_PARAGRAPH = re.compile(r'(?:^|\n)\s*§\s*(\d+[a-z]?)\b')
# NIST/OWASP section markers: "A01:2021", "AC-1", "PR.AC-1", etc.
NIST_CONTROL = re.compile(r'(?:^|\n)\s*([A-Z]{2}(?:\.[A-Z]{2})?-\d+)', re.MULTILINE)
OWASP_SECTION = re.compile(r'(A\d{2}:\d{4}(?:\s*[–—-]\s*[^\n]+)?)')
# Absatz/paragraph
ABSATZ = re.compile(r'(?:^|\n)\s*\((\d+)\)')
# ENISA/CISA sections (numbered)
SECTION_NUM = re.compile(r'(?:^|\n)\s*(\d+\.\d+(?:\.\d+)?)\s+[A-Z]')

# Regulation types
EU_REGS = {
    'eu_2016_679', 'eu_2024_1689', 'eu_2022_2555', 'eu_2024_2847',
    'eu_2023_1230', 'eu_2023_1542', 'eu_2022_2065', 'eu_2022_1925',
    'eu_2022_868', 'eu_2019_770', 'eu_2021_914', 'eu_2002_58',
    'eu_2000_31', 'eu_2023_1803', 'eu_2023_988', 'gpsr', 'eucsa',
    'dataact', 'dora', 'ehds', 'mica', 'psd2', 'dpf', 'dsm', 'amlr',
    'eaa', 'eu_blue_guide_2022',
}
DE_LAWS = {
    'bdsg', 'bdsg_2018_komplett', 'gewo', 'elektrog', 'verpackg',
    'battdg', 'bfsg', 'ddg', 'uwg', 'de_tkg', 'prodhaftg',
    'tmg_komplett', 'urhg_komplett', 'bgb_komplett', 'hgb_komplett',
    'ao_komplett', 'egbgb_komplett', 'de_betrvg', 'de_geschgehg',
    'vsbg', 'pangv', 'mstv', 'de_dlinfov', 'de_ustg_ret',
}
OWASP = {
    'owasp_top10_2021', 'owasp_asvs', 'owasp_samm', 'owasp_api_top10_2023',
    'owasp_masvs', 'owasp_mobile_top10',
}
NIST = {
    'nist_sp800_53r5', 'nist_sp_800_53', 'nist_sp_800_218', 'nist_sp800_218',
    'nist_sp_800_63b', 'nist_sp800_63_3', 'nist_csf_2_0', 'nist_sp800_207',
    'nist_ai_rmf', 'nist_privacy_1_0', 'nistir_8259a',
}


def scan_regulation(collection, regulation_id):
    """Scroll all chunks for a regulation, sorted by chunk_index."""
    chunks = []
    offset = None
    while True:
        params = {
            "filter": {"must": [{"key": "regulation_id", "match": {"value": regulation_id}}]},
            "limit": 250,
            "with_payload": ["chunk_text", "chunk_index"],
            "with_vectors": False,
        }
        if offset:
            params["offset"] = offset
        result = http_post(f"{QDRANT_URL}/collections/{collection}/points/scroll", params, timeout=30)
        points = result.get("result", {}).get("points", [])
        next_offset = result.get("result", {}).get("next_page_offset")
        for p in points:
            t = p["payload"].get("chunk_text", "")
            chunks.append({
                "hash": hashlib.sha256(t.encode()).hexdigest(),
                "idx": p["payload"].get("chunk_index", 0),
                "text": t,
            })
        if not next_offset:
            break
        offset = next_offset
    chunks.sort(key=lambda c: c["idx"])
    return chunks


def map_eu_articles(chunks):
    """Map EU regulation chunks to Artikel/Absatz."""
    current_article = ""
    current_paragraph = ""
    mapping = {}
    for c in chunks:
        m = EU_ARTICLE.search(c["text"])
        if m:
            current_article = f"Art. {m.group(1)}"
            current_paragraph = ""
        paras = ABSATZ.findall(c["text"])
        if paras:
            current_paragraph = f"Abs. {paras[0]}"
        if current_article:
            mapping[c["hash"]] = {"article": current_article, "paragraph": current_paragraph}
    return mapping


def map_de_paragraphs(chunks):
    """Map German law chunks to §/Absatz."""
    current_para = ""
    current_abs = ""
    mapping = {}
    for c in chunks:
        m = DE_PARAGRAPH.search(c["text"])
        if m:
            current_para = f"§ {m.group(1)}"
            current_abs = ""
        abs_matches = ABSATZ.findall(c["text"])
        if abs_matches:
            current_abs = f"Abs. {abs_matches[0]}"
        if current_para:
            mapping[c["hash"]] = {"article": current_para, "paragraph": current_abs}
    return mapping


def map_owasp(chunks):
    """Map OWASP chunks to section markers (A01:2021, etc.)."""
    current_section = ""
    mapping = {}
    for c in chunks:
        m = OWASP_SECTION.search(c["text"])
        if m:
            current_section = m.group(1).strip()
            # Normalize: take just the code part
            code_match = re.match(r'(A\d{2}:\d{4})', current_section)
            if code_match:
                current_section = code_match.group(1)
        if current_section:
            mapping[c["hash"]] = {"article": current_section, "paragraph": ""}
    return mapping


def map_nist(chunks):
    """Map NIST chunks to control families/sections."""
    current_section = ""
    mapping = {}
    for c in chunks:
        # Try NIST control ID (AC-1, SC-7, etc.)
        m = NIST_CONTROL.search(c["text"])
        if m:
            current_section = m.group(1)
        # Also try section numbers (2.1, 3.2.1, etc.)
        if not current_section:
            m2 = SECTION_NUM.search(c["text"])
            if m2:
                current_section = m2.group(1)
        if current_section:
            mapping[c["hash"]] = {"article": current_section, "paragraph": ""}
    return mapping


def map_generic(chunks):
    """Generic mapping using section numbers."""
    current_section = ""
    mapping = {}
    for c in chunks:
        # Try EU article first
        m = EU_ARTICLE.search(c["text"])
        if m:
            current_section = f"Art. {m.group(1)}"
        else:
            # Try section numbers
            m2 = SECTION_NUM.search(c["text"])
            if m2:
                current_section = m2.group(1)
        paras = ABSATZ.findall(c["text"])
        para = f"Abs. {paras[0]}" if paras else ""
        if current_section:
            mapping[c["hash"]] = {"article": current_section, "paragraph": para}
    return mapping


def map_regulation(collection, regulation_id):
    """Map a regulation to articles based on its type."""
    chunks = scan_regulation(collection, regulation_id)
    if not chunks:
        return {}, 0

    if regulation_id in EU_REGS:
        mapping = map_eu_articles(chunks)
    elif regulation_id in DE_LAWS:
        mapping = map_de_paragraphs(chunks)
    elif regulation_id in OWASP:
        mapping = map_owasp(chunks)
    elif regulation_id in NIST:
        mapping = map_nist(chunks)
    else:
        mapping = map_generic(chunks)

    return mapping, len(chunks)


# ── Main: Get all regulations that have controls ─────────────────────
with engine.connect() as conn:
    # Get regulations with controls (skip v1/v2 without citation)
    r = conn.execute(sql_text("""
        SELECT DISTINCT
            generation_metadata->>'source_regulation' as reg,
            source_citation->>'source' as source_name
        FROM compliance.canonical_controls
        WHERE source_citation IS NOT NULL
          AND generation_metadata->>'source_regulation' IS NOT NULL
          AND release_state NOT IN ('rejected')
        ORDER BY 1
    """))
    regulations = [(row[0], row[1]) for row in r.fetchall()]

print(f"Regulations with controls: {len(regulations)}")

# Determine which collection each regulation is in
# (Most are in bp_compliance_ce, some in bp_compliance_datenschutz)
CE_REGS = EU_REGS | {'enisa_ics_scada_dependencies', 'enisa_supply_chain_good_practices',
                      'enisa_threat_landscape_supply_chain', 'enisa_cybersecurity_state_2024',
                      'cisa_secure_by_design', 'oecd_ai_principles', 'nistir_8259a'}
DS_REGS = {'owasp_top10_2021', 'owasp_asvs', 'owasp_samm', 'owasp_api_top10_2023',
           'owasp_masvs', 'owasp_mobile_top10', 'nist_sp800_53r5', 'nist_sp_800_218',
           'nist_sp800_218', 'nist_sp800_63_3', 'nist_sp800_207', 'nist_csf_2_0',
           'nist_ai_rmf', 'nist_privacy_1_0', 'nistir_8259a',
           'edpb_bcr_01_2022', 'edpb_05_2020', 'edpb_09_2022',
           'edpb_certification_01_2019', 'edpb_connected_vehicles_01_2020',
           'edpb_dpbd_04_2019', 'edpb_legitimate_interest', 'edpb_legitimate_interest_01_2024',
           'edpb_social_media_08_2020', 'edpb_transfers_01_2020', 'edpb_transfers_07_2020',
           'edpb_breach_09_2022', 'edpb_01_2020',
           'wp244_profiling', 'wp251_profiling', 'wp260_transparency',
           'hleg_trustworthy_ai', 'edpb_guidelines_7_2020'}
GE_REGS = DE_LAWS | {'at_dsg', 'at_tkg', 'es_lopdgdd', 'fr_loi_informatique',
                      'hu_info_tv', 'bsi_200_1', 'bsi_200_2', 'bsi_200_3', 'bsi_200_4',
                      'bsi_c5_2020'}

# Build all mappings
all_mappings = {}  # chunk_hash -> {article, paragraph}
stats = []  # (reg_id, total_chunks, mapped_chunks)

for reg_id, source_name in regulations:
    # Skip eu_2023_988 (duplicate of gpsr)
    if reg_id == 'eu_2023_988':
        continue

    # Determine collection
    if reg_id in CE_REGS or reg_id.startswith('eu_') or reg_id.startswith('enisa_') or reg_id.startswith('cisa_') or reg_id.startswith('oecd_'):
        collection = 'bp_compliance_ce'
    elif reg_id in DS_REGS or reg_id.startswith('owasp_') or reg_id.startswith('nist_') or reg_id.startswith('edpb_') or reg_id.startswith('wp') or reg_id.startswith('hleg_'):
        collection = 'bp_compliance_datenschutz'
    elif reg_id in GE_REGS or reg_id.startswith('bsi_') or reg_id.startswith('at_') or reg_id.startswith('ch_'):
        collection = 'bp_compliance_gesetze'
    else:
        collection = 'bp_compliance_ce'  # default

    sys.stdout.write(f"\r  Mapping {reg_id:40s} ({collection})...")
    sys.stdout.flush()

    mapping, total = map_regulation(collection, reg_id)

    # If not found in first collection, try others
    if total == 0:
        for alt_coll in ['bp_compliance_ce', 'bp_compliance_datenschutz', 'bp_compliance_gesetze']:
            if alt_coll != collection:
                mapping, total = map_regulation(alt_coll, reg_id)
                if total > 0:
                    collection = alt_coll
                    break

    all_mappings.update(mapping)
    stats.append((reg_id, source_name, total, len(mapping), collection))

print(f"\r{'=' * 70}")
print(f"ARTICLE MAPPING RESULTS")
print(f"{'=' * 70}")
print(f"\n  {'Regulation':35s} {'Source':35s} {'Chunks':>6s} {'Mapped':>7s} {'%':>5s}")
print(f"  {'-' * 90}")

total_chunks = 0
total_mapped = 0
for reg_id, source_name, chunks, mapped, coll in sorted(stats, key=lambda x: -x[2]):
    pct = f"{mapped/chunks*100:.0f}%" if chunks > 0 else "N/A"
    name = (source_name or "")[:35]
    print(f"  {reg_id:35s} {name:35s} {chunks:6d} {mapped:7d} {pct:>5s}")
    total_chunks += chunks
    total_mapped += mapped

print(f"\n  TOTAL: {total_chunks} chunks, {total_mapped} mapped ({total_mapped/total_chunks*100:.0f}%)")

# Save mapping
with open("/tmp/all_article_mappings.json", "w") as f:
    json.dump(all_mappings, f)
print(f"\n  Saved to /tmp/all_article_mappings.json ({len(all_mappings)} entries)")