breakpilot-compliance/backend-compliance/compliance/services/control_generator.py

"""
Control Generator Pipeline — RAG → License → Structure/Reform → Harmonize → Anchor → Store.

7-stage pipeline that generates canonical security controls from RAG chunks:
  1. RAG SCAN        — Load unprocessed chunks (or new document versions)
  2. LICENSE CLASSIFY — Determine which of 3 license rules applies
  3a. STRUCTURE       — Rule 1+2: Structure original text into control format
  3b. LLM REFORM     — Rule 3: Fully reformulate (no original text, no source names)
  4. HARMONIZE       — Check against existing controls for duplicates
  5. ANCHOR SEARCH   — Find open-source references (OWASP, NIST, ENISA)
  6. STORE           — Persist to DB with correct visibility flags
  7. MARK PROCESSED  — Mark RAG chunks as processed (with version tracking)

Three License Rules:
  Rule 1 (free_use):         Laws, Public Domain — original text allowed
  Rule 2 (citation_required): CC-BY, CC-BY-SA — original text with citation
  Rule 3 (restricted):       BSI, ISO — full reformulation, no source names
"""

import hashlib
import json
import logging
import os
import re
import uuid
from collections import defaultdict
from dataclasses import dataclass, field, asdict
from datetime import datetime, timezone
from typing import Dict, List, Optional, Set

import httpx
from pydantic import BaseModel
from sqlalchemy import text
from sqlalchemy.orm import Session

from .rag_client import ComplianceRAGClient, RAGSearchResult, get_rag_client
from .similarity_detector import check_similarity, SimilarityReport

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------

SDK_URL = os.getenv("SDK_URL", "http://ai-compliance-sdk:8090")
EMBEDDING_URL = os.getenv("EMBEDDING_URL", "http://embedding-service:8087")
QDRANT_URL = os.getenv("QDRANT_URL", "http://host.docker.internal:6333")
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "")
ANTHROPIC_MODEL = os.getenv("CONTROL_GEN_ANTHROPIC_MODEL", "claude-sonnet-4-6")
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
OLLAMA_MODEL = os.getenv("CONTROL_GEN_OLLAMA_MODEL", "qwen3.5:35b-a3b")
LLM_TIMEOUT = float(os.getenv("CONTROL_GEN_LLM_TIMEOUT", "180"))

HARMONIZATION_THRESHOLD = 0.85  # Cosine similarity above this = duplicate

# Pipeline version — increment when generation rules change materially.
# v1: Original (local LLM prefilter, old prompt)
# v2: Anthropic decides relevance, null for non-requirement chunks, annexes protected
PIPELINE_VERSION = 2

ALL_COLLECTIONS = [
    "bp_compliance_ce",
    "bp_compliance_gesetze",
    "bp_compliance_datenschutz",
    "bp_dsfa_corpus",
    "bp_legal_templates",
]

# ---------------------------------------------------------------------------
# License Mapping (3-Rule System)
# ---------------------------------------------------------------------------

REGULATION_LICENSE_MAP: dict[str, dict] = {
    # RULE 1: FREE USE — Laws, Public Domain
    # EU Regulations
    "eu_2016_679":           {"license": "EU_LAW",              "rule": 1, "name": "DSGVO"},
    "eu_2024_1689":          {"license": "EU_LAW",              "rule": 1, "name": "AI Act (KI-Verordnung)"},
    "eu_2022_2555":          {"license": "EU_LAW",              "rule": 1, "name": "NIS2"},
    "eu_2024_2847":          {"license": "EU_LAW",              "rule": 1, "name": "Cyber Resilience Act (CRA)"},
    "eu_2023_1230":          {"license": "EU_LAW",              "rule": 1, "name": "Maschinenverordnung"},
    "eu_2022_2065":          {"license": "EU_LAW",              "rule": 1, "name": "Digital Services Act (DSA)"},
    "eu_2022_1925":          {"license": "EU_LAW",              "rule": 1, "name": "Digital Markets Act (DMA)"},
    "eu_2022_868":           {"license": "EU_LAW",              "rule": 1, "name": "Data Governance Act (DGA)"},
    "eu_2019_770":           {"license": "EU_LAW",              "rule": 1, "name": "Digitale-Inhalte-Richtlinie"},
    "eu_2021_914":           {"license": "EU_LAW",              "rule": 1, "name": "Standardvertragsklauseln (SCC)"},
    "eu_2002_58":            {"license": "EU_LAW",              "rule": 1, "name": "ePrivacy-Richtlinie"},
    "eu_2000_31":            {"license": "EU_LAW",              "rule": 1, "name": "E-Commerce-Richtlinie"},
    "eu_2023_1803":          {"license": "EU_LAW",              "rule": 1, "name": "IFRS-Uebernahmeverordnung"},
    "eucsa":                 {"license": "EU_LAW",              "rule": 1, "name": "EU Cybersecurity Act"},
    "dataact":               {"license": "EU_LAW",              "rule": 1, "name": "Data Act"},
    "dora":                  {"license": "EU_LAW",              "rule": 1, "name": "Digital Operational Resilience Act"},
    "ehds":                  {"license": "EU_LAW",              "rule": 1, "name": "European Health Data Space"},
    "gpsr":                  {"license": "EU_LAW",              "rule": 1, "name": "Allgemeine Produktsicherheitsverordnung"},
    "mica":                  {"license": "EU_LAW",              "rule": 1, "name": "Markets in Crypto-Assets"},
    "psd2":                  {"license": "EU_LAW",              "rule": 1, "name": "Zahlungsdiensterichtlinie 2"},
    "dpf":                   {"license": "EU_LAW",              "rule": 1, "name": "EU-US Data Privacy Framework"},
    "dsm":                   {"license": "EU_LAW",              "rule": 1, "name": "DSM-Urheberrechtsrichtlinie"},
    "amlr":                  {"license": "EU_LAW",              "rule": 1, "name": "AML-Verordnung"},
    "eu_blue_guide_2022":    {"license": "EU_PUBLIC",           "rule": 1, "name": "Blue Guide 2022"},
    # NIST (Public Domain — all variants)
    "nist_sp_800_53":        {"license": "NIST_PUBLIC_DOMAIN",  "rule": 1, "name": "NIST SP 800-53"},
    "nist_sp800_53r5":       {"license": "NIST_PUBLIC_DOMAIN",  "rule": 1, "name": "NIST SP 800-53 Rev.5"},
    "nist_sp_800_63b":       {"license": "NIST_PUBLIC_DOMAIN",  "rule": 1, "name": "NIST SP 800-63B"},
    "nist_sp800_63_3":       {"license": "NIST_PUBLIC_DOMAIN",  "rule": 1, "name": "NIST SP 800-63-3"},
    "nist_csf_2_0":          {"license": "NIST_PUBLIC_DOMAIN",  "rule": 1, "name": "NIST CSF 2.0"},
    "nist_sp_800_218":       {"license": "NIST_PUBLIC_DOMAIN",  "rule": 1, "name": "NIST SSDF"},
    "nist_sp800_207":        {"license": "NIST_PUBLIC_DOMAIN",  "rule": 1, "name": "NIST SP 800-207 Zero Trust"},
    "nist_ai_rmf":           {"license": "NIST_PUBLIC_DOMAIN",  "rule": 1, "name": "NIST AI Risk Management Framework"},
    "nistir_8259a":          {"license": "NIST_PUBLIC_DOMAIN",  "rule": 1, "name": "NISTIR 8259A IoT Security"},
    "cisa_secure_by_design": {"license": "US_GOV_PUBLIC",       "rule": 1, "name": "CISA Secure by Design"},
    # German Laws
    "bdsg":                  {"license": "DE_LAW",              "rule": 1, "name": "BDSG"},
    "bdsg_2018_komplett":    {"license": "DE_LAW",              "rule": 1, "name": "BDSG 2018"},
    "ttdsg":                 {"license": "DE_LAW",              "rule": 1, "name": "TTDSG"},
    "tdddg_25":              {"license": "DE_LAW",              "rule": 1, "name": "TDDDG"},
    "tkg":                   {"license": "DE_LAW",              "rule": 1, "name": "TKG"},
    "de_tkg":                {"license": "DE_LAW",              "rule": 1, "name": "TKG"},
    "bgb_komplett":          {"license": "DE_LAW",              "rule": 1, "name": "BGB"},
    "hgb":                   {"license": "DE_LAW",              "rule": 1, "name": "HGB"},
    "hgb_komplett":          {"license": "DE_LAW",              "rule": 1, "name": "HGB"},
    "urhg_komplett":         {"license": "DE_LAW",              "rule": 1, "name": "UrhG"},
    "uwg":                   {"license": "DE_LAW",              "rule": 1, "name": "UWG"},
    "tmg_komplett":          {"license": "DE_LAW",              "rule": 1, "name": "TMG"},
    "gewo":                  {"license": "DE_LAW",              "rule": 1, "name": "GewO"},
    "ao":                    {"license": "DE_LAW",              "rule": 1, "name": "Abgabenordnung"},
    "ao_komplett":           {"license": "DE_LAW",              "rule": 1, "name": "Abgabenordnung"},
    "battdg":                {"license": "DE_LAW",              "rule": 1, "name": "Batteriegesetz"},
    # Austrian Laws
    "at_dsg":                {"license": "AT_LAW",              "rule": 1, "name": "AT DSG"},
    "at_abgb":               {"license": "AT_LAW",              "rule": 1, "name": "AT ABGB"},
    "at_abgb_agb":           {"license": "AT_LAW",              "rule": 1, "name": "AT ABGB AGB-Recht"},
    "at_bao":                {"license": "AT_LAW",              "rule": 1, "name": "AT BAO"},
    "at_bao_ret":            {"license": "AT_LAW",              "rule": 1, "name": "AT BAO Retention"},
    "at_ecg":                {"license": "AT_LAW",              "rule": 1, "name": "AT E-Commerce-Gesetz"},
    "at_kschg":              {"license": "AT_LAW",              "rule": 1, "name": "AT Konsumentenschutzgesetz"},
    "at_medieng":            {"license": "AT_LAW",              "rule": 1, "name": "AT Mediengesetz"},
    "at_tkg":                {"license": "AT_LAW",              "rule": 1, "name": "AT TKG"},
    "at_ugb":                {"license": "AT_LAW",              "rule": 1, "name": "AT UGB"},
    "at_ugb_ret":            {"license": "AT_LAW",              "rule": 1, "name": "AT UGB Retention"},
    "at_uwg":                {"license": "AT_LAW",              "rule": 1, "name": "AT UWG"},
    # Other EU Member State Laws
    "fr_loi_informatique":   {"license": "FR_LAW",              "rule": 1, "name": "FR Loi Informatique"},
    "es_lopdgdd":            {"license": "ES_LAW",              "rule": 1, "name": "ES LOPDGDD"},
    "nl_uavg":               {"license": "NL_LAW",              "rule": 1, "name": "NL UAVG"},
    "it_codice_privacy":     {"license": "IT_LAW",              "rule": 1, "name": "IT Codice Privacy"},
    "hu_info_tv":            {"license": "HU_LAW",              "rule": 1, "name": "HU Információs törvény"},
    # EDPB Guidelines (EU Public Authority)
    "edpb_01_2020":          {"license": "EU_PUBLIC",           "rule": 1, "name": "EDPB 01/2020 Ergaenzende Massnahmen"},
    "edpb_02_2023":          {"license": "EU_PUBLIC",           "rule": 1, "name": "EDPB 02/2023 Technischer Anwendungsbereich"},
    "edpb_05_2020":          {"license": "EU_PUBLIC",           "rule": 1, "name": "EDPB 05/2020 Einwilligung"},
    "edpb_09_2022":          {"license": "EU_PUBLIC",           "rule": 1, "name": "EDPB 09/2022 Datenschutzverletzungen"},
    "edpb_bcr_01_2022":      {"license": "EU_PUBLIC",           "rule": 1, "name": "EDPB BCR Leitlinien"},
    "edpb_breach_09_2022":   {"license": "EU_PUBLIC",           "rule": 1, "name": "EDPB Breach Notification"},
    "edpb_connected_vehicles_01_2020": {"license": "EU_PUBLIC", "rule": 1, "name": "EDPB Connected Vehicles"},
    "edpb_dpbd_04_2019":     {"license": "EU_PUBLIC",           "rule": 1, "name": "EDPB Data Protection by Design"},
    "edpb_eprivacy_02_2023": {"license": "EU_PUBLIC",           "rule": 1, "name": "EDPB ePrivacy"},
    "edpb_facial_recognition_05_2022": {"license": "EU_PUBLIC", "rule": 1, "name": "EDPB Facial Recognition"},
    "edpb_fines_04_2022":    {"license": "EU_PUBLIC",           "rule": 1, "name": "EDPB Fines Calculation"},
    "edpb_legitimate_interest": {"license": "EU_PUBLIC",        "rule": 1, "name": "EDPB Legitimate Interest"},
    "edpb_legitimate_interest_01_2024": {"license": "EU_PUBLIC","rule": 1, "name": "EDPB Legitimate Interest 2024"},
    "edpb_social_media_08_2020": {"license": "EU_PUBLIC",       "rule": 1, "name": "EDPB Social Media"},
    "edpb_transfers_01_2020":{"license": "EU_PUBLIC",           "rule": 1, "name": "EDPB Transfers 01/2020"},
    "edpb_transfers_07_2020":{"license": "EU_PUBLIC",           "rule": 1, "name": "EDPB Transfers 07/2020"},
    "edpb_video_03_2019":    {"license": "EU_PUBLIC",           "rule": 1, "name": "EDPB Video Surveillance"},
    "edps_dpia_list":        {"license": "EU_PUBLIC",           "rule": 1, "name": "EDPS DPIA Liste"},
    "edpb_certification_01_2018": {"license": "EU_PUBLIC",     "rule": 1, "name": "EDPB Certification 01/2018"},
    "edpb_certification_01_2019": {"license": "EU_PUBLIC",     "rule": 1, "name": "EDPB Certification 01/2019"},
    "eaa":                   {"license": "EU_LAW",              "rule": 1, "name": "European Accessibility Act"},
    # WP29 (pre-EDPB) Guidelines
    "wp244_profiling":       {"license": "EU_PUBLIC",           "rule": 1, "name": "WP29 Profiling"},
    "wp251_profiling":       {"license": "EU_PUBLIC",           "rule": 1, "name": "WP29 Data Portability"},
    "wp260_transparency":    {"license": "EU_PUBLIC",           "rule": 1, "name": "WP29 Transparency"},

    # RULE 2: CITATION REQUIRED — CC-BY, CC-BY-SA
    "owasp_asvs":            {"license": "CC-BY-SA-4.0", "rule": 2, "name": "OWASP ASVS",
                              "attribution": "OWASP Foundation, CC BY-SA 4.0"},
    "owasp_masvs":           {"license": "CC-BY-SA-4.0", "rule": 2, "name": "OWASP MASVS",
                              "attribution": "OWASP Foundation, CC BY-SA 4.0"},
    "owasp_top10":           {"license": "CC-BY-SA-4.0", "rule": 2, "name": "OWASP Top 10",
                              "attribution": "OWASP Foundation, CC BY-SA 4.0"},
    "owasp_top10_2021":      {"license": "CC-BY-SA-4.0", "rule": 2, "name": "OWASP Top 10 2021",
                              "attribution": "OWASP Foundation, CC BY-SA 4.0"},
    "owasp_api_top10_2023":  {"license": "CC-BY-SA-4.0", "rule": 2, "name": "OWASP API Top 10 2023",
                              "attribution": "OWASP Foundation, CC BY-SA 4.0"},
    "owasp_samm":            {"license": "CC-BY-SA-4.0", "rule": 2, "name": "OWASP SAMM",
                              "attribution": "OWASP Foundation, CC BY-SA 4.0"},
    "oecd_ai_principles":    {"license": "OECD_PUBLIC",  "rule": 2, "name": "OECD AI Principles",
                              "attribution": "OECD"},

    # RULE 3: RESTRICTED — Full reformulation required
    # Names stored as INTERNAL_ONLY — never exposed to customers
}

# Prefix-based matching for wildcard entries
_RULE3_PREFIXES = ["bsi_", "iso_", "etsi_"]
_RULE2_PREFIXES = ["enisa_"]


def _classify_regulation(regulation_code: str) -> dict:
    """Determine license rule for a regulation_code."""
    code = regulation_code.lower().strip()

    # Exact match first
    if code in REGULATION_LICENSE_MAP:
        return REGULATION_LICENSE_MAP[code]

    # Prefix match for Rule 2
    for prefix in _RULE2_PREFIXES:
        if code.startswith(prefix):
            return {"license": "CC-BY-4.0", "rule": 2, "name": "ENISA",
                    "attribution": "ENISA, CC BY 4.0"}

    # Prefix match for Rule 3
    for prefix in _RULE3_PREFIXES:
        if code.startswith(prefix):
            return {"license": f"{prefix.rstrip('_').upper()}_RESTRICTED", "rule": 3,
                    "name": "INTERNAL_ONLY"}

    # Unknown → treat as restricted (safe default)
    logger.warning("Unknown regulation_code %r — defaulting to Rule 3 (restricted)", code)
    return {"license": "UNKNOWN", "rule": 3, "name": "INTERNAL_ONLY"}


# ---------------------------------------------------------------------------
# Domain detection from content
# ---------------------------------------------------------------------------

DOMAIN_KEYWORDS = {
    "AUTH": ["authentication", "login", "password", "credential", "mfa", "2fa",
             "session", "token", "oauth", "identity", "authentifizierung", "anmeldung"],
    "CRYP": ["encryption", "cryptography", "tls", "ssl", "certificate", "hashing",
              "aes", "rsa", "verschlüsselung", "kryptographie", "cipher", "schlüssel"],
    "NET": ["network", "firewall", "dns", "vpn", "proxy", "segmentation",
            "netzwerk", "routing", "port", "intrusion"],
    "DATA": ["data protection", "privacy", "personal data", "datenschutz",
             "personenbezogen", "dsgvo", "gdpr", "löschung", "verarbeitung"],
    "LOG": ["logging", "monitoring", "audit trail", "siem", "alert", "anomaly",
            "protokollierung", "überwachung"],
    "ACC": ["access control", "authorization", "rbac", "permission", "privilege",
            "zugriffskontrolle", "berechtigung", "autorisierung"],
    "SEC": ["vulnerability", "patch", "update", "hardening", "configuration",
            "schwachstelle", "härtung", "konfiguration"],
    "INC": ["incident", "response", "breach", "recovery", "backup",
            "vorfall", "wiederherstellung", "notfall"],
    "AI": ["artificial intelligence", "machine learning", "model", "bias",
           "ki", "künstliche intelligenz", "algorithmus", "training"],
    "COMP": ["compliance", "audit", "regulation", "standard", "certification",
             "konformität", "prüfung", "zertifizierung"],
    "GOV": ["behörde", "verwaltung", "öffentlich", "register", "gewerberegister",
            "handelsregister", "meldepflicht", "aufsicht", "genehmigung", "bescheid",
            "verwaltungsakt", "ordnungswidrig", "bußgeld", "staat", "ministerium",
            "bundesamt", "landesamt", "kommune", "gebietskörperschaft"],
    "LAB": ["arbeitnehmer", "arbeitgeber", "arbeitsschutz", "arbeitszeit", "betriebsrat",
            "kündigung", "beschäftigung", "mindestlohn", "arbeitsvertrag", "betriebsverfassung",
            "arbeitsrecht", "arbeitsstätte", "gefährdungsbeurteilung", "unterweisung"],
    "FIN": ["finanz", "bankwesen", "zahlungsverkehr", "geldwäsche", "bilanz", "rechnungslegung",
            "buchführung", "jahresabschluss", "steuererklärung", "kapitalmarkt", "wertpapier",
            "kreditinstitut", "finanzdienstleistung", "bankenaufsicht", "bafin"],
    "TRD": ["handelsrecht", "gewerbeordnung", "gewerbe", "handwerk", "gewerbeuntersagung",
            "gewerbebetrieb", "handelsgesetzbuch", "handelsregister", "kaufmann",
            "unternehmer", "wettbewerb", "verbraucherschutz", "produktsicherheit"],
    "ENV": ["umweltschutz", "emission", "abfall", "immission", "gewässerschutz",
            "naturschutz", "umweltverträglichkeit", "klimaschutz", "nachhaltigkeit",
            "entsorgung", "recycling", "umweltrecht"],
    "HLT": ["gesundheit", "medizinprodukt", "arzneimittel", "patient", "krankenhaus",
            "hygiene", "infektionsschutz", "medizin", "pflege", "therapie"],
}


CATEGORY_KEYWORDS = {
    "encryption": ["encryption", "cryptography", "tls", "ssl", "certificate", "hashing",
                    "aes", "rsa", "verschlüsselung", "kryptographie", "cipher", "schlüssel"],
    "authentication": ["authentication", "login", "password", "credential", "mfa", "2fa",
                       "session", "oauth", "authentifizierung", "anmeldung", "passwort"],
    "network": ["network", "firewall", "dns", "vpn", "proxy", "segmentation",
                "netzwerk", "routing", "port", "intrusion", "ids", "ips"],
    "data_protection": ["data protection", "privacy", "personal data", "datenschutz",
                        "personenbezogen", "dsgvo", "gdpr", "löschung", "verarbeitung", "einwilligung"],
    "logging": ["logging", "monitoring", "audit trail", "siem", "alert", "anomaly",
                "protokollierung", "überwachung", "nachvollziehbar"],
    "incident": ["incident", "response", "breach", "recovery", "vorfall", "sicherheitsvorfall"],
    "continuity": ["backup", "disaster recovery", "notfall", "wiederherstellung", "notfallplan",
                   "business continuity", "ausfallsicherheit"],
    "compliance": ["compliance", "audit", "regulation", "certification", "konformität",
                   "prüfung", "zertifizierung", "nachweis"],
    "supply_chain": ["supplier", "vendor", "third party", "lieferant", "auftragnehmer",
                     "unterauftragnehmer", "supply chain", "dienstleister"],
    "physical": ["physical", "building", "access zone", "physisch", "gebäude", "zutritt",
                 "schließsystem", "rechenzentrum"],
    "personnel": ["training", "awareness", "employee", "schulung", "mitarbeiter",
                  "sensibilisierung", "personal", "unterweisung"],
    "application": ["application", "software", "code review", "sdlc", "secure coding",
                    "anwendung", "entwicklung", "software-entwicklung", "api"],
    "system": ["hardening", "patch", "configuration", "update", "härtung", "konfiguration",
               "betriebssystem", "system", "server"],
    "risk": ["risk assessment", "risk management", "risiko", "bewertung", "risikobewertung",
             "risikoanalyse", "bedrohung", "threat"],
    "governance": ["governance", "policy", "organization", "isms", "sicherheitsorganisation",
                   "richtlinie", "verantwortlichkeit", "rolle"],
    "hardware": ["hardware", "platform", "firmware", "bios", "tpm", "chip",
                 "plattform", "geräte"],
    "identity": ["identity", "iam", "directory", "ldap", "sso", "provisioning",
                 "identität", "identitätsmanagement", "benutzerverzeichnis"],
    "public_administration": ["behörde", "verwaltung", "öffentlich", "register", "gewerberegister",
                              "handelsregister", "meldepflicht", "aufsicht", "genehmigung", "bescheid",
                              "verwaltungsakt", "ordnungswidrig", "bußgeld", "amt"],
    "labor_law": ["arbeitnehmer", "arbeitgeber", "arbeitsschutz", "arbeitszeit", "betriebsrat",
                  "kündigung", "beschäftigung", "mindestlohn", "arbeitsvertrag", "betriebsverfassung"],
    "finance": ["finanz", "bankwesen", "zahlungsverkehr", "geldwäsche", "bilanz", "rechnungslegung",
                "buchführung", "jahresabschluss", "kapitalmarkt", "wertpapier", "bafin"],
    "trade_regulation": ["gewerbeordnung", "gewerbe", "handwerk", "gewerbeuntersagung",
                         "gewerbebetrieb", "handelsrecht", "kaufmann", "wettbewerb",
                         "verbraucherschutz", "produktsicherheit"],
    "environmental": ["umweltschutz", "emission", "abfall", "immission", "gewässerschutz",
                      "naturschutz", "klimaschutz", "nachhaltigkeit", "entsorgung"],
    "health": ["gesundheit", "medizinprodukt", "arzneimittel", "patient", "krankenhaus",
               "hygiene", "infektionsschutz", "pflege"],
}

VALID_CATEGORIES = set(CATEGORY_KEYWORDS.keys())
VALID_DOMAINS = {"AUTH", "CRYP", "NET", "DATA", "LOG", "ACC", "SEC", "INC",
                 "AI", "COMP", "GOV", "LAB", "FIN", "TRD", "ENV", "HLT"}

# ---------------------------------------------------------------------------
# Recital (Erwägungsgrund) detection in source text
# ---------------------------------------------------------------------------

# Pattern: standalone recital number like (125)\n or (126) at line start
_RECITAL_RE = re.compile(r'\((\d{1,3})\)\s*\n')

# Recital-typical phrasing (German EU law Erwägungsgründe)
_RECITAL_PHRASES = [
    "in erwägung nachstehender gründe",
    "erwägungsgrund",
    "in anbetracht",
    "daher sollte",
    "aus diesem grund",
    "es ist daher",
    "folglich sollte",
    "es sollte daher",
    "in diesem zusammenhang",
]


def _detect_recital(text: str) -> Optional[dict]:
    """Detect if source text is a recital (Erwägungsgrund) rather than an article.

    Returns a dict with detection details if recital markers are found,
    or None if the text appears to be genuine article text.

    Detection criteria:
    1. Standalone recital numbers like (126)\\n in the text
    2. Recital-typical phrasing ("daher sollte", "erwägungsgrund", etc.)
    """
    if not text:
        return None

    # Check 1: Recital number markers
    recital_matches = _RECITAL_RE.findall(text)

    # Check 2: Recital phrasing
    text_lower = text.lower()
    phrase_hits = [p for p in _RECITAL_PHRASES if p in text_lower]

    if not recital_matches and not phrase_hits:
        return None

    # Require at least recital numbers OR >=2 phrase hits to be a suspect
    if not recital_matches and len(phrase_hits) < 2:
        return None

    return {
        "recital_suspect": True,
        "recital_numbers": recital_matches[:10],
        "recital_phrases": phrase_hits[:5],
        "detection_method": "regex+phrases" if recital_matches and phrase_hits
                           else "regex" if recital_matches else "phrases",
    }

CATEGORY_LIST_STR = ", ".join(sorted(VALID_CATEGORIES))

VERIFICATION_KEYWORDS = {
    "code_review": ["source code", "code review", "static analysis", "sast", "dast",
                    "dependency check", "quellcode", "codeanalyse", "secure coding",
                    "software development", "api", "input validation", "output encoding"],
    "document": ["policy", "procedure", "documentation", "training", "awareness",
                 "richtlinie", "dokumentation", "schulung", "nachweis", "vertrag",
                 "organizational", "process", "role", "responsibility"],
    "tool": ["scanner", "monitoring", "siem", "ids", "ips", "firewall", "antivirus",
             "vulnerability scan", "penetration test", "tool", "automated"],
    "hybrid": [],  # Assigned when multiple methods match equally
}


def _detect_category(text: str) -> Optional[str]:
    """Detect the most likely category from text content."""
    text_lower = text.lower()
    scores: dict[str, int] = {}
    for cat, keywords in CATEGORY_KEYWORDS.items():
        scores[cat] = sum(1 for kw in keywords if kw in text_lower)
    if not scores or max(scores.values()) == 0:
        return None
    return max(scores, key=scores.get)


def _detect_verification_method(text: str) -> Optional[str]:
    """Detect verification method from text content."""
    text_lower = text.lower()
    scores: dict[str, int] = {}
    for method, keywords in VERIFICATION_KEYWORDS.items():
        if method == "hybrid":
            continue
        scores[method] = sum(1 for kw in keywords if kw in text_lower)
    if not scores or max(scores.values()) == 0:
        return None
    top = sorted(scores.items(), key=lambda x: -x[1])
    # If top two are close, it's hybrid
    if len(top) >= 2 and top[0][1] > 0 and top[1][1] > 0 and top[1][1] >= top[0][1] * 0.7:
        return "hybrid"
    return top[0][0] if top[0][1] > 0 else None


def _detect_domain(text: str) -> str:
    """Detect the most likely domain from text content."""
    text_lower = text.lower()
    scores: dict[str, int] = {}
    for domain, keywords in DOMAIN_KEYWORDS.items():
        scores[domain] = sum(1 for kw in keywords if kw in text_lower)
    if not scores or max(scores.values()) == 0:
        return "SEC"  # Default
    return max(scores, key=scores.get)


# ---------------------------------------------------------------------------
# Data Models
# ---------------------------------------------------------------------------

class GeneratorConfig(BaseModel):
    collections: Optional[List[str]] = None
    domain: Optional[str] = None
    batch_size: int = 5
    max_controls: int = 0  # 0 = unlimited (process ALL chunks)
    max_chunks: int = 0  # 0 = unlimited; >0 = stop after N chunks (respects document boundaries)
    skip_processed: bool = True
    skip_web_search: bool = False
    dry_run: bool = False
    existing_job_id: Optional[str] = None  # If set, reuse this job instead of creating a new one
    regulation_filter: Optional[List[str]] = None  # Only process chunks matching these regulation_code prefixes
    skip_prefilter: bool = False  # If True, skip local LLM pre-filter (send all chunks to API)


@dataclass
class GeneratedControl:
    control_id: str = ""
    title: str = ""
    objective: str = ""
    rationale: str = ""
    scope: dict = field(default_factory=dict)
    requirements: list = field(default_factory=list)
    test_procedure: list = field(default_factory=list)
    evidence: list = field(default_factory=list)
    severity: str = "medium"
    risk_score: float = 5.0
    implementation_effort: str = "m"
    open_anchors: list = field(default_factory=list)
    release_state: str = "draft"
    tags: list = field(default_factory=list)
    # 3-rule fields
    license_rule: Optional[int] = None
    source_original_text: Optional[str] = None
    source_citation: Optional[dict] = None
    customer_visible: bool = True
    generation_metadata: dict = field(default_factory=dict)
    generation_strategy: str = "ungrouped"  # ungrouped | document_grouped
    # Classification fields
    verification_method: Optional[str] = None  # code_review, document, tool, hybrid
    category: Optional[str] = None  # one of 22 categories
    target_audience: Optional[list] = None  # e.g. ["unternehmen", "behoerden", "entwickler"]


@dataclass
class GeneratorResult:
    job_id: str = ""
    status: str = "completed"
    total_chunks_scanned: int = 0
    controls_generated: int = 0
    controls_verified: int = 0
    controls_needs_review: int = 0
    controls_too_close: int = 0
    controls_duplicates_found: int = 0
    controls_qa_fixed: int = 0
    chunks_skipped_prefilter: int = 0
    errors: list = field(default_factory=list)
    controls: list = field(default_factory=list)


# ---------------------------------------------------------------------------
# LLM Client (via Go SDK)
# ---------------------------------------------------------------------------

async def _llm_chat(prompt: str, system_prompt: Optional[str] = None) -> str:
    """Call LLM — Anthropic Claude (primary) or Ollama (fallback)."""
    if ANTHROPIC_API_KEY:
        logger.info("Calling Anthropic API (model=%s)...", ANTHROPIC_MODEL)
        result = await _llm_anthropic(prompt, system_prompt)
        if result:
            logger.info("Anthropic API success (%d chars)", len(result))
            return result
        logger.warning("Anthropic failed, falling back to Ollama")

    logger.info("Calling Ollama (model=%s)...", OLLAMA_MODEL)
    return await _llm_ollama(prompt, system_prompt)


async def _llm_local(prompt: str, system_prompt: Optional[str] = None) -> str:
    """Call local Ollama LLM only (for pre-filtering and classification tasks)."""
    return await _llm_ollama(prompt, system_prompt)


PREFILTER_SYSTEM_PROMPT = """Du bist ein Compliance-Analyst. Deine Aufgabe: Prüfe ob ein Textabschnitt eine konkrete Sicherheitsanforderung, Datenschutzpflicht, oder technische/organisatorische Maßnahme enthält.

Antworte NUR mit einem JSON-Objekt: {"relevant": true/false, "reason": "kurze Begründung"}

Relevant = true wenn der Text mindestens EINE der folgenden enthält:
- Konkrete Pflicht/Anforderung ("muss", "soll", "ist sicherzustellen")
- Technische Sicherheitsmaßnahme (Verschlüsselung, Zugriffskontrolle, Logging)
- Organisatorische Maßnahme (Schulung, Dokumentation, Audit)
- Datenschutz-Vorgabe (Löschpflicht, Einwilligung, Zweckbindung)
- Risikomanagement-Anforderung

Relevant = false wenn der Text NUR enthält:
- Definitionen ohne Pflichten
- Inhaltsverzeichnisse oder Verweise
- Reine Begriffsbestimmungen
- Übergangsvorschriften ohne Substanz
- Adressaten/Geltungsbereich ohne Anforderung"""


async def _prefilter_chunk(chunk_text: str) -> tuple[bool, str]:
    """Use local LLM to check if a chunk contains an actionable requirement.

    Returns (is_relevant, reason).
    Much cheaper than sending every chunk to Anthropic.
    """
    prompt = f"""Prüfe ob dieser Textabschnitt eine konkrete Sicherheitsanforderung oder Compliance-Pflicht enthält.

Text:
---
{chunk_text[:1500]}
---

Antworte NUR mit JSON: {{"relevant": true/false, "reason": "kurze Begründung"}}"""

    try:
        raw = await _llm_local(prompt, PREFILTER_SYSTEM_PROMPT)
        data = _parse_llm_json(raw)
        if data:
            return data.get("relevant", True), data.get("reason", "")
        # If parsing fails, assume relevant (don't skip)
        return True, "parse_failed"
    except Exception as e:
        logger.warning("Prefilter failed: %s — treating as relevant", e)
        return True, f"error: {e}"


async def _llm_anthropic(prompt: str, system_prompt: Optional[str] = None) -> str:
    """Call Anthropic Messages API."""
    headers = {
        "x-api-key": ANTHROPIC_API_KEY,
        "anthropic-version": "2023-06-01",
        "content-type": "application/json",
    }
    payload = {
        "model": ANTHROPIC_MODEL,
        "max_tokens": 8192,
        "messages": [{"role": "user", "content": prompt}],
    }
    if system_prompt:
        payload["system"] = system_prompt

    try:
        async with httpx.AsyncClient(timeout=LLM_TIMEOUT) as client:
            resp = await client.post(
                "https://api.anthropic.com/v1/messages",
                headers=headers,
                json=payload,
            )
            if resp.status_code != 200:
                logger.error("Anthropic API %d: %s", resp.status_code, resp.text[:300])
                return ""
            data = resp.json()
            content = data.get("content", [])
            if content and isinstance(content, list):
                return content[0].get("text", "")
            return ""
    except Exception as e:
        logger.error("Anthropic request failed: %s (type: %s)", e, type(e).__name__)
        return ""


async def _llm_ollama(prompt: str, system_prompt: Optional[str] = None) -> str:
    """Call Ollama chat API (fallback)."""
    messages = []
    if system_prompt:
        messages.append({"role": "system", "content": system_prompt})
    messages.append({"role": "user", "content": prompt})

    payload = {
        "model": OLLAMA_MODEL,
        "messages": messages,
        "stream": False,
        "options": {"num_predict": 512},  # Limit response length for speed
        "think": False,  # Disable thinking for faster responses
    }

    try:
        async with httpx.AsyncClient(timeout=LLM_TIMEOUT) as client:
            resp = await client.post(f"{OLLAMA_URL}/api/chat", json=payload)
            if resp.status_code != 200:
                logger.error("Ollama chat failed %d: %s", resp.status_code, resp.text[:300])
                return ""
            data = resp.json()
            msg = data.get("message", {})
            if isinstance(msg, dict):
                return msg.get("content", "")
            return data.get("response", str(msg))
    except Exception as e:
        logger.error("Ollama request failed: %s", e)
        return ""


async def _get_embedding(text: str) -> list[float]:
    """Get embedding vector for text via embedding service."""
    try:
        async with httpx.AsyncClient(timeout=10.0) as client:
            resp = await client.post(
                f"{EMBEDDING_URL}/embed",
                json={"texts": [text]},
            )
            resp.raise_for_status()
            embeddings = resp.json().get("embeddings", [])
            return embeddings[0] if embeddings else []
    except Exception:
        return []


async def _get_embeddings_batch(texts: list[str], batch_size: int = 32) -> list[list[float]]:
    """Get embedding vectors for multiple texts in batches."""
    all_embeddings: list[list[float]] = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        try:
            async with httpx.AsyncClient(timeout=30.0) as client:
                resp = await client.post(
                    f"{EMBEDDING_URL}/embed",
                    json={"texts": batch},
                )
                resp.raise_for_status()
                embeddings = resp.json().get("embeddings", [])
                all_embeddings.extend(embeddings)
        except Exception as e:
            logger.warning("Batch embedding failed for %d texts: %s", len(batch), e)
            all_embeddings.extend([[] for _ in batch])
    return all_embeddings


def _cosine_sim(a: list[float], b: list[float]) -> float:
    """Compute cosine similarity between two vectors."""
    if not a or not b or len(a) != len(b):
        return 0.0
    dot = sum(x * y for x, y in zip(a, b))
    norm_a = sum(x * x for x in a) ** 0.5
    norm_b = sum(x * x for x in b) ** 0.5
    if norm_a == 0 or norm_b == 0:
        return 0.0
    return dot / (norm_a * norm_b)


# ---------------------------------------------------------------------------
# JSON Parsing Helper
# ---------------------------------------------------------------------------

def _parse_llm_json(raw: str) -> dict:
    """Extract JSON from LLM response (handles markdown fences)."""
    # Try extracting from ```json ... ``` blocks
    match = re.search(r"```(?:json)?\s*\n?(.*?)\n?```", raw, re.DOTALL)
    text = match.group(1) if match else raw

    # Try parsing directly
    try:
        return json.loads(text)
    except json.JSONDecodeError:
        pass

    # Try finding first { ... } block
    brace_match = re.search(r"\{.*\}", text, re.DOTALL)
    if brace_match:
        try:
            return json.loads(brace_match.group(0))
        except json.JSONDecodeError:
            pass

    logger.warning("Failed to parse LLM JSON response")
    return {}


def _parse_llm_json_array(raw: str) -> list[dict]:
    """Extract a JSON array from LLM response — returns list of dicts."""
    match = re.search(r"```(?:json)?\s*\n?(.*?)\n?```", raw, re.DOTALL)
    text = match.group(1) if match else raw

    # Try parsing as array directly
    try:
        parsed = json.loads(text)
        if isinstance(parsed, list):
            return parsed
        if isinstance(parsed, dict):
            # Check if it wraps an array (e.g. {"controls": [...]})
            for key in ("controls", "results", "items", "data"):
                if key in parsed and isinstance(parsed[key], list):
                    return parsed[key]
            return [parsed]
    except json.JSONDecodeError:
        pass

    # Try finding [ ... ] block
    bracket_match = re.search(r"\[.*\]", text, re.DOTALL)
    if bracket_match:
        try:
            parsed = json.loads(bracket_match.group(0))
            if isinstance(parsed, list):
                return parsed
        except json.JSONDecodeError:
            pass

    # Try finding multiple { ... } blocks (LLM sometimes returns separate objects)
    objects = []
    for obj_match in re.finditer(r"\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}", text, re.DOTALL):
        try:
            obj = json.loads(obj_match.group(0))
            if isinstance(obj, dict) and obj.get("title"):
                objects.append(obj)
        except json.JSONDecodeError:
            continue
    if objects:
        logger.info("Parsed %d individual JSON objects from batch response", len(objects))
        return objects

    # Fallback: try single object
    single = _parse_llm_json(raw)
    if single:
        logger.info("Batch parse fallback: extracted single object")
    else:
        logger.warning("Batch parse failed — logging first 500 chars: %s", raw[:500])
    return [single] if single else []


# ---------------------------------------------------------------------------
# Pipeline
# ---------------------------------------------------------------------------

REFORM_SYSTEM_PROMPT = """Du bist ein Security-Compliance-Experte. Deine Aufgabe ist es, eigenständige
Security Controls zu formulieren. Du formulierst IMMER in eigenen Worten.
KOPIERE KEINE Sätze aus dem Quelltext. Verwende eigene Begriffe und Struktur.
NENNE NICHT die Quelle. Keine proprietären Bezeichner.
Antworte NUR mit validem JSON. Bei mehreren Controls antworte mit einem JSON-Array."""

STRUCTURE_SYSTEM_PROMPT = """Du bist ein Security-Compliance-Experte. Strukturiere den gegebenen Text
als praxisorientiertes Security Control. Erstelle eine verständliche, umsetzbare Formulierung.
Antworte NUR mit validem JSON. Bei mehreren Controls antworte mit einem JSON-Array."""


class ControlGeneratorPipeline:
    """Orchestrates the 7-stage control generation pipeline."""

    def __init__(self, db: Session, rag_client: Optional[ComplianceRAGClient] = None):
        self.db = db
        self.rag = rag_client or get_rag_client()
        self._existing_controls: Optional[List[dict]] = None
        self._existing_embeddings: Dict[str, List[float]] = {}

    # ── Stage 1: RAG Scan ──────────────────────────────────────────────

    async def _scan_rag(self, config: GeneratorConfig) -> list[RAGSearchResult]:
        """Scroll through ALL chunks in RAG collections.

        Uses DIRECT Qdrant scroll API (bypasses Go SDK which has offset cycling bugs).
        Filters out already-processed chunks by hash.
        """
        collections = config.collections or ALL_COLLECTIONS
        all_results: list[RAGSearchResult] = []

        # Pre-load all processed hashes for fast filtering
        processed_hashes: set[str] = set()
        if config.skip_processed:
            try:
                result = self.db.execute(
                    text("SELECT chunk_hash FROM canonical_processed_chunks")
                )
                processed_hashes = {row[0] for row in result}
                logger.info("Loaded %d processed chunk hashes", len(processed_hashes))
            except Exception as e:
                logger.warning("Error loading processed hashes: %s", e)

        seen_hashes: set[str] = set()

        for collection in collections:
            page = 0
            collection_total = 0
            collection_new = 0
            qdrant_offset = None  # Qdrant uses point ID as offset

            while True:
                # Direct Qdrant scroll API — bypasses Go SDK offset cycling bug
                try:
                    scroll_body: dict = {
                        "limit": 250,
                        "with_payload": True,
                        "with_vector": False,
                    }
                    if qdrant_offset is not None:
                        scroll_body["offset"] = qdrant_offset

                    async with httpx.AsyncClient(timeout=30.0) as client:
                        resp = await client.post(
                            f"{QDRANT_URL}/collections/{collection}/points/scroll",
                            json=scroll_body,
                        )
                        if resp.status_code != 200:
                            logger.error("Qdrant scroll %s failed: %d %s", collection, resp.status_code, resp.text[:200])
                            break
                        data = resp.json().get("result", {})
                        points = data.get("points", [])
                        next_page_offset = data.get("next_page_offset")
                except Exception as e:
                    logger.error("Qdrant scroll error for %s: %s", collection, e)
                    break

                if not points:
                    break

                collection_total += len(points)

                for point in points:
                    payload = point.get("payload", {})
                    # Different collections use different field names for text
                    chunk_text = (payload.get("chunk_text", "")
                                  or payload.get("content", "")
                                  or payload.get("text", "")
                                  or payload.get("page_content", ""))
                    if not chunk_text or len(chunk_text.strip()) < 50:
                        continue

                    h = hashlib.sha256(chunk_text.encode()).hexdigest()

                    if h in seen_hashes:
                        continue
                    seen_hashes.add(h)

                    if h in processed_hashes:
                        continue

                    # Convert Qdrant point to RAGSearchResult
                    # Handle varying payload schemas across collections
                    reg_code = (payload.get("regulation_id", "")
                                or payload.get("regulation_code", "")
                                or payload.get("source_id", "")
                                or payload.get("source_code", ""))

                    # Filter by regulation_code if configured
                    if config.regulation_filter:
                        if not reg_code:
                            continue  # Skip chunks without regulation code
                        code_lower = reg_code.lower()
                        if not any(code_lower.startswith(f.lower()) for f in config.regulation_filter):
                            continue

                    reg_name = (payload.get("regulation_name_de", "")
                                or payload.get("regulation_name", "")
                                or payload.get("source_name", "")
                                or payload.get("guideline_name", "")
                                or payload.get("document_title", "")
                                or payload.get("filename", ""))
                    reg_short = (payload.get("regulation_short", "")
                                 or reg_code)
                    chunk = RAGSearchResult(
                        text=chunk_text,
                        regulation_code=reg_code,
                        regulation_name=reg_name,
                        regulation_short=reg_short,
                        category=payload.get("category", "") or payload.get("data_type", ""),
                        article=payload.get("article", "") or payload.get("section_title", "") or payload.get("section", ""),
                        paragraph=payload.get("paragraph", ""),
                        source_url=payload.get("source_url", "") or payload.get("source", "") or payload.get("url", ""),
                        score=0.0,
                        collection=collection,
                    )
                    all_results.append(chunk)
                    collection_new += 1

                page += 1
                if page % 100 == 0:
                    logger.info(
                        "Scrolling %s (direct Qdrant): page %d, %d total chunks, %d new unprocessed",
                        collection, page, collection_total, collection_new,
                    )

                # Stop conditions
                if next_page_offset is None:
                    break  # Qdrant returns null when no more pages

                qdrant_offset = next_page_offset

            logger.info(
                "Collection %s: %d total chunks scrolled (direct Qdrant), %d new unprocessed",
                collection, collection_total, collection_new,
            )

        if config.regulation_filter:
            logger.info(
                "RAG scroll complete: %d total unique seen, %d passed regulation_filter %s",
                len(seen_hashes), len(all_results), config.regulation_filter,
            )
        else:
            logger.info(
                "RAG scroll complete: %d total unique seen, %d new unprocessed to process",
                len(seen_hashes), len(all_results),
            )
        return all_results

    def _get_processed_hashes(self, hashes: list[str]) -> set[str]:
        """Check which chunk hashes are already processed."""
        if not hashes:
            return set()
        try:
            result = self.db.execute(
                text("SELECT chunk_hash FROM canonical_processed_chunks WHERE chunk_hash = ANY(:hashes)"),
                {"hashes": hashes},
            )
            return {row[0] for row in result}
        except Exception as e:
            logger.warning("Error checking processed chunks: %s", e)
            return set()

    # ── Stage 2: License Classification ────────────────────────────────

    def _classify_license(self, chunk: RAGSearchResult) -> dict:
        """Determine which license rule applies to this chunk."""
        return _classify_regulation(chunk.regulation_code)

    # ── Stage 3a: Structure (Rule 1 — Free Use) ───────────────────────

    async def _structure_free_use(self, chunk: RAGSearchResult, license_info: dict) -> GeneratedControl:
        """Structure a freely usable text into control format."""
        source_name = license_info.get("name", chunk.regulation_name)
        prompt = f"""Strukturiere den folgenden Gesetzestext als Security/Compliance Control.
Du DARFST den Originaltext verwenden (Quelle: {source_name}, {license_info.get('license', '')}).

WICHTIG: Erstelle eine verständliche, praxisorientierte Formulierung.
Der Originaltext wird separat gespeichert — deine Formulierung soll klar und umsetzbar sein.

Gib JSON zurück mit diesen Feldern:
- title: Kurzer prägnanter Titel (max 100 Zeichen)
- objective: Was soll erreicht werden? (1-3 Sätze)
- rationale: Warum ist das wichtig? (1-2 Sätze)
- requirements: Liste von konkreten Anforderungen (Strings)
- test_procedure: Liste von Prüfschritten (Strings)
- evidence: Liste von Nachweisdokumenten (Strings)
- severity: low/medium/high/critical
- tags: Liste von Tags
- domain: Fachgebiet als Kuerzel (AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden/Verwaltung, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe/Handelsrecht, ENV=Umwelt, HLT=Gesundheit)
- category: Inhaltliche Kategorie — MUSS zum domain passen. Moegliche Werte: {CATEGORY_LIST_STR}
- target_audience: Liste der Zielgruppen (z.B. "unternehmen", "behoerden", "entwickler", "datenschutzbeauftragte", "geschaeftsfuehrung", "it-abteilung", "rechtsabteilung", "compliance-officer", "personalwesen", "einkauf", "produktion", "gesundheitswesen", "finanzwesen", "oeffentlicher_dienst")
- source_article: Artikel-/Paragraphen-Referenz aus dem Text (z.B. "Artikel 10", "§ 42"). Leer lassen wenn nicht erkennbar.
- source_paragraph: Absatz-Referenz aus dem Text (z.B. "Absatz 5", "Nr. 2"). Leer lassen wenn nicht erkennbar.

Text: {chunk.text[:2000]}
Quelle: {chunk.regulation_name} ({chunk.regulation_code}), {chunk.article}"""

        raw = await _llm_chat(prompt, STRUCTURE_SYSTEM_PROMPT)
        data = _parse_llm_json(raw)
        if not data:
            return self._fallback_control(chunk)

        domain = _detect_domain(chunk.text)
        control = self._build_control_from_json(data, domain)
        llm_article = str(data.get("source_article", "")).strip()
        llm_paragraph = str(data.get("source_paragraph", "")).strip()
        effective_article = llm_article or chunk.article or ""
        effective_paragraph = llm_paragraph or chunk.paragraph or ""
        control.license_rule = 1
        control.source_original_text = chunk.text
        control.source_citation = {
            "source": chunk.regulation_name,
            "article": effective_article,
            "paragraph": effective_paragraph,
            "license": license_info.get("license", ""),
            "url": chunk.source_url or "",
        }
        control.customer_visible = True
        control.verification_method = _detect_verification_method(chunk.text)
        if not control.category:
            control.category = _detect_category(chunk.text)
        control.generation_metadata = {
            "processing_path": "structured",
            "license_rule": 1,
            "source_regulation": chunk.regulation_code,
            "source_article": effective_article,
            "source_paragraph": effective_paragraph,
        }
        return control

    # ── Stage 3b: Structure with Citation (Rule 2) ────────────────────

    async def _structure_with_citation(self, chunk: RAGSearchResult, license_info: dict) -> GeneratedControl:
        """Structure text that requires citation."""
        source_name = license_info.get("name", chunk.regulation_name)
        attribution = license_info.get("attribution", "")
        prompt = f"""Strukturiere den folgenden Text als Security Control.
Quelle: {source_name} ({license_info.get('license', '')}) — Zitation erforderlich.

Du darfst den Text übernehmen oder verständlicher umformulieren.
Die Quelle wird automatisch zitiert — fokussiere dich auf Klarheit.

Gib JSON zurück mit diesen Feldern:
- title: Kurzer prägnanter Titel (max 100 Zeichen)
- objective: Was soll erreicht werden? (1-3 Sätze)
- rationale: Warum ist das wichtig? (1-2 Sätze)
- requirements: Liste von konkreten Anforderungen (Strings)
- test_procedure: Liste von Prüfschritten (Strings)
- evidence: Liste von Nachweisdokumenten (Strings)
- severity: low/medium/high/critical
- tags: Liste von Tags
- domain: Fachgebiet als Kuerzel (AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden/Verwaltung, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe/Handelsrecht, ENV=Umwelt, HLT=Gesundheit)
- category: Inhaltliche Kategorie — MUSS zum domain passen. Moegliche Werte: {CATEGORY_LIST_STR}
- target_audience: Liste der Zielgruppen (z.B. "unternehmen", "behoerden", "entwickler", "datenschutzbeauftragte", "geschaeftsfuehrung", "it-abteilung", "rechtsabteilung", "compliance-officer", "personalwesen", "einkauf", "produktion", "gesundheitswesen", "finanzwesen", "oeffentlicher_dienst")
- source_article: Artikel-/Paragraphen-Referenz aus dem Text (z.B. "Artikel 10", "§ 42"). Leer lassen wenn nicht erkennbar.
- source_paragraph: Absatz-Referenz aus dem Text (z.B. "Absatz 5", "Nr. 2"). Leer lassen wenn nicht erkennbar.

Text: {chunk.text[:2000]}
Quelle: {chunk.regulation_name}, {chunk.article}"""

        raw = await _llm_chat(prompt, STRUCTURE_SYSTEM_PROMPT)
        data = _parse_llm_json(raw)
        if not data:
            return self._fallback_control(chunk)

        domain = _detect_domain(chunk.text)
        control = self._build_control_from_json(data, domain)
        llm_article = str(data.get("source_article", "")).strip()
        llm_paragraph = str(data.get("source_paragraph", "")).strip()
        effective_article = llm_article or chunk.article or ""
        effective_paragraph = llm_paragraph or chunk.paragraph or ""
        control.license_rule = 2
        control.source_original_text = chunk.text
        control.source_citation = {
            "source": chunk.regulation_name,
            "article": effective_article,
            "paragraph": effective_paragraph,
            "license": license_info.get("license", ""),
            "license_notice": attribution,
            "url": chunk.source_url or "",
        }
        control.customer_visible = True
        control.verification_method = _detect_verification_method(chunk.text)
        if not control.category:
            control.category = _detect_category(chunk.text)
        control.generation_metadata = {
            "processing_path": "structured",
            "license_rule": 2,
            "source_regulation": chunk.regulation_code,
            "source_article": effective_article,
            "source_paragraph": effective_paragraph,
        }
        return control

    # ── Stage 3c: LLM Reformulation (Rule 3 — Restricted) ─────────────

    async def _llm_reformulate(self, chunk: RAGSearchResult, config: GeneratorConfig) -> GeneratedControl:
        """Fully reformulate — NO original text, NO source names."""
        domain = config.domain or _detect_domain(chunk.text)
        prompt = f"""Analysiere den folgenden Prüfaspekt und formuliere ein EIGENSTÄNDIGES Security Control.
KOPIERE KEINE Sätze. Verwende eigene Begriffe und Struktur.
NENNE NICHT die Quelle. Keine proprietären Bezeichner (kein O.Auth_*, TR-03161, BSI-TR etc.).

Aspekt (nur zur Analyse, NICHT kopieren, NICHT referenzieren):
---
{chunk.text[:1500]}
---

Domain: {domain}

Gib JSON zurück mit diesen Feldern:
- title: Kurzer eigenständiger Titel (max 100 Zeichen)
- objective: Eigenständige Formulierung des Ziels (1-3 Sätze)
- rationale: Eigenständige Begründung (1-2 Sätze)
- requirements: Liste von konkreten Anforderungen (Strings, eigene Worte)
- test_procedure: Liste von Prüfschritten (Strings)
- evidence: Liste von Nachweisdokumenten (Strings)
- severity: low/medium/high/critical
- tags: Liste von Tags (eigene Begriffe)
- domain: Fachgebiet als Kuerzel (AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden/Verwaltung, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe/Handelsrecht, ENV=Umwelt, HLT=Gesundheit)
- category: Inhaltliche Kategorie — MUSS zum domain passen. Moegliche Werte: {CATEGORY_LIST_STR}
- target_audience: Liste der Zielgruppen (z.B. "unternehmen", "behoerden", "entwickler", "datenschutzbeauftragte", "geschaeftsfuehrung", "it-abteilung", "rechtsabteilung", "compliance-officer", "personalwesen", "oeffentlicher_dienst")"""

        raw = await _llm_chat(prompt, REFORM_SYSTEM_PROMPT)
        data = _parse_llm_json(raw)
        if not data:
            return self._fallback_control(chunk)

        control = self._build_control_from_json(data, domain)
        control.license_rule = 3
        control.source_original_text = None   # NEVER store original
        control.source_citation = None        # NEVER cite source
        control.customer_visible = False      # Only our formulation
        control.verification_method = _detect_verification_method(chunk.text)
        if not control.category:
            control.category = _detect_category(chunk.text)
        # generation_metadata: NO source names, NO original texts
        control.generation_metadata = {
            "processing_path": "llm_reform",
            "license_rule": 3,
        }
        return control

    # ── Stage 3 BATCH: Multiple chunks in one API call ─────────────────

    async def _structure_batch(
        self,
        chunks: list[RAGSearchResult],
        license_infos: list[dict],
    ) -> list[Optional[GeneratedControl]]:
        """Structure multiple free-use/citation chunks in a single Anthropic call."""
        # Build document context header if chunks share a regulation
        regulations_in_batch = set(c.regulation_name for c in chunks)
        doc_context = ""
        if len(regulations_in_batch) == 1:
            reg_name = next(iter(regulations_in_batch))
            articles = sorted(set(c.article or "?" for c in chunks))
            doc_context = (
                f"\nDOKUMENTKONTEXT: Alle {len(chunks)} Chunks stammen aus demselben Gesetz: {reg_name}.\n"
                f"Betroffene Artikel/Abschnitte: {', '.join(articles)}.\n"
                f"Nutze diesen Zusammenhang fuer eine kohaerente, aufeinander abgestimmte Formulierung der Controls.\n"
                f"Vermeide Redundanzen zwischen den Controls — jedes soll einen eigenen Aspekt abdecken.\n"
            )
        elif len(regulations_in_batch) <= 3:
            doc_context = (
                f"\nDOKUMENTKONTEXT: Die Chunks stammen aus {len(regulations_in_batch)} Gesetzen: "
                f"{', '.join(regulations_in_batch)}.\n"
            )

        chunk_entries = []
        for idx, (chunk, lic) in enumerate(zip(chunks, license_infos)):
            source_name = lic.get("name", chunk.regulation_name)
            chunk_entries.append(
                f"--- CHUNK {idx + 1} ---\n"
                f"Text: {chunk.text[:2000]}\n"
                f"Quelle: {chunk.regulation_name} ({chunk.regulation_code}), {chunk.article}\n"
                f"Lizenz: {source_name} ({lic.get('license', '')})"
            )
        joined = "\n\n".join(chunk_entries)
        prompt = f"""Strukturiere die folgenden {len(chunks)} Gesetzestexte jeweils als eigenstaendiges Security/Compliance Control.
Du DARFST den Originaltext verwenden (Quellen sind jeweils angegeben).
{doc_context}
WICHTIG:
- Pruefe JEDEN Chunk: Enthaelt er eine konkrete Pflicht, Anforderung oder Massnahme?
- Wenn JA: Erstelle ein vollstaendiges, eigenstaendiges Control mit praxisorientierter Formulierung.
- Wenn NEIN (reines Inhaltsverzeichnis, Begriffsbestimmung ohne Pflicht, Geltungsbereich ohne Anforderung, reine Verweiskette): Gib null fuer diesen Chunk zurueck.
- BEACHTE: Anhaenge/Annexe enthalten oft KONKRETE technische Anforderungen — diese MUESSEN als Control erfasst werden!
- Jedes Control muss eigenstaendig und vollstaendig sein — nicht auf andere Controls verweisen.
- Qualitaet ist wichtiger als Geschwindigkeit.
- Antworte IMMER auf Deutsch.

Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Elementen. Fuer Chunks ohne Anforderung gib null zurueck. Fuer Chunks mit Anforderung ein Objekt mit diesen Feldern:
- chunk_index: 1-basierter Index des Chunks (1, 2, 3, ...)
- title: Kurzer praegnanter Titel auf Deutsch (max 100 Zeichen)
- objective: Was soll erreicht werden? (1-3 Saetze, Deutsch)
- rationale: Warum ist das wichtig? (1-2 Saetze, Deutsch)
- requirements: Liste von konkreten Anforderungen (Strings, Deutsch)
- test_procedure: Liste von Pruefschritten (Strings, Deutsch)
- evidence: Liste von Nachweisdokumenten (Strings, Deutsch)
- severity: low/medium/high/critical
- tags: Liste von Tags
- domain: Fachgebiet als Kuerzel (AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden/Verwaltung, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe/Handelsrecht, ENV=Umwelt, HLT=Gesundheit)
- category: Inhaltliche Kategorie — MUSS zum domain passen. Moegliche Werte: {CATEGORY_LIST_STR}
- target_audience: Liste der Zielgruppen fuer die dieses Control relevant ist. Moegliche Werte: "unternehmen", "behoerden", "entwickler", "datenschutzbeauftragte", "geschaeftsfuehrung", "it-abteilung", "rechtsabteilung", "compliance-officer", "personalwesen", "einkauf", "produktion", "vertrieb", "gesundheitswesen", "finanzwesen", "oeffentlicher_dienst"
- source_article: Artikel-/Paragraphen-Referenz aus dem Text extrahieren (z.B. "Artikel 10", "Art. 5", "§ 42", "Section 3"). Leer lassen wenn nicht erkennbar.
- source_paragraph: Absatz-Referenz aus dem Text extrahieren (z.B. "Absatz 5", "Abs. 3", "Nr. 2", "(1)"). Leer lassen wenn nicht erkennbar.

{joined}"""

        raw = await _llm_chat(prompt, STRUCTURE_SYSTEM_PROMPT)
        results = _parse_llm_json_array(raw)
        logger.info("Batch structure: parsed %d results from API response", len(results))

        # Map results back to chunks by chunk_index (or by position if no index)
        controls: list[Optional[GeneratedControl]] = [None] * len(chunks)
        skipped_by_api = 0
        for pos, data in enumerate(results):
            # API returns null for chunks without actionable requirements
            if data is None:
                skipped_by_api += 1
                continue
            # Try chunk_index first, fall back to position
            idx = data.get("chunk_index")
            if idx is not None:
                idx = int(idx) - 1  # Convert to 0-based
            else:
                idx = pos  # Use position as fallback
            if idx < 0 or idx >= len(chunks):
                logger.warning("Batch: chunk_index %d out of range (0-%d), using position %d", idx, len(chunks)-1, pos)
                idx = min(pos, len(chunks) - 1)
            chunk = chunks[idx]
            lic = license_infos[idx]
            domain = _detect_domain(chunk.text)
            control = self._build_control_from_json(data, domain)
            control.license_rule = lic["rule"]
            # Use LLM-extracted article/paragraph, fall back to chunk metadata
            llm_article = str(data.get("source_article", "")).strip()
            llm_paragraph = str(data.get("source_paragraph", "")).strip()
            effective_article = llm_article or chunk.article or ""
            effective_paragraph = llm_paragraph or chunk.paragraph or ""
            if lic["rule"] in (1, 2):
                control.source_original_text = chunk.text
                control.source_citation = {
                    "source": chunk.regulation_name,
                    "article": effective_article,
                    "paragraph": effective_paragraph,
                    "license": lic.get("license", ""),
                    "license_notice": lic.get("attribution", ""),
                    "url": chunk.source_url or "",
                }
                control.customer_visible = True
            control.verification_method = _detect_verification_method(chunk.text)
            if not control.category:
                control.category = _detect_category(chunk.text)
            same_doc = len(set(c.regulation_code for c in chunks)) == 1
            control.generation_metadata = {
                "processing_path": "structured_batch",
                "license_rule": lic["rule"],
                "source_regulation": chunk.regulation_code,
                "source_article": effective_article,
                "source_paragraph": effective_paragraph,
                "batch_size": len(chunks),
                "document_grouped": same_doc,
            }
            control.generation_strategy = "document_grouped" if same_doc else "ungrouped"
            controls[idx] = control

        return controls

    async def _reformulate_batch(
        self,
        chunks: list[RAGSearchResult],
        config: GeneratorConfig,
    ) -> list[Optional[GeneratedControl]]:
        """Reformulate multiple restricted chunks in a single Anthropic call."""
        chunk_entries = []
        for idx, chunk in enumerate(chunks):
            domain = config.domain or _detect_domain(chunk.text)
            chunk_entries.append(
                f"--- ASPEKT {idx + 1} ---\n"
                f"Domain: {domain}\n"
                f"Text (nur zur Analyse, NICHT kopieren, NICHT referenzieren):\n{chunk.text[:1500]}"
            )
        joined = "\n\n".join(chunk_entries)
        prompt = f"""Analysiere die folgenden {len(chunks)} Pruefaspekte und formuliere fuer JEDEN mit konkreter Anforderung ein EIGENSTAENDIGES Security Control.
KOPIERE KEINE Saetze. Verwende eigene Begriffe und Struktur.
NENNE NICHT die Quellen. Keine proprietaeren Bezeichner (kein O.Auth_*, TR-03161, BSI-TR etc.).

WICHTIG:
- Pruefe JEDEN Aspekt: Enthaelt er eine konkrete Pflicht, Anforderung oder Massnahme?
- Wenn JA: Erstelle ein vollstaendiges, eigenstaendiges Control.
- Wenn NEIN (reines Inhaltsverzeichnis, Begriffsbestimmung ohne Pflicht, Geltungsbereich ohne Anforderung): Gib null fuer diesen Aspekt zurueck.
- BEACHTE: Anhaenge/Annexe enthalten oft KONKRETE technische Anforderungen — diese MUESSEN erfasst werden!
- Jedes Control muss eigenstaendig und vollstaendig sein — nicht auf andere Controls verweisen.
- Qualitaet ist wichtiger als Geschwindigkeit.

Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Elementen. Fuer Aspekte ohne Anforderung gib null zurueck. Fuer Aspekte mit Anforderung ein Objekt mit diesen Feldern:
- chunk_index: 1-basierter Index des Aspekts (1, 2, 3, ...)
- title: Kurzer eigenstaendiger Titel (max 100 Zeichen)
- objective: Eigenstaendige Formulierung des Ziels (1-3 Saetze)
- rationale: Eigenstaendige Begruendung (1-2 Saetze)
- requirements: Liste von konkreten Anforderungen (Strings, eigene Worte)
- test_procedure: Liste von Pruefschritten (Strings)
- evidence: Liste von Nachweisdokumenten (Strings)
- severity: low/medium/high/critical
- tags: Liste von Tags (eigene Begriffe)
- domain: Fachgebiet als Kuerzel (AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden/Verwaltung, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe/Handelsrecht, ENV=Umwelt, HLT=Gesundheit)
- category: Inhaltliche Kategorie — MUSS zum domain passen. Moegliche Werte: {CATEGORY_LIST_STR}
- target_audience: Liste der Zielgruppen (z.B. "unternehmen", "behoerden", "entwickler", "datenschutzbeauftragte", "geschaeftsfuehrung", "it-abteilung", "rechtsabteilung", "compliance-officer", "personalwesen", "einkauf", "produktion", "gesundheitswesen", "finanzwesen", "oeffentlicher_dienst")

{joined}"""

        raw = await _llm_chat(prompt, REFORM_SYSTEM_PROMPT)
        results = _parse_llm_json_array(raw)
        logger.info("Batch reform: parsed %d results from API response", len(results))

        controls: list[Optional[GeneratedControl]] = [None] * len(chunks)
        for pos, data in enumerate(results):
            if data is None:
                continue
            idx = data.get("chunk_index")
            if idx is not None:
                idx = int(idx) - 1
            else:
                idx = pos
            if idx < 0 or idx >= len(chunks):
                logger.warning("Batch reform: chunk_index %d out of range, using position %d", idx, pos)
                idx = min(pos, len(chunks) - 1)
            chunk = chunks[idx]
            domain = config.domain or _detect_domain(chunk.text)
            control = self._build_control_from_json(data, domain)
            control.license_rule = 3
            control.source_original_text = None
            control.source_citation = None
            control.customer_visible = False
            control.verification_method = _detect_verification_method(chunk.text)
            if not control.category:
                control.category = _detect_category(chunk.text)
            control.generation_metadata = {
                "processing_path": "llm_reform_batch",
                "license_rule": 3,
                "batch_size": len(chunks),
            }
            controls[idx] = control

        return controls

    async def _process_batch(
        self,
        batch_items: list[tuple[RAGSearchResult, dict]],
        config: GeneratorConfig,
        job_id: str,
    ) -> list[Optional[GeneratedControl]]:
        """Process a batch of (chunk, license_info) through stages 3-5."""
        # Split by license rule: Rule 1+2 → structure, Rule 3 → reform
        structure_items = [(c, l) for c, l in batch_items if l["rule"] in (1, 2)]
        reform_items = [(c, l) for c, l in batch_items if l["rule"] == 3]

        all_controls: dict[int, Optional[GeneratedControl]] = {}

        if structure_items:
            s_chunks = [c for c, _ in structure_items]
            s_lics = [l for _, l in structure_items]
            s_controls = await self._structure_batch(s_chunks, s_lics)
            for (chunk, _), ctrl in zip(structure_items, s_controls):
                orig_idx = next(i for i, (c, _) in enumerate(batch_items) if c is chunk)
                all_controls[orig_idx] = ctrl

        if reform_items:
            r_chunks = [c for c, _ in reform_items]
            r_controls = await self._reformulate_batch(r_chunks, config)
            for (chunk, _), ctrl in zip(reform_items, r_controls):
                orig_idx = next(i for i, (c, _) in enumerate(batch_items) if c is chunk)
                if ctrl:
                    # Too-Close-Check for Rule 3
                    similarity = await check_similarity(chunk.text, f"{ctrl.objective} {ctrl.rationale}")
                    if similarity.status == "FAIL":
                        ctrl.release_state = "too_close"
                        ctrl.generation_metadata["similarity_status"] = "FAIL"
                        ctrl.generation_metadata["similarity_scores"] = {
                            "token_overlap": similarity.token_overlap,
                            "ngram_jaccard": similarity.ngram_jaccard,
                            "lcs_ratio": similarity.lcs_ratio,
                        }
                all_controls[orig_idx] = ctrl

        # Post-process all controls: harmonization + anchor search
        # NOTE: QA validation runs as a separate batch AFTER generation (qa-reclassify endpoint)
        # to avoid competing with Ollama prefilter for resources.
        qa_fixed_count = 0
        final: list[Optional[GeneratedControl]] = []
        for i in range(len(batch_items)):
            control = all_controls.get(i)
            if not control or (not control.title and not control.objective):
                final.append(None)
                continue

            if control.release_state == "too_close":
                final.append(control)
                continue

            # Harmonization
            duplicates = await self._check_harmonization(control)
            if duplicates:
                control.release_state = "duplicate"
                control.generation_metadata["similar_controls"] = duplicates
                final.append(control)
                continue

            # Anchor search
            try:
                from .anchor_finder import AnchorFinder
                finder = AnchorFinder(self.rag)
                anchors = await finder.find_anchors(control, skip_web=config.skip_web_search)
                control.open_anchors = [asdict(a) if hasattr(a, '__dataclass_fields__') else a for a in anchors]
            except Exception as e:
                logger.warning("Anchor search failed: %s", e)

            # Release state
            if control.license_rule in (1, 2):
                control.release_state = "draft"
            elif control.open_anchors:
                control.release_state = "draft"
            else:
                control.release_state = "needs_review"

            # Control ID — prefer QA-corrected or LLM-assigned domain over keyword detection
            domain = (control.generation_metadata.get("_effective_domain")
                      or config.domain
                      or _detect_domain(control.objective))
            control.control_id = self._generate_control_id(domain, self.db)
            control.generation_metadata["job_id"] = job_id

            final.append(control)

        if qa_fixed_count:
            logger.info("QA validation: fixed %d/%d controls in batch", qa_fixed_count, len(final))
        return final, qa_fixed_count

    # ── Stage 4: Harmonization ─────────────────────────────────────────

    async def _check_harmonization(self, new_control: GeneratedControl) -> Optional[list]:
        """Check if a new control duplicates existing ones via embedding similarity."""
        existing = self._load_existing_controls()
        if not existing:
            return None

        # Pre-load all existing embeddings in batch (once per pipeline run)
        if not self._existing_embeddings:
            await self._preload_embeddings(existing)

        new_text = f"{new_control.title} {new_control.objective}"
        new_emb = await _get_embedding(new_text)
        if not new_emb:
            return None

        similar = []
        for ex in existing:
            ex_key = ex.get("control_id", "")
            ex_emb = self._existing_embeddings.get(ex_key, [])
            if not ex_emb:
                continue

            cosine = _cosine_sim(new_emb, ex_emb)
            if cosine > HARMONIZATION_THRESHOLD:
                similar.append({
                    "control_id": ex.get("control_id", ""),
                    "title": ex.get("title", ""),
                    "similarity": round(cosine, 3),
                })

        return similar if similar else None

    async def _preload_embeddings(self, existing: list[dict]):
        """Pre-load embeddings for all existing controls in batches."""
        texts = [f"{ex.get('title', '')} {ex.get('objective', '')}" for ex in existing]
        keys = [ex.get("control_id", "") for ex in existing]

        logger.info("Pre-loading embeddings for %d existing controls...", len(texts))
        embeddings = await _get_embeddings_batch(texts)

        for key, emb in zip(keys, embeddings):
            self._existing_embeddings[key] = emb

        loaded = sum(1 for emb in embeddings if emb)
        logger.info("Pre-loaded %d/%d embeddings", loaded, len(texts))

        # Reset DB session after long-running embedding operation to avoid stale connections
        try:
            self.db.rollback()
        except Exception:
            pass

    def _load_existing_controls(self) -> list[dict]:
        """Load existing controls from DB (cached per pipeline run)."""
        if self._existing_controls is not None:
            return self._existing_controls

        try:
            result = self.db.execute(
                text("SELECT control_id, title, objective FROM canonical_controls WHERE release_state != 'deprecated'")
            )
            self._existing_controls = [
                {"control_id": r[0], "title": r[1], "objective": r[2]}
                for r in result
            ]
        except Exception as e:
            logger.warning("Error loading existing controls: %s", e)
            self._existing_controls = []

        return self._existing_controls

    # ── Helpers ────────────────────────────────────────────────────────

    def _build_control_from_json(self, data: dict, domain: str) -> GeneratedControl:
        """Build a GeneratedControl from parsed LLM JSON."""
        severity = data.get("severity", "medium")
        if severity not in ("low", "medium", "high", "critical"):
            severity = "medium"

        tags = data.get("tags", [])
        if isinstance(tags, str):
            tags = [t.strip() for t in tags.split(",")]

        # Use LLM-provided domain if available, fallback to keyword-detected domain
        llm_domain = data.get("domain")
        if llm_domain and llm_domain.upper() in VALID_DOMAINS:
            domain = llm_domain.upper()

        # Use LLM-provided category if available
        llm_category = data.get("category")
        category = None
        if llm_category and llm_category in VALID_CATEGORIES:
            category = llm_category

        # Parse target_audience from LLM response
        target_audience = data.get("target_audience")
        if isinstance(target_audience, str):
            target_audience = [t.strip() for t in target_audience.split(",")]
        if not isinstance(target_audience, list):
            target_audience = None

        control = GeneratedControl(
            title=str(data.get("title", "Untitled Control"))[:255],
            objective=str(data.get("objective", "")),
            rationale=str(data.get("rationale", "")),
            scope=data.get("scope", {}),
            requirements=data.get("requirements", []) if isinstance(data.get("requirements"), list) else [],
            test_procedure=data.get("test_procedure", []) if isinstance(data.get("test_procedure"), list) else [],
            evidence=data.get("evidence", []) if isinstance(data.get("evidence"), list) else [],
            severity=severity,
            risk_score=min(10.0, max(0.0, float(data.get("risk_score", 5.0)))),
            implementation_effort=data.get("implementation_effort", "m") if data.get("implementation_effort") in ("s", "m", "l", "xl") else "m",
            tags=tags[:20],
            target_audience=target_audience,
            category=category,
        )
        # Store effective domain for later control_id generation
        control.generation_metadata["_effective_domain"] = domain
        return control

    def _fallback_control(self, chunk: RAGSearchResult) -> GeneratedControl:
        """Create a minimal control when LLM parsing fails."""
        domain = _detect_domain(chunk.text)
        return GeneratedControl(
            title=f"Control from {chunk.regulation_code} {chunk.article or ''}".strip()[:255],
            objective=chunk.text[:500] if chunk.text else "Needs manual review",
            rationale="Auto-generated — LLM parsing failed, manual review required.",
            severity="medium",
            release_state="needs_review",
            tags=[domain.lower()],
        )

    def _generate_control_id(self, domain: str, db: Session) -> str:
        """Generate next sequential control ID like AUTH-011."""
        prefix = domain.upper()[:4]
        try:
            result = db.execute(
                text("SELECT control_id FROM canonical_controls WHERE control_id LIKE :prefix ORDER BY control_id DESC LIMIT 1"),
                {"prefix": f"{prefix}-%"},
            )
            row = result.fetchone()
            if row:
                last_num = int(row[0].split("-")[-1])
                return f"{prefix}-{last_num + 1:03d}"
        except Exception:
            pass
        return f"{prefix}-001"

    # ── Stage QA: Automated Quality Validation ───────────────────────

    async def _qa_validate_control(
        self, control: GeneratedControl, chunk_text: str
    ) -> tuple[GeneratedControl, bool]:
        """Cross-validate category/domain using keyword detection + local LLM.

        Also checks for recital (Erwägungsgrund) contamination in source text.
        Returns (control, was_fixed). Only triggers Ollama QA when the LLM
        classification disagrees with keyword detection — keeps it fast.
        """
        # ── Recital detection ──────────────────────────────────────────
        source_text = control.source_original_text or ""
        recital_info = _detect_recital(source_text)
        if recital_info:
            control.generation_metadata["recital_suspect"] = True
            control.generation_metadata["recital_detection"] = recital_info
            control.release_state = "needs_review"
            logger.warning(
                "Recital suspect: '%s' — recitals %s detected in source text",
                control.title[:40],
                recital_info.get("recital_numbers", []),
            )

        kw_category = _detect_category(chunk_text) or _detect_category(control.objective)
        kw_domain = _detect_domain(chunk_text)
        llm_domain = control.generation_metadata.get("_effective_domain", "")

        # If keyword and LLM agree → no QA needed
        if control.category == kw_category and llm_domain == kw_domain:
            return control, False

        # Disagreement detected → ask local LLM to arbitrate
        title = control.title[:100]
        objective = control.objective[:200]
        reqs = ", ".join(control.requirements[:3]) if control.requirements else "keine"
        prompt = f"""Pruefe dieses Compliance-Control auf korrekte Klassifizierung.

Titel: {title}
Ziel: {objective}
Anforderungen: {reqs}

Aktuelle Zuordnung: domain={llm_domain}, category={control.category}
Keyword-Erkennung: domain={kw_domain}, category={kw_category}

Welche Zuordnung ist korrekt? Antworte NUR als JSON:
{{"domain": "KUERZEL", "category": "kategorie_name", "reason": "kurze Begruendung"}}

Domains: AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe, ENV=Umwelt, HLT=Gesundheit
Kategorien: {CATEGORY_LIST_STR}"""

        try:
            raw = await _llm_local(prompt)
            data = _parse_llm_json(raw)
            if not data:
                return control, False

            fixed = False
            qa_domain = data.get("domain", "").upper()
            qa_category = data.get("category", "")
            reason = data.get("reason", "")

            if qa_category and qa_category in VALID_CATEGORIES and qa_category != control.category:
                old_cat = control.category
                control.category = qa_category
                control.generation_metadata["qa_category_fix"] = {
                    "from": old_cat, "to": qa_category, "reason": reason,
                }
                logger.info("QA fix: '%s' category '%s' -> '%s' (%s)",
                            title[:40], old_cat, qa_category, reason)
                fixed = True

            if qa_domain and qa_domain in VALID_DOMAINS and qa_domain != llm_domain:
                control.generation_metadata["qa_domain_fix"] = {
                    "from": llm_domain, "to": qa_domain, "reason": reason,
                }
                control.generation_metadata["_effective_domain"] = qa_domain
                logger.info("QA fix: '%s' domain '%s' -> '%s' (%s)",
                            title[:40], llm_domain, qa_domain, reason)
                fixed = True

            return control, fixed

        except Exception as e:
            logger.warning("QA validation failed for '%s': %s", title[:40], e)
            return control, False

    # ── Pipeline Orchestration ─────────────────────────────────────────

    def _create_job(self, config: GeneratorConfig) -> str:
        """Create a generation job record."""
        try:
            result = self.db.execute(
                text("""
                    INSERT INTO canonical_generation_jobs (status, config)
                    VALUES ('running', :config)
                    RETURNING id
                """),
                {"config": json.dumps(config.model_dump())},
            )
            self.db.commit()
            row = result.fetchone()
            return str(row[0]) if row else str(uuid.uuid4())
        except Exception as e:
            logger.error("Failed to create job: %s", e)
            return str(uuid.uuid4())

    def _update_job(self, job_id: str, result: GeneratorResult):
        """Update job with current stats. Sets completed_at only when status is final."""
        is_final = result.status in ("completed", "failed")
        try:
            self.db.execute(
                text(f"""
                    UPDATE canonical_generation_jobs
                    SET status = :status,
                        total_chunks_scanned = :scanned,
                        controls_generated = :generated,
                        controls_verified = :verified,
                        controls_needs_review = :needs_review,
                        controls_too_close = :too_close,
                        controls_duplicates_found = :duplicates,
                        errors = :errors
                        {"" if not is_final else ", completed_at = NOW()"}
                    WHERE id = CAST(:job_id AS uuid)
                """),
                {
                    "job_id": job_id,
                    "status": result.status,
                    "scanned": result.total_chunks_scanned,
                    "generated": result.controls_generated,
                    "verified": result.controls_verified,
                    "needs_review": result.controls_needs_review,
                    "too_close": result.controls_too_close,
                    "duplicates": result.controls_duplicates_found,
                    "errors": json.dumps(result.errors[-50:]),
                },
            )
            self.db.commit()
        except Exception as e:
            logger.error("Failed to update job: %s", e)

    def _store_control(self, control: GeneratedControl, job_id: str) -> Optional[str]:
        """Persist a generated control to DB. Returns the control UUID or None."""
        try:
            # Get framework UUID
            fw_result = self.db.execute(
                text("SELECT id FROM canonical_control_frameworks WHERE framework_id = 'bp_security_v1' LIMIT 1")
            )
            fw_row = fw_result.fetchone()
            if not fw_row:
                logger.error("Framework bp_security_v1 not found")
                return None
            framework_uuid = fw_row[0]

            # Generate control_id if not set
            if not control.control_id:
                domain = _detect_domain(control.objective) if control.objective else "SEC"
                control.control_id = self._generate_control_id(domain, self.db)

            result = self.db.execute(
                text("""
                    INSERT INTO canonical_controls (
                        framework_id, control_id, title, objective, rationale,
                        scope, requirements, test_procedure, evidence,
                        severity, risk_score, implementation_effort,
                        open_anchors, release_state, tags,
                        license_rule, source_original_text, source_citation,
                        customer_visible, generation_metadata,
                        verification_method, category, generation_strategy,
                        target_audience, pipeline_version
                    ) VALUES (
                        :framework_id, :control_id, :title, :objective, :rationale,
                        :scope, :requirements, :test_procedure, :evidence,
                        :severity, :risk_score, :implementation_effort,
                        :open_anchors, :release_state, :tags,
                        :license_rule, :source_original_text, :source_citation,
                        :customer_visible, :generation_metadata,
                        :verification_method, :category, :generation_strategy,
                        :target_audience, :pipeline_version
                    )
                    ON CONFLICT (framework_id, control_id) DO NOTHING
                    RETURNING id
                """),
                {
                    "framework_id": framework_uuid,
                    "control_id": control.control_id,
                    "title": control.title,
                    "objective": control.objective,
                    "rationale": control.rationale,
                    "scope": json.dumps(control.scope),
                    "requirements": json.dumps(control.requirements),
                    "test_procedure": json.dumps(control.test_procedure),
                    "evidence": json.dumps(control.evidence),
                    "severity": control.severity,
                    "risk_score": control.risk_score,
                    "implementation_effort": control.implementation_effort,
                    "open_anchors": json.dumps(control.open_anchors),
                    "release_state": control.release_state,
                    "tags": json.dumps(control.tags),
                    "license_rule": control.license_rule,
                    "source_original_text": control.source_original_text,
                    "source_citation": json.dumps(control.source_citation) if control.source_citation else None,
                    "customer_visible": control.customer_visible,
                    "generation_metadata": json.dumps(control.generation_metadata) if control.generation_metadata else None,
                    "verification_method": control.verification_method,
                    "category": control.category,
                    "generation_strategy": control.generation_strategy,
                    "target_audience": json.dumps(control.target_audience) if control.target_audience else None,
                    "pipeline_version": PIPELINE_VERSION,
                },
            )
            self.db.commit()
            row = result.fetchone()
            return str(row[0]) if row else None
        except Exception as e:
            logger.error("Failed to store control %s: %s", control.control_id, e)
            self.db.rollback()
            return None

    def _mark_chunk_processed(
        self,
        chunk: RAGSearchResult,
        license_info: dict,
        processing_path: str,
        control_ids: list[str],
        job_id: str,
    ):
        """Mark a RAG chunk as processed (Stage 7)."""
        chunk_hash = hashlib.sha256(chunk.text.encode()).hexdigest()
        try:
            self.db.execute(
                text("""
                    INSERT INTO canonical_processed_chunks (
                        chunk_hash, collection, regulation_code,
                        document_version, source_license, license_rule,
                        processing_path, generated_control_ids, job_id,
                        pipeline_version
                    ) VALUES (
                        :hash, :collection, :regulation_code,
                        :doc_version, :license, :rule,
                        :path, :control_ids, CAST(:job_id AS uuid),
                        :pipeline_version
                    )
                    ON CONFLICT (chunk_hash, collection, document_version) DO NOTHING
                """),
                {
                    "hash": chunk_hash,
                    "collection": chunk.collection or "bp_compliance_ce",
                    "regulation_code": chunk.regulation_code,
                    "doc_version": "1.0",
                    "license": license_info.get("license", ""),
                    "rule": license_info.get("rule", 3),
                    "path": processing_path,
                    "control_ids": json.dumps(control_ids),
                    "job_id": job_id,
                    "pipeline_version": PIPELINE_VERSION,
                },
            )
            self.db.commit()
        except Exception as e:
            logger.warning("Failed to mark chunk processed: %s", e)
            self.db.rollback()

    # ── Main Pipeline ──────────────────────────────────────────────────

    async def run(self, config: GeneratorConfig) -> GeneratorResult:
        """Execute the full 7-stage pipeline."""
        result = GeneratorResult()

        # Create or reuse job
        if config.existing_job_id:
            job_id = config.existing_job_id
        else:
            job_id = self._create_job(config)
        result.job_id = job_id

        try:
            # Stage 1: RAG Scan
            chunks = await self._scan_rag(config)
            result.total_chunks_scanned = len(chunks)

            if not chunks:
                result.status = "completed"
                self._update_job(job_id, result)
                return result

            # ── Group chunks by document (regulation_code) for coherent batching ──
            doc_groups: dict[str, list[RAGSearchResult]] = defaultdict(list)
            for chunk in chunks:
                group_key = chunk.regulation_code or "unknown"
                doc_groups[group_key].append(chunk)

            # Sort chunks within each group by article for sequential context
            for key in doc_groups:
                doc_groups[key].sort(key=lambda c: (c.article or "", c.paragraph or ""))

            logger.info(
                "Grouped %d chunks into %d document groups for coherent batching",
                len(chunks), len(doc_groups),
            )

            # ── Apply max_chunks limit respecting document boundaries ──
            # Process complete documents until we exceed the limit.
            # Never split a document across jobs.
            chunks = []
            if config.max_chunks and config.max_chunks > 0:
                for group_key, group_list in doc_groups.items():
                    if chunks and len(chunks) + len(group_list) > config.max_chunks:
                        # Adding this document would exceed the limit — stop here
                        break
                    chunks.extend(group_list)
                logger.info(
                    "max_chunks=%d: selected %d chunks from %d complete documents (of %d total groups)",
                    config.max_chunks, len(chunks),
                    len(set(c.regulation_code for c in chunks)),
                    len(doc_groups),
                )
            else:
                # No limit: flatten all groups
                for group_list in doc_groups.values():
                    chunks.extend(group_list)

            result.total_chunks_scanned = len(chunks)

            # Process chunks — batch mode (N chunks per Anthropic API call)
            BATCH_SIZE = config.batch_size or 5
            controls_count = 0
            chunks_skipped_prefilter = 0
            pending_batch: list[tuple[RAGSearchResult, dict]] = []  # (chunk, license_info)
            current_batch_regulation: Optional[str] = None  # Track regulation for group-aware flushing

            async def _flush_batch():
                """Send pending batch to Anthropic and process results."""
                nonlocal controls_count, current_batch_regulation
                if not pending_batch:
                    return
                batch = pending_batch.copy()
                pending_batch.clear()
                current_batch_regulation = None

                # Log which document this batch belongs to
                regs_in_batch = set(c.regulation_code for c, _ in batch)
                logger.info(
                    "Processing batch of %d chunks (docs: %s) via single API call...",
                    len(batch), ", ".join(regs_in_batch),
                )
                try:
                    batch_controls, batch_qa_fixes = await self._process_batch(batch, config, job_id)
                    result.controls_qa_fixed += batch_qa_fixes
                except Exception as e:
                    logger.error("Batch processing failed: %s — falling back to single-chunk mode", e)
                    # Fallback: process each chunk individually
                    batch_controls = []
                    for chunk, _lic in batch:
                        try:
                            ctrl = await self._process_single_chunk(chunk, config, job_id)
                            batch_controls.append(ctrl)
                        except Exception as e2:
                            logger.error("Single-chunk fallback also failed: %s", e2)
                            batch_controls.append(None)

                for (chunk, lic_info), control in zip(batch, batch_controls):
                    if control is None:
                        if not config.dry_run:
                            self._mark_chunk_processed(chunk, lic_info, "no_control", [], job_id)
                        continue

                    # Mark as document_grouped strategy
                    control.generation_strategy = "document_grouped"

                    # Count by state
                    if control.release_state == "too_close":
                        result.controls_too_close += 1
                    elif control.release_state == "duplicate":
                        result.controls_duplicates_found += 1
                    elif control.release_state == "needs_review":
                        result.controls_needs_review += 1
                    else:
                        result.controls_verified += 1

                    # Store
                    if not config.dry_run:
                        ctrl_uuid = self._store_control(control, job_id)
                        if ctrl_uuid:
                            path = control.generation_metadata.get("processing_path", "structured_batch")
                            self._mark_chunk_processed(chunk, lic_info, path, [ctrl_uuid], job_id)
                        else:
                            self._mark_chunk_processed(chunk, lic_info, "store_failed", [], job_id)

                    result.controls_generated += 1
                    result.controls.append(asdict(control))
                    controls_count += 1

                    if self._existing_controls is not None:
                        self._existing_controls.append({
                            "control_id": control.control_id,
                            "title": control.title,
                            "objective": control.objective,
                        })

            for i, chunk in enumerate(chunks):
                try:
                    # Progress logging every 50 chunks
                    if i > 0 and i % 50 == 0:
                        logger.info(
                            "Progress: %d/%d chunks processed, %d controls generated, %d skipped by prefilter",
                            i, len(chunks), controls_count, chunks_skipped_prefilter,
                        )
                        self._update_job(job_id, result)

                    # Stage 1.5: Local LLM pre-filter — skip chunks without requirements
                    if not config.dry_run and not config.skip_prefilter:
                        is_relevant, prefilter_reason = await _prefilter_chunk(chunk.text)
                        if not is_relevant:
                            chunks_skipped_prefilter += 1
                            license_info = self._classify_license(chunk)
                            self._mark_chunk_processed(
                                chunk, license_info, "prefilter_skip", [], job_id
                            )
                            continue

                    # Classify license and add to batch
                    license_info = self._classify_license(chunk)
                    chunk_regulation = chunk.regulation_code or "unknown"

                    # Flush when: batch is full OR regulation changes (group boundary)
                    if pending_batch and (
                        len(pending_batch) >= BATCH_SIZE
                        or chunk_regulation != current_batch_regulation
                    ):
                        await _flush_batch()

                    pending_batch.append((chunk, license_info))
                    current_batch_regulation = chunk_regulation

                except Exception as e:
                    error_msg = f"Error processing chunk {chunk.regulation_code}/{chunk.article}: {e}"
                    logger.error(error_msg)
                    result.errors.append(error_msg)
                    try:
                        if not config.dry_run:
                            license_info = self._classify_license(chunk)
                            self._mark_chunk_processed(
                                chunk, license_info, "error", [], job_id
                            )
                    except Exception:
                        pass

            # Flush remaining chunks
            await _flush_batch()

            result.chunks_skipped_prefilter = chunks_skipped_prefilter
            logger.info(
                "Pipeline complete: %d controls generated, %d chunks skipped by prefilter, %d total chunks",
                controls_count, chunks_skipped_prefilter, len(chunks),
            )

            result.status = "completed"

        except Exception as e:
            result.status = "failed"
            result.errors.append(str(e))
            logger.error("Pipeline failed: %s", e)

        self._update_job(job_id, result)
        return result

    async def _process_single_chunk(
        self,
        chunk: RAGSearchResult,
        config: GeneratorConfig,
        job_id: str,
    ) -> Optional[GeneratedControl]:
        """Process a single chunk through stages 2-5."""
        # Stage 2: License classification
        license_info = self._classify_license(chunk)

        # Stage 3: Structure or Reform based on rule
        if license_info["rule"] == 1:
            control = await self._structure_free_use(chunk, license_info)
        elif license_info["rule"] == 2:
            control = await self._structure_with_citation(chunk, license_info)
        else:
            control = await self._llm_reformulate(chunk, config)

            # Too-Close-Check for Rule 3
            similarity = await check_similarity(chunk.text, f"{control.objective} {control.rationale}")
            if similarity.status == "FAIL":
                control.release_state = "too_close"
                control.generation_metadata["similarity_status"] = "FAIL"
                control.generation_metadata["similarity_scores"] = {
                    "token_overlap": similarity.token_overlap,
                    "ngram_jaccard": similarity.ngram_jaccard,
                    "lcs_ratio": similarity.lcs_ratio,
                }
                return control

        if not control.title or not control.objective:
            return None

        # NOTE: QA validation runs as a separate batch AFTER generation (qa-reclassify endpoint)

        # Stage 4: Harmonization
        duplicates = await self._check_harmonization(control)
        if duplicates:
            control.release_state = "duplicate"
            control.generation_metadata["similar_controls"] = duplicates
            return control

        # Stage 5: Anchor Search (imported from anchor_finder)
        try:
            from .anchor_finder import AnchorFinder
            finder = AnchorFinder(self.rag)
            anchors = await finder.find_anchors(control, skip_web=config.skip_web_search)
            control.open_anchors = [asdict(a) if hasattr(a, '__dataclass_fields__') else a for a in anchors]
        except Exception as e:
            logger.warning("Anchor search failed: %s", e)

        # Determine release state
        if control.license_rule in (1, 2):
            control.release_state = "draft"
        elif control.open_anchors:
            control.release_state = "draft"
        else:
            control.release_state = "needs_review"

        # Generate control_id — prefer QA-corrected or LLM-assigned domain
        domain = (control.generation_metadata.get("_effective_domain")
                  or config.domain
                  or _detect_domain(control.objective))
        control.control_id = self._generate_control_id(domain, self.db)

        # Store job_id in metadata
        control.generation_metadata["job_id"] = job_id

        return control