All checks were successful
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Successful in 31s
CI/CD / test-python-backend-compliance (push) Successful in 31s
CI/CD / test-python-document-crawler (push) Successful in 21s
CI/CD / test-python-dsms-gateway (push) Successful in 18s
CI/CD / validate-canonical-controls (push) Successful in 11s
CI/CD / Deploy (push) Successful in 2s
- Group chunks by regulation_code before batching for better LLM context - Add generation_strategy column (ungrouped=v1, document_grouped=v2) - Add v1/v2 badge to control cards in frontend - Add sort-by-source option with visual group headers - Add frontend page tests (18 tests) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1734 lines
81 KiB
Python
1734 lines
81 KiB
Python
"""
|
|
Control Generator Pipeline — RAG → License → Structure/Reform → Harmonize → Anchor → Store.
|
|
|
|
7-stage pipeline that generates canonical security controls from RAG chunks:
|
|
1. RAG SCAN — Load unprocessed chunks (or new document versions)
|
|
2. LICENSE CLASSIFY — Determine which of 3 license rules applies
|
|
3a. STRUCTURE — Rule 1+2: Structure original text into control format
|
|
3b. LLM REFORM — Rule 3: Fully reformulate (no original text, no source names)
|
|
4. HARMONIZE — Check against existing controls for duplicates
|
|
5. ANCHOR SEARCH — Find open-source references (OWASP, NIST, ENISA)
|
|
6. STORE — Persist to DB with correct visibility flags
|
|
7. MARK PROCESSED — Mark RAG chunks as processed (with version tracking)
|
|
|
|
Three License Rules:
|
|
Rule 1 (free_use): Laws, Public Domain — original text allowed
|
|
Rule 2 (citation_required): CC-BY, CC-BY-SA — original text with citation
|
|
Rule 3 (restricted): BSI, ISO — full reformulation, no source names
|
|
"""
|
|
|
|
import hashlib
|
|
import json
|
|
import logging
|
|
import os
|
|
import re
|
|
import uuid
|
|
from collections import defaultdict
|
|
from dataclasses import dataclass, field, asdict
|
|
from datetime import datetime, timezone
|
|
from typing import Dict, List, Optional, Set
|
|
|
|
import httpx
|
|
from pydantic import BaseModel
|
|
from sqlalchemy import text
|
|
from sqlalchemy.orm import Session
|
|
|
|
from .rag_client import ComplianceRAGClient, RAGSearchResult, get_rag_client
|
|
from .similarity_detector import check_similarity, SimilarityReport
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Configuration
|
|
# ---------------------------------------------------------------------------
|
|
|
|
SDK_URL = os.getenv("SDK_URL", "http://ai-compliance-sdk:8090")
|
|
EMBEDDING_URL = os.getenv("EMBEDDING_URL", "http://embedding-service:8087")
|
|
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "")
|
|
ANTHROPIC_MODEL = os.getenv("CONTROL_GEN_ANTHROPIC_MODEL", "claude-sonnet-4-6")
|
|
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
|
|
OLLAMA_MODEL = os.getenv("CONTROL_GEN_OLLAMA_MODEL", "qwen3.5:35b-a3b")
|
|
LLM_TIMEOUT = float(os.getenv("CONTROL_GEN_LLM_TIMEOUT", "180"))
|
|
|
|
HARMONIZATION_THRESHOLD = 0.85 # Cosine similarity above this = duplicate
|
|
|
|
ALL_COLLECTIONS = [
|
|
"bp_compliance_ce",
|
|
"bp_compliance_recht",
|
|
"bp_compliance_gesetze",
|
|
"bp_compliance_datenschutz",
|
|
"bp_dsfa_corpus",
|
|
"bp_legal_templates",
|
|
]
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# License Mapping (3-Rule System)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
REGULATION_LICENSE_MAP: dict[str, dict] = {
|
|
# RULE 1: FREE USE — Laws, Public Domain
|
|
# EU Regulations
|
|
"eu_2016_679": {"license": "EU_LAW", "rule": 1, "name": "DSGVO"},
|
|
"eu_2024_1689": {"license": "EU_LAW", "rule": 1, "name": "AI Act (KI-Verordnung)"},
|
|
"eu_2022_2555": {"license": "EU_LAW", "rule": 1, "name": "NIS2"},
|
|
"eu_2024_2847": {"license": "EU_LAW", "rule": 1, "name": "Cyber Resilience Act (CRA)"},
|
|
"eu_2023_1230": {"license": "EU_LAW", "rule": 1, "name": "Maschinenverordnung"},
|
|
"eu_2022_2065": {"license": "EU_LAW", "rule": 1, "name": "Digital Services Act (DSA)"},
|
|
"eu_2022_1925": {"license": "EU_LAW", "rule": 1, "name": "Digital Markets Act (DMA)"},
|
|
"eu_2022_868": {"license": "EU_LAW", "rule": 1, "name": "Data Governance Act (DGA)"},
|
|
"eu_2019_770": {"license": "EU_LAW", "rule": 1, "name": "Digitale-Inhalte-Richtlinie"},
|
|
"eu_2021_914": {"license": "EU_LAW", "rule": 1, "name": "Standardvertragsklauseln (SCC)"},
|
|
"eu_2002_58": {"license": "EU_LAW", "rule": 1, "name": "ePrivacy-Richtlinie"},
|
|
"eu_2000_31": {"license": "EU_LAW", "rule": 1, "name": "E-Commerce-Richtlinie"},
|
|
"eu_2023_1803": {"license": "EU_LAW", "rule": 1, "name": "IFRS-Uebernahmeverordnung"},
|
|
"eucsa": {"license": "EU_LAW", "rule": 1, "name": "EU Cybersecurity Act"},
|
|
"dataact": {"license": "EU_LAW", "rule": 1, "name": "Data Act"},
|
|
"dora": {"license": "EU_LAW", "rule": 1, "name": "Digital Operational Resilience Act"},
|
|
"ehds": {"license": "EU_LAW", "rule": 1, "name": "European Health Data Space"},
|
|
"gpsr": {"license": "EU_LAW", "rule": 1, "name": "Allgemeine Produktsicherheitsverordnung"},
|
|
"mica": {"license": "EU_LAW", "rule": 1, "name": "Markets in Crypto-Assets"},
|
|
"psd2": {"license": "EU_LAW", "rule": 1, "name": "Zahlungsdiensterichtlinie 2"},
|
|
"dpf": {"license": "EU_LAW", "rule": 1, "name": "EU-US Data Privacy Framework"},
|
|
"dsm": {"license": "EU_LAW", "rule": 1, "name": "DSM-Urheberrechtsrichtlinie"},
|
|
"amlr": {"license": "EU_LAW", "rule": 1, "name": "AML-Verordnung"},
|
|
"eu_blue_guide_2022": {"license": "EU_PUBLIC", "rule": 1, "name": "Blue Guide 2022"},
|
|
# NIST (Public Domain — all variants)
|
|
"nist_sp_800_53": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "name": "NIST SP 800-53"},
|
|
"nist_sp800_53r5": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "name": "NIST SP 800-53 Rev.5"},
|
|
"nist_sp_800_63b": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "name": "NIST SP 800-63B"},
|
|
"nist_sp800_63_3": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "name": "NIST SP 800-63-3"},
|
|
"nist_csf_2_0": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "name": "NIST CSF 2.0"},
|
|
"nist_sp_800_218": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "name": "NIST SSDF"},
|
|
"nist_sp800_207": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "name": "NIST SP 800-207 Zero Trust"},
|
|
"nist_ai_rmf": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "name": "NIST AI Risk Management Framework"},
|
|
"nistir_8259a": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "name": "NISTIR 8259A IoT Security"},
|
|
"cisa_secure_by_design": {"license": "US_GOV_PUBLIC", "rule": 1, "name": "CISA Secure by Design"},
|
|
# German Laws
|
|
"bdsg": {"license": "DE_LAW", "rule": 1, "name": "BDSG"},
|
|
"bdsg_2018_komplett": {"license": "DE_LAW", "rule": 1, "name": "BDSG 2018"},
|
|
"ttdsg": {"license": "DE_LAW", "rule": 1, "name": "TTDSG"},
|
|
"tdddg_25": {"license": "DE_LAW", "rule": 1, "name": "TDDDG"},
|
|
"tkg": {"license": "DE_LAW", "rule": 1, "name": "TKG"},
|
|
"de_tkg": {"license": "DE_LAW", "rule": 1, "name": "TKG"},
|
|
"bgb_komplett": {"license": "DE_LAW", "rule": 1, "name": "BGB"},
|
|
"hgb": {"license": "DE_LAW", "rule": 1, "name": "HGB"},
|
|
"hgb_komplett": {"license": "DE_LAW", "rule": 1, "name": "HGB"},
|
|
"urhg_komplett": {"license": "DE_LAW", "rule": 1, "name": "UrhG"},
|
|
"uwg": {"license": "DE_LAW", "rule": 1, "name": "UWG"},
|
|
"tmg_komplett": {"license": "DE_LAW", "rule": 1, "name": "TMG"},
|
|
"gewo": {"license": "DE_LAW", "rule": 1, "name": "GewO"},
|
|
"ao": {"license": "DE_LAW", "rule": 1, "name": "Abgabenordnung"},
|
|
"ao_komplett": {"license": "DE_LAW", "rule": 1, "name": "Abgabenordnung"},
|
|
"battdg": {"license": "DE_LAW", "rule": 1, "name": "Batteriegesetz"},
|
|
# Austrian Laws
|
|
"at_dsg": {"license": "AT_LAW", "rule": 1, "name": "AT DSG"},
|
|
"at_abgb": {"license": "AT_LAW", "rule": 1, "name": "AT ABGB"},
|
|
"at_abgb_agb": {"license": "AT_LAW", "rule": 1, "name": "AT ABGB AGB-Recht"},
|
|
"at_bao": {"license": "AT_LAW", "rule": 1, "name": "AT BAO"},
|
|
"at_bao_ret": {"license": "AT_LAW", "rule": 1, "name": "AT BAO Retention"},
|
|
"at_ecg": {"license": "AT_LAW", "rule": 1, "name": "AT E-Commerce-Gesetz"},
|
|
"at_kschg": {"license": "AT_LAW", "rule": 1, "name": "AT Konsumentenschutzgesetz"},
|
|
"at_medieng": {"license": "AT_LAW", "rule": 1, "name": "AT Mediengesetz"},
|
|
"at_tkg": {"license": "AT_LAW", "rule": 1, "name": "AT TKG"},
|
|
"at_ugb": {"license": "AT_LAW", "rule": 1, "name": "AT UGB"},
|
|
"at_ugb_ret": {"license": "AT_LAW", "rule": 1, "name": "AT UGB Retention"},
|
|
"at_uwg": {"license": "AT_LAW", "rule": 1, "name": "AT UWG"},
|
|
# Other EU Member State Laws
|
|
"fr_loi_informatique": {"license": "FR_LAW", "rule": 1, "name": "FR Loi Informatique"},
|
|
"es_lopdgdd": {"license": "ES_LAW", "rule": 1, "name": "ES LOPDGDD"},
|
|
"nl_uavg": {"license": "NL_LAW", "rule": 1, "name": "NL UAVG"},
|
|
"it_codice_privacy": {"license": "IT_LAW", "rule": 1, "name": "IT Codice Privacy"},
|
|
"hu_info_tv": {"license": "HU_LAW", "rule": 1, "name": "HU Információs törvény"},
|
|
# EDPB Guidelines (EU Public Authority)
|
|
"edpb_01_2020": {"license": "EU_PUBLIC", "rule": 1, "name": "EDPB 01/2020 Ergaenzende Massnahmen"},
|
|
"edpb_02_2023": {"license": "EU_PUBLIC", "rule": 1, "name": "EDPB 02/2023 Technischer Anwendungsbereich"},
|
|
"edpb_05_2020": {"license": "EU_PUBLIC", "rule": 1, "name": "EDPB 05/2020 Einwilligung"},
|
|
"edpb_09_2022": {"license": "EU_PUBLIC", "rule": 1, "name": "EDPB 09/2022 Datenschutzverletzungen"},
|
|
"edpb_bcr_01_2022": {"license": "EU_PUBLIC", "rule": 1, "name": "EDPB BCR Leitlinien"},
|
|
"edpb_breach_09_2022": {"license": "EU_PUBLIC", "rule": 1, "name": "EDPB Breach Notification"},
|
|
"edpb_connected_vehicles_01_2020": {"license": "EU_PUBLIC", "rule": 1, "name": "EDPB Connected Vehicles"},
|
|
"edpb_dpbd_04_2019": {"license": "EU_PUBLIC", "rule": 1, "name": "EDPB Data Protection by Design"},
|
|
"edpb_eprivacy_02_2023": {"license": "EU_PUBLIC", "rule": 1, "name": "EDPB ePrivacy"},
|
|
"edpb_facial_recognition_05_2022": {"license": "EU_PUBLIC", "rule": 1, "name": "EDPB Facial Recognition"},
|
|
"edpb_fines_04_2022": {"license": "EU_PUBLIC", "rule": 1, "name": "EDPB Fines Calculation"},
|
|
"edpb_legitimate_interest": {"license": "EU_PUBLIC", "rule": 1, "name": "EDPB Legitimate Interest"},
|
|
"edpb_legitimate_interest_01_2024": {"license": "EU_PUBLIC","rule": 1, "name": "EDPB Legitimate Interest 2024"},
|
|
"edpb_social_media_08_2020": {"license": "EU_PUBLIC", "rule": 1, "name": "EDPB Social Media"},
|
|
"edpb_transfers_01_2020":{"license": "EU_PUBLIC", "rule": 1, "name": "EDPB Transfers 01/2020"},
|
|
"edpb_transfers_07_2020":{"license": "EU_PUBLIC", "rule": 1, "name": "EDPB Transfers 07/2020"},
|
|
"edpb_video_03_2019": {"license": "EU_PUBLIC", "rule": 1, "name": "EDPB Video Surveillance"},
|
|
"edps_dpia_list": {"license": "EU_PUBLIC", "rule": 1, "name": "EDPS DPIA Liste"},
|
|
"edpb_certification_01_2018": {"license": "EU_PUBLIC", "rule": 1, "name": "EDPB Certification 01/2018"},
|
|
"edpb_certification_01_2019": {"license": "EU_PUBLIC", "rule": 1, "name": "EDPB Certification 01/2019"},
|
|
"eaa": {"license": "EU_LAW", "rule": 1, "name": "European Accessibility Act"},
|
|
# WP29 (pre-EDPB) Guidelines
|
|
"wp244_profiling": {"license": "EU_PUBLIC", "rule": 1, "name": "WP29 Profiling"},
|
|
"wp251_profiling": {"license": "EU_PUBLIC", "rule": 1, "name": "WP29 Data Portability"},
|
|
"wp260_transparency": {"license": "EU_PUBLIC", "rule": 1, "name": "WP29 Transparency"},
|
|
|
|
# RULE 2: CITATION REQUIRED — CC-BY, CC-BY-SA
|
|
"owasp_asvs": {"license": "CC-BY-SA-4.0", "rule": 2, "name": "OWASP ASVS",
|
|
"attribution": "OWASP Foundation, CC BY-SA 4.0"},
|
|
"owasp_masvs": {"license": "CC-BY-SA-4.0", "rule": 2, "name": "OWASP MASVS",
|
|
"attribution": "OWASP Foundation, CC BY-SA 4.0"},
|
|
"owasp_top10": {"license": "CC-BY-SA-4.0", "rule": 2, "name": "OWASP Top 10",
|
|
"attribution": "OWASP Foundation, CC BY-SA 4.0"},
|
|
"owasp_top10_2021": {"license": "CC-BY-SA-4.0", "rule": 2, "name": "OWASP Top 10 2021",
|
|
"attribution": "OWASP Foundation, CC BY-SA 4.0"},
|
|
"owasp_api_top10_2023": {"license": "CC-BY-SA-4.0", "rule": 2, "name": "OWASP API Top 10 2023",
|
|
"attribution": "OWASP Foundation, CC BY-SA 4.0"},
|
|
"owasp_samm": {"license": "CC-BY-SA-4.0", "rule": 2, "name": "OWASP SAMM",
|
|
"attribution": "OWASP Foundation, CC BY-SA 4.0"},
|
|
"oecd_ai_principles": {"license": "OECD_PUBLIC", "rule": 2, "name": "OECD AI Principles",
|
|
"attribution": "OECD"},
|
|
|
|
# RULE 3: RESTRICTED — Full reformulation required
|
|
# Names stored as INTERNAL_ONLY — never exposed to customers
|
|
}
|
|
|
|
# Prefix-based matching for wildcard entries
|
|
_RULE3_PREFIXES = ["bsi_", "iso_", "etsi_"]
|
|
_RULE2_PREFIXES = ["enisa_"]
|
|
|
|
|
|
def _classify_regulation(regulation_code: str) -> dict:
|
|
"""Determine license rule for a regulation_code."""
|
|
code = regulation_code.lower().strip()
|
|
|
|
# Exact match first
|
|
if code in REGULATION_LICENSE_MAP:
|
|
return REGULATION_LICENSE_MAP[code]
|
|
|
|
# Prefix match for Rule 2
|
|
for prefix in _RULE2_PREFIXES:
|
|
if code.startswith(prefix):
|
|
return {"license": "CC-BY-4.0", "rule": 2, "name": "ENISA",
|
|
"attribution": "ENISA, CC BY 4.0"}
|
|
|
|
# Prefix match for Rule 3
|
|
for prefix in _RULE3_PREFIXES:
|
|
if code.startswith(prefix):
|
|
return {"license": f"{prefix.rstrip('_').upper()}_RESTRICTED", "rule": 3,
|
|
"name": "INTERNAL_ONLY"}
|
|
|
|
# Unknown → treat as restricted (safe default)
|
|
logger.warning("Unknown regulation_code %r — defaulting to Rule 3 (restricted)", code)
|
|
return {"license": "UNKNOWN", "rule": 3, "name": "INTERNAL_ONLY"}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Domain detection from content
|
|
# ---------------------------------------------------------------------------
|
|
|
|
DOMAIN_KEYWORDS = {
|
|
"AUTH": ["authentication", "login", "password", "credential", "mfa", "2fa",
|
|
"session", "token", "oauth", "identity", "authentifizierung", "anmeldung"],
|
|
"CRYPT": ["encryption", "cryptography", "tls", "ssl", "certificate", "hashing",
|
|
"aes", "rsa", "verschlüsselung", "kryptographie", "zertifikat"],
|
|
"NET": ["network", "firewall", "dns", "vpn", "proxy", "segmentation",
|
|
"netzwerk", "routing", "port", "intrusion"],
|
|
"DATA": ["data protection", "privacy", "personal data", "datenschutz",
|
|
"personenbezogen", "dsgvo", "gdpr", "löschung", "verarbeitung"],
|
|
"LOG": ["logging", "monitoring", "audit", "siem", "alert", "anomaly",
|
|
"protokollierung", "überwachung"],
|
|
"ACC": ["access control", "authorization", "rbac", "permission", "privilege",
|
|
"zugriffskontrolle", "berechtigung", "autorisierung"],
|
|
"SEC": ["vulnerability", "patch", "update", "hardening", "configuration",
|
|
"schwachstelle", "härtung", "konfiguration"],
|
|
"INC": ["incident", "response", "breach", "recovery", "backup",
|
|
"vorfall", "wiederherstellung", "notfall"],
|
|
"AI": ["artificial intelligence", "machine learning", "model", "bias",
|
|
"ki", "künstliche intelligenz", "algorithmus", "training"],
|
|
"COMP": ["compliance", "audit", "regulation", "standard", "certification",
|
|
"konformität", "prüfung", "zertifizierung"],
|
|
}
|
|
|
|
|
|
CATEGORY_KEYWORDS = {
|
|
"encryption": ["encryption", "cryptography", "tls", "ssl", "certificate", "hashing",
|
|
"aes", "rsa", "verschlüsselung", "kryptographie", "zertifikat", "cipher"],
|
|
"authentication": ["authentication", "login", "password", "credential", "mfa", "2fa",
|
|
"session", "oauth", "authentifizierung", "anmeldung", "passwort"],
|
|
"network": ["network", "firewall", "dns", "vpn", "proxy", "segmentation",
|
|
"netzwerk", "routing", "port", "intrusion", "ids", "ips"],
|
|
"data_protection": ["data protection", "privacy", "personal data", "datenschutz",
|
|
"personenbezogen", "dsgvo", "gdpr", "löschung", "verarbeitung", "einwilligung"],
|
|
"logging": ["logging", "monitoring", "audit trail", "siem", "alert", "anomaly",
|
|
"protokollierung", "überwachung", "nachvollziehbar"],
|
|
"incident": ["incident", "response", "breach", "recovery", "vorfall", "sicherheitsvorfall"],
|
|
"continuity": ["backup", "disaster recovery", "notfall", "wiederherstellung", "notfallplan",
|
|
"business continuity", "ausfallsicherheit"],
|
|
"compliance": ["compliance", "audit", "regulation", "certification", "konformität",
|
|
"prüfung", "zertifizierung", "nachweis"],
|
|
"supply_chain": ["supplier", "vendor", "third party", "lieferant", "auftragnehmer",
|
|
"unterauftragnehmer", "supply chain", "dienstleister"],
|
|
"physical": ["physical", "building", "access zone", "physisch", "gebäude", "zutritt",
|
|
"schließsystem", "rechenzentrum"],
|
|
"personnel": ["training", "awareness", "employee", "schulung", "mitarbeiter",
|
|
"sensibilisierung", "personal", "unterweisung"],
|
|
"application": ["application", "software", "code review", "sdlc", "secure coding",
|
|
"anwendung", "entwicklung", "software-entwicklung", "api"],
|
|
"system": ["hardening", "patch", "configuration", "update", "härtung", "konfiguration",
|
|
"betriebssystem", "system", "server"],
|
|
"risk": ["risk assessment", "risk management", "risiko", "bewertung", "risikobewertung",
|
|
"risikoanalyse", "bedrohung", "threat"],
|
|
"governance": ["governance", "policy", "organization", "isms", "sicherheitsorganisation",
|
|
"richtlinie", "verantwortlichkeit", "rolle"],
|
|
"hardware": ["hardware", "platform", "firmware", "bios", "tpm", "chip",
|
|
"plattform", "geräte"],
|
|
"identity": ["identity", "iam", "directory", "ldap", "sso", "provisioning",
|
|
"identität", "identitätsmanagement", "benutzerverzeichnis"],
|
|
}
|
|
|
|
VERIFICATION_KEYWORDS = {
|
|
"code_review": ["source code", "code review", "static analysis", "sast", "dast",
|
|
"dependency check", "quellcode", "codeanalyse", "secure coding",
|
|
"software development", "api", "input validation", "output encoding"],
|
|
"document": ["policy", "procedure", "documentation", "training", "awareness",
|
|
"richtlinie", "dokumentation", "schulung", "nachweis", "vertrag",
|
|
"organizational", "process", "role", "responsibility"],
|
|
"tool": ["scanner", "monitoring", "siem", "ids", "ips", "firewall", "antivirus",
|
|
"vulnerability scan", "penetration test", "tool", "automated"],
|
|
"hybrid": [], # Assigned when multiple methods match equally
|
|
}
|
|
|
|
|
|
def _detect_category(text: str) -> Optional[str]:
|
|
"""Detect the most likely category from text content."""
|
|
text_lower = text.lower()
|
|
scores: dict[str, int] = {}
|
|
for cat, keywords in CATEGORY_KEYWORDS.items():
|
|
scores[cat] = sum(1 for kw in keywords if kw in text_lower)
|
|
if not scores or max(scores.values()) == 0:
|
|
return None
|
|
return max(scores, key=scores.get)
|
|
|
|
|
|
def _detect_verification_method(text: str) -> Optional[str]:
|
|
"""Detect verification method from text content."""
|
|
text_lower = text.lower()
|
|
scores: dict[str, int] = {}
|
|
for method, keywords in VERIFICATION_KEYWORDS.items():
|
|
if method == "hybrid":
|
|
continue
|
|
scores[method] = sum(1 for kw in keywords if kw in text_lower)
|
|
if not scores or max(scores.values()) == 0:
|
|
return None
|
|
top = sorted(scores.items(), key=lambda x: -x[1])
|
|
# If top two are close, it's hybrid
|
|
if len(top) >= 2 and top[0][1] > 0 and top[1][1] > 0 and top[1][1] >= top[0][1] * 0.7:
|
|
return "hybrid"
|
|
return top[0][0] if top[0][1] > 0 else None
|
|
|
|
|
|
def _detect_domain(text: str) -> str:
|
|
"""Detect the most likely domain from text content."""
|
|
text_lower = text.lower()
|
|
scores: dict[str, int] = {}
|
|
for domain, keywords in DOMAIN_KEYWORDS.items():
|
|
scores[domain] = sum(1 for kw in keywords if kw in text_lower)
|
|
if not scores or max(scores.values()) == 0:
|
|
return "SEC" # Default
|
|
return max(scores, key=scores.get)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Data Models
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class GeneratorConfig(BaseModel):
|
|
collections: Optional[List[str]] = None
|
|
domain: Optional[str] = None
|
|
batch_size: int = 5
|
|
max_controls: int = 0 # 0 = unlimited (process ALL chunks)
|
|
skip_processed: bool = True
|
|
skip_web_search: bool = False
|
|
dry_run: bool = False
|
|
existing_job_id: Optional[str] = None # If set, reuse this job instead of creating a new one
|
|
|
|
|
|
@dataclass
|
|
class GeneratedControl:
|
|
control_id: str = ""
|
|
title: str = ""
|
|
objective: str = ""
|
|
rationale: str = ""
|
|
scope: dict = field(default_factory=dict)
|
|
requirements: list = field(default_factory=list)
|
|
test_procedure: list = field(default_factory=list)
|
|
evidence: list = field(default_factory=list)
|
|
severity: str = "medium"
|
|
risk_score: float = 5.0
|
|
implementation_effort: str = "m"
|
|
open_anchors: list = field(default_factory=list)
|
|
release_state: str = "draft"
|
|
tags: list = field(default_factory=list)
|
|
# 3-rule fields
|
|
license_rule: Optional[int] = None
|
|
source_original_text: Optional[str] = None
|
|
source_citation: Optional[dict] = None
|
|
customer_visible: bool = True
|
|
generation_metadata: dict = field(default_factory=dict)
|
|
generation_strategy: str = "ungrouped" # ungrouped | document_grouped
|
|
# Classification fields
|
|
verification_method: Optional[str] = None # code_review, document, tool, hybrid
|
|
category: Optional[str] = None # one of 17 categories
|
|
|
|
|
|
@dataclass
|
|
class GeneratorResult:
|
|
job_id: str = ""
|
|
status: str = "completed"
|
|
total_chunks_scanned: int = 0
|
|
controls_generated: int = 0
|
|
controls_verified: int = 0
|
|
controls_needs_review: int = 0
|
|
controls_too_close: int = 0
|
|
controls_duplicates_found: int = 0
|
|
chunks_skipped_prefilter: int = 0
|
|
errors: list = field(default_factory=list)
|
|
controls: list = field(default_factory=list)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# LLM Client (via Go SDK)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
async def _llm_chat(prompt: str, system_prompt: Optional[str] = None) -> str:
|
|
"""Call LLM — Anthropic Claude (primary) or Ollama (fallback)."""
|
|
if ANTHROPIC_API_KEY:
|
|
logger.info("Calling Anthropic API (model=%s)...", ANTHROPIC_MODEL)
|
|
result = await _llm_anthropic(prompt, system_prompt)
|
|
if result:
|
|
logger.info("Anthropic API success (%d chars)", len(result))
|
|
return result
|
|
logger.warning("Anthropic failed, falling back to Ollama")
|
|
|
|
logger.info("Calling Ollama (model=%s)...", OLLAMA_MODEL)
|
|
return await _llm_ollama(prompt, system_prompt)
|
|
|
|
|
|
async def _llm_local(prompt: str, system_prompt: Optional[str] = None) -> str:
|
|
"""Call local Ollama LLM only (for pre-filtering and classification tasks)."""
|
|
return await _llm_ollama(prompt, system_prompt)
|
|
|
|
|
|
PREFILTER_SYSTEM_PROMPT = """Du bist ein Compliance-Analyst. Deine Aufgabe: Prüfe ob ein Textabschnitt eine konkrete Sicherheitsanforderung, Datenschutzpflicht, oder technische/organisatorische Maßnahme enthält.
|
|
|
|
Antworte NUR mit einem JSON-Objekt: {"relevant": true/false, "reason": "kurze Begründung"}
|
|
|
|
Relevant = true wenn der Text mindestens EINE der folgenden enthält:
|
|
- Konkrete Pflicht/Anforderung ("muss", "soll", "ist sicherzustellen")
|
|
- Technische Sicherheitsmaßnahme (Verschlüsselung, Zugriffskontrolle, Logging)
|
|
- Organisatorische Maßnahme (Schulung, Dokumentation, Audit)
|
|
- Datenschutz-Vorgabe (Löschpflicht, Einwilligung, Zweckbindung)
|
|
- Risikomanagement-Anforderung
|
|
|
|
Relevant = false wenn der Text NUR enthält:
|
|
- Definitionen ohne Pflichten
|
|
- Inhaltsverzeichnisse oder Verweise
|
|
- Reine Begriffsbestimmungen
|
|
- Übergangsvorschriften ohne Substanz
|
|
- Adressaten/Geltungsbereich ohne Anforderung"""
|
|
|
|
|
|
async def _prefilter_chunk(chunk_text: str) -> tuple[bool, str]:
|
|
"""Use local LLM to check if a chunk contains an actionable requirement.
|
|
|
|
Returns (is_relevant, reason).
|
|
Much cheaper than sending every chunk to Anthropic.
|
|
"""
|
|
prompt = f"""Prüfe ob dieser Textabschnitt eine konkrete Sicherheitsanforderung oder Compliance-Pflicht enthält.
|
|
|
|
Text:
|
|
---
|
|
{chunk_text[:1500]}
|
|
---
|
|
|
|
Antworte NUR mit JSON: {{"relevant": true/false, "reason": "kurze Begründung"}}"""
|
|
|
|
try:
|
|
raw = await _llm_local(prompt, PREFILTER_SYSTEM_PROMPT)
|
|
data = _parse_llm_json(raw)
|
|
if data:
|
|
return data.get("relevant", True), data.get("reason", "")
|
|
# If parsing fails, assume relevant (don't skip)
|
|
return True, "parse_failed"
|
|
except Exception as e:
|
|
logger.warning("Prefilter failed: %s — treating as relevant", e)
|
|
return True, f"error: {e}"
|
|
|
|
|
|
async def _llm_anthropic(prompt: str, system_prompt: Optional[str] = None) -> str:
|
|
"""Call Anthropic Messages API."""
|
|
headers = {
|
|
"x-api-key": ANTHROPIC_API_KEY,
|
|
"anthropic-version": "2023-06-01",
|
|
"content-type": "application/json",
|
|
}
|
|
payload = {
|
|
"model": ANTHROPIC_MODEL,
|
|
"max_tokens": 8192,
|
|
"messages": [{"role": "user", "content": prompt}],
|
|
}
|
|
if system_prompt:
|
|
payload["system"] = system_prompt
|
|
|
|
try:
|
|
async with httpx.AsyncClient(timeout=LLM_TIMEOUT) as client:
|
|
resp = await client.post(
|
|
"https://api.anthropic.com/v1/messages",
|
|
headers=headers,
|
|
json=payload,
|
|
)
|
|
if resp.status_code != 200:
|
|
logger.error("Anthropic API %d: %s", resp.status_code, resp.text[:300])
|
|
return ""
|
|
data = resp.json()
|
|
content = data.get("content", [])
|
|
if content and isinstance(content, list):
|
|
return content[0].get("text", "")
|
|
return ""
|
|
except Exception as e:
|
|
logger.error("Anthropic request failed: %s (type: %s)", e, type(e).__name__)
|
|
return ""
|
|
|
|
|
|
async def _llm_ollama(prompt: str, system_prompt: Optional[str] = None) -> str:
|
|
"""Call Ollama chat API (fallback)."""
|
|
messages = []
|
|
if system_prompt:
|
|
messages.append({"role": "system", "content": system_prompt})
|
|
messages.append({"role": "user", "content": prompt})
|
|
|
|
payload = {
|
|
"model": OLLAMA_MODEL,
|
|
"messages": messages,
|
|
"stream": False,
|
|
"options": {"num_predict": 512}, # Limit response length for speed
|
|
"think": False, # Disable thinking for faster responses
|
|
}
|
|
|
|
try:
|
|
async with httpx.AsyncClient(timeout=LLM_TIMEOUT) as client:
|
|
resp = await client.post(f"{OLLAMA_URL}/api/chat", json=payload)
|
|
if resp.status_code != 200:
|
|
logger.error("Ollama chat failed %d: %s", resp.status_code, resp.text[:300])
|
|
return ""
|
|
data = resp.json()
|
|
msg = data.get("message", {})
|
|
if isinstance(msg, dict):
|
|
return msg.get("content", "")
|
|
return data.get("response", str(msg))
|
|
except Exception as e:
|
|
logger.error("Ollama request failed: %s", e)
|
|
return ""
|
|
|
|
|
|
async def _get_embedding(text: str) -> list[float]:
|
|
"""Get embedding vector for text via embedding service."""
|
|
try:
|
|
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
resp = await client.post(
|
|
f"{EMBEDDING_URL}/embed",
|
|
json={"texts": [text]},
|
|
)
|
|
resp.raise_for_status()
|
|
embeddings = resp.json().get("embeddings", [])
|
|
return embeddings[0] if embeddings else []
|
|
except Exception:
|
|
return []
|
|
|
|
|
|
async def _get_embeddings_batch(texts: list[str], batch_size: int = 32) -> list[list[float]]:
|
|
"""Get embedding vectors for multiple texts in batches."""
|
|
all_embeddings: list[list[float]] = []
|
|
for i in range(0, len(texts), batch_size):
|
|
batch = texts[i:i + batch_size]
|
|
try:
|
|
async with httpx.AsyncClient(timeout=30.0) as client:
|
|
resp = await client.post(
|
|
f"{EMBEDDING_URL}/embed",
|
|
json={"texts": batch},
|
|
)
|
|
resp.raise_for_status()
|
|
embeddings = resp.json().get("embeddings", [])
|
|
all_embeddings.extend(embeddings)
|
|
except Exception as e:
|
|
logger.warning("Batch embedding failed for %d texts: %s", len(batch), e)
|
|
all_embeddings.extend([[] for _ in batch])
|
|
return all_embeddings
|
|
|
|
|
|
def _cosine_sim(a: list[float], b: list[float]) -> float:
|
|
"""Compute cosine similarity between two vectors."""
|
|
if not a or not b or len(a) != len(b):
|
|
return 0.0
|
|
dot = sum(x * y for x, y in zip(a, b))
|
|
norm_a = sum(x * x for x in a) ** 0.5
|
|
norm_b = sum(x * x for x in b) ** 0.5
|
|
if norm_a == 0 or norm_b == 0:
|
|
return 0.0
|
|
return dot / (norm_a * norm_b)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# JSON Parsing Helper
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _parse_llm_json(raw: str) -> dict:
|
|
"""Extract JSON from LLM response (handles markdown fences)."""
|
|
# Try extracting from ```json ... ``` blocks
|
|
match = re.search(r"```(?:json)?\s*\n?(.*?)\n?```", raw, re.DOTALL)
|
|
text = match.group(1) if match else raw
|
|
|
|
# Try parsing directly
|
|
try:
|
|
return json.loads(text)
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
# Try finding first { ... } block
|
|
brace_match = re.search(r"\{.*\}", text, re.DOTALL)
|
|
if brace_match:
|
|
try:
|
|
return json.loads(brace_match.group(0))
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
logger.warning("Failed to parse LLM JSON response")
|
|
return {}
|
|
|
|
|
|
def _parse_llm_json_array(raw: str) -> list[dict]:
|
|
"""Extract a JSON array from LLM response — returns list of dicts."""
|
|
match = re.search(r"```(?:json)?\s*\n?(.*?)\n?```", raw, re.DOTALL)
|
|
text = match.group(1) if match else raw
|
|
|
|
# Try parsing as array directly
|
|
try:
|
|
parsed = json.loads(text)
|
|
if isinstance(parsed, list):
|
|
return parsed
|
|
if isinstance(parsed, dict):
|
|
# Check if it wraps an array (e.g. {"controls": [...]})
|
|
for key in ("controls", "results", "items", "data"):
|
|
if key in parsed and isinstance(parsed[key], list):
|
|
return parsed[key]
|
|
return [parsed]
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
# Try finding [ ... ] block
|
|
bracket_match = re.search(r"\[.*\]", text, re.DOTALL)
|
|
if bracket_match:
|
|
try:
|
|
parsed = json.loads(bracket_match.group(0))
|
|
if isinstance(parsed, list):
|
|
return parsed
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
# Try finding multiple { ... } blocks (LLM sometimes returns separate objects)
|
|
objects = []
|
|
for obj_match in re.finditer(r"\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}", text, re.DOTALL):
|
|
try:
|
|
obj = json.loads(obj_match.group(0))
|
|
if isinstance(obj, dict) and obj.get("title"):
|
|
objects.append(obj)
|
|
except json.JSONDecodeError:
|
|
continue
|
|
if objects:
|
|
logger.info("Parsed %d individual JSON objects from batch response", len(objects))
|
|
return objects
|
|
|
|
# Fallback: try single object
|
|
single = _parse_llm_json(raw)
|
|
if single:
|
|
logger.info("Batch parse fallback: extracted single object")
|
|
else:
|
|
logger.warning("Batch parse failed — logging first 500 chars: %s", raw[:500])
|
|
return [single] if single else []
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Pipeline
|
|
# ---------------------------------------------------------------------------
|
|
|
|
REFORM_SYSTEM_PROMPT = """Du bist ein Security-Compliance-Experte. Deine Aufgabe ist es, eigenständige
|
|
Security Controls zu formulieren. Du formulierst IMMER in eigenen Worten.
|
|
KOPIERE KEINE Sätze aus dem Quelltext. Verwende eigene Begriffe und Struktur.
|
|
NENNE NICHT die Quelle. Keine proprietären Bezeichner.
|
|
Antworte NUR mit validem JSON. Bei mehreren Controls antworte mit einem JSON-Array."""
|
|
|
|
STRUCTURE_SYSTEM_PROMPT = """Du bist ein Security-Compliance-Experte. Strukturiere den gegebenen Text
|
|
als praxisorientiertes Security Control. Erstelle eine verständliche, umsetzbare Formulierung.
|
|
Antworte NUR mit validem JSON. Bei mehreren Controls antworte mit einem JSON-Array."""
|
|
|
|
|
|
class ControlGeneratorPipeline:
|
|
"""Orchestrates the 7-stage control generation pipeline."""
|
|
|
|
def __init__(self, db: Session, rag_client: Optional[ComplianceRAGClient] = None):
|
|
self.db = db
|
|
self.rag = rag_client or get_rag_client()
|
|
self._existing_controls: Optional[List[dict]] = None
|
|
self._existing_embeddings: Dict[str, List[float]] = {}
|
|
|
|
# ── Stage 1: RAG Scan ──────────────────────────────────────────────
|
|
|
|
async def _scan_rag(self, config: GeneratorConfig) -> list[RAGSearchResult]:
|
|
"""Scroll through ALL chunks in RAG collections.
|
|
|
|
Uses the scroll endpoint to iterate over every chunk (not just top-K search).
|
|
Filters out already-processed chunks by hash.
|
|
"""
|
|
collections = config.collections or ALL_COLLECTIONS
|
|
all_results: list[RAGSearchResult] = []
|
|
|
|
# Pre-load all processed hashes for fast filtering
|
|
processed_hashes: set[str] = set()
|
|
if config.skip_processed:
|
|
try:
|
|
result = self.db.execute(
|
|
text("SELECT chunk_hash FROM canonical_processed_chunks")
|
|
)
|
|
processed_hashes = {row[0] for row in result}
|
|
logger.info("Loaded %d processed chunk hashes", len(processed_hashes))
|
|
except Exception as e:
|
|
logger.warning("Error loading processed hashes: %s", e)
|
|
|
|
seen_hashes: set[str] = set()
|
|
|
|
for collection in collections:
|
|
offset = None
|
|
page = 0
|
|
collection_total = 0
|
|
collection_new = 0
|
|
seen_offsets: set[str] = set() # Detect scroll loops
|
|
|
|
while True:
|
|
chunks, next_offset = await self.rag.scroll(
|
|
collection=collection,
|
|
offset=offset,
|
|
limit=200,
|
|
)
|
|
|
|
if not chunks:
|
|
break
|
|
|
|
collection_total += len(chunks)
|
|
|
|
for chunk in chunks:
|
|
if not chunk.text or len(chunk.text.strip()) < 50:
|
|
continue # Skip empty/tiny chunks
|
|
|
|
h = hashlib.sha256(chunk.text.encode()).hexdigest()
|
|
|
|
# Skip duplicates (same text in multiple collections)
|
|
if h in seen_hashes:
|
|
continue
|
|
seen_hashes.add(h)
|
|
|
|
# Skip already-processed
|
|
if h in processed_hashes:
|
|
continue
|
|
|
|
all_results.append(chunk)
|
|
collection_new += 1
|
|
|
|
page += 1
|
|
if page % 50 == 0:
|
|
logger.info(
|
|
"Scrolling %s: page %d, %d total chunks, %d new unprocessed",
|
|
collection, page, collection_total, collection_new,
|
|
)
|
|
|
|
# Stop conditions
|
|
if not next_offset:
|
|
break
|
|
# Detect infinite scroll loops (Qdrant mixed ID types)
|
|
if next_offset in seen_offsets:
|
|
logger.warning(
|
|
"Scroll loop detected in %s at offset %s (page %d) — stopping",
|
|
collection, next_offset, page,
|
|
)
|
|
break
|
|
seen_offsets.add(next_offset)
|
|
|
|
offset = next_offset
|
|
|
|
logger.info(
|
|
"Collection %s: %d total chunks scrolled, %d new unprocessed",
|
|
collection, collection_total, collection_new,
|
|
)
|
|
|
|
logger.info(
|
|
"RAG scroll complete: %d total unique seen, %d new unprocessed to process",
|
|
len(seen_hashes), len(all_results),
|
|
)
|
|
return all_results
|
|
|
|
def _get_processed_hashes(self, hashes: list[str]) -> set[str]:
|
|
"""Check which chunk hashes are already processed."""
|
|
if not hashes:
|
|
return set()
|
|
try:
|
|
result = self.db.execute(
|
|
text("SELECT chunk_hash FROM canonical_processed_chunks WHERE chunk_hash = ANY(:hashes)"),
|
|
{"hashes": hashes},
|
|
)
|
|
return {row[0] for row in result}
|
|
except Exception as e:
|
|
logger.warning("Error checking processed chunks: %s", e)
|
|
return set()
|
|
|
|
# ── Stage 2: License Classification ────────────────────────────────
|
|
|
|
def _classify_license(self, chunk: RAGSearchResult) -> dict:
|
|
"""Determine which license rule applies to this chunk."""
|
|
return _classify_regulation(chunk.regulation_code)
|
|
|
|
# ── Stage 3a: Structure (Rule 1 — Free Use) ───────────────────────
|
|
|
|
async def _structure_free_use(self, chunk: RAGSearchResult, license_info: dict) -> GeneratedControl:
|
|
"""Structure a freely usable text into control format."""
|
|
source_name = license_info.get("name", chunk.regulation_name)
|
|
prompt = f"""Strukturiere den folgenden Gesetzestext als Security/Compliance Control.
|
|
Du DARFST den Originaltext verwenden (Quelle: {source_name}, {license_info.get('license', '')}).
|
|
|
|
WICHTIG: Erstelle eine verständliche, praxisorientierte Formulierung.
|
|
Der Originaltext wird separat gespeichert — deine Formulierung soll klar und umsetzbar sein.
|
|
|
|
Gib JSON zurück mit diesen Feldern:
|
|
- title: Kurzer prägnanter Titel (max 100 Zeichen)
|
|
- objective: Was soll erreicht werden? (1-3 Sätze)
|
|
- rationale: Warum ist das wichtig? (1-2 Sätze)
|
|
- requirements: Liste von konkreten Anforderungen (Strings)
|
|
- test_procedure: Liste von Prüfschritten (Strings)
|
|
- evidence: Liste von Nachweisdokumenten (Strings)
|
|
- severity: low/medium/high/critical
|
|
- tags: Liste von Tags
|
|
|
|
Text: {chunk.text[:2000]}
|
|
Quelle: {chunk.regulation_name} ({chunk.regulation_code}), {chunk.article}"""
|
|
|
|
raw = await _llm_chat(prompt, STRUCTURE_SYSTEM_PROMPT)
|
|
data = _parse_llm_json(raw)
|
|
if not data:
|
|
return self._fallback_control(chunk)
|
|
|
|
domain = _detect_domain(chunk.text)
|
|
control = self._build_control_from_json(data, domain)
|
|
control.license_rule = 1
|
|
control.source_original_text = chunk.text
|
|
control.source_citation = {
|
|
"source": f"{chunk.regulation_name} {chunk.article or ''}".strip(),
|
|
"license": license_info.get("license", ""),
|
|
"url": chunk.source_url or "",
|
|
}
|
|
control.customer_visible = True
|
|
control.verification_method = _detect_verification_method(chunk.text)
|
|
control.category = _detect_category(chunk.text)
|
|
control.generation_metadata = {
|
|
"processing_path": "structured",
|
|
"license_rule": 1,
|
|
"source_regulation": chunk.regulation_code,
|
|
"source_article": chunk.article,
|
|
}
|
|
return control
|
|
|
|
# ── Stage 3b: Structure with Citation (Rule 2) ────────────────────
|
|
|
|
async def _structure_with_citation(self, chunk: RAGSearchResult, license_info: dict) -> GeneratedControl:
|
|
"""Structure text that requires citation."""
|
|
source_name = license_info.get("name", chunk.regulation_name)
|
|
attribution = license_info.get("attribution", "")
|
|
prompt = f"""Strukturiere den folgenden Text als Security Control.
|
|
Quelle: {source_name} ({license_info.get('license', '')}) — Zitation erforderlich.
|
|
|
|
Du darfst den Text übernehmen oder verständlicher umformulieren.
|
|
Die Quelle wird automatisch zitiert — fokussiere dich auf Klarheit.
|
|
|
|
Gib JSON zurück mit diesen Feldern:
|
|
- title: Kurzer prägnanter Titel (max 100 Zeichen)
|
|
- objective: Was soll erreicht werden? (1-3 Sätze)
|
|
- rationale: Warum ist das wichtig? (1-2 Sätze)
|
|
- requirements: Liste von konkreten Anforderungen (Strings)
|
|
- test_procedure: Liste von Prüfschritten (Strings)
|
|
- evidence: Liste von Nachweisdokumenten (Strings)
|
|
- severity: low/medium/high/critical
|
|
- tags: Liste von Tags
|
|
|
|
Text: {chunk.text[:2000]}
|
|
Quelle: {chunk.regulation_name}, {chunk.article}"""
|
|
|
|
raw = await _llm_chat(prompt, STRUCTURE_SYSTEM_PROMPT)
|
|
data = _parse_llm_json(raw)
|
|
if not data:
|
|
return self._fallback_control(chunk)
|
|
|
|
domain = _detect_domain(chunk.text)
|
|
control = self._build_control_from_json(data, domain)
|
|
control.license_rule = 2
|
|
control.source_original_text = chunk.text
|
|
control.source_citation = {
|
|
"source": f"{chunk.regulation_name} {chunk.article or ''}".strip(),
|
|
"license": license_info.get("license", ""),
|
|
"license_notice": attribution,
|
|
"url": chunk.source_url or "",
|
|
}
|
|
control.customer_visible = True
|
|
control.verification_method = _detect_verification_method(chunk.text)
|
|
control.category = _detect_category(chunk.text)
|
|
control.generation_metadata = {
|
|
"processing_path": "structured",
|
|
"license_rule": 2,
|
|
"source_regulation": chunk.regulation_code,
|
|
"source_article": chunk.article,
|
|
}
|
|
return control
|
|
|
|
# ── Stage 3c: LLM Reformulation (Rule 3 — Restricted) ─────────────
|
|
|
|
async def _llm_reformulate(self, chunk: RAGSearchResult, config: GeneratorConfig) -> GeneratedControl:
|
|
"""Fully reformulate — NO original text, NO source names."""
|
|
domain = config.domain or _detect_domain(chunk.text)
|
|
prompt = f"""Analysiere den folgenden Prüfaspekt und formuliere ein EIGENSTÄNDIGES Security Control.
|
|
KOPIERE KEINE Sätze. Verwende eigene Begriffe und Struktur.
|
|
NENNE NICHT die Quelle. Keine proprietären Bezeichner (kein O.Auth_*, TR-03161, BSI-TR etc.).
|
|
|
|
Aspekt (nur zur Analyse, NICHT kopieren, NICHT referenzieren):
|
|
---
|
|
{chunk.text[:1500]}
|
|
---
|
|
|
|
Domain: {domain}
|
|
|
|
Gib JSON zurück mit diesen Feldern:
|
|
- title: Kurzer eigenständiger Titel (max 100 Zeichen)
|
|
- objective: Eigenständige Formulierung des Ziels (1-3 Sätze)
|
|
- rationale: Eigenständige Begründung (1-2 Sätze)
|
|
- requirements: Liste von konkreten Anforderungen (Strings, eigene Worte)
|
|
- test_procedure: Liste von Prüfschritten (Strings)
|
|
- evidence: Liste von Nachweisdokumenten (Strings)
|
|
- severity: low/medium/high/critical
|
|
- tags: Liste von Tags (eigene Begriffe)"""
|
|
|
|
raw = await _llm_chat(prompt, REFORM_SYSTEM_PROMPT)
|
|
data = _parse_llm_json(raw)
|
|
if not data:
|
|
return self._fallback_control(chunk)
|
|
|
|
control = self._build_control_from_json(data, domain)
|
|
control.license_rule = 3
|
|
control.source_original_text = None # NEVER store original
|
|
control.source_citation = None # NEVER cite source
|
|
control.customer_visible = False # Only our formulation
|
|
control.verification_method = _detect_verification_method(chunk.text)
|
|
control.category = _detect_category(chunk.text)
|
|
# generation_metadata: NO source names, NO original texts
|
|
control.generation_metadata = {
|
|
"processing_path": "llm_reform",
|
|
"license_rule": 3,
|
|
}
|
|
return control
|
|
|
|
# ── Stage 3 BATCH: Multiple chunks in one API call ─────────────────
|
|
|
|
async def _structure_batch(
|
|
self,
|
|
chunks: list[RAGSearchResult],
|
|
license_infos: list[dict],
|
|
) -> list[Optional[GeneratedControl]]:
|
|
"""Structure multiple free-use/citation chunks in a single Anthropic call."""
|
|
# Build document context header if chunks share a regulation
|
|
regulations_in_batch = set(c.regulation_name for c in chunks)
|
|
doc_context = ""
|
|
if len(regulations_in_batch) == 1:
|
|
reg_name = next(iter(regulations_in_batch))
|
|
articles = sorted(set(c.article or "?" for c in chunks))
|
|
doc_context = (
|
|
f"\nDOKUMENTKONTEXT: Alle {len(chunks)} Chunks stammen aus demselben Gesetz: {reg_name}.\n"
|
|
f"Betroffene Artikel/Abschnitte: {', '.join(articles)}.\n"
|
|
f"Nutze diesen Zusammenhang fuer eine kohaerente, aufeinander abgestimmte Formulierung der Controls.\n"
|
|
f"Vermeide Redundanzen zwischen den Controls — jedes soll einen eigenen Aspekt abdecken.\n"
|
|
)
|
|
elif len(regulations_in_batch) <= 3:
|
|
doc_context = (
|
|
f"\nDOKUMENTKONTEXT: Die Chunks stammen aus {len(regulations_in_batch)} Gesetzen: "
|
|
f"{', '.join(regulations_in_batch)}.\n"
|
|
)
|
|
|
|
chunk_entries = []
|
|
for idx, (chunk, lic) in enumerate(zip(chunks, license_infos)):
|
|
source_name = lic.get("name", chunk.regulation_name)
|
|
chunk_entries.append(
|
|
f"--- CHUNK {idx + 1} ---\n"
|
|
f"Text: {chunk.text[:2000]}\n"
|
|
f"Quelle: {chunk.regulation_name} ({chunk.regulation_code}), {chunk.article}\n"
|
|
f"Lizenz: {source_name} ({lic.get('license', '')})"
|
|
)
|
|
joined = "\n\n".join(chunk_entries)
|
|
prompt = f"""Strukturiere die folgenden {len(chunks)} Gesetzestexte jeweils als eigenstaendiges Security/Compliance Control.
|
|
Du DARFST den Originaltext verwenden (Quellen sind jeweils angegeben).
|
|
{doc_context}
|
|
WICHTIG:
|
|
- Erstelle fuer JEDEN Chunk ein separates Control mit verstaendlicher, praxisorientierter Formulierung.
|
|
- Jedes Control muss eigenstaendig und vollstaendig sein — nicht auf andere Controls verweisen.
|
|
- Qualitaet ist wichtiger als Geschwindigkeit. Jedes Control muss die gleiche Qualitaet haben wie ein einzeln erstelltes.
|
|
- Antworte IMMER auf Deutsch.
|
|
|
|
Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat diese Felder:
|
|
- chunk_index: 1-basierter Index des Chunks (1, 2, 3, ...)
|
|
- title: Kurzer praegnanter Titel auf Deutsch (max 100 Zeichen)
|
|
- objective: Was soll erreicht werden? (1-3 Saetze, Deutsch)
|
|
- rationale: Warum ist das wichtig? (1-2 Saetze, Deutsch)
|
|
- requirements: Liste von konkreten Anforderungen (Strings, Deutsch)
|
|
- test_procedure: Liste von Pruefschritten (Strings, Deutsch)
|
|
- evidence: Liste von Nachweisdokumenten (Strings, Deutsch)
|
|
- severity: low/medium/high/critical
|
|
- tags: Liste von Tags
|
|
|
|
{joined}"""
|
|
|
|
raw = await _llm_chat(prompt, STRUCTURE_SYSTEM_PROMPT)
|
|
results = _parse_llm_json_array(raw)
|
|
logger.info("Batch structure: parsed %d results from API response", len(results))
|
|
|
|
# Map results back to chunks by chunk_index (or by position if no index)
|
|
controls: list[Optional[GeneratedControl]] = [None] * len(chunks)
|
|
for pos, data in enumerate(results):
|
|
# Try chunk_index first, fall back to position
|
|
idx = data.get("chunk_index")
|
|
if idx is not None:
|
|
idx = int(idx) - 1 # Convert to 0-based
|
|
else:
|
|
idx = pos # Use position as fallback
|
|
if idx < 0 or idx >= len(chunks):
|
|
logger.warning("Batch: chunk_index %d out of range (0-%d), using position %d", idx, len(chunks)-1, pos)
|
|
idx = min(pos, len(chunks) - 1)
|
|
chunk = chunks[idx]
|
|
lic = license_infos[idx]
|
|
domain = _detect_domain(chunk.text)
|
|
control = self._build_control_from_json(data, domain)
|
|
control.license_rule = lic["rule"]
|
|
if lic["rule"] in (1, 2):
|
|
control.source_original_text = chunk.text
|
|
control.source_citation = {
|
|
"source": f"{chunk.regulation_name} {chunk.article or ''}".strip(),
|
|
"license": lic.get("license", ""),
|
|
"license_notice": lic.get("attribution", ""),
|
|
"url": chunk.source_url or "",
|
|
}
|
|
control.customer_visible = True
|
|
control.verification_method = _detect_verification_method(chunk.text)
|
|
control.category = _detect_category(chunk.text)
|
|
same_doc = len(set(c.regulation_code for c in chunks)) == 1
|
|
control.generation_metadata = {
|
|
"processing_path": "structured_batch",
|
|
"license_rule": lic["rule"],
|
|
"source_regulation": chunk.regulation_code,
|
|
"source_article": chunk.article,
|
|
"batch_size": len(chunks),
|
|
"document_grouped": same_doc,
|
|
}
|
|
control.generation_strategy = "document_grouped" if same_doc else "ungrouped"
|
|
controls[idx] = control
|
|
|
|
return controls
|
|
|
|
async def _reformulate_batch(
|
|
self,
|
|
chunks: list[RAGSearchResult],
|
|
config: GeneratorConfig,
|
|
) -> list[Optional[GeneratedControl]]:
|
|
"""Reformulate multiple restricted chunks in a single Anthropic call."""
|
|
chunk_entries = []
|
|
for idx, chunk in enumerate(chunks):
|
|
domain = config.domain or _detect_domain(chunk.text)
|
|
chunk_entries.append(
|
|
f"--- ASPEKT {idx + 1} ---\n"
|
|
f"Domain: {domain}\n"
|
|
f"Text (nur zur Analyse, NICHT kopieren, NICHT referenzieren):\n{chunk.text[:1500]}"
|
|
)
|
|
joined = "\n\n".join(chunk_entries)
|
|
prompt = f"""Analysiere die folgenden {len(chunks)} Pruefaspekte und formuliere fuer JEDEN ein EIGENSTAENDIGES Security Control.
|
|
KOPIERE KEINE Saetze. Verwende eigene Begriffe und Struktur.
|
|
NENNE NICHT die Quellen. Keine proprietaeren Bezeichner (kein O.Auth_*, TR-03161, BSI-TR etc.).
|
|
|
|
WICHTIG:
|
|
- Jedes Control muss eigenstaendig und vollstaendig sein — nicht auf andere Controls verweisen.
|
|
- Qualitaet ist wichtiger als Geschwindigkeit. Jedes Control muss die gleiche Qualitaet haben wie ein einzeln erstelltes.
|
|
|
|
Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat diese Felder:
|
|
- chunk_index: 1-basierter Index des Aspekts (1, 2, 3, ...)
|
|
- title: Kurzer eigenstaendiger Titel (max 100 Zeichen)
|
|
- objective: Eigenstaendige Formulierung des Ziels (1-3 Saetze)
|
|
- rationale: Eigenstaendige Begruendung (1-2 Saetze)
|
|
- requirements: Liste von konkreten Anforderungen (Strings, eigene Worte)
|
|
- test_procedure: Liste von Pruefschritten (Strings)
|
|
- evidence: Liste von Nachweisdokumenten (Strings)
|
|
- severity: low/medium/high/critical
|
|
- tags: Liste von Tags (eigene Begriffe)
|
|
|
|
{joined}"""
|
|
|
|
raw = await _llm_chat(prompt, REFORM_SYSTEM_PROMPT)
|
|
results = _parse_llm_json_array(raw)
|
|
logger.info("Batch reform: parsed %d results from API response", len(results))
|
|
|
|
controls: list[Optional[GeneratedControl]] = [None] * len(chunks)
|
|
for pos, data in enumerate(results):
|
|
idx = data.get("chunk_index")
|
|
if idx is not None:
|
|
idx = int(idx) - 1
|
|
else:
|
|
idx = pos
|
|
if idx < 0 or idx >= len(chunks):
|
|
logger.warning("Batch reform: chunk_index %d out of range, using position %d", idx, pos)
|
|
idx = min(pos, len(chunks) - 1)
|
|
chunk = chunks[idx]
|
|
domain = config.domain or _detect_domain(chunk.text)
|
|
control = self._build_control_from_json(data, domain)
|
|
control.license_rule = 3
|
|
control.source_original_text = None
|
|
control.source_citation = None
|
|
control.customer_visible = False
|
|
control.verification_method = _detect_verification_method(chunk.text)
|
|
control.category = _detect_category(chunk.text)
|
|
control.generation_metadata = {
|
|
"processing_path": "llm_reform_batch",
|
|
"license_rule": 3,
|
|
"batch_size": len(chunks),
|
|
}
|
|
controls[idx] = control
|
|
|
|
return controls
|
|
|
|
async def _process_batch(
|
|
self,
|
|
batch_items: list[tuple[RAGSearchResult, dict]],
|
|
config: GeneratorConfig,
|
|
job_id: str,
|
|
) -> list[Optional[GeneratedControl]]:
|
|
"""Process a batch of (chunk, license_info) through stages 3-5."""
|
|
# Split by license rule: Rule 1+2 → structure, Rule 3 → reform
|
|
structure_items = [(c, l) for c, l in batch_items if l["rule"] in (1, 2)]
|
|
reform_items = [(c, l) for c, l in batch_items if l["rule"] == 3]
|
|
|
|
all_controls: dict[int, Optional[GeneratedControl]] = {}
|
|
|
|
if structure_items:
|
|
s_chunks = [c for c, _ in structure_items]
|
|
s_lics = [l for _, l in structure_items]
|
|
s_controls = await self._structure_batch(s_chunks, s_lics)
|
|
for (chunk, _), ctrl in zip(structure_items, s_controls):
|
|
orig_idx = next(i for i, (c, _) in enumerate(batch_items) if c is chunk)
|
|
all_controls[orig_idx] = ctrl
|
|
|
|
if reform_items:
|
|
r_chunks = [c for c, _ in reform_items]
|
|
r_controls = await self._reformulate_batch(r_chunks, config)
|
|
for (chunk, _), ctrl in zip(reform_items, r_controls):
|
|
orig_idx = next(i for i, (c, _) in enumerate(batch_items) if c is chunk)
|
|
if ctrl:
|
|
# Too-Close-Check for Rule 3
|
|
similarity = await check_similarity(chunk.text, f"{ctrl.objective} {ctrl.rationale}")
|
|
if similarity.status == "FAIL":
|
|
ctrl.release_state = "too_close"
|
|
ctrl.generation_metadata["similarity_status"] = "FAIL"
|
|
ctrl.generation_metadata["similarity_scores"] = {
|
|
"token_overlap": similarity.token_overlap,
|
|
"ngram_jaccard": similarity.ngram_jaccard,
|
|
"lcs_ratio": similarity.lcs_ratio,
|
|
}
|
|
all_controls[orig_idx] = ctrl
|
|
|
|
# Post-process all controls: harmonization + anchor search
|
|
final: list[Optional[GeneratedControl]] = []
|
|
for i in range(len(batch_items)):
|
|
control = all_controls.get(i)
|
|
if not control or (not control.title and not control.objective):
|
|
final.append(None)
|
|
continue
|
|
|
|
if control.release_state == "too_close":
|
|
final.append(control)
|
|
continue
|
|
|
|
# Harmonization
|
|
duplicates = await self._check_harmonization(control)
|
|
if duplicates:
|
|
control.release_state = "duplicate"
|
|
control.generation_metadata["similar_controls"] = duplicates
|
|
final.append(control)
|
|
continue
|
|
|
|
# Anchor search
|
|
try:
|
|
from .anchor_finder import AnchorFinder
|
|
finder = AnchorFinder(self.rag)
|
|
anchors = await finder.find_anchors(control, skip_web=config.skip_web_search)
|
|
control.open_anchors = [asdict(a) if hasattr(a, '__dataclass_fields__') else a for a in anchors]
|
|
except Exception as e:
|
|
logger.warning("Anchor search failed: %s", e)
|
|
|
|
# Release state
|
|
if control.license_rule in (1, 2):
|
|
control.release_state = "draft"
|
|
elif control.open_anchors:
|
|
control.release_state = "draft"
|
|
else:
|
|
control.release_state = "needs_review"
|
|
|
|
# Control ID
|
|
domain = config.domain or _detect_domain(control.objective)
|
|
control.control_id = self._generate_control_id(domain, self.db)
|
|
control.generation_metadata["job_id"] = job_id
|
|
|
|
final.append(control)
|
|
|
|
return final
|
|
|
|
# ── Stage 4: Harmonization ─────────────────────────────────────────
|
|
|
|
async def _check_harmonization(self, new_control: GeneratedControl) -> Optional[list]:
|
|
"""Check if a new control duplicates existing ones via embedding similarity."""
|
|
existing = self._load_existing_controls()
|
|
if not existing:
|
|
return None
|
|
|
|
# Pre-load all existing embeddings in batch (once per pipeline run)
|
|
if not self._existing_embeddings:
|
|
await self._preload_embeddings(existing)
|
|
|
|
new_text = f"{new_control.title} {new_control.objective}"
|
|
new_emb = await _get_embedding(new_text)
|
|
if not new_emb:
|
|
return None
|
|
|
|
similar = []
|
|
for ex in existing:
|
|
ex_key = ex.get("control_id", "")
|
|
ex_emb = self._existing_embeddings.get(ex_key, [])
|
|
if not ex_emb:
|
|
continue
|
|
|
|
cosine = _cosine_sim(new_emb, ex_emb)
|
|
if cosine > HARMONIZATION_THRESHOLD:
|
|
similar.append({
|
|
"control_id": ex.get("control_id", ""),
|
|
"title": ex.get("title", ""),
|
|
"similarity": round(cosine, 3),
|
|
})
|
|
|
|
return similar if similar else None
|
|
|
|
async def _preload_embeddings(self, existing: list[dict]):
|
|
"""Pre-load embeddings for all existing controls in batches."""
|
|
texts = [f"{ex.get('title', '')} {ex.get('objective', '')}" for ex in existing]
|
|
keys = [ex.get("control_id", "") for ex in existing]
|
|
|
|
logger.info("Pre-loading embeddings for %d existing controls...", len(texts))
|
|
embeddings = await _get_embeddings_batch(texts)
|
|
|
|
for key, emb in zip(keys, embeddings):
|
|
self._existing_embeddings[key] = emb
|
|
|
|
loaded = sum(1 for emb in embeddings if emb)
|
|
logger.info("Pre-loaded %d/%d embeddings", loaded, len(texts))
|
|
|
|
def _load_existing_controls(self) -> list[dict]:
|
|
"""Load existing controls from DB (cached per pipeline run)."""
|
|
if self._existing_controls is not None:
|
|
return self._existing_controls
|
|
|
|
try:
|
|
result = self.db.execute(
|
|
text("SELECT control_id, title, objective FROM canonical_controls WHERE release_state != 'deprecated'")
|
|
)
|
|
self._existing_controls = [
|
|
{"control_id": r[0], "title": r[1], "objective": r[2]}
|
|
for r in result
|
|
]
|
|
except Exception as e:
|
|
logger.warning("Error loading existing controls: %s", e)
|
|
self._existing_controls = []
|
|
|
|
return self._existing_controls
|
|
|
|
# ── Helpers ────────────────────────────────────────────────────────
|
|
|
|
def _build_control_from_json(self, data: dict, domain: str) -> GeneratedControl:
|
|
"""Build a GeneratedControl from parsed LLM JSON."""
|
|
severity = data.get("severity", "medium")
|
|
if severity not in ("low", "medium", "high", "critical"):
|
|
severity = "medium"
|
|
|
|
tags = data.get("tags", [])
|
|
if isinstance(tags, str):
|
|
tags = [t.strip() for t in tags.split(",")]
|
|
|
|
return GeneratedControl(
|
|
title=str(data.get("title", "Untitled Control"))[:255],
|
|
objective=str(data.get("objective", "")),
|
|
rationale=str(data.get("rationale", "")),
|
|
scope=data.get("scope", {}),
|
|
requirements=data.get("requirements", []) if isinstance(data.get("requirements"), list) else [],
|
|
test_procedure=data.get("test_procedure", []) if isinstance(data.get("test_procedure"), list) else [],
|
|
evidence=data.get("evidence", []) if isinstance(data.get("evidence"), list) else [],
|
|
severity=severity,
|
|
risk_score=min(10.0, max(0.0, float(data.get("risk_score", 5.0)))),
|
|
implementation_effort=data.get("implementation_effort", "m") if data.get("implementation_effort") in ("s", "m", "l", "xl") else "m",
|
|
tags=tags[:20],
|
|
)
|
|
|
|
def _fallback_control(self, chunk: RAGSearchResult) -> GeneratedControl:
|
|
"""Create a minimal control when LLM parsing fails."""
|
|
domain = _detect_domain(chunk.text)
|
|
return GeneratedControl(
|
|
title=f"Control from {chunk.regulation_code} {chunk.article or ''}".strip()[:255],
|
|
objective=chunk.text[:500] if chunk.text else "Needs manual review",
|
|
rationale="Auto-generated — LLM parsing failed, manual review required.",
|
|
severity="medium",
|
|
release_state="needs_review",
|
|
tags=[domain.lower()],
|
|
)
|
|
|
|
def _generate_control_id(self, domain: str, db: Session) -> str:
|
|
"""Generate next sequential control ID like AUTH-011."""
|
|
prefix = domain.upper()[:4]
|
|
try:
|
|
result = db.execute(
|
|
text("SELECT control_id FROM canonical_controls WHERE control_id LIKE :prefix ORDER BY control_id DESC LIMIT 1"),
|
|
{"prefix": f"{prefix}-%"},
|
|
)
|
|
row = result.fetchone()
|
|
if row:
|
|
last_num = int(row[0].split("-")[-1])
|
|
return f"{prefix}-{last_num + 1:03d}"
|
|
except Exception:
|
|
pass
|
|
return f"{prefix}-001"
|
|
|
|
# ── Pipeline Orchestration ─────────────────────────────────────────
|
|
|
|
def _create_job(self, config: GeneratorConfig) -> str:
|
|
"""Create a generation job record."""
|
|
try:
|
|
result = self.db.execute(
|
|
text("""
|
|
INSERT INTO canonical_generation_jobs (status, config)
|
|
VALUES ('running', :config)
|
|
RETURNING id
|
|
"""),
|
|
{"config": json.dumps(config.model_dump())},
|
|
)
|
|
self.db.commit()
|
|
row = result.fetchone()
|
|
return str(row[0]) if row else str(uuid.uuid4())
|
|
except Exception as e:
|
|
logger.error("Failed to create job: %s", e)
|
|
return str(uuid.uuid4())
|
|
|
|
def _update_job(self, job_id: str, result: GeneratorResult):
|
|
"""Update job with current stats. Sets completed_at only when status is final."""
|
|
is_final = result.status in ("completed", "failed")
|
|
try:
|
|
self.db.execute(
|
|
text(f"""
|
|
UPDATE canonical_generation_jobs
|
|
SET status = :status,
|
|
total_chunks_scanned = :scanned,
|
|
controls_generated = :generated,
|
|
controls_verified = :verified,
|
|
controls_needs_review = :needs_review,
|
|
controls_too_close = :too_close,
|
|
controls_duplicates_found = :duplicates,
|
|
errors = :errors
|
|
{"" if not is_final else ", completed_at = NOW()"}
|
|
WHERE id = CAST(:job_id AS uuid)
|
|
"""),
|
|
{
|
|
"job_id": job_id,
|
|
"status": result.status,
|
|
"scanned": result.total_chunks_scanned,
|
|
"generated": result.controls_generated,
|
|
"verified": result.controls_verified,
|
|
"needs_review": result.controls_needs_review,
|
|
"too_close": result.controls_too_close,
|
|
"duplicates": result.controls_duplicates_found,
|
|
"errors": json.dumps(result.errors[-50:]),
|
|
},
|
|
)
|
|
self.db.commit()
|
|
except Exception as e:
|
|
logger.error("Failed to update job: %s", e)
|
|
|
|
def _store_control(self, control: GeneratedControl, job_id: str) -> Optional[str]:
|
|
"""Persist a generated control to DB. Returns the control UUID or None."""
|
|
try:
|
|
# Get framework UUID
|
|
fw_result = self.db.execute(
|
|
text("SELECT id FROM canonical_control_frameworks WHERE framework_id = 'bp_security_v1' LIMIT 1")
|
|
)
|
|
fw_row = fw_result.fetchone()
|
|
if not fw_row:
|
|
logger.error("Framework bp_security_v1 not found")
|
|
return None
|
|
framework_uuid = fw_row[0]
|
|
|
|
# Generate control_id if not set
|
|
if not control.control_id:
|
|
domain = _detect_domain(control.objective) if control.objective else "SEC"
|
|
control.control_id = self._generate_control_id(domain, self.db)
|
|
|
|
result = self.db.execute(
|
|
text("""
|
|
INSERT INTO canonical_controls (
|
|
framework_id, control_id, title, objective, rationale,
|
|
scope, requirements, test_procedure, evidence,
|
|
severity, risk_score, implementation_effort,
|
|
open_anchors, release_state, tags,
|
|
license_rule, source_original_text, source_citation,
|
|
customer_visible, generation_metadata,
|
|
verification_method, category, generation_strategy
|
|
) VALUES (
|
|
:framework_id, :control_id, :title, :objective, :rationale,
|
|
:scope, :requirements, :test_procedure, :evidence,
|
|
:severity, :risk_score, :implementation_effort,
|
|
:open_anchors, :release_state, :tags,
|
|
:license_rule, :source_original_text, :source_citation,
|
|
:customer_visible, :generation_metadata,
|
|
:verification_method, :category, :generation_strategy
|
|
)
|
|
ON CONFLICT (framework_id, control_id) DO NOTHING
|
|
RETURNING id
|
|
"""),
|
|
{
|
|
"framework_id": framework_uuid,
|
|
"control_id": control.control_id,
|
|
"title": control.title,
|
|
"objective": control.objective,
|
|
"rationale": control.rationale,
|
|
"scope": json.dumps(control.scope),
|
|
"requirements": json.dumps(control.requirements),
|
|
"test_procedure": json.dumps(control.test_procedure),
|
|
"evidence": json.dumps(control.evidence),
|
|
"severity": control.severity,
|
|
"risk_score": control.risk_score,
|
|
"implementation_effort": control.implementation_effort,
|
|
"open_anchors": json.dumps(control.open_anchors),
|
|
"release_state": control.release_state,
|
|
"tags": json.dumps(control.tags),
|
|
"license_rule": control.license_rule,
|
|
"source_original_text": control.source_original_text,
|
|
"source_citation": json.dumps(control.source_citation) if control.source_citation else None,
|
|
"customer_visible": control.customer_visible,
|
|
"generation_metadata": json.dumps(control.generation_metadata) if control.generation_metadata else None,
|
|
"verification_method": control.verification_method,
|
|
"category": control.category,
|
|
"generation_strategy": control.generation_strategy,
|
|
},
|
|
)
|
|
self.db.commit()
|
|
row = result.fetchone()
|
|
return str(row[0]) if row else None
|
|
except Exception as e:
|
|
logger.error("Failed to store control %s: %s", control.control_id, e)
|
|
self.db.rollback()
|
|
return None
|
|
|
|
def _mark_chunk_processed(
|
|
self,
|
|
chunk: RAGSearchResult,
|
|
license_info: dict,
|
|
processing_path: str,
|
|
control_ids: list[str],
|
|
job_id: str,
|
|
):
|
|
"""Mark a RAG chunk as processed (Stage 7)."""
|
|
chunk_hash = hashlib.sha256(chunk.text.encode()).hexdigest()
|
|
try:
|
|
self.db.execute(
|
|
text("""
|
|
INSERT INTO canonical_processed_chunks (
|
|
chunk_hash, collection, regulation_code,
|
|
document_version, source_license, license_rule,
|
|
processing_path, generated_control_ids, job_id
|
|
) VALUES (
|
|
:hash, :collection, :regulation_code,
|
|
:doc_version, :license, :rule,
|
|
:path, :control_ids, CAST(:job_id AS uuid)
|
|
)
|
|
ON CONFLICT (chunk_hash, collection, document_version) DO NOTHING
|
|
"""),
|
|
{
|
|
"hash": chunk_hash,
|
|
"collection": chunk.collection or "bp_compliance_ce",
|
|
"regulation_code": chunk.regulation_code,
|
|
"doc_version": "1.0",
|
|
"license": license_info.get("license", ""),
|
|
"rule": license_info.get("rule", 3),
|
|
"path": processing_path,
|
|
"control_ids": json.dumps(control_ids),
|
|
"job_id": job_id,
|
|
},
|
|
)
|
|
self.db.commit()
|
|
except Exception as e:
|
|
logger.warning("Failed to mark chunk processed: %s", e)
|
|
self.db.rollback()
|
|
|
|
# ── Main Pipeline ──────────────────────────────────────────────────
|
|
|
|
async def run(self, config: GeneratorConfig) -> GeneratorResult:
|
|
"""Execute the full 7-stage pipeline."""
|
|
result = GeneratorResult()
|
|
|
|
# Create or reuse job
|
|
if config.existing_job_id:
|
|
job_id = config.existing_job_id
|
|
else:
|
|
job_id = self._create_job(config)
|
|
result.job_id = job_id
|
|
|
|
try:
|
|
# Stage 1: RAG Scan
|
|
chunks = await self._scan_rag(config)
|
|
result.total_chunks_scanned = len(chunks)
|
|
|
|
if not chunks:
|
|
result.status = "completed"
|
|
self._update_job(job_id, result)
|
|
return result
|
|
|
|
# ── Group chunks by document (regulation_code) for coherent batching ──
|
|
doc_groups: dict[str, list[RAGSearchResult]] = defaultdict(list)
|
|
for chunk in chunks:
|
|
group_key = chunk.regulation_code or "unknown"
|
|
doc_groups[group_key].append(chunk)
|
|
|
|
# Sort chunks within each group by article for sequential context
|
|
for key in doc_groups:
|
|
doc_groups[key].sort(key=lambda c: (c.article or "", c.paragraph or ""))
|
|
|
|
logger.info(
|
|
"Grouped %d chunks into %d document groups for coherent batching",
|
|
len(chunks), len(doc_groups),
|
|
)
|
|
|
|
# Flatten back: chunks from same document are now adjacent
|
|
chunks = []
|
|
for group_list in doc_groups.values():
|
|
chunks.extend(group_list)
|
|
|
|
# Process chunks — batch mode (N chunks per Anthropic API call)
|
|
BATCH_SIZE = config.batch_size or 5
|
|
controls_count = 0
|
|
chunks_skipped_prefilter = 0
|
|
pending_batch: list[tuple[RAGSearchResult, dict]] = [] # (chunk, license_info)
|
|
current_batch_regulation: Optional[str] = None # Track regulation for group-aware flushing
|
|
|
|
async def _flush_batch():
|
|
"""Send pending batch to Anthropic and process results."""
|
|
nonlocal controls_count, current_batch_regulation
|
|
if not pending_batch:
|
|
return
|
|
batch = pending_batch.copy()
|
|
pending_batch.clear()
|
|
current_batch_regulation = None
|
|
|
|
# Log which document this batch belongs to
|
|
regs_in_batch = set(c.regulation_code for c, _ in batch)
|
|
logger.info(
|
|
"Processing batch of %d chunks (docs: %s) via single API call...",
|
|
len(batch), ", ".join(regs_in_batch),
|
|
)
|
|
try:
|
|
batch_controls = await self._process_batch(batch, config, job_id)
|
|
except Exception as e:
|
|
logger.error("Batch processing failed: %s — falling back to single-chunk mode", e)
|
|
# Fallback: process each chunk individually
|
|
batch_controls = []
|
|
for chunk, _lic in batch:
|
|
try:
|
|
ctrl = await self._process_single_chunk(chunk, config, job_id)
|
|
batch_controls.append(ctrl)
|
|
except Exception as e2:
|
|
logger.error("Single-chunk fallback also failed: %s", e2)
|
|
batch_controls.append(None)
|
|
|
|
for (chunk, lic_info), control in zip(batch, batch_controls):
|
|
if control is None:
|
|
if not config.dry_run:
|
|
self._mark_chunk_processed(chunk, lic_info, "no_control", [], job_id)
|
|
continue
|
|
|
|
# Mark as document_grouped strategy
|
|
control.generation_strategy = "document_grouped"
|
|
|
|
# Count by state
|
|
if control.release_state == "too_close":
|
|
result.controls_too_close += 1
|
|
elif control.release_state == "duplicate":
|
|
result.controls_duplicates_found += 1
|
|
elif control.release_state == "needs_review":
|
|
result.controls_needs_review += 1
|
|
else:
|
|
result.controls_verified += 1
|
|
|
|
# Store
|
|
if not config.dry_run:
|
|
ctrl_uuid = self._store_control(control, job_id)
|
|
if ctrl_uuid:
|
|
path = control.generation_metadata.get("processing_path", "structured_batch")
|
|
self._mark_chunk_processed(chunk, lic_info, path, [ctrl_uuid], job_id)
|
|
else:
|
|
self._mark_chunk_processed(chunk, lic_info, "store_failed", [], job_id)
|
|
|
|
result.controls_generated += 1
|
|
result.controls.append(asdict(control))
|
|
controls_count += 1
|
|
|
|
if self._existing_controls is not None:
|
|
self._existing_controls.append({
|
|
"control_id": control.control_id,
|
|
"title": control.title,
|
|
"objective": control.objective,
|
|
})
|
|
|
|
for i, chunk in enumerate(chunks):
|
|
try:
|
|
# Progress logging every 50 chunks
|
|
if i > 0 and i % 50 == 0:
|
|
logger.info(
|
|
"Progress: %d/%d chunks processed, %d controls generated, %d skipped by prefilter",
|
|
i, len(chunks), controls_count, chunks_skipped_prefilter,
|
|
)
|
|
self._update_job(job_id, result)
|
|
|
|
# Stage 1.5: Local LLM pre-filter — skip chunks without requirements
|
|
if not config.dry_run:
|
|
is_relevant, prefilter_reason = await _prefilter_chunk(chunk.text)
|
|
if not is_relevant:
|
|
chunks_skipped_prefilter += 1
|
|
license_info = self._classify_license(chunk)
|
|
self._mark_chunk_processed(
|
|
chunk, license_info, "prefilter_skip", [], job_id
|
|
)
|
|
continue
|
|
|
|
# Classify license and add to batch
|
|
license_info = self._classify_license(chunk)
|
|
chunk_regulation = chunk.regulation_code or "unknown"
|
|
|
|
# Flush when: batch is full OR regulation changes (group boundary)
|
|
if pending_batch and (
|
|
len(pending_batch) >= BATCH_SIZE
|
|
or chunk_regulation != current_batch_regulation
|
|
):
|
|
await _flush_batch()
|
|
|
|
pending_batch.append((chunk, license_info))
|
|
current_batch_regulation = chunk_regulation
|
|
|
|
except Exception as e:
|
|
error_msg = f"Error processing chunk {chunk.regulation_code}/{chunk.article}: {e}"
|
|
logger.error(error_msg)
|
|
result.errors.append(error_msg)
|
|
try:
|
|
if not config.dry_run:
|
|
license_info = self._classify_license(chunk)
|
|
self._mark_chunk_processed(
|
|
chunk, license_info, "error", [], job_id
|
|
)
|
|
except Exception:
|
|
pass
|
|
|
|
# Flush remaining chunks
|
|
await _flush_batch()
|
|
|
|
result.chunks_skipped_prefilter = chunks_skipped_prefilter
|
|
logger.info(
|
|
"Pipeline complete: %d controls generated, %d chunks skipped by prefilter, %d total chunks",
|
|
controls_count, chunks_skipped_prefilter, len(chunks),
|
|
)
|
|
|
|
result.status = "completed"
|
|
|
|
except Exception as e:
|
|
result.status = "failed"
|
|
result.errors.append(str(e))
|
|
logger.error("Pipeline failed: %s", e)
|
|
|
|
self._update_job(job_id, result)
|
|
return result
|
|
|
|
async def _process_single_chunk(
|
|
self,
|
|
chunk: RAGSearchResult,
|
|
config: GeneratorConfig,
|
|
job_id: str,
|
|
) -> Optional[GeneratedControl]:
|
|
"""Process a single chunk through stages 2-5."""
|
|
# Stage 2: License classification
|
|
license_info = self._classify_license(chunk)
|
|
|
|
# Stage 3: Structure or Reform based on rule
|
|
if license_info["rule"] == 1:
|
|
control = await self._structure_free_use(chunk, license_info)
|
|
elif license_info["rule"] == 2:
|
|
control = await self._structure_with_citation(chunk, license_info)
|
|
else:
|
|
control = await self._llm_reformulate(chunk, config)
|
|
|
|
# Too-Close-Check for Rule 3
|
|
similarity = await check_similarity(chunk.text, f"{control.objective} {control.rationale}")
|
|
if similarity.status == "FAIL":
|
|
control.release_state = "too_close"
|
|
control.generation_metadata["similarity_status"] = "FAIL"
|
|
control.generation_metadata["similarity_scores"] = {
|
|
"token_overlap": similarity.token_overlap,
|
|
"ngram_jaccard": similarity.ngram_jaccard,
|
|
"lcs_ratio": similarity.lcs_ratio,
|
|
}
|
|
return control
|
|
|
|
if not control.title or not control.objective:
|
|
return None
|
|
|
|
# Stage 4: Harmonization
|
|
duplicates = await self._check_harmonization(control)
|
|
if duplicates:
|
|
control.release_state = "duplicate"
|
|
control.generation_metadata["similar_controls"] = duplicates
|
|
return control
|
|
|
|
# Stage 5: Anchor Search (imported from anchor_finder)
|
|
try:
|
|
from .anchor_finder import AnchorFinder
|
|
finder = AnchorFinder(self.rag)
|
|
anchors = await finder.find_anchors(control, skip_web=config.skip_web_search)
|
|
control.open_anchors = [asdict(a) if hasattr(a, '__dataclass_fields__') else a for a in anchors]
|
|
except Exception as e:
|
|
logger.warning("Anchor search failed: %s", e)
|
|
|
|
# Determine release state
|
|
if control.license_rule in (1, 2):
|
|
control.release_state = "draft"
|
|
elif control.open_anchors:
|
|
control.release_state = "draft"
|
|
else:
|
|
control.release_state = "needs_review"
|
|
|
|
# Generate control_id
|
|
domain = config.domain or _detect_domain(control.objective)
|
|
control.control_id = self._generate_control_id(domain, self.db)
|
|
|
|
# Store job_id in metadata
|
|
control.generation_metadata["job_id"] = job_id
|
|
|
|
return control
|