fix: Pipeline-Skalierung — 6 Optimierungen für 80k+ Controls
1. control_generator: GeneratorResult.status Default "completed" → "running" (Bug) 2. control_generator: Anthropic API mit Phase-Timeouts + Retry bei Disconnect 3. control_generator: regulation_exclude Filter + Harmonization via Qdrant statt In-Memory 4. decomposition_pass: Enrich Pass Batch-UPDATEs (400k → ~400 DB-Calls) 5. decomposition_pass: Merge Pass single Query statt N+1 6. batch_dedup_runner: Cross-Group Dedup parallelisiert (asyncio.gather) 7. canonical_control_routes: Framework Controls API Pagination (limit/offset) 8. DB-Indizes: idx_oc_parent_release, idx_oc_trigger_null, idx_cc_framework Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -92,6 +92,7 @@ REGULATION_LICENSE_MAP: dict[str, dict] = {
|
||||
"eucsa": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "EU Cybersecurity Act"},
|
||||
"dataact": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Data Act"},
|
||||
"dora": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Digital Operational Resilience Act"},
|
||||
"eu_2017_745": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Medizinprodukteverordnung (EU) 2017/745 (MDR)"},
|
||||
"ehds": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "European Health Data Space"},
|
||||
"gpsr": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Allgemeine Produktsicherheitsverordnung"},
|
||||
"eu_2023_988": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Allgemeine Produktsicherheitsverordnung (GPSR)"},
|
||||
@@ -132,6 +133,39 @@ REGULATION_LICENSE_MAP: dict[str, dict] = {
|
||||
"ao": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "Abgabenordnung (AO)"},
|
||||
"ao_komplett": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "Abgabenordnung (AO)"},
|
||||
"battdg": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "Batteriegesetz"},
|
||||
# New German Laws (2026-04-10 ingestion)
|
||||
"de_bsig_2025": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "BSI-Gesetz (BSIG 2025, NIS2-Umsetzung)"},
|
||||
"de_tdddg": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "TDDDG"},
|
||||
"de_gwg_2017": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "Geldwaeschegesetz (GwG)"},
|
||||
"de_agg": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "Allgemeines Gleichbehandlungsgesetz (AGG)"},
|
||||
"de_hinschg": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "Hinweisgeberschutzgesetz (HinSchG)"},
|
||||
"de_lksg": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "Lieferkettensorgfaltspflichtengesetz (LkSG)"},
|
||||
"de_kritisdachg": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "KRITIS-Dachgesetz (KRITISDachG)"},
|
||||
"de_bsi_kritisv": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "BSI-Kritisverordnung (BSI-KritisV)"},
|
||||
# DSK/BfDI Guidance (amtliche Orientierungshilfen)
|
||||
"dsk_oh_ki_datenschutz": {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "DSK OH KI und Datenschutz"},
|
||||
"dsk_oh_ki_systeme_tom": {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "DSK OH TOM bei KI-Systemen"},
|
||||
"dsk_oh_ki_rag": {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "DSK OH Generative KI mit RAG"},
|
||||
"dsk_oh_telemedien": {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "DSK OH Telemedien"},
|
||||
"dsk_oh_digitale_dienste": {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "DSK OH Digitale Dienste"},
|
||||
"dsk_oh_direktwerbung": {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "DSK OH Direktwerbung"},
|
||||
"dsk_oh_videokonferenz": {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "DSK OH Videokonferenzsysteme"},
|
||||
"dsk_oh_videoueberwachung": {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "DSK OH Videoueberwachung"},
|
||||
"dsk_oh_email_verschluesselung": {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "DSK OH E-Mail-Verschluesselung"},
|
||||
"dsk_oh_whistleblowing": {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "DSK OH Whistleblowing"},
|
||||
"dsk_oh_onlinedienste_zugang": {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "DSK OH Onlinedienste Zugang"},
|
||||
"dsk_oh_datenuebermittlung_drittlaender": {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "DSK OH Datenuebermittlung Drittlaender"},
|
||||
"dsk_sdm_methode": {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "DSK Standard-Datenschutzmodell SDM V3.1a"},
|
||||
"dsk_ah_eu_us_dpf": {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "DSK AH EU-US Data Privacy Framework"},
|
||||
"dsk_ah_bussgeldkonzept": {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "DSK Bussgeldkonzept"},
|
||||
"dsk_ah_dsfa_mussliste": {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "DSK DSFA Muss-Liste"},
|
||||
"dsk_ah_verzeichnis_vvt": {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "DSK Hinweise VVT"},
|
||||
"dsk_ah_zertifizierung": {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "DSK Zertifizierungskriterien"},
|
||||
"dsk_beschluss_ms365": {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "DSK Festlegung Microsoft 365"},
|
||||
"dsk_pos_ki_verordnung": {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "DSK Positionspapier KI-Verordnung"},
|
||||
"dsk_entschl_beschaeftigtendatenschutz": {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "DSK Entschliessung Beschaeftigtendatenschutz"},
|
||||
"bfdi_handreichung_ki_behoerden": {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "BfDI Handreichung KI in Behoerden"},
|
||||
"bfdi_handreichung_ki_sicherheit": {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "BfDI Handreichung KI Sicherheitsbehoerden"},
|
||||
# Austrian Laws
|
||||
"at_dsg": {"license": "AT_LAW", "rule": 1, "source_type": "law", "name": "Österreichisches Datenschutzgesetz (DSG)"},
|
||||
"at_abgb": {"license": "AT_LAW", "rule": 1, "source_type": "law", "name": "AT ABGB"},
|
||||
@@ -459,6 +493,7 @@ class GeneratorConfig(BaseModel):
|
||||
dry_run: bool = False
|
||||
existing_job_id: Optional[str] = None # If set, reuse this job instead of creating a new one
|
||||
regulation_filter: Optional[List[str]] = None # Only process chunks matching these regulation_code prefixes
|
||||
regulation_exclude: Optional[List[str]] = None # Skip chunks matching these regulation_code prefixes
|
||||
skip_prefilter: bool = False # If True, skip local LLM pre-filter (send all chunks to API)
|
||||
|
||||
|
||||
@@ -501,7 +536,7 @@ class GeneratedControl:
|
||||
@dataclass
|
||||
class GeneratorResult:
|
||||
job_id: str = ""
|
||||
status: str = "completed"
|
||||
status: str = "running"
|
||||
total_chunks_scanned: int = 0
|
||||
controls_generated: int = 0
|
||||
controls_verified: int = 0
|
||||
@@ -583,8 +618,8 @@ Antworte NUR mit JSON: {{"relevant": true/false, "reason": "kurze Begründung"}}
|
||||
return True, f"error: {e}"
|
||||
|
||||
|
||||
async def _llm_anthropic(prompt: str, system_prompt: Optional[str] = None) -> str:
|
||||
"""Call Anthropic Messages API."""
|
||||
async def _llm_anthropic(prompt: str, system_prompt: Optional[str] = None, max_retries: int = 2) -> str:
|
||||
"""Call Anthropic Messages API with retry on timeout."""
|
||||
headers = {
|
||||
"x-api-key": ANTHROPIC_API_KEY,
|
||||
"anthropic-version": "2023-06-01",
|
||||
@@ -598,24 +633,36 @@ async def _llm_anthropic(prompt: str, system_prompt: Optional[str] = None) -> st
|
||||
if system_prompt:
|
||||
payload["system"] = system_prompt
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=LLM_TIMEOUT) as client:
|
||||
resp = await client.post(
|
||||
"https://api.anthropic.com/v1/messages",
|
||||
headers=headers,
|
||||
json=payload,
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
logger.error("Anthropic API %d: %s", resp.status_code, resp.text[:300])
|
||||
# Use explicit per-phase timeouts to prevent indefinite hangs
|
||||
timeout = httpx.Timeout(connect=30.0, read=LLM_TIMEOUT, write=30.0, pool=30.0)
|
||||
|
||||
for attempt in range(1 + max_retries):
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=timeout) as client:
|
||||
resp = await client.post(
|
||||
"https://api.anthropic.com/v1/messages",
|
||||
headers=headers,
|
||||
json=payload,
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
logger.error("Anthropic API %d: %s", resp.status_code, resp.text[:300])
|
||||
return ""
|
||||
data = resp.json()
|
||||
content = data.get("content", [])
|
||||
if content and isinstance(content, list):
|
||||
return content[0].get("text", "")
|
||||
return ""
|
||||
data = resp.json()
|
||||
content = data.get("content", [])
|
||||
if content and isinstance(content, list):
|
||||
return content[0].get("text", "")
|
||||
except (httpx.TimeoutException, httpx.RemoteProtocolError) as e:
|
||||
if attempt < max_retries:
|
||||
logger.warning("Anthropic request attempt %d/%d failed: %s — retrying...", attempt + 1, max_retries + 1, e)
|
||||
import asyncio
|
||||
await asyncio.sleep(2 ** attempt)
|
||||
continue
|
||||
logger.error("Anthropic request failed after %d attempts: %s (type: %s)", max_retries + 1, e, type(e).__name__)
|
||||
return ""
|
||||
except Exception as e:
|
||||
logger.error("Anthropic request failed: %s (type: %s)", e, type(e).__name__)
|
||||
return ""
|
||||
except Exception as e:
|
||||
logger.error("Anthropic request failed: %s (type: %s)", e, type(e).__name__)
|
||||
return ""
|
||||
|
||||
|
||||
async def _llm_ollama(prompt: str, system_prompt: Optional[str] = None) -> str:
|
||||
@@ -941,6 +988,12 @@ class ControlGeneratorPipeline:
|
||||
if not any(code_lower.startswith(f.lower()) for f in config.regulation_filter):
|
||||
continue
|
||||
|
||||
# Exclude specific regulation_codes
|
||||
if config.regulation_exclude and reg_code:
|
||||
code_lower = reg_code.lower()
|
||||
if any(code_lower.startswith(f.lower()) for f in config.regulation_exclude):
|
||||
continue
|
||||
|
||||
reg_name = (payload.get("regulation_name_de", "")
|
||||
or payload.get("regulation_name", "")
|
||||
or payload.get("source_name", "")
|
||||
@@ -1423,20 +1476,31 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Elementen. Fuer Aspekte ohne
|
||||
if structure_items:
|
||||
s_chunks = [c for c, _ in structure_items]
|
||||
s_lics = [l for _, l in structure_items]
|
||||
s_controls = await self._structure_batch(s_chunks, s_lics)
|
||||
try:
|
||||
s_controls = await self._structure_batch(s_chunks, s_lics)
|
||||
except Exception as e:
|
||||
import traceback
|
||||
logger.warning("Batch structure failed: %s — creating fallback controls\n%s", e, traceback.format_exc())
|
||||
s_controls = [self._fallback_control(c) for c in s_chunks]
|
||||
for (chunk, _), ctrl in zip(structure_items, s_controls):
|
||||
orig_idx = next(i for i, (c, _) in enumerate(batch_items) if c is chunk)
|
||||
all_controls[orig_idx] = ctrl
|
||||
|
||||
if reform_items:
|
||||
r_chunks = [c for c, _ in reform_items]
|
||||
r_controls = await self._reformulate_batch(r_chunks, config)
|
||||
try:
|
||||
r_controls = await self._reformulate_batch(r_chunks, config)
|
||||
except Exception as e:
|
||||
logger.warning("Batch reform failed: %s — creating fallback controls", e)
|
||||
r_controls = [self._fallback_control(c) for c in r_chunks]
|
||||
for (chunk, _), ctrl in zip(reform_items, r_controls):
|
||||
orig_idx = next(i for i, (c, _) in enumerate(batch_items) if c is chunk)
|
||||
if ctrl:
|
||||
# Too-Close-Check for Rule 3
|
||||
similarity = await check_similarity(chunk.text, f"{ctrl.objective} {ctrl.rationale}")
|
||||
if similarity.status == "FAIL":
|
||||
if similarity is None:
|
||||
logger.warning("Similarity check returned None — skipping too-close check")
|
||||
elif similarity.status == "FAIL":
|
||||
ctrl.release_state = "too_close"
|
||||
ctrl.generation_metadata["similarity_status"] = "FAIL"
|
||||
ctrl.generation_metadata["similarity_scores"] = {
|
||||
@@ -1502,36 +1566,42 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Elementen. Fuer Aspekte ohne
|
||||
# ── Stage 4: Harmonization ─────────────────────────────────────────
|
||||
|
||||
async def _check_harmonization(self, new_control: GeneratedControl) -> Optional[list]:
|
||||
"""Check if a new control duplicates existing ones via embedding similarity."""
|
||||
existing = self._load_existing_controls()
|
||||
if not existing:
|
||||
return None
|
||||
|
||||
# Pre-load all existing embeddings in batch (once per pipeline run)
|
||||
if not self._existing_embeddings:
|
||||
await self._preload_embeddings(existing)
|
||||
"""Check if a new control duplicates existing ones via Qdrant vector search.
|
||||
|
||||
Uses the atomic_controls_dedup collection for fast nearest-neighbor lookup
|
||||
instead of pre-loading all embeddings into memory.
|
||||
"""
|
||||
new_text = f"{new_control.title} {new_control.objective}"
|
||||
new_emb = await _get_embedding(new_text)
|
||||
if not new_emb:
|
||||
return None
|
||||
|
||||
similar = []
|
||||
for ex in existing:
|
||||
ex_key = ex.get("control_id", "")
|
||||
ex_emb = self._existing_embeddings.get(ex_key, [])
|
||||
if not ex_emb:
|
||||
continue
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
resp = await client.post(
|
||||
f"{QDRANT_URL}/collections/atomic_controls_dedup/points/search",
|
||||
json={
|
||||
"vector": new_emb,
|
||||
"limit": 5,
|
||||
"score_threshold": HARMONIZATION_THRESHOLD,
|
||||
"with_payload": {"include": ["control_id", "title"]},
|
||||
},
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
results = resp.json().get("result", [])
|
||||
if results:
|
||||
return [
|
||||
{
|
||||
"control_id": r["payload"].get("control_id", ""),
|
||||
"title": r["payload"].get("title", ""),
|
||||
"similarity": round(r["score"], 3),
|
||||
}
|
||||
for r in results
|
||||
]
|
||||
except Exception as e:
|
||||
logger.warning("Qdrant dedup search failed: %s — skipping harmonization", e)
|
||||
|
||||
cosine = _cosine_sim(new_emb, ex_emb)
|
||||
if cosine > HARMONIZATION_THRESHOLD:
|
||||
similar.append({
|
||||
"control_id": ex.get("control_id", ""),
|
||||
"title": ex.get("title", ""),
|
||||
"similarity": round(cosine, 3),
|
||||
})
|
||||
|
||||
return similar if similar else None
|
||||
return None
|
||||
|
||||
async def _preload_embeddings(self, existing: list[dict]):
|
||||
"""Pre-load embeddings for all existing controls in batches."""
|
||||
@@ -1580,9 +1650,11 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Elementen. Fuer Aspekte ohne
|
||||
if severity not in ("low", "medium", "high", "critical"):
|
||||
severity = "medium"
|
||||
|
||||
tags = data.get("tags", [])
|
||||
tags = data.get("tags") or []
|
||||
if isinstance(tags, str):
|
||||
tags = [t.strip() for t in tags.split(",")]
|
||||
if not isinstance(tags, list):
|
||||
tags = []
|
||||
|
||||
# Use LLM-provided domain if available, fallback to keyword-detected domain
|
||||
llm_domain = data.get("domain")
|
||||
|
||||
Reference in New Issue
Block a user