fix: Pipeline-Skalierung — 6 Optimierungen für 80k+ Controls

1. control_generator: GeneratorResult.status Default "completed" → "running" (Bug) 2. control_generator: Anthropic API mit Phase-Timeouts + Retry bei Disconnect 3. control_generator: regulation_exclude Filter + Harmonization via Qdrant statt In-Memory 4. decomposition_pass: Enrich Pass Batch-UPDATEs (400k → ~400 DB-Calls) 5. decomposition_pass: Merge Pass single Query statt N+1 6. batch_dedup_runner: Cross-Group Dedup parallelisiert (asyncio.gather) 7. canonical_control_routes: Framework Controls API Pagination (limit/offset) 8. DB-Indizes: idx_oc_parent_release, idx_oc_trigger_null, idx_cc_framework Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-11 14:09:32 +02:00
parent fc71117bf2
commit f89ce46631
5 changed files with 291 additions and 141 deletions
@@ -92,6 +92,7 @@ REGULATION_LICENSE_MAP: dict[str, dict] = {
    "eucsa":                 {"license": "EU_LAW",              "rule": 1, "source_type": "law", "name": "EU Cybersecurity Act"},
    "dataact":               {"license": "EU_LAW",              "rule": 1, "source_type": "law", "name": "Data Act"},
    "dora":                  {"license": "EU_LAW",              "rule": 1, "source_type": "law", "name": "Digital Operational Resilience Act"},
+    "eu_2017_745":            {"license": "EU_LAW",              "rule": 1, "source_type": "law", "name": "Medizinprodukteverordnung (EU) 2017/745 (MDR)"},
    "ehds":                  {"license": "EU_LAW",              "rule": 1, "source_type": "law", "name": "European Health Data Space"},
    "gpsr":                  {"license": "EU_LAW",              "rule": 1, "source_type": "law", "name": "Allgemeine Produktsicherheitsverordnung"},
    "eu_2023_988":           {"license": "EU_LAW",              "rule": 1, "source_type": "law", "name": "Allgemeine Produktsicherheitsverordnung (GPSR)"},
@@ -132,6 +133,39 @@ REGULATION_LICENSE_MAP: dict[str, dict] = {
    "ao":                    {"license": "DE_LAW",              "rule": 1, "source_type": "law", "name": "Abgabenordnung (AO)"},
    "ao_komplett":           {"license": "DE_LAW",              "rule": 1, "source_type": "law", "name": "Abgabenordnung (AO)"},
    "battdg":                {"license": "DE_LAW",              "rule": 1, "source_type": "law", "name": "Batteriegesetz"},
+    # New German Laws (2026-04-10 ingestion)
+    "de_bsig_2025":          {"license": "DE_LAW",              "rule": 1, "source_type": "law", "name": "BSI-Gesetz (BSIG 2025, NIS2-Umsetzung)"},
+    "de_tdddg":              {"license": "DE_LAW",              "rule": 1, "source_type": "law", "name": "TDDDG"},
+    "de_gwg_2017":           {"license": "DE_LAW",              "rule": 1, "source_type": "law", "name": "Geldwaeschegesetz (GwG)"},
+    "de_agg":                {"license": "DE_LAW",              "rule": 1, "source_type": "law", "name": "Allgemeines Gleichbehandlungsgesetz (AGG)"},
+    "de_hinschg":            {"license": "DE_LAW",              "rule": 1, "source_type": "law", "name": "Hinweisgeberschutzgesetz (HinSchG)"},
+    "de_lksg":               {"license": "DE_LAW",              "rule": 1, "source_type": "law", "name": "Lieferkettensorgfaltspflichtengesetz (LkSG)"},
+    "de_kritisdachg":        {"license": "DE_LAW",              "rule": 1, "source_type": "law", "name": "KRITIS-Dachgesetz (KRITISDachG)"},
+    "de_bsi_kritisv":        {"license": "DE_LAW",              "rule": 1, "source_type": "law", "name": "BSI-Kritisverordnung (BSI-KritisV)"},
+    # DSK/BfDI Guidance (amtliche Orientierungshilfen)
+    "dsk_oh_ki_datenschutz":     {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "DSK OH KI und Datenschutz"},
+    "dsk_oh_ki_systeme_tom":     {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "DSK OH TOM bei KI-Systemen"},
+    "dsk_oh_ki_rag":             {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "DSK OH Generative KI mit RAG"},
+    "dsk_oh_telemedien":         {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "DSK OH Telemedien"},
+    "dsk_oh_digitale_dienste":   {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "DSK OH Digitale Dienste"},
+    "dsk_oh_direktwerbung":      {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "DSK OH Direktwerbung"},
+    "dsk_oh_videokonferenz":     {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "DSK OH Videokonferenzsysteme"},
+    "dsk_oh_videoueberwachung":  {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "DSK OH Videoueberwachung"},
+    "dsk_oh_email_verschluesselung": {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "DSK OH E-Mail-Verschluesselung"},
+    "dsk_oh_whistleblowing":     {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "DSK OH Whistleblowing"},
+    "dsk_oh_onlinedienste_zugang": {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "DSK OH Onlinedienste Zugang"},
+    "dsk_oh_datenuebermittlung_drittlaender": {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "DSK OH Datenuebermittlung Drittlaender"},
+    "dsk_sdm_methode":           {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "DSK Standard-Datenschutzmodell SDM V3.1a"},
+    "dsk_ah_eu_us_dpf":          {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "DSK AH EU-US Data Privacy Framework"},
+    "dsk_ah_bussgeldkonzept":    {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "DSK Bussgeldkonzept"},
+    "dsk_ah_dsfa_mussliste":     {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "DSK DSFA Muss-Liste"},
+    "dsk_ah_verzeichnis_vvt":    {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "DSK Hinweise VVT"},
+    "dsk_ah_zertifizierung":     {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "DSK Zertifizierungskriterien"},
+    "dsk_beschluss_ms365":       {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "DSK Festlegung Microsoft 365"},
+    "dsk_pos_ki_verordnung":     {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "DSK Positionspapier KI-Verordnung"},
+    "dsk_entschl_beschaeftigtendatenschutz": {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "DSK Entschliessung Beschaeftigtendatenschutz"},
+    "bfdi_handreichung_ki_behoerden": {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "BfDI Handreichung KI in Behoerden"},
+    "bfdi_handreichung_ki_sicherheit": {"license": "DE_PUBLIC", "rule": 1, "source_type": "guideline", "name": "BfDI Handreichung KI Sicherheitsbehoerden"},
    # Austrian Laws
    "at_dsg":                {"license": "AT_LAW",              "rule": 1, "source_type": "law", "name": "Österreichisches Datenschutzgesetz (DSG)"},
    "at_abgb":               {"license": "AT_LAW",              "rule": 1, "source_type": "law", "name": "AT ABGB"},
@@ -459,6 +493,7 @@ class GeneratorConfig(BaseModel):
    dry_run: bool = False
    existing_job_id: Optional[str] = None  # If set, reuse this job instead of creating a new one
    regulation_filter: Optional[List[str]] = None  # Only process chunks matching these regulation_code prefixes
+    regulation_exclude: Optional[List[str]] = None  # Skip chunks matching these regulation_code prefixes
    skip_prefilter: bool = False  # If True, skip local LLM pre-filter (send all chunks to API)


@@ -501,7 +536,7 @@ class GeneratedControl:
@dataclass
 class GeneratorResult:
    job_id: str = ""
-    status: str = "completed"
+    status: str = "running"
    total_chunks_scanned: int = 0
    controls_generated: int = 0
    controls_verified: int = 0
@@ -583,8 +618,8 @@ Antworte NUR mit JSON: {{"relevant": true/false, "reason": "kurze Begründung"}}
        return True, f"error: {e}"


-async def _llm_anthropic(prompt: str, system_prompt: Optional[str] = None) -> str:
-    """Call Anthropic Messages API."""
+async def _llm_anthropic(prompt: str, system_prompt: Optional[str] = None, max_retries: int = 2) -> str:
+    """Call Anthropic Messages API with retry on timeout."""
    headers = {
        "x-api-key": ANTHROPIC_API_KEY,
        "anthropic-version": "2023-06-01",
@@ -598,24 +633,36 @@ async def _llm_anthropic(prompt: str, system_prompt: Optional[str] = None) -> st
    if system_prompt:
        payload["system"] = system_prompt

-    try:
-        async with httpx.AsyncClient(timeout=LLM_TIMEOUT) as client:
-            resp = await client.post(
-                "https://api.anthropic.com/v1/messages",
-                headers=headers,
-                json=payload,
-            )
-            if resp.status_code != 200:
-                logger.error("Anthropic API %d: %s", resp.status_code, resp.text[:300])
+    # Use explicit per-phase timeouts to prevent indefinite hangs
+    timeout = httpx.Timeout(connect=30.0, read=LLM_TIMEOUT, write=30.0, pool=30.0)
+
+    for attempt in range(1 + max_retries):
+        try:
+            async with httpx.AsyncClient(timeout=timeout) as client:
+                resp = await client.post(
+                    "https://api.anthropic.com/v1/messages",
+                    headers=headers,
+                    json=payload,
+                )
+                if resp.status_code != 200:
+                    logger.error("Anthropic API %d: %s", resp.status_code, resp.text[:300])
+                    return ""
+                data = resp.json()
+                content = data.get("content", [])
+                if content and isinstance(content, list):
+                    return content[0].get("text", "")
                return ""
-            data = resp.json()
-            content = data.get("content", [])
-            if content and isinstance(content, list):
-                return content[0].get("text", "")
+        except (httpx.TimeoutException, httpx.RemoteProtocolError) as e:
+            if attempt < max_retries:
+                logger.warning("Anthropic request attempt %d/%d failed: %s — retrying...", attempt + 1, max_retries + 1, e)
+                import asyncio
+                await asyncio.sleep(2 ** attempt)
+                continue
+            logger.error("Anthropic request failed after %d attempts: %s (type: %s)", max_retries + 1, e, type(e).__name__)
+            return ""
+        except Exception as e:
+            logger.error("Anthropic request failed: %s (type: %s)", e, type(e).__name__)
            return ""
-    except Exception as e:
-        logger.error("Anthropic request failed: %s (type: %s)", e, type(e).__name__)
-        return ""


 async def _llm_ollama(prompt: str, system_prompt: Optional[str] = None) -> str:
@@ -941,6 +988,12 @@ class ControlGeneratorPipeline:
                        if not any(code_lower.startswith(f.lower()) for f in config.regulation_filter):
                            continue

+                    # Exclude specific regulation_codes
+                    if config.regulation_exclude and reg_code:
+                        code_lower = reg_code.lower()
+                        if any(code_lower.startswith(f.lower()) for f in config.regulation_exclude):
+                            continue
+
                    reg_name = (payload.get("regulation_name_de", "")
                                or payload.get("regulation_name", "")
                                or payload.get("source_name", "")
@@ -1423,20 +1476,31 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Elementen. Fuer Aspekte ohne
        if structure_items:
            s_chunks = [c for c, _ in structure_items]
            s_lics = [l for _, l in structure_items]
-            s_controls = await self._structure_batch(s_chunks, s_lics)
+            try:
+                s_controls = await self._structure_batch(s_chunks, s_lics)
+            except Exception as e:
+                import traceback
+                logger.warning("Batch structure failed: %s — creating fallback controls\n%s", e, traceback.format_exc())
+                s_controls = [self._fallback_control(c) for c in s_chunks]
            for (chunk, _), ctrl in zip(structure_items, s_controls):
                orig_idx = next(i for i, (c, _) in enumerate(batch_items) if c is chunk)
                all_controls[orig_idx] = ctrl

        if reform_items:
            r_chunks = [c for c, _ in reform_items]
-            r_controls = await self._reformulate_batch(r_chunks, config)
+            try:
+                r_controls = await self._reformulate_batch(r_chunks, config)
+            except Exception as e:
+                logger.warning("Batch reform failed: %s — creating fallback controls", e)
+                r_controls = [self._fallback_control(c) for c in r_chunks]
            for (chunk, _), ctrl in zip(reform_items, r_controls):
                orig_idx = next(i for i, (c, _) in enumerate(batch_items) if c is chunk)
                if ctrl:
                    # Too-Close-Check for Rule 3
                    similarity = await check_similarity(chunk.text, f"{ctrl.objective} {ctrl.rationale}")
-                    if similarity.status == "FAIL":
+                    if similarity is None:
+                        logger.warning("Similarity check returned None — skipping too-close check")
+                    elif similarity.status == "FAIL":
                        ctrl.release_state = "too_close"
                        ctrl.generation_metadata["similarity_status"] = "FAIL"
                        ctrl.generation_metadata["similarity_scores"] = {
@@ -1502,36 +1566,42 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Elementen. Fuer Aspekte ohne
    # ── Stage 4: Harmonization ─────────────────────────────────────────

    async def _check_harmonization(self, new_control: GeneratedControl) -> Optional[list]:
-        """Check if a new control duplicates existing ones via embedding similarity."""
-        existing = self._load_existing_controls()
-        if not existing:
-            return None
-
-        # Pre-load all existing embeddings in batch (once per pipeline run)
-        if not self._existing_embeddings:
-            await self._preload_embeddings(existing)
+        """Check if a new control duplicates existing ones via Qdrant vector search.

+        Uses the atomic_controls_dedup collection for fast nearest-neighbor lookup
+        instead of pre-loading all embeddings into memory.
+        """
        new_text = f"{new_control.title} {new_control.objective}"
        new_emb = await _get_embedding(new_text)
        if not new_emb:
            return None

-        similar = []
-        for ex in existing:
-            ex_key = ex.get("control_id", "")
-            ex_emb = self._existing_embeddings.get(ex_key, [])
-            if not ex_emb:
-                continue
+        try:
+            async with httpx.AsyncClient(timeout=10.0) as client:
+                resp = await client.post(
+                    f"{QDRANT_URL}/collections/atomic_controls_dedup/points/search",
+                    json={
+                        "vector": new_emb,
+                        "limit": 5,
+                        "score_threshold": HARMONIZATION_THRESHOLD,
+                        "with_payload": {"include": ["control_id", "title"]},
+                    },
+                )
+                if resp.status_code == 200:
+                    results = resp.json().get("result", [])
+                    if results:
+                        return [
+                            {
+                                "control_id": r["payload"].get("control_id", ""),
+                                "title": r["payload"].get("title", ""),
+                                "similarity": round(r["score"], 3),
+                            }
+                            for r in results
+                        ]
+        except Exception as e:
+            logger.warning("Qdrant dedup search failed: %s — skipping harmonization", e)

-            cosine = _cosine_sim(new_emb, ex_emb)
-            if cosine > HARMONIZATION_THRESHOLD:
-                similar.append({
-                    "control_id": ex.get("control_id", ""),
-                    "title": ex.get("title", ""),
-                    "similarity": round(cosine, 3),
-                })
-
-        return similar if similar else None
+        return None

    async def _preload_embeddings(self, existing: list[dict]):
        """Pre-load embeddings for all existing controls in batches."""
@@ -1580,9 +1650,11 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Elementen. Fuer Aspekte ohne
        if severity not in ("low", "medium", "high", "critical"):
            severity = "medium"

-        tags = data.get("tags", [])
+        tags = data.get("tags") or []
        if isinstance(tags, str):
            tags = [t.strip() for t in tags.split(",")]
+        if not isinstance(tags, list):
+            tags = []

        # Use LLM-provided domain if available, fallback to keyword-detected domain
        llm_domain = data.get("domain")