feat: add policy library with 29 German policy templates

Add 29 new document types (IT security, data, personnel, vendor, BCM policies) to VALID_DOCUMENT_TYPES and 5 category pills to the document generator UI. Include seed script for production DB population. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-14 22:37:33 +01:00
parent 637fab6fdb
commit 0171d611f6
4 changed files with 1842 additions and 54 deletions
@@ -32,19 +32,26 @@ import {

 const CATEGORIES: { key: string; label: string; types: string[] | null }[] = [
  { key: 'all', label: 'Alle', types: null },
+  // Legal / Vertragsvorlagen
  { key: 'privacy_policy', label: 'Datenschutz', types: ['privacy_policy'] },
  { key: 'terms', label: 'AGB', types: ['terms_of_service', 'agb', 'clause'] },
  { key: 'impressum', label: 'Impressum', types: ['impressum'] },
  { key: 'dpa', label: 'AVV/DPA', types: ['dpa'] },
  { key: 'nda', label: 'NDA', types: ['nda'] },
  { key: 'sla', label: 'SLA', types: ['sla'] },
-  { key: 'acceptable_use', label: 'AUP', types: ['acceptable_use'] },
  { key: 'widerruf', label: 'Widerruf', types: ['widerruf'] },
  { key: 'cookie', label: 'Cookie', types: ['cookie_policy', 'cookie_banner'] },
  { key: 'cloud', label: 'Cloud', types: ['cloud_service_agreement'] },
  { key: 'misc', label: 'Weitere', types: ['community_guidelines', 'copyright_policy', 'data_usage_clause'] },
  { key: 'dsfa', label: 'DSFA', types: ['dsfa'] },
+  // Sicherheitskonzepte (Migration 051)
  { key: 'security', label: 'Sicherheitskonzepte', types: ['it_security_concept', 'data_protection_concept', 'backup_recovery_concept', 'logging_concept', 'incident_response_plan', 'access_control_concept', 'risk_management_concept'] },
+  // Policy-Bibliothek (Migration 054)
+  { key: 'it_security_policies', label: 'IT-Sicherheit Policies', types: ['information_security_policy', 'access_control_policy', 'password_policy', 'encryption_policy', 'logging_policy', 'backup_policy', 'incident_response_policy', 'change_management_policy', 'patch_management_policy', 'asset_management_policy', 'cloud_security_policy', 'devsecops_policy', 'secrets_management_policy', 'vulnerability_management_policy'] },
+  { key: 'data_policies', label: 'Daten-Policies', types: ['data_protection_policy', 'data_classification_policy', 'data_retention_policy', 'data_transfer_policy', 'privacy_incident_policy'] },
+  { key: 'hr_policies', label: 'Personal-Policies', types: ['employee_security_policy', 'security_awareness_policy', 'acceptable_use', 'remote_work_policy', 'offboarding_policy'] },
+  { key: 'vendor_policies', label: 'Lieferanten-Policies', types: ['vendor_risk_management_policy', 'third_party_security_policy', 'supplier_security_policy'] },
+  { key: 'bcm_policies', label: 'BCM/Notfall', types: ['business_continuity_policy', 'disaster_recovery_policy', 'crisis_management_policy'] },
 ]

 // =============================================================================
@@ -58,6 +58,40 @@ VALID_DOCUMENT_TYPES = {
    "incident_response_plan",
    "access_control_concept",
    "risk_management_concept",
+    # Policy templates — IT Security (Migration 054)
+    "information_security_policy",
+    "access_control_policy",
+    "password_policy",
+    "encryption_policy",
+    "logging_policy",
+    "backup_policy",
+    "incident_response_policy",
+    "change_management_policy",
+    "patch_management_policy",
+    "asset_management_policy",
+    "cloud_security_policy",
+    "devsecops_policy",
+    "secrets_management_policy",
+    "vulnerability_management_policy",
+    # Policy templates — Data (Migration 054)
+    "data_protection_policy",
+    "data_classification_policy",
+    "data_retention_policy",
+    "data_transfer_policy",
+    "privacy_incident_policy",
+    # Policy templates — Personnel (Migration 054)
+    "employee_security_policy",
+    "security_awareness_policy",
+    "remote_work_policy",
+    "offboarding_policy",
+    # Policy templates — Vendor/Supply Chain (Migration 054)
+    "vendor_risk_management_policy",
+    "third_party_security_policy",
+    "supplier_security_policy",
+    # Policy templates — BCM (Migration 054)
+    "business_continuity_policy",
+    "disaster_recovery_policy",
+    "crisis_management_policy",
 }
 VALID_STATUSES = {"published", "draft", "archived"}

@@ -47,7 +47,7 @@ ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "")
 ANTHROPIC_MODEL = os.getenv("CONTROL_GEN_ANTHROPIC_MODEL", "claude-sonnet-4-6")
 OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
 OLLAMA_MODEL = os.getenv("CONTROL_GEN_OLLAMA_MODEL", "qwen3.5:35b-a3b")
-LLM_TIMEOUT = float(os.getenv("CONTROL_GEN_LLM_TIMEOUT", "120"))
+LLM_TIMEOUT = float(os.getenv("CONTROL_GEN_LLM_TIMEOUT", "180"))

 HARMONIZATION_THRESHOLD = 0.85  # Cosine similarity above this = duplicate

@@ -466,7 +466,7 @@ async def _llm_anthropic(prompt: str, system_prompt: Optional[str] = None) -> st
    }
    payload = {
        "model": ANTHROPIC_MODEL,
-        "max_tokens": 4096,
+        "max_tokens": 8192,
        "messages": [{"role": "user", "content": prompt}],
    }
    if system_prompt:
@@ -488,7 +488,7 @@ async def _llm_anthropic(prompt: str, system_prompt: Optional[str] = None) -> st
                return content[0].get("text", "")
            return ""
    except Exception as e:
-        logger.error("Anthropic request failed: %s", e)
+        logger.error("Anthropic request failed: %s (type: %s)", e, type(e).__name__)
        return ""


@@ -598,6 +598,57 @@ def _parse_llm_json(raw: str) -> dict:
    return {}


+def _parse_llm_json_array(raw: str) -> list[dict]:
+    """Extract a JSON array from LLM response — returns list of dicts."""
+    match = re.search(r"```(?:json)?\s*\n?(.*?)\n?```", raw, re.DOTALL)
+    text = match.group(1) if match else raw
+
+    # Try parsing as array directly
+    try:
+        parsed = json.loads(text)
+        if isinstance(parsed, list):
+            return parsed
+        if isinstance(parsed, dict):
+            # Check if it wraps an array (e.g. {"controls": [...]})
+            for key in ("controls", "results", "items", "data"):
+                if key in parsed and isinstance(parsed[key], list):
+                    return parsed[key]
+            return [parsed]
+    except json.JSONDecodeError:
+        pass
+
+    # Try finding [ ... ] block
+    bracket_match = re.search(r"\[.*\]", text, re.DOTALL)
+    if bracket_match:
+        try:
+            parsed = json.loads(bracket_match.group(0))
+            if isinstance(parsed, list):
+                return parsed
+        except json.JSONDecodeError:
+            pass
+
+    # Try finding multiple { ... } blocks (LLM sometimes returns separate objects)
+    objects = []
+    for obj_match in re.finditer(r"\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}", text, re.DOTALL):
+        try:
+            obj = json.loads(obj_match.group(0))
+            if isinstance(obj, dict) and obj.get("title"):
+                objects.append(obj)
+        except json.JSONDecodeError:
+            continue
+    if objects:
+        logger.info("Parsed %d individual JSON objects from batch response", len(objects))
+        return objects
+
+    # Fallback: try single object
+    single = _parse_llm_json(raw)
+    if single:
+        logger.info("Batch parse fallback: extracted single object")
+    else:
+        logger.warning("Batch parse failed — logging first 500 chars: %s", raw[:500])
+    return [single] if single else []
+
+
 # ---------------------------------------------------------------------------
 # Pipeline
 # ---------------------------------------------------------------------------
@@ -606,11 +657,11 @@ REFORM_SYSTEM_PROMPT = """Du bist ein Security-Compliance-Experte. Deine Aufgabe
 Security Controls zu formulieren. Du formulierst IMMER in eigenen Worten.
 KOPIERE KEINE Sätze aus dem Quelltext. Verwende eigene Begriffe und Struktur.
 NENNE NICHT die Quelle. Keine proprietären Bezeichner.
-Antworte NUR mit validem JSON."""
+Antworte NUR mit validem JSON. Bei mehreren Controls antworte mit einem JSON-Array."""

 STRUCTURE_SYSTEM_PROMPT = """Du bist ein Security-Compliance-Experte. Strukturiere den gegebenen Text
 als praxisorientiertes Security Control. Erstelle eine verständliche, umsetzbare Formulierung.
-Antworte NUR mit validem JSON."""
+Antworte NUR mit validem JSON. Bei mehreren Controls antworte mit einem JSON-Array."""


 class ControlGeneratorPipeline:
@@ -881,6 +932,241 @@ Gib JSON zurück mit diesen Feldern:
        }
        return control

+    # ── Stage 3 BATCH: Multiple chunks in one API call ─────────────────
+
+    async def _structure_batch(
+        self,
+        chunks: list[RAGSearchResult],
+        license_infos: list[dict],
+    ) -> list[Optional[GeneratedControl]]:
+        """Structure multiple free-use/citation chunks in a single Anthropic call."""
+        chunk_entries = []
+        for idx, (chunk, lic) in enumerate(zip(chunks, license_infos)):
+            source_name = lic.get("name", chunk.regulation_name)
+            chunk_entries.append(
+                f"--- CHUNK {idx + 1} ---\n"
+                f"Text: {chunk.text[:2000]}\n"
+                f"Quelle: {chunk.regulation_name} ({chunk.regulation_code}), {chunk.article}\n"
+                f"Lizenz: {source_name} ({lic.get('license', '')})"
+            )
+        joined = "\n\n".join(chunk_entries)
+        prompt = f"""Strukturiere die folgenden {len(chunks)} Gesetzestexte jeweils als eigenstaendiges Security/Compliance Control.
+Du DARFST den Originaltext verwenden (Quellen sind jeweils angegeben).
+
+WICHTIG:
+- Erstelle fuer JEDEN Chunk ein separates Control mit verstaendlicher, praxisorientierter Formulierung.
+- Jedes Control muss eigenstaendig und vollstaendig sein — nicht auf andere Controls verweisen.
+- Qualitaet ist wichtiger als Geschwindigkeit. Jedes Control muss die gleiche Qualitaet haben wie ein einzeln erstelltes.
+
+Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat diese Felder:
+- chunk_index: 1-basierter Index des Chunks (1, 2, 3, ...)
+- title: Kurzer praegnanter Titel (max 100 Zeichen)
+- objective: Was soll erreicht werden? (1-3 Saetze)
+- rationale: Warum ist das wichtig? (1-2 Saetze)
+- requirements: Liste von konkreten Anforderungen (Strings)
+- test_procedure: Liste von Pruefschritten (Strings)
+- evidence: Liste von Nachweisdokumenten (Strings)
+- severity: low/medium/high/critical
+- tags: Liste von Tags
+
+{joined}"""
+
+        raw = await _llm_chat(prompt, STRUCTURE_SYSTEM_PROMPT)
+        results = _parse_llm_json_array(raw)
+        logger.info("Batch structure: parsed %d results from API response", len(results))
+
+        # Map results back to chunks by chunk_index (or by position if no index)
+        controls: list[Optional[GeneratedControl]] = [None] * len(chunks)
+        for pos, data in enumerate(results):
+            # Try chunk_index first, fall back to position
+            idx = data.get("chunk_index")
+            if idx is not None:
+                idx = int(idx) - 1  # Convert to 0-based
+            else:
+                idx = pos  # Use position as fallback
+            if idx < 0 or idx >= len(chunks):
+                logger.warning("Batch: chunk_index %d out of range (0-%d), using position %d", idx, len(chunks)-1, pos)
+                idx = min(pos, len(chunks) - 1)
+            chunk = chunks[idx]
+            lic = license_infos[idx]
+            domain = _detect_domain(chunk.text)
+            control = self._build_control_from_json(data, domain)
+            control.license_rule = lic["rule"]
+            if lic["rule"] in (1, 2):
+                control.source_original_text = chunk.text
+                control.source_citation = {
+                    "source": f"{chunk.regulation_name} {chunk.article or ''}".strip(),
+                    "license": lic.get("license", ""),
+                    "license_notice": lic.get("attribution", ""),
+                    "url": chunk.source_url or "",
+                }
+                control.customer_visible = True
+            control.verification_method = _detect_verification_method(chunk.text)
+            control.category = _detect_category(chunk.text)
+            control.generation_metadata = {
+                "processing_path": "structured_batch",
+                "license_rule": lic["rule"],
+                "source_regulation": chunk.regulation_code,
+                "source_article": chunk.article,
+                "batch_size": len(chunks),
+            }
+            controls[idx] = control
+
+        return controls
+
+    async def _reformulate_batch(
+        self,
+        chunks: list[RAGSearchResult],
+        config: GeneratorConfig,
+    ) -> list[Optional[GeneratedControl]]:
+        """Reformulate multiple restricted chunks in a single Anthropic call."""
+        chunk_entries = []
+        for idx, chunk in enumerate(chunks):
+            domain = config.domain or _detect_domain(chunk.text)
+            chunk_entries.append(
+                f"--- ASPEKT {idx + 1} ---\n"
+                f"Domain: {domain}\n"
+                f"Text (nur zur Analyse, NICHT kopieren, NICHT referenzieren):\n{chunk.text[:1500]}"
+            )
+        joined = "\n\n".join(chunk_entries)
+        prompt = f"""Analysiere die folgenden {len(chunks)} Pruefaspekte und formuliere fuer JEDEN ein EIGENSTAENDIGES Security Control.
+KOPIERE KEINE Saetze. Verwende eigene Begriffe und Struktur.
+NENNE NICHT die Quellen. Keine proprietaeren Bezeichner (kein O.Auth_*, TR-03161, BSI-TR etc.).
+
+WICHTIG:
+- Jedes Control muss eigenstaendig und vollstaendig sein — nicht auf andere Controls verweisen.
+- Qualitaet ist wichtiger als Geschwindigkeit. Jedes Control muss die gleiche Qualitaet haben wie ein einzeln erstelltes.
+
+Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat diese Felder:
+- chunk_index: 1-basierter Index des Aspekts (1, 2, 3, ...)
+- title: Kurzer eigenstaendiger Titel (max 100 Zeichen)
+- objective: Eigenstaendige Formulierung des Ziels (1-3 Saetze)
+- rationale: Eigenstaendige Begruendung (1-2 Saetze)
+- requirements: Liste von konkreten Anforderungen (Strings, eigene Worte)
+- test_procedure: Liste von Pruefschritten (Strings)
+- evidence: Liste von Nachweisdokumenten (Strings)
+- severity: low/medium/high/critical
+- tags: Liste von Tags (eigene Begriffe)
+
+{joined}"""
+
+        raw = await _llm_chat(prompt, REFORM_SYSTEM_PROMPT)
+        results = _parse_llm_json_array(raw)
+        logger.info("Batch reform: parsed %d results from API response", len(results))
+
+        controls: list[Optional[GeneratedControl]] = [None] * len(chunks)
+        for pos, data in enumerate(results):
+            idx = data.get("chunk_index")
+            if idx is not None:
+                idx = int(idx) - 1
+            else:
+                idx = pos
+            if idx < 0 or idx >= len(chunks):
+                logger.warning("Batch reform: chunk_index %d out of range, using position %d", idx, pos)
+                idx = min(pos, len(chunks) - 1)
+            chunk = chunks[idx]
+            domain = config.domain or _detect_domain(chunk.text)
+            control = self._build_control_from_json(data, domain)
+            control.license_rule = 3
+            control.source_original_text = None
+            control.source_citation = None
+            control.customer_visible = False
+            control.verification_method = _detect_verification_method(chunk.text)
+            control.category = _detect_category(chunk.text)
+            control.generation_metadata = {
+                "processing_path": "llm_reform_batch",
+                "license_rule": 3,
+                "batch_size": len(chunks),
+            }
+            controls[idx] = control
+
+        return controls
+
+    async def _process_batch(
+        self,
+        batch_items: list[tuple[RAGSearchResult, dict]],
+        config: GeneratorConfig,
+        job_id: str,
+    ) -> list[Optional[GeneratedControl]]:
+        """Process a batch of (chunk, license_info) through stages 3-5."""
+        # Split by license rule: Rule 1+2 → structure, Rule 3 → reform
+        structure_items = [(c, l) for c, l in batch_items if l["rule"] in (1, 2)]
+        reform_items = [(c, l) for c, l in batch_items if l["rule"] == 3]
+
+        all_controls: dict[int, Optional[GeneratedControl]] = {}
+
+        if structure_items:
+            s_chunks = [c for c, _ in structure_items]
+            s_lics = [l for _, l in structure_items]
+            s_controls = await self._structure_batch(s_chunks, s_lics)
+            for (chunk, _), ctrl in zip(structure_items, s_controls):
+                orig_idx = next(i for i, (c, _) in enumerate(batch_items) if c is chunk)
+                all_controls[orig_idx] = ctrl
+
+        if reform_items:
+            r_chunks = [c for c, _ in reform_items]
+            r_controls = await self._reformulate_batch(r_chunks, config)
+            for (chunk, _), ctrl in zip(reform_items, r_controls):
+                orig_idx = next(i for i, (c, _) in enumerate(batch_items) if c is chunk)
+                if ctrl:
+                    # Too-Close-Check for Rule 3
+                    similarity = await check_similarity(chunk.text, f"{ctrl.objective} {ctrl.rationale}")
+                    if similarity.status == "FAIL":
+                        ctrl.release_state = "too_close"
+                        ctrl.generation_metadata["similarity_status"] = "FAIL"
+                        ctrl.generation_metadata["similarity_scores"] = {
+                            "token_overlap": similarity.token_overlap,
+                            "ngram_jaccard": similarity.ngram_jaccard,
+                            "lcs_ratio": similarity.lcs_ratio,
+                        }
+                all_controls[orig_idx] = ctrl
+
+        # Post-process all controls: harmonization + anchor search
+        final: list[Optional[GeneratedControl]] = []
+        for i in range(len(batch_items)):
+            control = all_controls.get(i)
+            if not control or (not control.title and not control.objective):
+                final.append(None)
+                continue
+
+            if control.release_state == "too_close":
+                final.append(control)
+                continue
+
+            # Harmonization
+            duplicates = await self._check_harmonization(control)
+            if duplicates:
+                control.release_state = "duplicate"
+                control.generation_metadata["similar_controls"] = duplicates
+                final.append(control)
+                continue
+
+            # Anchor search
+            try:
+                from .anchor_finder import AnchorFinder
+                finder = AnchorFinder(self.rag)
+                anchors = await finder.find_anchors(control, skip_web=config.skip_web_search)
+                control.open_anchors = [asdict(a) if hasattr(a, '__dataclass_fields__') else a for a in anchors]
+            except Exception as e:
+                logger.warning("Anchor search failed: %s", e)
+
+            # Release state
+            if control.license_rule in (1, 2):
+                control.release_state = "draft"
+            elif control.open_anchors:
+                control.release_state = "draft"
+            else:
+                control.release_state = "needs_review"
+
+            # Control ID
+            domain = config.domain or _detect_domain(control.objective)
+            control.control_id = self._generate_control_id(domain, self.db)
+            control.generation_metadata["job_id"] = job_id
+
+            final.append(control)
+
+        return final
+
    # ── Stage 4: Harmonization ─────────────────────────────────────────

    async def _check_harmonization(self, new_control: GeneratedControl) -> Optional[list]:
@@ -1168,6 +1454,7 @@ Gib JSON zurück mit diesen Feldern:
            self.db.commit()
        except Exception as e:
            logger.warning("Failed to mark chunk processed: %s", e)
+            self.db.rollback()

    # ── Main Pipeline ──────────────────────────────────────────────────

@@ -1192,9 +1479,71 @@ Gib JSON zurück mit diesen Feldern:
                self._update_job(job_id, result)
                return result

-            # Process chunks
+            # Process chunks — batch mode (N chunks per Anthropic API call)
+            BATCH_SIZE = config.batch_size or 5
            controls_count = 0
            chunks_skipped_prefilter = 0
+            pending_batch: list[tuple[RAGSearchResult, dict]] = []  # (chunk, license_info)
+
+            async def _flush_batch():
+                """Send pending batch to Anthropic and process results."""
+                nonlocal controls_count
+                if not pending_batch:
+                    return
+                batch = pending_batch.copy()
+                pending_batch.clear()
+
+                logger.info("Processing batch of %d chunks via single API call...", len(batch))
+                try:
+                    batch_controls = await self._process_batch(batch, config, job_id)
+                except Exception as e:
+                    logger.error("Batch processing failed: %s — falling back to single-chunk mode", e)
+                    # Fallback: process each chunk individually
+                    batch_controls = []
+                    for chunk, _lic in batch:
+                        try:
+                            ctrl = await self._process_single_chunk(chunk, config, job_id)
+                            batch_controls.append(ctrl)
+                        except Exception as e2:
+                            logger.error("Single-chunk fallback also failed: %s", e2)
+                            batch_controls.append(None)
+
+                for (chunk, lic_info), control in zip(batch, batch_controls):
+                    if control is None:
+                        if not config.dry_run:
+                            self._mark_chunk_processed(chunk, lic_info, "no_control", [], job_id)
+                        continue
+
+                    # Count by state
+                    if control.release_state == "too_close":
+                        result.controls_too_close += 1
+                    elif control.release_state == "duplicate":
+                        result.controls_duplicates_found += 1
+                    elif control.release_state == "needs_review":
+                        result.controls_needs_review += 1
+                    else:
+                        result.controls_verified += 1
+
+                    # Store
+                    if not config.dry_run:
+                        ctrl_uuid = self._store_control(control, job_id)
+                        if ctrl_uuid:
+                            path = control.generation_metadata.get("processing_path", "structured_batch")
+                            self._mark_chunk_processed(chunk, lic_info, path, [ctrl_uuid], job_id)
+                        else:
+                            self._mark_chunk_processed(chunk, lic_info, "store_failed", [], job_id)
+
+                    result.controls_generated += 1
+                    result.controls.append(asdict(control))
+                    controls_count += 1
+
+                    if self._existing_controls is not None:
+                        self._existing_controls.append({
+                            "control_id": control.control_id,
+                            "title": control.title,
+                            "objective": control.objective,
+                        })
+
            for i, chunk in enumerate(chunks):
                try:
                    # Progress logging every 50 chunks
@@ -1210,65 +1559,24 @@ Gib JSON zurück mit diesen Feldern:
                        is_relevant, prefilter_reason = await _prefilter_chunk(chunk.text)
                        if not is_relevant:
                            chunks_skipped_prefilter += 1
-                            # Mark as processed so we don't re-check next time
                            license_info = self._classify_license(chunk)
                            self._mark_chunk_processed(
                                chunk, license_info, "prefilter_skip", [], job_id
                            )
                            continue

-                    control = await self._process_single_chunk(chunk, config, job_id)
-                    if control is None:
-                        # No control generated — still mark as processed
-                        if not config.dry_run:
+                    # Classify license and add to batch
                    license_info = self._classify_license(chunk)
-                            self._mark_chunk_processed(
-                                chunk, license_info, "no_control", [], job_id
-                            )
-                        continue
+                    pending_batch.append((chunk, license_info))

-                    # Count by state
-                    if control.release_state == "too_close":
-                        result.controls_too_close += 1
-                    elif control.release_state == "duplicate":
-                        result.controls_duplicates_found += 1
-                    elif control.release_state == "needs_review":
-                        result.controls_needs_review += 1
-                    else:
-                        result.controls_verified += 1
-
-                    # Store (unless dry run)
-                    if not config.dry_run:
-                        ctrl_uuid = self._store_control(control, job_id)
-                        if ctrl_uuid:
-                            # Stage 7: Mark chunk processed
-                            license_info = self._classify_license(chunk)
-                            path = "llm_reform" if license_info["rule"] == 3 else "structured"
-                            self._mark_chunk_processed(chunk, license_info, path, [ctrl_uuid], job_id)
-                        else:
-                            # Store failed — still mark as processed
-                            license_info = self._classify_license(chunk)
-                            self._mark_chunk_processed(
-                                chunk, license_info, "store_failed", [], job_id
-                            )
-
-                    result.controls_generated += 1
-                    result.controls.append(asdict(control))
-                    controls_count += 1
-
-                    # Add to existing controls for harmonization of next chunks
-                    if self._existing_controls is not None:
-                        self._existing_controls.append({
-                            "control_id": control.control_id,
-                            "title": control.title,
-                            "objective": control.objective,
-                        })
+                    # Flush when batch is full
+                    if len(pending_batch) >= BATCH_SIZE:
+                        await _flush_batch()

                except Exception as e:
                    error_msg = f"Error processing chunk {chunk.regulation_code}/{chunk.article}: {e}"
                    logger.error(error_msg)
                    result.errors.append(error_msg)
-                    # Mark failed chunks as processed too (so we don't retry endlessly)
                    try:
                        if not config.dry_run:
                            license_info = self._classify_license(chunk)
@@ -1278,6 +1586,9 @@ Gib JSON zurück mit diesen Feldern:
                    except Exception:
                        pass

+            # Flush remaining chunks
+            await _flush_batch()
+
            result.chunks_skipped_prefilter = chunks_skipped_prefilter
            logger.info(
                "Pipeline complete: %d controls generated, %d chunks skipped by prefilter, %d total chunks",