- {ctrl.source_citation.source && (
+ {ctrl.source_citation.source ? (
{ctrl.source_citation.source}
- )}
+ ) : ctrl.generation_metadata?.source_regulation ? (
+
{String(ctrl.generation_metadata.source_regulation)}
+ ) : null}
{ctrl.source_citation.license && (
Lizenz: {ctrl.source_citation.license}
)}
@@ -211,15 +213,18 @@ export function ControlDetail({
)}
- {/* Impliziter Gesetzesbezug (Rule 3 — kein Originaltext, aber ggf. Gesetzesbezug ueber Anchors) */}
+ {/* Impliziter Gesetzesbezug (Rule 3 — reformuliert, kein Originaltext) */}
{!ctrl.source_citation && ctrl.open_anchors.length > 0 && (
-
- Dieser Control setzt implizit gesetzliche Anforderungen um (z.B. DSGVO Art. 32, NIS2 Art. 21).
- Die konkreten Massnahmen leiten sich aus den Open-Source-Referenzen unten ab.
-
+
+
Abgeleitet aus regulatorischen Anforderungen
+
+ Dieser Control wurde aus geschuetzten Quellen reformuliert (z.B. BSI Grundschutz, ISO 27001).
+ Die konkreten Massnahmen leiten sich aus den Open-Source-Referenzen unten ab.
+
+
)}
diff --git a/backend-compliance/tests/test_control_generator.py b/backend-compliance/tests/test_control_generator.py
index 08406c3..6482a6b 100644
--- a/backend-compliance/tests/test_control_generator.py
+++ b/backend-compliance/tests/test_control_generator.py
@@ -305,7 +305,7 @@ class TestPipelineMocked:
def test_config_defaults(self):
config = GeneratorConfig()
- assert config.max_controls == 50
+ assert config.max_controls == 0
assert config.batch_size == 5
assert config.skip_processed is True
assert config.dry_run is False
@@ -340,3 +340,610 @@ class TestPipelineMocked:
assert control.source_citation is not None
assert "DSGVO" in control.source_citation["source"]
assert control.customer_visible is True
+
+
+# =============================================================================
+# JSON Array Parsing Tests (_parse_llm_json_array)
+# =============================================================================
+
+from compliance.services.control_generator import _parse_llm_json_array
+
+
+class TestParseJsonArray:
+ """Tests for _parse_llm_json_array — batch LLM response parsing."""
+
+ def test_clean_json_array(self):
+ """A well-formed JSON array should be returned directly."""
+ raw = json.dumps([
+ {"title": "Control A", "objective": "Obj A"},
+ {"title": "Control B", "objective": "Obj B"},
+ ])
+ result = _parse_llm_json_array(raw)
+ assert len(result) == 2
+ assert result[0]["title"] == "Control A"
+ assert result[1]["title"] == "Control B"
+
+ def test_json_array_in_markdown_code_block(self):
+ """JSON array wrapped in ```json ... ``` markdown fence."""
+ inner = json.dumps([
+ {"title": "Fenced A", "chunk_index": 1},
+ {"title": "Fenced B", "chunk_index": 2},
+ ])
+ raw = f"Here is the result:\n```json\n{inner}\n```\nDone."
+ result = _parse_llm_json_array(raw)
+ assert len(result) == 2
+ assert result[0]["title"] == "Fenced A"
+ assert result[1]["title"] == "Fenced B"
+
+ def test_markdown_code_block_without_json_tag(self):
+ """Markdown fence without explicit 'json' language tag."""
+ inner = json.dumps([{"title": "NoTag", "objective": "test"}])
+ raw = f"```\n{inner}\n```"
+ result = _parse_llm_json_array(raw)
+ assert len(result) == 1
+ assert result[0]["title"] == "NoTag"
+
+ def test_wrapper_object_controls_key(self):
+ """LLM wraps array in {"controls": [...]} — should unwrap."""
+ raw = json.dumps({
+ "controls": [
+ {"title": "Wrapped A", "objective": "O1"},
+ {"title": "Wrapped B", "objective": "O2"},
+ ]
+ })
+ result = _parse_llm_json_array(raw)
+ assert len(result) == 2
+ assert result[0]["title"] == "Wrapped A"
+
+ def test_wrapper_object_results_key(self):
+ """LLM wraps array in {"results": [...]} — should unwrap."""
+ raw = json.dumps({
+ "results": [
+ {"title": "R1"},
+ {"title": "R2"},
+ {"title": "R3"},
+ ]
+ })
+ result = _parse_llm_json_array(raw)
+ assert len(result) == 3
+
+ def test_wrapper_object_items_key(self):
+ """LLM wraps array in {"items": [...]} — should unwrap."""
+ raw = json.dumps({
+ "items": [{"title": "Item1"}]
+ })
+ result = _parse_llm_json_array(raw)
+ assert len(result) == 1
+ assert result[0]["title"] == "Item1"
+
+ def test_wrapper_object_data_key(self):
+ """LLM wraps array in {"data": [...]} — should unwrap."""
+ raw = json.dumps({
+ "data": [{"title": "D1"}, {"title": "D2"}]
+ })
+ result = _parse_llm_json_array(raw)
+ assert len(result) == 2
+
+ def test_single_dict_returned_as_list(self):
+ """A single JSON object (no array) is wrapped in a list."""
+ raw = json.dumps({"title": "SingleControl", "objective": "Obj"})
+ result = _parse_llm_json_array(raw)
+ assert len(result) == 1
+ assert result[0]["title"] == "SingleControl"
+
+ def test_individual_json_objects_fallback(self):
+ """Multiple separate JSON objects (not in array) are collected."""
+ raw = (
+ 'Here are the controls:\n'
+ '{"title": "Ctrl1", "objective": "A"}\n'
+ '{"title": "Ctrl2", "objective": "B"}\n'
+ )
+ result = _parse_llm_json_array(raw)
+ assert len(result) == 2
+ titles = {r["title"] for r in result}
+ assert "Ctrl1" in titles
+ assert "Ctrl2" in titles
+
+ def test_individual_objects_require_title(self):
+ """Fallback individual-object parsing only includes objects with 'title'."""
+ raw = (
+ '{"title": "HasTitle", "objective": "Yes"}\n'
+ '{"no_title_field": "skip_me"}\n'
+ )
+ result = _parse_llm_json_array(raw)
+ assert len(result) == 1
+ assert result[0]["title"] == "HasTitle"
+
+ def test_empty_string_returns_empty_list(self):
+ """Empty input returns empty list."""
+ result = _parse_llm_json_array("")
+ assert result == []
+
+ def test_invalid_input_returns_empty_list(self):
+ """Completely invalid input returns empty list."""
+ result = _parse_llm_json_array("this is not json at all, no braces anywhere")
+ assert result == []
+
+ def test_garbage_with_no_json_returns_empty(self):
+ """Random non-JSON text should return empty list."""
+ result = _parse_llm_json_array("Hier ist meine Antwort: leider kann ich das nicht.")
+ assert result == []
+
+ def test_bracket_block_extraction(self):
+ """Array embedded in preamble text is extracted via bracket matching."""
+ raw = 'Some preamble text...\n[{"title": "Extracted", "objective": "X"}]\nSome trailing text.'
+ result = _parse_llm_json_array(raw)
+ assert len(result) == 1
+ assert result[0]["title"] == "Extracted"
+
+ def test_nested_objects_in_array(self):
+ """Array elements with nested objects (like scope) are parsed correctly."""
+ raw = json.dumps([
+ {
+ "title": "Nested",
+ "objective": "Test",
+ "scope": {"regions": ["EU", "DE"]},
+ "requirements": ["Req1"],
+ }
+ ])
+ result = _parse_llm_json_array(raw)
+ assert len(result) == 1
+ assert result[0]["scope"]["regions"] == ["EU", "DE"]
+
+
+# =============================================================================
+# Batch Size Configuration Tests
+# =============================================================================
+
+class TestBatchSizeConfig:
+ """Tests for batch_size configuration on GeneratorConfig."""
+
+ def test_default_batch_size(self):
+ config = GeneratorConfig()
+ assert config.batch_size == 5
+
+ def test_custom_batch_size(self):
+ config = GeneratorConfig(batch_size=10)
+ assert config.batch_size == 10
+
+ def test_batch_size_of_one(self):
+ config = GeneratorConfig(batch_size=1)
+ assert config.batch_size == 1
+
+ def test_batch_size_used_in_pipeline_constant(self):
+ """Verify that pipeline uses config.batch_size (not a hardcoded value)."""
+ config = GeneratorConfig(batch_size=3)
+ # BATCH_SIZE = config.batch_size or 5 — with batch_size=3 it should be 3
+ batch_size = config.batch_size or 5
+ assert batch_size == 3
+
+ def test_batch_size_zero_falls_back_to_five(self):
+ """batch_size=0 triggers `or 5` fallback in the pipeline loop."""
+ config = GeneratorConfig(batch_size=0)
+ # Mimics the pipeline logic: BATCH_SIZE = config.batch_size or 5
+ batch_size = config.batch_size or 5
+ assert batch_size == 5
+
+
+# =============================================================================
+# Batch Processing Loop Tests (Mocked)
+# =============================================================================
+
+class TestBatchProcessingLoop:
+ """Tests for _process_batch, _structure_batch, _reformulate_batch with mocked LLM."""
+
+ def _make_chunk(self, regulation_code="owasp_asvs", article="V2.1.1", text="Test requirement"):
+ return RAGSearchResult(
+ text=text,
+ regulation_code=regulation_code,
+ regulation_name="OWASP ASVS" if "owasp" in regulation_code else "Test Source",
+ regulation_short="OWASP" if "owasp" in regulation_code else "TEST",
+ category="requirement",
+ article=article,
+ paragraph="",
+ source_url="https://example.com",
+ score=0.9,
+ )
+
+ @pytest.mark.asyncio
+ async def test_process_batch_splits_by_license_rule(self):
+ """_process_batch routes Rule 1+2 to _structure_batch and Rule 3 to _reformulate_batch."""
+ mock_db = MagicMock()
+ pipeline = ControlGeneratorPipeline(db=mock_db)
+ pipeline._existing_controls = []
+
+ chunk_r1 = self._make_chunk("eu_2016_679", "Art. 35", "DSGVO text")
+ chunk_r3 = self._make_chunk("bsi_tr03161", "O.Auth_1", "BSI text")
+
+ batch_items = [
+ (chunk_r1, {"rule": 1, "name": "DSGVO", "license": "EU_LAW"}),
+ (chunk_r3, {"rule": 3, "name": "INTERNAL_ONLY"}),
+ ]
+
+ # Mock _structure_batch and _reformulate_batch
+ structure_ctrl = GeneratedControl(title="Structured", license_rule=1, release_state="draft")
+ reform_ctrl = GeneratedControl(title="Reformed", license_rule=3, release_state="draft")
+
+ mock_finder_instance = AsyncMock()
+ mock_finder_instance.find_anchors = AsyncMock(return_value=[])
+ mock_finder_cls = MagicMock(return_value=mock_finder_instance)
+
+ with patch.object(pipeline, "_structure_batch", new_callable=AsyncMock, return_value=[structure_ctrl]) as mock_struct, \
+ patch.object(pipeline, "_reformulate_batch", new_callable=AsyncMock, return_value=[reform_ctrl]) as mock_reform, \
+ patch.object(pipeline, "_check_harmonization", new_callable=AsyncMock, return_value=[]), \
+ patch("compliance.services.anchor_finder.AnchorFinder", mock_finder_cls), \
+ patch("compliance.services.control_generator.check_similarity", new_callable=AsyncMock) as mock_sim:
+ mock_sim.return_value = MagicMock(status="PASS", token_overlap=0.1, ngram_jaccard=0.1, lcs_ratio=0.1)
+ config = GeneratorConfig(batch_size=5)
+ result = await pipeline._process_batch(batch_items, config, "test-job-123")
+
+ mock_struct.assert_called_once()
+ mock_reform.assert_called_once()
+ # structure_batch received only the Rule 1 chunk
+ assert len(mock_struct.call_args[0][0]) == 1
+ # reformulate_batch received only the Rule 3 chunk
+ assert len(mock_reform.call_args[0][0]) == 1
+
+ @pytest.mark.asyncio
+ async def test_structure_batch_calls_llm_and_parses(self):
+ """_structure_batch sends prompt to LLM and parses array response."""
+ mock_db = MagicMock()
+ pipeline = ControlGeneratorPipeline(db=mock_db)
+
+ chunks = [
+ self._make_chunk("eu_2016_679", "Art. 5", "Datensparsamkeit und Zweckbindung"),
+ self._make_chunk("eu_2016_679", "Art. 35", "DSFA bei hohem Risiko"),
+ ]
+ license_infos = [
+ {"rule": 1, "name": "DSGVO", "license": "EU_LAW"},
+ {"rule": 1, "name": "DSGVO", "license": "EU_LAW"},
+ ]
+
+ llm_response = json.dumps([
+ {
+ "chunk_index": 1,
+ "title": "Datensparsamkeit",
+ "objective": "Nur notwendige Daten erheben.",
+ "rationale": "DSGVO Grundprinzip.",
+ "requirements": ["Datenminimierung"],
+ "test_procedure": ["Datenbestand pruefen"],
+ "evidence": ["Verarbeitungsverzeichnis"],
+ "severity": "high",
+ "tags": ["dsgvo", "datenschutz"],
+ },
+ {
+ "chunk_index": 2,
+ "title": "DSFA Pflicht",
+ "objective": "DSFA bei hohem Risiko durchfuehren.",
+ "rationale": "Gesetzliche Pflicht.",
+ "requirements": ["DSFA erstellen"],
+ "test_procedure": ["DSFA Bericht pruefen"],
+ "evidence": ["DSFA Dokumentation"],
+ "severity": "high",
+ "tags": ["dsfa"],
+ },
+ ])
+
+ with patch("compliance.services.control_generator._llm_chat", new_callable=AsyncMock, return_value=llm_response):
+ controls = await pipeline._structure_batch(chunks, license_infos)
+
+ assert len(controls) == 2
+ assert controls[0] is not None
+ assert controls[0].title == "Datensparsamkeit"
+ assert controls[0].license_rule == 1
+ assert controls[0].source_original_text is not None
+ assert controls[0].customer_visible is True
+ assert controls[0].generation_metadata["processing_path"] == "structured_batch"
+ assert controls[0].generation_metadata["batch_size"] == 2
+
+ assert controls[1] is not None
+ assert controls[1].title == "DSFA Pflicht"
+ assert controls[1].license_rule == 1
+
+ @pytest.mark.asyncio
+ async def test_reformulate_batch_calls_llm_and_strips_source(self):
+ """_reformulate_batch produces Rule 3 controls without source info."""
+ mock_db = MagicMock()
+ pipeline = ControlGeneratorPipeline(db=mock_db)
+
+ chunks = [
+ self._make_chunk("bsi_tr03161", "O.Auth_1", "Multi-factor authentication for apps"),
+ ]
+ config = GeneratorConfig(batch_size=5)
+
+ llm_response = json.dumps([
+ {
+ "chunk_index": 1,
+ "title": "Starke Authentifizierung",
+ "objective": "Mehrstufige Anmeldung implementieren.",
+ "rationale": "Schutz vor unbefugtem Zugriff.",
+ "requirements": ["MFA einrichten"],
+ "test_procedure": ["MFA Funktionstest"],
+ "evidence": ["MFA Konfiguration"],
+ "severity": "critical",
+ "tags": ["auth", "mfa"],
+ }
+ ])
+
+ with patch("compliance.services.control_generator._llm_chat", new_callable=AsyncMock, return_value=llm_response):
+ controls = await pipeline._reformulate_batch(chunks, config)
+
+ assert len(controls) == 1
+ ctrl = controls[0]
+ assert ctrl is not None
+ assert ctrl.title == "Starke Authentifizierung"
+ assert ctrl.license_rule == 3
+ assert ctrl.source_original_text is None
+ assert ctrl.source_citation is None
+ assert ctrl.customer_visible is False
+ assert ctrl.generation_metadata["processing_path"] == "llm_reform_batch"
+ # Must not contain BSI references
+ metadata_str = json.dumps(ctrl.generation_metadata)
+ assert "bsi" not in metadata_str.lower()
+ assert "TR-03161" not in metadata_str
+
+ @pytest.mark.asyncio
+ async def test_structure_batch_maps_by_chunk_index(self):
+ """Controls are mapped back to the correct chunk via chunk_index."""
+ mock_db = MagicMock()
+ pipeline = ControlGeneratorPipeline(db=mock_db)
+
+ chunks = [
+ self._make_chunk("eu_2016_679", "Art. 5", "First chunk"),
+ self._make_chunk("eu_2016_679", "Art. 6", "Second chunk"),
+ self._make_chunk("eu_2016_679", "Art. 7", "Third chunk"),
+ ]
+ license_infos = [{"rule": 1, "name": "DSGVO", "license": "EU_LAW"}] * 3
+
+ # LLM returns them in reversed order
+ llm_response = json.dumps([
+ {
+ "chunk_index": 3,
+ "title": "Third Control",
+ "objective": "Obj3",
+ "rationale": "Rat3",
+ "requirements": [],
+ "test_procedure": [],
+ "evidence": [],
+ "severity": "low",
+ "tags": [],
+ },
+ {
+ "chunk_index": 1,
+ "title": "First Control",
+ "objective": "Obj1",
+ "rationale": "Rat1",
+ "requirements": [],
+ "test_procedure": [],
+ "evidence": [],
+ "severity": "high",
+ "tags": [],
+ },
+ {
+ "chunk_index": 2,
+ "title": "Second Control",
+ "objective": "Obj2",
+ "rationale": "Rat2",
+ "requirements": [],
+ "test_procedure": [],
+ "evidence": [],
+ "severity": "medium",
+ "tags": [],
+ },
+ ])
+
+ with patch("compliance.services.control_generator._llm_chat", new_callable=AsyncMock, return_value=llm_response):
+ controls = await pipeline._structure_batch(chunks, license_infos)
+
+ assert len(controls) == 3
+ # Verify correct mapping despite shuffled chunk_index
+ assert controls[0].title == "First Control"
+ assert controls[1].title == "Second Control"
+ assert controls[2].title == "Third Control"
+
+ @pytest.mark.asyncio
+ async def test_structure_batch_falls_back_to_position(self):
+ """If chunk_index is missing, controls are assigned by position."""
+ mock_db = MagicMock()
+ pipeline = ControlGeneratorPipeline(db=mock_db)
+
+ chunks = [
+ self._make_chunk("eu_2016_679", "Art. 5", "Chunk A"),
+ self._make_chunk("eu_2016_679", "Art. 6", "Chunk B"),
+ ]
+ license_infos = [{"rule": 1, "name": "DSGVO", "license": "EU_LAW"}] * 2
+
+ # No chunk_index in response
+ llm_response = json.dumps([
+ {
+ "title": "PositionA",
+ "objective": "ObjA",
+ "rationale": "RatA",
+ "requirements": [],
+ "test_procedure": [],
+ "evidence": [],
+ "severity": "medium",
+ "tags": [],
+ },
+ {
+ "title": "PositionB",
+ "objective": "ObjB",
+ "rationale": "RatB",
+ "requirements": [],
+ "test_procedure": [],
+ "evidence": [],
+ "severity": "medium",
+ "tags": [],
+ },
+ ])
+
+ with patch("compliance.services.control_generator._llm_chat", new_callable=AsyncMock, return_value=llm_response):
+ controls = await pipeline._structure_batch(chunks, license_infos)
+
+ assert len(controls) == 2
+ assert controls[0].title == "PositionA"
+ assert controls[1].title == "PositionB"
+
+ @pytest.mark.asyncio
+ async def test_process_batch_only_structure_no_reform(self):
+ """Batch with only Rule 1+2 items skips _reformulate_batch."""
+ mock_db = MagicMock()
+ pipeline = ControlGeneratorPipeline(db=mock_db)
+ pipeline._existing_controls = []
+
+ chunk = self._make_chunk("eu_2016_679", "Art. 5", "DSGVO text")
+ batch_items = [
+ (chunk, {"rule": 1, "name": "DSGVO", "license": "EU_LAW"}),
+ ]
+
+ ctrl = GeneratedControl(title="OnlyStructure", license_rule=1, release_state="draft")
+
+ mock_finder_instance = AsyncMock()
+ mock_finder_instance.find_anchors = AsyncMock(return_value=[])
+ mock_finder_cls = MagicMock(return_value=mock_finder_instance)
+
+ with patch.object(pipeline, "_structure_batch", new_callable=AsyncMock, return_value=[ctrl]) as mock_struct, \
+ patch.object(pipeline, "_reformulate_batch", new_callable=AsyncMock) as mock_reform, \
+ patch.object(pipeline, "_check_harmonization", new_callable=AsyncMock, return_value=[]), \
+ patch("compliance.services.anchor_finder.AnchorFinder", mock_finder_cls):
+ config = GeneratorConfig()
+ result = await pipeline._process_batch(batch_items, config, "job-1")
+
+ mock_struct.assert_called_once()
+ mock_reform.assert_not_called()
+ assert len(result) == 1
+ assert result[0].title == "OnlyStructure"
+
+ @pytest.mark.asyncio
+ async def test_process_batch_only_reform_no_structure(self):
+ """Batch with only Rule 3 items skips _structure_batch."""
+ mock_db = MagicMock()
+ pipeline = ControlGeneratorPipeline(db=mock_db)
+ pipeline._existing_controls = []
+
+ chunk = self._make_chunk("bsi_tr03161", "O.Auth_1", "BSI text")
+ batch_items = [
+ (chunk, {"rule": 3, "name": "INTERNAL_ONLY"}),
+ ]
+
+ ctrl = GeneratedControl(title="OnlyReform", license_rule=3, release_state="draft")
+
+ mock_finder_instance = AsyncMock()
+ mock_finder_instance.find_anchors = AsyncMock(return_value=[])
+ mock_finder_cls = MagicMock(return_value=mock_finder_instance)
+
+ with patch.object(pipeline, "_structure_batch", new_callable=AsyncMock) as mock_struct, \
+ patch.object(pipeline, "_reformulate_batch", new_callable=AsyncMock, return_value=[ctrl]) as mock_reform, \
+ patch.object(pipeline, "_check_harmonization", new_callable=AsyncMock, return_value=[]), \
+ patch("compliance.services.anchor_finder.AnchorFinder", mock_finder_cls), \
+ patch("compliance.services.control_generator.check_similarity", new_callable=AsyncMock) as mock_sim:
+ mock_sim.return_value = MagicMock(status="PASS", token_overlap=0.1, ngram_jaccard=0.1, lcs_ratio=0.1)
+ config = GeneratorConfig()
+ result = await pipeline._process_batch(batch_items, config, "job-2")
+
+ mock_struct.assert_not_called()
+ mock_reform.assert_called_once()
+ assert len(result) == 1
+ assert result[0].title == "OnlyReform"
+
+ @pytest.mark.asyncio
+ async def test_process_batch_mixed_rules(self):
+ """Batch with mixed Rule 1 and Rule 3 items calls both sub-methods."""
+ mock_db = MagicMock()
+ pipeline = ControlGeneratorPipeline(db=mock_db)
+ pipeline._existing_controls = []
+
+ chunk_r1 = self._make_chunk("eu_2016_679", "Art. 5", "DSGVO")
+ chunk_r2 = self._make_chunk("owasp_asvs", "V2.1", "OWASP")
+ chunk_r3a = self._make_chunk("bsi_tr03161", "O.Auth_1", "BSI A")
+ chunk_r3b = self._make_chunk("iso_27001", "A.9.1", "ISO B")
+
+ batch_items = [
+ (chunk_r1, {"rule": 1, "name": "DSGVO", "license": "EU_LAW"}),
+ (chunk_r3a, {"rule": 3, "name": "INTERNAL_ONLY"}),
+ (chunk_r2, {"rule": 2, "name": "OWASP ASVS", "license": "CC-BY-SA-4.0", "attribution": "OWASP Foundation"}),
+ (chunk_r3b, {"rule": 3, "name": "INTERNAL_ONLY"}),
+ ]
+
+ struct_ctrls = [
+ GeneratedControl(title="DSGVO Ctrl", license_rule=1, release_state="draft"),
+ GeneratedControl(title="OWASP Ctrl", license_rule=2, release_state="draft"),
+ ]
+ reform_ctrls = [
+ GeneratedControl(title="BSI Ctrl", license_rule=3, release_state="draft"),
+ GeneratedControl(title="ISO Ctrl", license_rule=3, release_state="draft"),
+ ]
+
+ mock_finder_instance = AsyncMock()
+ mock_finder_instance.find_anchors = AsyncMock(return_value=[])
+ mock_finder_cls = MagicMock(return_value=mock_finder_instance)
+
+ with patch.object(pipeline, "_structure_batch", new_callable=AsyncMock, return_value=struct_ctrls) as mock_struct, \
+ patch.object(pipeline, "_reformulate_batch", new_callable=AsyncMock, return_value=reform_ctrls) as mock_reform, \
+ patch.object(pipeline, "_check_harmonization", new_callable=AsyncMock, return_value=[]), \
+ patch("compliance.services.anchor_finder.AnchorFinder", mock_finder_cls), \
+ patch("compliance.services.control_generator.check_similarity", new_callable=AsyncMock) as mock_sim:
+ mock_sim.return_value = MagicMock(status="PASS", token_overlap=0.05, ngram_jaccard=0.05, lcs_ratio=0.05)
+ config = GeneratorConfig()
+ result = await pipeline._process_batch(batch_items, config, "job-mixed")
+
+ # Both methods called
+ mock_struct.assert_called_once()
+ mock_reform.assert_called_once()
+ # structure_batch gets 2 items (Rule 1 + Rule 2)
+ assert len(mock_struct.call_args[0][0]) == 2
+ # reformulate_batch gets 2 items (Rule 3 + Rule 3)
+ assert len(mock_reform.call_args[0][0]) == 2
+ # Result has 4 controls total
+ assert len(result) == 4
+
+ @pytest.mark.asyncio
+ async def test_process_batch_empty_batch(self):
+ """Empty batch returns empty list."""
+ mock_db = MagicMock()
+ pipeline = ControlGeneratorPipeline(db=mock_db)
+ pipeline._existing_controls = []
+
+ config = GeneratorConfig()
+ result = await pipeline._process_batch([], config, "job-empty")
+ assert result == []
+
+ @pytest.mark.asyncio
+ async def test_reformulate_batch_too_close_flagged(self):
+ """Rule 3 controls that are too similar to source get flagged."""
+ mock_db = MagicMock()
+ pipeline = ControlGeneratorPipeline(db=mock_db)
+ pipeline._existing_controls = []
+
+ chunk = self._make_chunk("bsi_tr03161", "O.Auth_1", "Authentication must use MFA")
+ batch_items = [
+ (chunk, {"rule": 3, "name": "INTERNAL_ONLY"}),
+ ]
+
+ ctrl = GeneratedControl(
+ title="Auth MFA",
+ objective="Authentication must use MFA",
+ rationale="Security",
+ license_rule=3,
+ release_state="draft",
+ generation_metadata={},
+ )
+
+ # Simulate similarity FAIL (too close to source)
+ fail_report = MagicMock(status="FAIL", token_overlap=0.85, ngram_jaccard=0.9, lcs_ratio=0.88)
+
+ mock_finder_instance = AsyncMock()
+ mock_finder_instance.find_anchors = AsyncMock(return_value=[])
+ mock_finder_cls = MagicMock(return_value=mock_finder_instance)
+
+ with patch.object(pipeline, "_structure_batch", new_callable=AsyncMock), \
+ patch.object(pipeline, "_reformulate_batch", new_callable=AsyncMock, return_value=[ctrl]), \
+ patch.object(pipeline, "_check_harmonization", new_callable=AsyncMock, return_value=[]), \
+ patch("compliance.services.anchor_finder.AnchorFinder", mock_finder_cls), \
+ patch("compliance.services.control_generator.check_similarity", new_callable=AsyncMock, return_value=fail_report):
+ config = GeneratorConfig()
+ result = await pipeline._process_batch(batch_items, config, "job-tooclose")
+
+ assert len(result) == 1
+ assert result[0].release_state == "too_close"
+ assert result[0].generation_metadata["similarity_status"] == "FAIL"
diff --git a/docs-src/development/testing.md b/docs-src/development/testing.md
index 83d5e53..2e6bab4 100644
--- a/docs-src/development/testing.md
+++ b/docs-src/development/testing.md
@@ -209,3 +209,38 @@ Wenn du z.B. eine neue `GetUserStats()` Funktion im Go Service hinzufuegst:
```
3. **Tests ausfuehren**: `go test -v ./internal/services/...`
4. **Dokumentation aktualisieren** (siehe [Dokumentation](./documentation.md))
+
+---
+
+## Modul-spezifische Tests
+
+### Canonical Control Generator (82 Tests)
+
+Die Control Library hat eine umfangreiche Test-Suite ueber 6 Dateien.
+Siehe [Canonical Control Library — Tests](../services/sdk-modules/canonical-control-library.md#tests) fuer Details.
+
+```bash
+# Alle Generator-Tests
+cd backend-compliance && pytest -v tests/test_control_generator.py
+
+# Similarity Detector Tests
+cd backend-compliance && pytest -v compliance/tests/test_similarity_detector.py
+
+# API Route Tests
+cd backend-compliance && pytest -v tests/test_canonical_control_routes.py
+
+# License Gate Tests
+cd backend-compliance && pytest -v tests/test_license_gate.py
+
+# CI/CD Validator Tests
+cd backend-compliance && pytest -v tests/test_validate_controls.py
+```
+
+**Wichtig:** Die Generator-Tests nutzen Mocks fuer Anthropic-API und Qdrant — sie laufen ohne externe Abhaengigkeiten.
+Die `TestPipelineMocked`-Klasse prueft insbesondere:
+
+- Korrekte Lizenz-Klassifikation (Rule 1/2/3 Verhalten)
+- Rule 3 exponiert **keine** Quellennamen in `generation_metadata`
+- SHA-256 Hash-Deduplizierung fuer Chunks
+- Config-Defaults (`batch_size: 5`, `skip_processed: true`)
+- Rule 1 Citation wird korrekt mit Gesetzesreferenz generiert
diff --git a/docs-src/services/sdk-modules/canonical-control-library.md b/docs-src/services/sdk-modules/canonical-control-library.md
index 5177a81..48c0160 100644
--- a/docs-src/services/sdk-modules/canonical-control-library.md
+++ b/docs-src/services/sdk-modules/canonical-control-library.md
@@ -118,6 +118,13 @@ erDiagram
| `GET` | `/v1/canonical/sources` | Quellenregister mit Berechtigungen |
| `GET` | `/v1/canonical/licenses` | Lizenz-Matrix |
| `POST` | `/v1/canonical/controls/{id}/similarity-check` | Too-Close-Pruefung |
+| `POST` | `/v1/canonical/generate` | Generator-Job starten |
+| `GET` | `/v1/canonical/generate/jobs` | Alle Generator-Jobs |
+| `GET` | `/v1/canonical/generate/processed-stats` | Verarbeitungsstatistik pro Collection |
+| `GET` | `/v1/canonical/generate/review-queue` | Controls zur Pruefung |
+| `POST` | `/v1/canonical/generate/review/{control_id}` | Review abschliessen |
+| `GET` | `/v1/canonical/blocked-sources` | Gesperrte Quellen (Rule 3) |
+| `POST` | `/v1/canonical/blocked-sources/cleanup` | Cleanup-Workflow starten |
### Beispiel: Control abrufen
@@ -224,7 +231,8 @@ Der Validator (`scripts/validate-controls.py`) prueft bei jedem Commit:
## Control Generator Pipeline
-Automatische Generierung von Controls aus dem gesamten RAG-Korpus (170.000+ Chunks aus Gesetzen, Verordnungen und Standards).
+Automatische Generierung von Controls aus dem gesamten RAG-Korpus (~183.000 Chunks aus Gesetzen, Verordnungen und Standards).
+Aktueller Stand: **~2.120 Controls** generiert.
### 8-Stufen-Pipeline
@@ -233,14 +241,15 @@ flowchart TD
A[1. RAG Scroll] -->|Alle Chunks| B[2. Prefilter - Lokales LLM]
B -->|Irrelevant| C[Als processed markieren]
B -->|Relevant| D[3. License Classify]
- D -->|Rule 1/2| E[4a. Structure - Anthropic]
- D -->|Rule 3| F[4b. LLM Reform - Anthropic]
- E --> G[5. Harmonization - Embeddings]
- F --> G
- G -->|Duplikat| H[Als Duplikat speichern]
- G -->|Neu| I[6. Anchor Search]
- I --> J[7. Store Control]
- J --> K[8. Mark Processed]
+ D -->|Batch sammeln| E[4. Batch Processing - 5 Chunks/API-Call]
+ E -->|Rule 1/2| F[4a. Structure Batch - Anthropic]
+ E -->|Rule 3| G[4b. Reform Batch - Anthropic]
+ F --> H[5. Harmonization - Embeddings]
+ G --> H
+ H -->|Duplikat| I[Als Duplikat speichern]
+ H -->|Neu| J[6. Anchor Search]
+ J --> K[7. Store Control]
+ K --> L[8. Mark Processed]
```
### Stufe 1: RAG Scroll (Vollstaendig)
@@ -273,6 +282,67 @@ Dies spart >50% der Anthropic-API-Kosten.
- **Rule 1+2:** Anthropic strukturiert den Originaltext in Control-Format (Titel, Ziel, Anforderungen)
- **Rule 3:** Anthropic reformuliert vollstaendig — kein Originaltext, keine Quellennamen
+### Batch Processing (Stufe 4 — Optimierung)
+
+Die Pipeline verarbeitet Chunks **nicht einzeln**, sondern sammelt sie in Batches von **5 Chunks pro API-Call**.
+Das reduziert die Anzahl der Anthropic-API-Aufrufe um ~80% und beschleunigt die Generierung erheblich.
+
+#### Ablauf
+
+1. **Chunks sammeln:** Nach dem Prefilter werden relevante Chunks mit ihrer Lizenz-Info in `pending_batch` gesammelt
+2. **Batch voll?** Sobald `batch_size` (Default: 5) erreicht ist, wird `_flush_batch()` aufgerufen
+3. **`_process_batch()`** trennt den Batch nach Lizenzregel:
+ - **Rule 1+2 Chunks** → `_structure_batch()` — ein einziger Anthropic-Call fuer alle
+ - **Rule 3 Chunks** → `_reformulate_batch()` — ein einziger Anthropic-Call fuer alle
+4. **Ergebnis:** JSON-Array mit genau N Controls, zurueck-gemappt per `chunk_index`
+
+#### `_structure_batch()` (Rule 1+2)
+
+Sendet alle freien/CC-BY Chunks in einem einzigen Prompt an Anthropic. Der Originaltext darf verwendet werden.
+Jeder Chunk wird als `--- CHUNK N ---` Block formatiert, das LLM gibt ein JSON-Array mit `chunk_index` zurueck.
+
+```python
+# Prompt-Auszug:
+"Strukturiere die folgenden 5 Gesetzestexte jeweils als eigenstaendiges Control."
+"Gib ein JSON-Array zurueck mit GENAU 5 Objekten."
+```
+
+**Processing Path:** `structured_batch` (in `generation_metadata`)
+
+#### `_reformulate_batch()` (Rule 3)
+
+Sendet alle eingeschraenkten Chunks in einem Prompt. Der Originaltext darf **nicht kopiert** werden.
+Quellennamen und proprietaere Bezeichner werden im Prompt explizit verboten.
+
+```python
+# Prompt-Auszug:
+"KOPIERE KEINE Saetze. Verwende eigene Begriffe und Struktur."
+"NENNE NICHT die Quellen. Keine proprietaeren Bezeichner."
+```
+
+**Processing Path:** `llm_reform_batch` (in `generation_metadata`)
+
+#### Fallback bei Batch-Fehler
+
+Falls ein Batch-Call fehlschlaegt (z.B. Timeout, Parsing-Error), faellt die Pipeline automatisch auf **Einzelverarbeitung** zurueck:
+
+```python
+except Exception as e:
+ logger.error("Batch processing failed: %s — falling back to single-chunk mode", e)
+ for chunk, _lic in batch:
+ ctrl = await self._process_single_chunk(chunk, config, job_id)
+```
+
+!!! info "Batch-Konfiguration"
+ | Parameter | Wert | Beschreibung |
+ |-----------|------|-------------|
+ | `batch_size` | 5 (Default) | Chunks pro API-Call |
+ | `max_tokens` | 8192 | Maximale Token-Laenge der LLM-Antwort |
+ | `LLM_TIMEOUT` | 180s | Timeout pro Anthropic-Call |
+
+ Die `batch_size` ist ueber `GeneratorConfig` konfigurierbar.
+ Bei grosser Batch-Size steigt die Wahrscheinlichkeit fuer Parsing-Fehler.
+
### Stufe 5: Harmonisierung (Embedding-basiert)
Prueft per bge-m3 Embeddings (Cosine Similarity > 0.85), ob ein aehnliches Control existiert.
@@ -310,7 +380,17 @@ system, risk, governance, hardware, identity
| `CONTROL_GEN_ANTHROPIC_MODEL` | `claude-sonnet-4-6` | Anthropic-Modell fuer Formulierung |
| `OLLAMA_URL` | `http://host.docker.internal:11434` | Lokaler Ollama-Server (Vorfilter) |
| `CONTROL_GEN_OLLAMA_MODEL` | `qwen3:30b-a3b` | Lokales LLM fuer Vorfilter |
-| `CONTROL_GEN_LLM_TIMEOUT` | `120` | Timeout in Sekunden |
+| `CONTROL_GEN_LLM_TIMEOUT` | `180` | Timeout in Sekunden (erhoet fuer Batch-Calls) |
+
+**Pipeline-Konfiguration (via `GeneratorConfig`):**
+
+| Parameter | Default | Beschreibung |
+|-----------|---------|-------------|
+| `batch_size` | `5` | Chunks pro Anthropic-API-Call |
+| `max_controls` | `0` | Limit (0 = alle Chunks verarbeiten) |
+| `skip_processed` | `true` | Bereits verarbeitete Chunks ueberspringen |
+| `dry_run` | `false` | Trockenlauf ohne DB-Schreibzugriffe |
+| `skip_web_search` | `false` | Web-Suche fuer Anchor-Finder ueberspringen |
### Architektur-Entscheidung: Gesetzesverweise
@@ -351,15 +431,145 @@ curl https://macmini:8002/api/compliance/v1/canonical/generate/jobs \
---
+## Processed Chunks Tracking
+
+Die Tabelle `canonical_processed_chunks` trackt **JEDEN** verarbeiteten RAG-Chunk per SHA-256-Hash.
+Dadurch werden Chunks bei erneutem Pipeline-Lauf automatisch uebersprungen (`skip_processed: true`).
+
+### Tabelle: `canonical_processed_chunks` (Migration 046 + 048)
+
+| Spalte | Typ | Beschreibung |
+|--------|-----|-------------|
+| `id` | UUID | Primary Key |
+| `chunk_hash` | VARCHAR(64) | SHA-256 Hash des Chunk-Textes |
+| `collection` | VARCHAR(100) | Qdrant-Collection (z.B. `bp_compliance_gesetze`) |
+| `regulation_code` | VARCHAR(100) | Quell-Regulation (z.B. `bdsg`, `eu_2016_679`) |
+| `document_version` | VARCHAR(50) | Versions-Tracking |
+| `source_license` | VARCHAR(50) | Lizenz der Quelle |
+| `license_rule` | INTEGER | 1, 2 oder 3 |
+| `processing_path` | VARCHAR(20) | Verarbeitungspfad (siehe unten) |
+| `generated_control_ids` | JSONB | UUIDs der generierten Controls |
+| `job_id` | UUID | Referenz auf `canonical_generation_jobs` |
+| `processed_at` | TIMESTAMPTZ | Zeitstempel |
+
+**UNIQUE Constraint:** `(chunk_hash, collection, document_version)` — verhindert Doppelverarbeitung.
+
+### Processing Paths
+
+| Wert | Stufe | Bedeutung |
+|------|-------|-----------|
+| `prefilter_skip` | 2 | Lokaler LLM-Vorfilter: Chunk nicht sicherheitsrelevant |
+| `structured` | 4a | Einzelner Chunk strukturiert (Rule 1/2) |
+| `llm_reform` | 4b | Einzelner Chunk reformuliert (Rule 3) |
+| `structured_batch` | 4a | Batch-Strukturierung (Rule 1/2, in `generation_metadata`) |
+| `llm_reform_batch` | 4b | Batch-Reformulierung (Rule 3, in `generation_metadata`) |
+| `no_control` | 4 | LLM konnte kein Control ableiten |
+| `store_failed` | 7 | DB-Speichern fehlgeschlagen |
+| `error` | — | Unerwarteter Fehler bei der Verarbeitung |
+
+!!! note "Batch-Pfade in generation_metadata"
+ Die Werte `structured_batch` und `llm_reform_batch` werden im `processing_path` der Datenbank gespeichert
+ **und** im `generation_metadata` JSON-Feld des Controls. So ist nachvollziehbar, ob ein Control
+ einzeln oder im Batch generiert wurde.
+
+### Beispiel-Query: Verarbeitungsstatistik
+
+```sql
+SELECT
+ processing_path,
+ COUNT(*) as count
+FROM canonical_processed_chunks
+GROUP BY processing_path
+ORDER BY count DESC;
+```
+
+---
+
+## Statistiken (processed-stats Endpoint)
+
+Der Endpoint `GET /v1/canonical/generate/processed-stats` liefert Verarbeitungsstatistiken pro RAG-Collection.
+
+```bash
+curl -s https://macmini:8002/api/compliance/v1/canonical/generate/processed-stats | jq
+```
+
+**Response:**
+```json
+{
+ "stats": [
+ {
+ "collection": "bp_compliance_gesetze",
+ "processed_chunks": 45200,
+ "direct_adopted": 1850,
+ "llm_reformed": 120,
+ "skipped": 43230,
+ "total_chunks_estimated": 0,
+ "pending_chunks": 0
+ }
+ ]
+}
+```
+
+### Aktuelle Groessenordnung
+
+| Metrik | Wert |
+|--------|------|
+| RAG-Chunks gesamt | ~183.000 |
+| Verarbeitete Chunks | ~183.000 (vollstaendig) |
+| Generierte Controls | **~2.120** |
+| Konversionsrate | ~1,2% (nur sicherheitsrelevante Chunks erzeugen Controls) |
+
+!!! info "Warum so wenige Controls?"
+ Die meisten RAG-Chunks sind Definitionen, Begriffsbestimmungen, Inhaltsverzeichnisse oder
+ Uebergangsvorschriften. Der Prefilter (Stufe 2) sortiert >50% aus, die Harmonisierung (Stufe 5)
+ entfernt weitere Duplikate. Nur konkrete, einzigartige Anforderungen werden zu Controls.
+
+---
+
+## Migration von Controls (Lokal → Production)
+
+Controls koennen ueber die REST-API von der lokalen Entwicklungsumgebung in die Production migriert werden.
+Jedes Control wird einzeln per `POST` mit der Referenz auf das Framework erstellt.
+
+```bash
+# 1. Control aus lokaler Umgebung exportieren
+curl -s https://macmini:8002/api/compliance/v1/canonical/controls/AUTH-001 | jq > control.json
+
+# 2. In Production importieren (mit framework_id)
+curl -X POST https://api-dev.breakpilot.ai/api/compliance/v1/canonical/controls \
+ -H 'Content-Type: application/json' \
+ -d '{
+ "framework_id": "bp_security_v1",
+ "control_id": "AUTH-001",
+ "title": "Multi-Faktor-Authentifizierung",
+ "objective": "...",
+ "severity": "high",
+ "open_anchors": [...]
+ }'
+```
+
+!!! warning "Framework muss existieren"
+ Das Ziel-Framework (`bp_security_v1`) muss in der Production-DB bereits angelegt sein.
+ Falls nicht, zuerst das Framework erstellen:
+ ```bash
+ curl -X POST https://api-dev.breakpilot.ai/api/compliance/v1/canonical/frameworks \
+ -H 'Content-Type: application/json' \
+ -d '{"framework_id": "bp_security_v1", "name": "BreakPilot Security", "version": "1.0"}'
+ ```
+
+---
+
## Dateien
| Datei | Typ | Beschreibung |
|-------|-----|-------------|
| `backend-compliance/migrations/044_canonical_control_library.sql` | SQL | 5 Tabellen + Seed-Daten |
+| `backend-compliance/migrations/046_control_generator.sql` | SQL | Job-Tracking, Chunk-Tracking, Blocked Sources |
| `backend-compliance/migrations/047_verification_method_category.sql` | SQL | verification_method + category Felder |
+| `backend-compliance/migrations/048_processing_path_expand.sql` | SQL | Erweiterte processing_path Werte |
| `backend-compliance/compliance/api/canonical_control_routes.py` | Python | REST API (8+ Endpoints) |
-| `backend-compliance/compliance/api/control_generator_routes.py` | Python | Generator API (Start/Status/Jobs) |
-| `backend-compliance/compliance/services/control_generator.py` | Python | 8-Stufen-Pipeline |
+| `backend-compliance/compliance/api/control_generator_routes.py` | Python | Generator API (Start/Status/Jobs/Stats) |
+| `backend-compliance/compliance/services/control_generator.py` | Python | 8-Stufen-Pipeline mit Batch Processing |
| `backend-compliance/compliance/services/license_gate.py` | Python | Lizenz-Gate-Logik |
| `backend-compliance/compliance/services/similarity_detector.py` | Python | Too-Close-Detektor (5 Metriken) |
| `backend-compliance/compliance/services/rag_client.py` | Python | RAG-Client (Search + Scroll) |
@@ -376,11 +586,25 @@ curl https://macmini:8002/api/compliance/v1/canonical/generate/jobs \
## Tests
-| Datei | Sprache | Tests |
-|-------|---------|-------|
-| `ai-compliance-sdk/internal/ucca/canonical_control_loader_test.go` | Go | 8 Tests |
-| `backend-compliance/compliance/tests/test_similarity_detector.py` | Python | 19 Tests |
-| `backend-compliance/tests/test_canonical_control_routes.py` | Python | 14 Tests |
-| `backend-compliance/tests/test_license_gate.py` | Python | 12 Tests |
-| `backend-compliance/tests/test_validate_controls.py` | Python | 14 Tests |
-| **Gesamt** | | **67 Tests** |
+| Datei | Sprache | Tests | Schwerpunkt |
+|-------|---------|-------|-------------|
+| `ai-compliance-sdk/internal/ucca/canonical_control_loader_test.go` | Go | 8 Tests | Control Loader, Multi-Index |
+| `backend-compliance/compliance/tests/test_similarity_detector.py` | Python | 19 Tests | Too-Close-Detektor, 5 Metriken |
+| `backend-compliance/tests/test_canonical_control_routes.py` | Python | 14 Tests | REST API Endpoints |
+| `backend-compliance/tests/test_license_gate.py` | Python | 12 Tests | Lizenz-Klassifikation |
+| `backend-compliance/tests/test_validate_controls.py` | Python | 14 Tests | CI/CD Validator |
+| `backend-compliance/tests/test_control_generator.py` | Python | 15 Tests | Pipeline, Batch, Lizenzregeln |
+| **Gesamt** | | **82 Tests** |
+
+### Control Generator Tests (test_control_generator.py)
+
+Die Generator-Tests decken folgende Bereiche ab:
+
+- **`TestLicenseMapping`** (12 Tests) — Korrekte Zuordnung von `regulation_code` zu Lizenzregeln (Rule 1/2/3),
+ Case-Insensitivity, Rule 3 darf keine Quellennamen exponieren
+- **`TestDomainDetection`** (5 Tests) — Erkennung von AUTH, CRYPT, NET, DATA Domains aus Chunk-Text
+- **`TestJsonParsing`** (4 Tests) — Robustes Parsing von LLM-Antworten (plain JSON, Markdown-Fenced, mit Preamble)
+- **`TestGeneratedControlRules`** (3 Tests) — Rule 1 hat Originaltext, Rule 2 hat Citation, Rule 3 hat **nichts**
+- **`TestAnchorFinder`** (2 Tests) — RAG-Suche filtert Rule 3 Quellen aus, Web-Suche erkennt Frameworks
+- **`TestPipelineMocked`** (5 Tests) — End-to-End mit Mocks: Lizenz-Klassifikation, Rule 3 Blocking,
+ Hash-Deduplizierung, Config-Defaults (`batch_size: 5`), Rule 1 Citation-Generierung