From 2ed1c08acfefd5bdc67101dcf795b90a52c23cb3 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sat, 14 Mar 2026 23:51:52 +0100 Subject: [PATCH] feat: enhance legal basis display, add batch processing tests and docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Backfill 81 controls with empty source_citation.source from generation_metadata - Add fallback to generation_metadata.source_regulation in ControlDetail blue box - Improve Rule 3 amber box text for reformulated controls - Add 30 new tests for batch processing (TestParseJsonArray, TestBatchSizeConfig, TestBatchProcessingLoop) — all 61 control generator tests passing - Fix stale test_config_defaults assertion (max_controls 50→0) - Update canonical-control-library.md with batch processing pipeline docs, processed chunks tracking, migration guide, and stats endpoint - Update testing.md with canonical control generator test section Co-Authored-By: Claude Opus 4.6 --- .../components/ControlDetail.tsx | 19 +- .../tests/test_control_generator.py | 609 +++++++++++++++++- docs-src/development/testing.md | 35 + .../sdk-modules/canonical-control-library.md | 264 +++++++- 4 files changed, 899 insertions(+), 28 deletions(-) diff --git a/admin-compliance/app/sdk/control-library/components/ControlDetail.tsx b/admin-compliance/app/sdk/control-library/components/ControlDetail.tsx index 58d0b51..de6e2ef 100644 --- a/admin-compliance/app/sdk/control-library/components/ControlDetail.tsx +++ b/admin-compliance/app/sdk/control-library/components/ControlDetail.tsx @@ -179,9 +179,11 @@ export function ControlDetail({
- {ctrl.source_citation.source && ( + {ctrl.source_citation.source ? (

{ctrl.source_citation.source}

- )} + ) : ctrl.generation_metadata?.source_regulation ? ( +

{String(ctrl.generation_metadata.source_regulation)}

+ ) : null} {ctrl.source_citation.license && (

Lizenz: {ctrl.source_citation.license}

)} @@ -211,15 +213,18 @@ export function ControlDetail({ )} - {/* Impliziter Gesetzesbezug (Rule 3 — kein Originaltext, aber ggf. Gesetzesbezug ueber Anchors) */} + {/* Impliziter Gesetzesbezug (Rule 3 — reformuliert, kein Originaltext) */} {!ctrl.source_citation && ctrl.open_anchors.length > 0 && (
-

- Dieser Control setzt implizit gesetzliche Anforderungen um (z.B. DSGVO Art. 32, NIS2 Art. 21). - Die konkreten Massnahmen leiten sich aus den Open-Source-Referenzen unten ab. -

+
+

Abgeleitet aus regulatorischen Anforderungen

+

+ Dieser Control wurde aus geschuetzten Quellen reformuliert (z.B. BSI Grundschutz, ISO 27001). + Die konkreten Massnahmen leiten sich aus den Open-Source-Referenzen unten ab. +

+
)} diff --git a/backend-compliance/tests/test_control_generator.py b/backend-compliance/tests/test_control_generator.py index 08406c3..6482a6b 100644 --- a/backend-compliance/tests/test_control_generator.py +++ b/backend-compliance/tests/test_control_generator.py @@ -305,7 +305,7 @@ class TestPipelineMocked: def test_config_defaults(self): config = GeneratorConfig() - assert config.max_controls == 50 + assert config.max_controls == 0 assert config.batch_size == 5 assert config.skip_processed is True assert config.dry_run is False @@ -340,3 +340,610 @@ class TestPipelineMocked: assert control.source_citation is not None assert "DSGVO" in control.source_citation["source"] assert control.customer_visible is True + + +# ============================================================================= +# JSON Array Parsing Tests (_parse_llm_json_array) +# ============================================================================= + +from compliance.services.control_generator import _parse_llm_json_array + + +class TestParseJsonArray: + """Tests for _parse_llm_json_array — batch LLM response parsing.""" + + def test_clean_json_array(self): + """A well-formed JSON array should be returned directly.""" + raw = json.dumps([ + {"title": "Control A", "objective": "Obj A"}, + {"title": "Control B", "objective": "Obj B"}, + ]) + result = _parse_llm_json_array(raw) + assert len(result) == 2 + assert result[0]["title"] == "Control A" + assert result[1]["title"] == "Control B" + + def test_json_array_in_markdown_code_block(self): + """JSON array wrapped in ```json ... ``` markdown fence.""" + inner = json.dumps([ + {"title": "Fenced A", "chunk_index": 1}, + {"title": "Fenced B", "chunk_index": 2}, + ]) + raw = f"Here is the result:\n```json\n{inner}\n```\nDone." + result = _parse_llm_json_array(raw) + assert len(result) == 2 + assert result[0]["title"] == "Fenced A" + assert result[1]["title"] == "Fenced B" + + def test_markdown_code_block_without_json_tag(self): + """Markdown fence without explicit 'json' language tag.""" + inner = json.dumps([{"title": "NoTag", "objective": "test"}]) + raw = f"```\n{inner}\n```" + result = _parse_llm_json_array(raw) + assert len(result) == 1 + assert result[0]["title"] == "NoTag" + + def test_wrapper_object_controls_key(self): + """LLM wraps array in {"controls": [...]} — should unwrap.""" + raw = json.dumps({ + "controls": [ + {"title": "Wrapped A", "objective": "O1"}, + {"title": "Wrapped B", "objective": "O2"}, + ] + }) + result = _parse_llm_json_array(raw) + assert len(result) == 2 + assert result[0]["title"] == "Wrapped A" + + def test_wrapper_object_results_key(self): + """LLM wraps array in {"results": [...]} — should unwrap.""" + raw = json.dumps({ + "results": [ + {"title": "R1"}, + {"title": "R2"}, + {"title": "R3"}, + ] + }) + result = _parse_llm_json_array(raw) + assert len(result) == 3 + + def test_wrapper_object_items_key(self): + """LLM wraps array in {"items": [...]} — should unwrap.""" + raw = json.dumps({ + "items": [{"title": "Item1"}] + }) + result = _parse_llm_json_array(raw) + assert len(result) == 1 + assert result[0]["title"] == "Item1" + + def test_wrapper_object_data_key(self): + """LLM wraps array in {"data": [...]} — should unwrap.""" + raw = json.dumps({ + "data": [{"title": "D1"}, {"title": "D2"}] + }) + result = _parse_llm_json_array(raw) + assert len(result) == 2 + + def test_single_dict_returned_as_list(self): + """A single JSON object (no array) is wrapped in a list.""" + raw = json.dumps({"title": "SingleControl", "objective": "Obj"}) + result = _parse_llm_json_array(raw) + assert len(result) == 1 + assert result[0]["title"] == "SingleControl" + + def test_individual_json_objects_fallback(self): + """Multiple separate JSON objects (not in array) are collected.""" + raw = ( + 'Here are the controls:\n' + '{"title": "Ctrl1", "objective": "A"}\n' + '{"title": "Ctrl2", "objective": "B"}\n' + ) + result = _parse_llm_json_array(raw) + assert len(result) == 2 + titles = {r["title"] for r in result} + assert "Ctrl1" in titles + assert "Ctrl2" in titles + + def test_individual_objects_require_title(self): + """Fallback individual-object parsing only includes objects with 'title'.""" + raw = ( + '{"title": "HasTitle", "objective": "Yes"}\n' + '{"no_title_field": "skip_me"}\n' + ) + result = _parse_llm_json_array(raw) + assert len(result) == 1 + assert result[0]["title"] == "HasTitle" + + def test_empty_string_returns_empty_list(self): + """Empty input returns empty list.""" + result = _parse_llm_json_array("") + assert result == [] + + def test_invalid_input_returns_empty_list(self): + """Completely invalid input returns empty list.""" + result = _parse_llm_json_array("this is not json at all, no braces anywhere") + assert result == [] + + def test_garbage_with_no_json_returns_empty(self): + """Random non-JSON text should return empty list.""" + result = _parse_llm_json_array("Hier ist meine Antwort: leider kann ich das nicht.") + assert result == [] + + def test_bracket_block_extraction(self): + """Array embedded in preamble text is extracted via bracket matching.""" + raw = 'Some preamble text...\n[{"title": "Extracted", "objective": "X"}]\nSome trailing text.' + result = _parse_llm_json_array(raw) + assert len(result) == 1 + assert result[0]["title"] == "Extracted" + + def test_nested_objects_in_array(self): + """Array elements with nested objects (like scope) are parsed correctly.""" + raw = json.dumps([ + { + "title": "Nested", + "objective": "Test", + "scope": {"regions": ["EU", "DE"]}, + "requirements": ["Req1"], + } + ]) + result = _parse_llm_json_array(raw) + assert len(result) == 1 + assert result[0]["scope"]["regions"] == ["EU", "DE"] + + +# ============================================================================= +# Batch Size Configuration Tests +# ============================================================================= + +class TestBatchSizeConfig: + """Tests for batch_size configuration on GeneratorConfig.""" + + def test_default_batch_size(self): + config = GeneratorConfig() + assert config.batch_size == 5 + + def test_custom_batch_size(self): + config = GeneratorConfig(batch_size=10) + assert config.batch_size == 10 + + def test_batch_size_of_one(self): + config = GeneratorConfig(batch_size=1) + assert config.batch_size == 1 + + def test_batch_size_used_in_pipeline_constant(self): + """Verify that pipeline uses config.batch_size (not a hardcoded value).""" + config = GeneratorConfig(batch_size=3) + # BATCH_SIZE = config.batch_size or 5 — with batch_size=3 it should be 3 + batch_size = config.batch_size or 5 + assert batch_size == 3 + + def test_batch_size_zero_falls_back_to_five(self): + """batch_size=0 triggers `or 5` fallback in the pipeline loop.""" + config = GeneratorConfig(batch_size=0) + # Mimics the pipeline logic: BATCH_SIZE = config.batch_size or 5 + batch_size = config.batch_size or 5 + assert batch_size == 5 + + +# ============================================================================= +# Batch Processing Loop Tests (Mocked) +# ============================================================================= + +class TestBatchProcessingLoop: + """Tests for _process_batch, _structure_batch, _reformulate_batch with mocked LLM.""" + + def _make_chunk(self, regulation_code="owasp_asvs", article="V2.1.1", text="Test requirement"): + return RAGSearchResult( + text=text, + regulation_code=regulation_code, + regulation_name="OWASP ASVS" if "owasp" in regulation_code else "Test Source", + regulation_short="OWASP" if "owasp" in regulation_code else "TEST", + category="requirement", + article=article, + paragraph="", + source_url="https://example.com", + score=0.9, + ) + + @pytest.mark.asyncio + async def test_process_batch_splits_by_license_rule(self): + """_process_batch routes Rule 1+2 to _structure_batch and Rule 3 to _reformulate_batch.""" + mock_db = MagicMock() + pipeline = ControlGeneratorPipeline(db=mock_db) + pipeline._existing_controls = [] + + chunk_r1 = self._make_chunk("eu_2016_679", "Art. 35", "DSGVO text") + chunk_r3 = self._make_chunk("bsi_tr03161", "O.Auth_1", "BSI text") + + batch_items = [ + (chunk_r1, {"rule": 1, "name": "DSGVO", "license": "EU_LAW"}), + (chunk_r3, {"rule": 3, "name": "INTERNAL_ONLY"}), + ] + + # Mock _structure_batch and _reformulate_batch + structure_ctrl = GeneratedControl(title="Structured", license_rule=1, release_state="draft") + reform_ctrl = GeneratedControl(title="Reformed", license_rule=3, release_state="draft") + + mock_finder_instance = AsyncMock() + mock_finder_instance.find_anchors = AsyncMock(return_value=[]) + mock_finder_cls = MagicMock(return_value=mock_finder_instance) + + with patch.object(pipeline, "_structure_batch", new_callable=AsyncMock, return_value=[structure_ctrl]) as mock_struct, \ + patch.object(pipeline, "_reformulate_batch", new_callable=AsyncMock, return_value=[reform_ctrl]) as mock_reform, \ + patch.object(pipeline, "_check_harmonization", new_callable=AsyncMock, return_value=[]), \ + patch("compliance.services.anchor_finder.AnchorFinder", mock_finder_cls), \ + patch("compliance.services.control_generator.check_similarity", new_callable=AsyncMock) as mock_sim: + mock_sim.return_value = MagicMock(status="PASS", token_overlap=0.1, ngram_jaccard=0.1, lcs_ratio=0.1) + config = GeneratorConfig(batch_size=5) + result = await pipeline._process_batch(batch_items, config, "test-job-123") + + mock_struct.assert_called_once() + mock_reform.assert_called_once() + # structure_batch received only the Rule 1 chunk + assert len(mock_struct.call_args[0][0]) == 1 + # reformulate_batch received only the Rule 3 chunk + assert len(mock_reform.call_args[0][0]) == 1 + + @pytest.mark.asyncio + async def test_structure_batch_calls_llm_and_parses(self): + """_structure_batch sends prompt to LLM and parses array response.""" + mock_db = MagicMock() + pipeline = ControlGeneratorPipeline(db=mock_db) + + chunks = [ + self._make_chunk("eu_2016_679", "Art. 5", "Datensparsamkeit und Zweckbindung"), + self._make_chunk("eu_2016_679", "Art. 35", "DSFA bei hohem Risiko"), + ] + license_infos = [ + {"rule": 1, "name": "DSGVO", "license": "EU_LAW"}, + {"rule": 1, "name": "DSGVO", "license": "EU_LAW"}, + ] + + llm_response = json.dumps([ + { + "chunk_index": 1, + "title": "Datensparsamkeit", + "objective": "Nur notwendige Daten erheben.", + "rationale": "DSGVO Grundprinzip.", + "requirements": ["Datenminimierung"], + "test_procedure": ["Datenbestand pruefen"], + "evidence": ["Verarbeitungsverzeichnis"], + "severity": "high", + "tags": ["dsgvo", "datenschutz"], + }, + { + "chunk_index": 2, + "title": "DSFA Pflicht", + "objective": "DSFA bei hohem Risiko durchfuehren.", + "rationale": "Gesetzliche Pflicht.", + "requirements": ["DSFA erstellen"], + "test_procedure": ["DSFA Bericht pruefen"], + "evidence": ["DSFA Dokumentation"], + "severity": "high", + "tags": ["dsfa"], + }, + ]) + + with patch("compliance.services.control_generator._llm_chat", new_callable=AsyncMock, return_value=llm_response): + controls = await pipeline._structure_batch(chunks, license_infos) + + assert len(controls) == 2 + assert controls[0] is not None + assert controls[0].title == "Datensparsamkeit" + assert controls[0].license_rule == 1 + assert controls[0].source_original_text is not None + assert controls[0].customer_visible is True + assert controls[0].generation_metadata["processing_path"] == "structured_batch" + assert controls[0].generation_metadata["batch_size"] == 2 + + assert controls[1] is not None + assert controls[1].title == "DSFA Pflicht" + assert controls[1].license_rule == 1 + + @pytest.mark.asyncio + async def test_reformulate_batch_calls_llm_and_strips_source(self): + """_reformulate_batch produces Rule 3 controls without source info.""" + mock_db = MagicMock() + pipeline = ControlGeneratorPipeline(db=mock_db) + + chunks = [ + self._make_chunk("bsi_tr03161", "O.Auth_1", "Multi-factor authentication for apps"), + ] + config = GeneratorConfig(batch_size=5) + + llm_response = json.dumps([ + { + "chunk_index": 1, + "title": "Starke Authentifizierung", + "objective": "Mehrstufige Anmeldung implementieren.", + "rationale": "Schutz vor unbefugtem Zugriff.", + "requirements": ["MFA einrichten"], + "test_procedure": ["MFA Funktionstest"], + "evidence": ["MFA Konfiguration"], + "severity": "critical", + "tags": ["auth", "mfa"], + } + ]) + + with patch("compliance.services.control_generator._llm_chat", new_callable=AsyncMock, return_value=llm_response): + controls = await pipeline._reformulate_batch(chunks, config) + + assert len(controls) == 1 + ctrl = controls[0] + assert ctrl is not None + assert ctrl.title == "Starke Authentifizierung" + assert ctrl.license_rule == 3 + assert ctrl.source_original_text is None + assert ctrl.source_citation is None + assert ctrl.customer_visible is False + assert ctrl.generation_metadata["processing_path"] == "llm_reform_batch" + # Must not contain BSI references + metadata_str = json.dumps(ctrl.generation_metadata) + assert "bsi" not in metadata_str.lower() + assert "TR-03161" not in metadata_str + + @pytest.mark.asyncio + async def test_structure_batch_maps_by_chunk_index(self): + """Controls are mapped back to the correct chunk via chunk_index.""" + mock_db = MagicMock() + pipeline = ControlGeneratorPipeline(db=mock_db) + + chunks = [ + self._make_chunk("eu_2016_679", "Art. 5", "First chunk"), + self._make_chunk("eu_2016_679", "Art. 6", "Second chunk"), + self._make_chunk("eu_2016_679", "Art. 7", "Third chunk"), + ] + license_infos = [{"rule": 1, "name": "DSGVO", "license": "EU_LAW"}] * 3 + + # LLM returns them in reversed order + llm_response = json.dumps([ + { + "chunk_index": 3, + "title": "Third Control", + "objective": "Obj3", + "rationale": "Rat3", + "requirements": [], + "test_procedure": [], + "evidence": [], + "severity": "low", + "tags": [], + }, + { + "chunk_index": 1, + "title": "First Control", + "objective": "Obj1", + "rationale": "Rat1", + "requirements": [], + "test_procedure": [], + "evidence": [], + "severity": "high", + "tags": [], + }, + { + "chunk_index": 2, + "title": "Second Control", + "objective": "Obj2", + "rationale": "Rat2", + "requirements": [], + "test_procedure": [], + "evidence": [], + "severity": "medium", + "tags": [], + }, + ]) + + with patch("compliance.services.control_generator._llm_chat", new_callable=AsyncMock, return_value=llm_response): + controls = await pipeline._structure_batch(chunks, license_infos) + + assert len(controls) == 3 + # Verify correct mapping despite shuffled chunk_index + assert controls[0].title == "First Control" + assert controls[1].title == "Second Control" + assert controls[2].title == "Third Control" + + @pytest.mark.asyncio + async def test_structure_batch_falls_back_to_position(self): + """If chunk_index is missing, controls are assigned by position.""" + mock_db = MagicMock() + pipeline = ControlGeneratorPipeline(db=mock_db) + + chunks = [ + self._make_chunk("eu_2016_679", "Art. 5", "Chunk A"), + self._make_chunk("eu_2016_679", "Art. 6", "Chunk B"), + ] + license_infos = [{"rule": 1, "name": "DSGVO", "license": "EU_LAW"}] * 2 + + # No chunk_index in response + llm_response = json.dumps([ + { + "title": "PositionA", + "objective": "ObjA", + "rationale": "RatA", + "requirements": [], + "test_procedure": [], + "evidence": [], + "severity": "medium", + "tags": [], + }, + { + "title": "PositionB", + "objective": "ObjB", + "rationale": "RatB", + "requirements": [], + "test_procedure": [], + "evidence": [], + "severity": "medium", + "tags": [], + }, + ]) + + with patch("compliance.services.control_generator._llm_chat", new_callable=AsyncMock, return_value=llm_response): + controls = await pipeline._structure_batch(chunks, license_infos) + + assert len(controls) == 2 + assert controls[0].title == "PositionA" + assert controls[1].title == "PositionB" + + @pytest.mark.asyncio + async def test_process_batch_only_structure_no_reform(self): + """Batch with only Rule 1+2 items skips _reformulate_batch.""" + mock_db = MagicMock() + pipeline = ControlGeneratorPipeline(db=mock_db) + pipeline._existing_controls = [] + + chunk = self._make_chunk("eu_2016_679", "Art. 5", "DSGVO text") + batch_items = [ + (chunk, {"rule": 1, "name": "DSGVO", "license": "EU_LAW"}), + ] + + ctrl = GeneratedControl(title="OnlyStructure", license_rule=1, release_state="draft") + + mock_finder_instance = AsyncMock() + mock_finder_instance.find_anchors = AsyncMock(return_value=[]) + mock_finder_cls = MagicMock(return_value=mock_finder_instance) + + with patch.object(pipeline, "_structure_batch", new_callable=AsyncMock, return_value=[ctrl]) as mock_struct, \ + patch.object(pipeline, "_reformulate_batch", new_callable=AsyncMock) as mock_reform, \ + patch.object(pipeline, "_check_harmonization", new_callable=AsyncMock, return_value=[]), \ + patch("compliance.services.anchor_finder.AnchorFinder", mock_finder_cls): + config = GeneratorConfig() + result = await pipeline._process_batch(batch_items, config, "job-1") + + mock_struct.assert_called_once() + mock_reform.assert_not_called() + assert len(result) == 1 + assert result[0].title == "OnlyStructure" + + @pytest.mark.asyncio + async def test_process_batch_only_reform_no_structure(self): + """Batch with only Rule 3 items skips _structure_batch.""" + mock_db = MagicMock() + pipeline = ControlGeneratorPipeline(db=mock_db) + pipeline._existing_controls = [] + + chunk = self._make_chunk("bsi_tr03161", "O.Auth_1", "BSI text") + batch_items = [ + (chunk, {"rule": 3, "name": "INTERNAL_ONLY"}), + ] + + ctrl = GeneratedControl(title="OnlyReform", license_rule=3, release_state="draft") + + mock_finder_instance = AsyncMock() + mock_finder_instance.find_anchors = AsyncMock(return_value=[]) + mock_finder_cls = MagicMock(return_value=mock_finder_instance) + + with patch.object(pipeline, "_structure_batch", new_callable=AsyncMock) as mock_struct, \ + patch.object(pipeline, "_reformulate_batch", new_callable=AsyncMock, return_value=[ctrl]) as mock_reform, \ + patch.object(pipeline, "_check_harmonization", new_callable=AsyncMock, return_value=[]), \ + patch("compliance.services.anchor_finder.AnchorFinder", mock_finder_cls), \ + patch("compliance.services.control_generator.check_similarity", new_callable=AsyncMock) as mock_sim: + mock_sim.return_value = MagicMock(status="PASS", token_overlap=0.1, ngram_jaccard=0.1, lcs_ratio=0.1) + config = GeneratorConfig() + result = await pipeline._process_batch(batch_items, config, "job-2") + + mock_struct.assert_not_called() + mock_reform.assert_called_once() + assert len(result) == 1 + assert result[0].title == "OnlyReform" + + @pytest.mark.asyncio + async def test_process_batch_mixed_rules(self): + """Batch with mixed Rule 1 and Rule 3 items calls both sub-methods.""" + mock_db = MagicMock() + pipeline = ControlGeneratorPipeline(db=mock_db) + pipeline._existing_controls = [] + + chunk_r1 = self._make_chunk("eu_2016_679", "Art. 5", "DSGVO") + chunk_r2 = self._make_chunk("owasp_asvs", "V2.1", "OWASP") + chunk_r3a = self._make_chunk("bsi_tr03161", "O.Auth_1", "BSI A") + chunk_r3b = self._make_chunk("iso_27001", "A.9.1", "ISO B") + + batch_items = [ + (chunk_r1, {"rule": 1, "name": "DSGVO", "license": "EU_LAW"}), + (chunk_r3a, {"rule": 3, "name": "INTERNAL_ONLY"}), + (chunk_r2, {"rule": 2, "name": "OWASP ASVS", "license": "CC-BY-SA-4.0", "attribution": "OWASP Foundation"}), + (chunk_r3b, {"rule": 3, "name": "INTERNAL_ONLY"}), + ] + + struct_ctrls = [ + GeneratedControl(title="DSGVO Ctrl", license_rule=1, release_state="draft"), + GeneratedControl(title="OWASP Ctrl", license_rule=2, release_state="draft"), + ] + reform_ctrls = [ + GeneratedControl(title="BSI Ctrl", license_rule=3, release_state="draft"), + GeneratedControl(title="ISO Ctrl", license_rule=3, release_state="draft"), + ] + + mock_finder_instance = AsyncMock() + mock_finder_instance.find_anchors = AsyncMock(return_value=[]) + mock_finder_cls = MagicMock(return_value=mock_finder_instance) + + with patch.object(pipeline, "_structure_batch", new_callable=AsyncMock, return_value=struct_ctrls) as mock_struct, \ + patch.object(pipeline, "_reformulate_batch", new_callable=AsyncMock, return_value=reform_ctrls) as mock_reform, \ + patch.object(pipeline, "_check_harmonization", new_callable=AsyncMock, return_value=[]), \ + patch("compliance.services.anchor_finder.AnchorFinder", mock_finder_cls), \ + patch("compliance.services.control_generator.check_similarity", new_callable=AsyncMock) as mock_sim: + mock_sim.return_value = MagicMock(status="PASS", token_overlap=0.05, ngram_jaccard=0.05, lcs_ratio=0.05) + config = GeneratorConfig() + result = await pipeline._process_batch(batch_items, config, "job-mixed") + + # Both methods called + mock_struct.assert_called_once() + mock_reform.assert_called_once() + # structure_batch gets 2 items (Rule 1 + Rule 2) + assert len(mock_struct.call_args[0][0]) == 2 + # reformulate_batch gets 2 items (Rule 3 + Rule 3) + assert len(mock_reform.call_args[0][0]) == 2 + # Result has 4 controls total + assert len(result) == 4 + + @pytest.mark.asyncio + async def test_process_batch_empty_batch(self): + """Empty batch returns empty list.""" + mock_db = MagicMock() + pipeline = ControlGeneratorPipeline(db=mock_db) + pipeline._existing_controls = [] + + config = GeneratorConfig() + result = await pipeline._process_batch([], config, "job-empty") + assert result == [] + + @pytest.mark.asyncio + async def test_reformulate_batch_too_close_flagged(self): + """Rule 3 controls that are too similar to source get flagged.""" + mock_db = MagicMock() + pipeline = ControlGeneratorPipeline(db=mock_db) + pipeline._existing_controls = [] + + chunk = self._make_chunk("bsi_tr03161", "O.Auth_1", "Authentication must use MFA") + batch_items = [ + (chunk, {"rule": 3, "name": "INTERNAL_ONLY"}), + ] + + ctrl = GeneratedControl( + title="Auth MFA", + objective="Authentication must use MFA", + rationale="Security", + license_rule=3, + release_state="draft", + generation_metadata={}, + ) + + # Simulate similarity FAIL (too close to source) + fail_report = MagicMock(status="FAIL", token_overlap=0.85, ngram_jaccard=0.9, lcs_ratio=0.88) + + mock_finder_instance = AsyncMock() + mock_finder_instance.find_anchors = AsyncMock(return_value=[]) + mock_finder_cls = MagicMock(return_value=mock_finder_instance) + + with patch.object(pipeline, "_structure_batch", new_callable=AsyncMock), \ + patch.object(pipeline, "_reformulate_batch", new_callable=AsyncMock, return_value=[ctrl]), \ + patch.object(pipeline, "_check_harmonization", new_callable=AsyncMock, return_value=[]), \ + patch("compliance.services.anchor_finder.AnchorFinder", mock_finder_cls), \ + patch("compliance.services.control_generator.check_similarity", new_callable=AsyncMock, return_value=fail_report): + config = GeneratorConfig() + result = await pipeline._process_batch(batch_items, config, "job-tooclose") + + assert len(result) == 1 + assert result[0].release_state == "too_close" + assert result[0].generation_metadata["similarity_status"] == "FAIL" diff --git a/docs-src/development/testing.md b/docs-src/development/testing.md index 83d5e53..2e6bab4 100644 --- a/docs-src/development/testing.md +++ b/docs-src/development/testing.md @@ -209,3 +209,38 @@ Wenn du z.B. eine neue `GetUserStats()` Funktion im Go Service hinzufuegst: ``` 3. **Tests ausfuehren**: `go test -v ./internal/services/...` 4. **Dokumentation aktualisieren** (siehe [Dokumentation](./documentation.md)) + +--- + +## Modul-spezifische Tests + +### Canonical Control Generator (82 Tests) + +Die Control Library hat eine umfangreiche Test-Suite ueber 6 Dateien. +Siehe [Canonical Control Library — Tests](../services/sdk-modules/canonical-control-library.md#tests) fuer Details. + +```bash +# Alle Generator-Tests +cd backend-compliance && pytest -v tests/test_control_generator.py + +# Similarity Detector Tests +cd backend-compliance && pytest -v compliance/tests/test_similarity_detector.py + +# API Route Tests +cd backend-compliance && pytest -v tests/test_canonical_control_routes.py + +# License Gate Tests +cd backend-compliance && pytest -v tests/test_license_gate.py + +# CI/CD Validator Tests +cd backend-compliance && pytest -v tests/test_validate_controls.py +``` + +**Wichtig:** Die Generator-Tests nutzen Mocks fuer Anthropic-API und Qdrant — sie laufen ohne externe Abhaengigkeiten. +Die `TestPipelineMocked`-Klasse prueft insbesondere: + +- Korrekte Lizenz-Klassifikation (Rule 1/2/3 Verhalten) +- Rule 3 exponiert **keine** Quellennamen in `generation_metadata` +- SHA-256 Hash-Deduplizierung fuer Chunks +- Config-Defaults (`batch_size: 5`, `skip_processed: true`) +- Rule 1 Citation wird korrekt mit Gesetzesreferenz generiert diff --git a/docs-src/services/sdk-modules/canonical-control-library.md b/docs-src/services/sdk-modules/canonical-control-library.md index 5177a81..48c0160 100644 --- a/docs-src/services/sdk-modules/canonical-control-library.md +++ b/docs-src/services/sdk-modules/canonical-control-library.md @@ -118,6 +118,13 @@ erDiagram | `GET` | `/v1/canonical/sources` | Quellenregister mit Berechtigungen | | `GET` | `/v1/canonical/licenses` | Lizenz-Matrix | | `POST` | `/v1/canonical/controls/{id}/similarity-check` | Too-Close-Pruefung | +| `POST` | `/v1/canonical/generate` | Generator-Job starten | +| `GET` | `/v1/canonical/generate/jobs` | Alle Generator-Jobs | +| `GET` | `/v1/canonical/generate/processed-stats` | Verarbeitungsstatistik pro Collection | +| `GET` | `/v1/canonical/generate/review-queue` | Controls zur Pruefung | +| `POST` | `/v1/canonical/generate/review/{control_id}` | Review abschliessen | +| `GET` | `/v1/canonical/blocked-sources` | Gesperrte Quellen (Rule 3) | +| `POST` | `/v1/canonical/blocked-sources/cleanup` | Cleanup-Workflow starten | ### Beispiel: Control abrufen @@ -224,7 +231,8 @@ Der Validator (`scripts/validate-controls.py`) prueft bei jedem Commit: ## Control Generator Pipeline -Automatische Generierung von Controls aus dem gesamten RAG-Korpus (170.000+ Chunks aus Gesetzen, Verordnungen und Standards). +Automatische Generierung von Controls aus dem gesamten RAG-Korpus (~183.000 Chunks aus Gesetzen, Verordnungen und Standards). +Aktueller Stand: **~2.120 Controls** generiert. ### 8-Stufen-Pipeline @@ -233,14 +241,15 @@ flowchart TD A[1. RAG Scroll] -->|Alle Chunks| B[2. Prefilter - Lokales LLM] B -->|Irrelevant| C[Als processed markieren] B -->|Relevant| D[3. License Classify] - D -->|Rule 1/2| E[4a. Structure - Anthropic] - D -->|Rule 3| F[4b. LLM Reform - Anthropic] - E --> G[5. Harmonization - Embeddings] - F --> G - G -->|Duplikat| H[Als Duplikat speichern] - G -->|Neu| I[6. Anchor Search] - I --> J[7. Store Control] - J --> K[8. Mark Processed] + D -->|Batch sammeln| E[4. Batch Processing - 5 Chunks/API-Call] + E -->|Rule 1/2| F[4a. Structure Batch - Anthropic] + E -->|Rule 3| G[4b. Reform Batch - Anthropic] + F --> H[5. Harmonization - Embeddings] + G --> H + H -->|Duplikat| I[Als Duplikat speichern] + H -->|Neu| J[6. Anchor Search] + J --> K[7. Store Control] + K --> L[8. Mark Processed] ``` ### Stufe 1: RAG Scroll (Vollstaendig) @@ -273,6 +282,67 @@ Dies spart >50% der Anthropic-API-Kosten. - **Rule 1+2:** Anthropic strukturiert den Originaltext in Control-Format (Titel, Ziel, Anforderungen) - **Rule 3:** Anthropic reformuliert vollstaendig — kein Originaltext, keine Quellennamen +### Batch Processing (Stufe 4 — Optimierung) + +Die Pipeline verarbeitet Chunks **nicht einzeln**, sondern sammelt sie in Batches von **5 Chunks pro API-Call**. +Das reduziert die Anzahl der Anthropic-API-Aufrufe um ~80% und beschleunigt die Generierung erheblich. + +#### Ablauf + +1. **Chunks sammeln:** Nach dem Prefilter werden relevante Chunks mit ihrer Lizenz-Info in `pending_batch` gesammelt +2. **Batch voll?** Sobald `batch_size` (Default: 5) erreicht ist, wird `_flush_batch()` aufgerufen +3. **`_process_batch()`** trennt den Batch nach Lizenzregel: + - **Rule 1+2 Chunks** → `_structure_batch()` — ein einziger Anthropic-Call fuer alle + - **Rule 3 Chunks** → `_reformulate_batch()` — ein einziger Anthropic-Call fuer alle +4. **Ergebnis:** JSON-Array mit genau N Controls, zurueck-gemappt per `chunk_index` + +#### `_structure_batch()` (Rule 1+2) + +Sendet alle freien/CC-BY Chunks in einem einzigen Prompt an Anthropic. Der Originaltext darf verwendet werden. +Jeder Chunk wird als `--- CHUNK N ---` Block formatiert, das LLM gibt ein JSON-Array mit `chunk_index` zurueck. + +```python +# Prompt-Auszug: +"Strukturiere die folgenden 5 Gesetzestexte jeweils als eigenstaendiges Control." +"Gib ein JSON-Array zurueck mit GENAU 5 Objekten." +``` + +**Processing Path:** `structured_batch` (in `generation_metadata`) + +#### `_reformulate_batch()` (Rule 3) + +Sendet alle eingeschraenkten Chunks in einem Prompt. Der Originaltext darf **nicht kopiert** werden. +Quellennamen und proprietaere Bezeichner werden im Prompt explizit verboten. + +```python +# Prompt-Auszug: +"KOPIERE KEINE Saetze. Verwende eigene Begriffe und Struktur." +"NENNE NICHT die Quellen. Keine proprietaeren Bezeichner." +``` + +**Processing Path:** `llm_reform_batch` (in `generation_metadata`) + +#### Fallback bei Batch-Fehler + +Falls ein Batch-Call fehlschlaegt (z.B. Timeout, Parsing-Error), faellt die Pipeline automatisch auf **Einzelverarbeitung** zurueck: + +```python +except Exception as e: + logger.error("Batch processing failed: %s — falling back to single-chunk mode", e) + for chunk, _lic in batch: + ctrl = await self._process_single_chunk(chunk, config, job_id) +``` + +!!! info "Batch-Konfiguration" + | Parameter | Wert | Beschreibung | + |-----------|------|-------------| + | `batch_size` | 5 (Default) | Chunks pro API-Call | + | `max_tokens` | 8192 | Maximale Token-Laenge der LLM-Antwort | + | `LLM_TIMEOUT` | 180s | Timeout pro Anthropic-Call | + + Die `batch_size` ist ueber `GeneratorConfig` konfigurierbar. + Bei grosser Batch-Size steigt die Wahrscheinlichkeit fuer Parsing-Fehler. + ### Stufe 5: Harmonisierung (Embedding-basiert) Prueft per bge-m3 Embeddings (Cosine Similarity > 0.85), ob ein aehnliches Control existiert. @@ -310,7 +380,17 @@ system, risk, governance, hardware, identity | `CONTROL_GEN_ANTHROPIC_MODEL` | `claude-sonnet-4-6` | Anthropic-Modell fuer Formulierung | | `OLLAMA_URL` | `http://host.docker.internal:11434` | Lokaler Ollama-Server (Vorfilter) | | `CONTROL_GEN_OLLAMA_MODEL` | `qwen3:30b-a3b` | Lokales LLM fuer Vorfilter | -| `CONTROL_GEN_LLM_TIMEOUT` | `120` | Timeout in Sekunden | +| `CONTROL_GEN_LLM_TIMEOUT` | `180` | Timeout in Sekunden (erhoet fuer Batch-Calls) | + +**Pipeline-Konfiguration (via `GeneratorConfig`):** + +| Parameter | Default | Beschreibung | +|-----------|---------|-------------| +| `batch_size` | `5` | Chunks pro Anthropic-API-Call | +| `max_controls` | `0` | Limit (0 = alle Chunks verarbeiten) | +| `skip_processed` | `true` | Bereits verarbeitete Chunks ueberspringen | +| `dry_run` | `false` | Trockenlauf ohne DB-Schreibzugriffe | +| `skip_web_search` | `false` | Web-Suche fuer Anchor-Finder ueberspringen | ### Architektur-Entscheidung: Gesetzesverweise @@ -351,15 +431,145 @@ curl https://macmini:8002/api/compliance/v1/canonical/generate/jobs \ --- +## Processed Chunks Tracking + +Die Tabelle `canonical_processed_chunks` trackt **JEDEN** verarbeiteten RAG-Chunk per SHA-256-Hash. +Dadurch werden Chunks bei erneutem Pipeline-Lauf automatisch uebersprungen (`skip_processed: true`). + +### Tabelle: `canonical_processed_chunks` (Migration 046 + 048) + +| Spalte | Typ | Beschreibung | +|--------|-----|-------------| +| `id` | UUID | Primary Key | +| `chunk_hash` | VARCHAR(64) | SHA-256 Hash des Chunk-Textes | +| `collection` | VARCHAR(100) | Qdrant-Collection (z.B. `bp_compliance_gesetze`) | +| `regulation_code` | VARCHAR(100) | Quell-Regulation (z.B. `bdsg`, `eu_2016_679`) | +| `document_version` | VARCHAR(50) | Versions-Tracking | +| `source_license` | VARCHAR(50) | Lizenz der Quelle | +| `license_rule` | INTEGER | 1, 2 oder 3 | +| `processing_path` | VARCHAR(20) | Verarbeitungspfad (siehe unten) | +| `generated_control_ids` | JSONB | UUIDs der generierten Controls | +| `job_id` | UUID | Referenz auf `canonical_generation_jobs` | +| `processed_at` | TIMESTAMPTZ | Zeitstempel | + +**UNIQUE Constraint:** `(chunk_hash, collection, document_version)` — verhindert Doppelverarbeitung. + +### Processing Paths + +| Wert | Stufe | Bedeutung | +|------|-------|-----------| +| `prefilter_skip` | 2 | Lokaler LLM-Vorfilter: Chunk nicht sicherheitsrelevant | +| `structured` | 4a | Einzelner Chunk strukturiert (Rule 1/2) | +| `llm_reform` | 4b | Einzelner Chunk reformuliert (Rule 3) | +| `structured_batch` | 4a | Batch-Strukturierung (Rule 1/2, in `generation_metadata`) | +| `llm_reform_batch` | 4b | Batch-Reformulierung (Rule 3, in `generation_metadata`) | +| `no_control` | 4 | LLM konnte kein Control ableiten | +| `store_failed` | 7 | DB-Speichern fehlgeschlagen | +| `error` | — | Unerwarteter Fehler bei der Verarbeitung | + +!!! note "Batch-Pfade in generation_metadata" + Die Werte `structured_batch` und `llm_reform_batch` werden im `processing_path` der Datenbank gespeichert + **und** im `generation_metadata` JSON-Feld des Controls. So ist nachvollziehbar, ob ein Control + einzeln oder im Batch generiert wurde. + +### Beispiel-Query: Verarbeitungsstatistik + +```sql +SELECT + processing_path, + COUNT(*) as count +FROM canonical_processed_chunks +GROUP BY processing_path +ORDER BY count DESC; +``` + +--- + +## Statistiken (processed-stats Endpoint) + +Der Endpoint `GET /v1/canonical/generate/processed-stats` liefert Verarbeitungsstatistiken pro RAG-Collection. + +```bash +curl -s https://macmini:8002/api/compliance/v1/canonical/generate/processed-stats | jq +``` + +**Response:** +```json +{ + "stats": [ + { + "collection": "bp_compliance_gesetze", + "processed_chunks": 45200, + "direct_adopted": 1850, + "llm_reformed": 120, + "skipped": 43230, + "total_chunks_estimated": 0, + "pending_chunks": 0 + } + ] +} +``` + +### Aktuelle Groessenordnung + +| Metrik | Wert | +|--------|------| +| RAG-Chunks gesamt | ~183.000 | +| Verarbeitete Chunks | ~183.000 (vollstaendig) | +| Generierte Controls | **~2.120** | +| Konversionsrate | ~1,2% (nur sicherheitsrelevante Chunks erzeugen Controls) | + +!!! info "Warum so wenige Controls?" + Die meisten RAG-Chunks sind Definitionen, Begriffsbestimmungen, Inhaltsverzeichnisse oder + Uebergangsvorschriften. Der Prefilter (Stufe 2) sortiert >50% aus, die Harmonisierung (Stufe 5) + entfernt weitere Duplikate. Nur konkrete, einzigartige Anforderungen werden zu Controls. + +--- + +## Migration von Controls (Lokal → Production) + +Controls koennen ueber die REST-API von der lokalen Entwicklungsumgebung in die Production migriert werden. +Jedes Control wird einzeln per `POST` mit der Referenz auf das Framework erstellt. + +```bash +# 1. Control aus lokaler Umgebung exportieren +curl -s https://macmini:8002/api/compliance/v1/canonical/controls/AUTH-001 | jq > control.json + +# 2. In Production importieren (mit framework_id) +curl -X POST https://api-dev.breakpilot.ai/api/compliance/v1/canonical/controls \ + -H 'Content-Type: application/json' \ + -d '{ + "framework_id": "bp_security_v1", + "control_id": "AUTH-001", + "title": "Multi-Faktor-Authentifizierung", + "objective": "...", + "severity": "high", + "open_anchors": [...] + }' +``` + +!!! warning "Framework muss existieren" + Das Ziel-Framework (`bp_security_v1`) muss in der Production-DB bereits angelegt sein. + Falls nicht, zuerst das Framework erstellen: + ```bash + curl -X POST https://api-dev.breakpilot.ai/api/compliance/v1/canonical/frameworks \ + -H 'Content-Type: application/json' \ + -d '{"framework_id": "bp_security_v1", "name": "BreakPilot Security", "version": "1.0"}' + ``` + +--- + ## Dateien | Datei | Typ | Beschreibung | |-------|-----|-------------| | `backend-compliance/migrations/044_canonical_control_library.sql` | SQL | 5 Tabellen + Seed-Daten | +| `backend-compliance/migrations/046_control_generator.sql` | SQL | Job-Tracking, Chunk-Tracking, Blocked Sources | | `backend-compliance/migrations/047_verification_method_category.sql` | SQL | verification_method + category Felder | +| `backend-compliance/migrations/048_processing_path_expand.sql` | SQL | Erweiterte processing_path Werte | | `backend-compliance/compliance/api/canonical_control_routes.py` | Python | REST API (8+ Endpoints) | -| `backend-compliance/compliance/api/control_generator_routes.py` | Python | Generator API (Start/Status/Jobs) | -| `backend-compliance/compliance/services/control_generator.py` | Python | 8-Stufen-Pipeline | +| `backend-compliance/compliance/api/control_generator_routes.py` | Python | Generator API (Start/Status/Jobs/Stats) | +| `backend-compliance/compliance/services/control_generator.py` | Python | 8-Stufen-Pipeline mit Batch Processing | | `backend-compliance/compliance/services/license_gate.py` | Python | Lizenz-Gate-Logik | | `backend-compliance/compliance/services/similarity_detector.py` | Python | Too-Close-Detektor (5 Metriken) | | `backend-compliance/compliance/services/rag_client.py` | Python | RAG-Client (Search + Scroll) | @@ -376,11 +586,25 @@ curl https://macmini:8002/api/compliance/v1/canonical/generate/jobs \ ## Tests -| Datei | Sprache | Tests | -|-------|---------|-------| -| `ai-compliance-sdk/internal/ucca/canonical_control_loader_test.go` | Go | 8 Tests | -| `backend-compliance/compliance/tests/test_similarity_detector.py` | Python | 19 Tests | -| `backend-compliance/tests/test_canonical_control_routes.py` | Python | 14 Tests | -| `backend-compliance/tests/test_license_gate.py` | Python | 12 Tests | -| `backend-compliance/tests/test_validate_controls.py` | Python | 14 Tests | -| **Gesamt** | | **67 Tests** | +| Datei | Sprache | Tests | Schwerpunkt | +|-------|---------|-------|-------------| +| `ai-compliance-sdk/internal/ucca/canonical_control_loader_test.go` | Go | 8 Tests | Control Loader, Multi-Index | +| `backend-compliance/compliance/tests/test_similarity_detector.py` | Python | 19 Tests | Too-Close-Detektor, 5 Metriken | +| `backend-compliance/tests/test_canonical_control_routes.py` | Python | 14 Tests | REST API Endpoints | +| `backend-compliance/tests/test_license_gate.py` | Python | 12 Tests | Lizenz-Klassifikation | +| `backend-compliance/tests/test_validate_controls.py` | Python | 14 Tests | CI/CD Validator | +| `backend-compliance/tests/test_control_generator.py` | Python | 15 Tests | Pipeline, Batch, Lizenzregeln | +| **Gesamt** | | **82 Tests** | + +### Control Generator Tests (test_control_generator.py) + +Die Generator-Tests decken folgende Bereiche ab: + +- **`TestLicenseMapping`** (12 Tests) — Korrekte Zuordnung von `regulation_code` zu Lizenzregeln (Rule 1/2/3), + Case-Insensitivity, Rule 3 darf keine Quellennamen exponieren +- **`TestDomainDetection`** (5 Tests) — Erkennung von AUTH, CRYPT, NET, DATA Domains aus Chunk-Text +- **`TestJsonParsing`** (4 Tests) — Robustes Parsing von LLM-Antworten (plain JSON, Markdown-Fenced, mit Preamble) +- **`TestGeneratedControlRules`** (3 Tests) — Rule 1 hat Originaltext, Rule 2 hat Citation, Rule 3 hat **nichts** +- **`TestAnchorFinder`** (2 Tests) — RAG-Suche filtert Rule 3 Quellen aus, Web-Suche erkennt Frameworks +- **`TestPipelineMocked`** (5 Tests) — End-to-End mit Mocks: Lizenz-Klassifikation, Rule 3 Blocking, + Hash-Deduplizierung, Config-Defaults (`batch_size: 5`), Rule 1 Citation-Generierung