diff --git a/backend-compliance/compliance/services/control_generator.py b/backend-compliance/compliance/services/control_generator.py index 6ed5e2e..fce4316 100644 --- a/backend-compliance/compliance/services/control_generator.py +++ b/backend-compliance/compliance/services/control_generator.py @@ -53,6 +53,11 @@ LLM_TIMEOUT = float(os.getenv("CONTROL_GEN_LLM_TIMEOUT", "180")) HARMONIZATION_THRESHOLD = 0.85 # Cosine similarity above this = duplicate +# Pipeline version — increment when generation rules change materially. +# v1: Original (local LLM prefilter, old prompt) +# v2: Anthropic decides relevance, null for non-requirement chunks, annexes protected +PIPELINE_VERSION = 2 + ALL_COLLECTIONS = [ "bp_compliance_ce", "bp_compliance_gesetze", @@ -1663,7 +1668,7 @@ Kategorien: {CATEGORY_LIST_STR}""" license_rule, source_original_text, source_citation, customer_visible, generation_metadata, verification_method, category, generation_strategy, - target_audience + target_audience, pipeline_version ) VALUES ( :framework_id, :control_id, :title, :objective, :rationale, :scope, :requirements, :test_procedure, :evidence, @@ -1672,7 +1677,7 @@ Kategorien: {CATEGORY_LIST_STR}""" :license_rule, :source_original_text, :source_citation, :customer_visible, :generation_metadata, :verification_method, :category, :generation_strategy, - :target_audience + :target_audience, :pipeline_version ) ON CONFLICT (framework_id, control_id) DO NOTHING RETURNING id @@ -1702,6 +1707,7 @@ Kategorien: {CATEGORY_LIST_STR}""" "category": control.category, "generation_strategy": control.generation_strategy, "target_audience": json.dumps(control.target_audience) if control.target_audience else None, + "pipeline_version": PIPELINE_VERSION, }, ) self.db.commit() @@ -1728,11 +1734,13 @@ Kategorien: {CATEGORY_LIST_STR}""" INSERT INTO canonical_processed_chunks ( chunk_hash, collection, regulation_code, document_version, source_license, license_rule, - processing_path, generated_control_ids, job_id + processing_path, generated_control_ids, job_id, + pipeline_version ) VALUES ( :hash, :collection, :regulation_code, :doc_version, :license, :rule, - :path, :control_ids, CAST(:job_id AS uuid) + :path, :control_ids, CAST(:job_id AS uuid), + :pipeline_version ) ON CONFLICT (chunk_hash, collection, document_version) DO NOTHING """), @@ -1746,6 +1754,7 @@ Kategorien: {CATEGORY_LIST_STR}""" "path": processing_path, "control_ids": json.dumps(control_ids), "job_id": job_id, + "pipeline_version": PIPELINE_VERSION, }, ) self.db.commit() diff --git a/backend-compliance/migrations/062_pipeline_version.sql b/backend-compliance/migrations/062_pipeline_version.sql new file mode 100644 index 0000000..c8c7cab --- /dev/null +++ b/backend-compliance/migrations/062_pipeline_version.sql @@ -0,0 +1,22 @@ +-- Migration 062: Add pipeline_version to track which generation rules produced each control/chunk +-- +-- v1 = Original pipeline (local LLM prefilter, old prompt without null-skip) +-- v2 = Improved pipeline (skip_prefilter, Anthropic decides relevance, annexes protected) +-- +-- This allows identifying controls that may need reprocessing when pipeline rules change. + +ALTER TABLE canonical_controls + ADD COLUMN IF NOT EXISTS pipeline_version smallint NOT NULL DEFAULT 1; + +ALTER TABLE canonical_processed_chunks + ADD COLUMN IF NOT EXISTS pipeline_version smallint NOT NULL DEFAULT 1; + +-- Index for efficient querying by version +CREATE INDEX IF NOT EXISTS idx_canonical_controls_pipeline_version + ON canonical_controls (pipeline_version); + +CREATE INDEX IF NOT EXISTS idx_canonical_processed_chunks_pipeline_version + ON canonical_processed_chunks (pipeline_version); + +COMMENT ON COLUMN canonical_controls.pipeline_version IS 'Generation pipeline version: 1=original (local prefilter), 2=improved (Anthropic decides relevance, annexes protected)'; +COMMENT ON COLUMN canonical_processed_chunks.pipeline_version IS 'Pipeline version used when this chunk was processed'; diff --git a/backend-compliance/tests/test_control_generator.py b/backend-compliance/tests/test_control_generator.py index c8b2447..29c025d 100644 --- a/backend-compliance/tests/test_control_generator.py +++ b/backend-compliance/tests/test_control_generator.py @@ -8,10 +8,12 @@ from compliance.services.control_generator import ( _classify_regulation, _detect_domain, _parse_llm_json, + _parse_llm_json_array, GeneratorConfig, GeneratedControl, ControlGeneratorPipeline, REGULATION_LICENSE_MAP, + PIPELINE_VERSION, ) from compliance.services.anchor_finder import AnchorFinder, OpenAnchor from compliance.services.rag_client import RAGSearchResult @@ -91,7 +93,7 @@ class TestDomainDetection: assert _detect_domain("Multi-factor authentication and password policy") == "AUTH" def test_crypto_domain(self): - assert _detect_domain("TLS 1.3 encryption and certificate management") == "CRYPT" + assert _detect_domain("TLS 1.3 encryption and certificate management") == "CRYP" def test_network_domain(self): assert _detect_domain("Firewall rules and network segmentation") == "NET" @@ -807,7 +809,7 @@ class TestBatchProcessingLoop: patch.object(pipeline, "_check_harmonization", new_callable=AsyncMock, return_value=[]), \ patch("compliance.services.anchor_finder.AnchorFinder", mock_finder_cls): config = GeneratorConfig() - result = await pipeline._process_batch(batch_items, config, "job-1") + result, qa_count = await pipeline._process_batch(batch_items, config, "job-1") mock_struct.assert_called_once() mock_reform.assert_not_called() @@ -839,7 +841,7 @@ class TestBatchProcessingLoop: patch("compliance.services.control_generator.check_similarity", new_callable=AsyncMock) as mock_sim: mock_sim.return_value = MagicMock(status="PASS", token_overlap=0.1, ngram_jaccard=0.1, lcs_ratio=0.1) config = GeneratorConfig() - result = await pipeline._process_batch(batch_items, config, "job-2") + result, qa_count = await pipeline._process_batch(batch_items, config, "job-2") mock_struct.assert_not_called() mock_reform.assert_called_once() @@ -885,7 +887,7 @@ class TestBatchProcessingLoop: patch("compliance.services.control_generator.check_similarity", new_callable=AsyncMock) as mock_sim: mock_sim.return_value = MagicMock(status="PASS", token_overlap=0.05, ngram_jaccard=0.05, lcs_ratio=0.05) config = GeneratorConfig() - result = await pipeline._process_batch(batch_items, config, "job-mixed") + result, qa_count = await pipeline._process_batch(batch_items, config, "job-mixed") # Both methods called mock_struct.assert_called_once() @@ -905,8 +907,9 @@ class TestBatchProcessingLoop: pipeline._existing_controls = [] config = GeneratorConfig() - result = await pipeline._process_batch([], config, "job-empty") + result, qa_count = await pipeline._process_batch([], config, "job-empty") assert result == [] + assert qa_count == 0 @pytest.mark.asyncio async def test_reformulate_batch_too_close_flagged(self): @@ -942,7 +945,7 @@ class TestBatchProcessingLoop: patch("compliance.services.anchor_finder.AnchorFinder", mock_finder_cls), \ patch("compliance.services.control_generator.check_similarity", new_callable=AsyncMock, return_value=fail_report): config = GeneratorConfig() - result = await pipeline._process_batch(batch_items, config, "job-tooclose") + result, qa_count = await pipeline._process_batch(batch_items, config, "job-tooclose") assert len(result) == 1 assert result[0].release_state == "too_close" @@ -1112,3 +1115,194 @@ class TestRegulationFilter: results = await pipeline._scan_rag(config) assert len(results) == 2 + + +# ============================================================================= +# Pipeline Version Tests +# ============================================================================= + +class TestPipelineVersion: + """Tests for pipeline_version propagation in DB writes and null handling.""" + + def test_pipeline_version_constant_is_2(self): + assert PIPELINE_VERSION == 2 + + def test_store_control_includes_pipeline_version(self): + """_store_control must pass pipeline_version=PIPELINE_VERSION to the INSERT.""" + mock_db = MagicMock() + # Framework lookup returns a UUID + fw_row = MagicMock() + fw_row.__getitem__ = lambda self, idx: "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee" + mock_db.execute.return_value.fetchone.return_value = fw_row + + pipeline = ControlGeneratorPipeline(db=mock_db, rag_client=MagicMock()) + + control = GeneratedControl( + control_id="SEC-TEST-001", + title="Test Control", + objective="Test objective", + ) + pipeline._store_control(control, job_id="00000000-0000-0000-0000-000000000001") + + # The second call to db.execute is the INSERT + calls = mock_db.execute.call_args_list + assert len(calls) >= 2, f"Expected at least 2 db.execute calls, got {len(calls)}" + insert_call = calls[1] + params = insert_call[0][1] # positional arg 1 = params dict + assert "pipeline_version" in params + assert params["pipeline_version"] == PIPELINE_VERSION + + def test_mark_chunk_processed_includes_pipeline_version(self): + """_mark_chunk_processed must pass pipeline_version=PIPELINE_VERSION to the INSERT.""" + mock_db = MagicMock() + pipeline = ControlGeneratorPipeline(db=mock_db, rag_client=MagicMock()) + + chunk = MagicMock() + chunk.text = "Some chunk text for hashing" + chunk.collection = "bp_compliance_ce" + chunk.regulation_code = "eu_2016_679" + + license_info = {"license": "CC0-1.0", "rule": 1} + + pipeline._mark_chunk_processed( + chunk=chunk, + license_info=license_info, + processing_path="structured_batch", + control_ids=["SEC-TEST-001"], + job_id="00000000-0000-0000-0000-000000000001", + ) + + calls = mock_db.execute.call_args_list + assert len(calls) >= 1 + insert_call = calls[0] + params = insert_call[0][1] + assert "pipeline_version" in params + assert params["pipeline_version"] == PIPELINE_VERSION + + @pytest.mark.asyncio + async def test_structure_batch_handles_null_results(self): + """When _parse_llm_json_array returns [dict, None, dict], the null entries produce None.""" + mock_db = MagicMock() + pipeline = ControlGeneratorPipeline(db=mock_db, rag_client=MagicMock()) + + # Three chunks + chunks = [] + license_infos = [] + for i in range(3): + c = MagicMock() + c.text = f"Chunk text number {i} with enough content for processing" + c.regulation_name = "DSGVO" + c.regulation_code = "eu_2016_679" + c.article = f"Art. {i + 1}" + c.paragraph = "" + c.source_url = "" + c.collection = "bp_compliance_ce" + chunks.append(c) + license_infos.append({"rule": 1, "name": "DSGVO", "license": "CC0-1.0"}) + + # LLM returns a JSON array: valid, null, valid + llm_response = json.dumps([ + { + "chunk_index": 1, + "title": "Datenschutz-Kontrolle 1", + "objective": "Schutz personenbezogener Daten", + "rationale": "DSGVO-Konformitaet", + "requirements": ["Req 1"], + "test_procedure": ["Test 1"], + "evidence": ["Nachweis 1"], + "severity": "high", + "tags": ["dsgvo"], + "domain": "DATA", + "category": "datenschutz", + "target_audience": ["unternehmen"], + "source_article": "Art. 1", + "source_paragraph": "", + }, + None, + { + "chunk_index": 3, + "title": "Datenschutz-Kontrolle 3", + "objective": "Transparenzpflicht", + "rationale": "Information der Betroffenen", + "requirements": ["Req 3"], + "test_procedure": ["Test 3"], + "evidence": ["Nachweis 3"], + "severity": "medium", + "tags": ["transparenz"], + "domain": "DATA", + "category": "datenschutz", + "target_audience": ["unternehmen"], + "source_article": "Art. 3", + "source_paragraph": "", + }, + ]) + + with patch("compliance.services.control_generator._llm_chat", new_callable=AsyncMock) as mock_llm: + mock_llm.return_value = llm_response + controls = await pipeline._structure_batch(chunks, license_infos) + + assert len(controls) == 3 + assert controls[0] is not None + assert controls[1] is None # Null entry from LLM + assert controls[2] is not None + + @pytest.mark.asyncio + async def test_reformulate_batch_handles_null_results(self): + """When _parse_llm_json_array returns [dict, None, dict], the null entries produce None.""" + mock_db = MagicMock() + pipeline = ControlGeneratorPipeline(db=mock_db, rag_client=MagicMock()) + + chunks = [] + for i in range(3): + c = MagicMock() + c.text = f"Restricted chunk text number {i} with BSI content" + c.regulation_name = "BSI TR-03161" + c.regulation_code = "bsi_tr03161" + c.article = f"Section {i + 1}" + c.paragraph = "" + c.source_url = "" + c.collection = "bp_compliance_ce" + chunks.append(c) + + config = GeneratorConfig(domain="SEC") + + llm_response = json.dumps([ + { + "chunk_index": 1, + "title": "Sicherheitskontrolle 1", + "objective": "Authentifizierung absichern", + "rationale": "Best Practice", + "requirements": ["Req 1"], + "test_procedure": ["Test 1"], + "evidence": ["Nachweis 1"], + "severity": "high", + "tags": ["sicherheit"], + "domain": "SEC", + "category": "it-sicherheit", + "target_audience": ["it-abteilung"], + }, + None, + { + "chunk_index": 3, + "title": "Sicherheitskontrolle 3", + "objective": "Netzwerk segmentieren", + "rationale": "Angriffsoberflaeche reduzieren", + "requirements": ["Req 3"], + "test_procedure": ["Test 3"], + "evidence": ["Nachweis 3"], + "severity": "medium", + "tags": ["netzwerk"], + "domain": "NET", + "category": "netzwerksicherheit", + "target_audience": ["it-abteilung"], + }, + ]) + + with patch("compliance.services.control_generator._llm_chat", new_callable=AsyncMock) as mock_llm: + mock_llm.return_value = llm_response + controls = await pipeline._reformulate_batch(chunks, config) + + assert len(controls) == 3 + assert controls[0] is not None + assert controls[1] is None # Null entry from LLM + assert controls[2] is not None diff --git a/docs-src/development/testing.md b/docs-src/development/testing.md index 2e6bab4..f6aab2c 100644 --- a/docs-src/development/testing.md +++ b/docs-src/development/testing.md @@ -214,13 +214,13 @@ Wenn du z.B. eine neue `GetUserStats()` Funktion im Go Service hinzufuegst: ## Modul-spezifische Tests -### Canonical Control Generator (82 Tests) +### Canonical Control Generator (71+ Tests) Die Control Library hat eine umfangreiche Test-Suite ueber 6 Dateien. -Siehe [Canonical Control Library — Tests](../services/sdk-modules/canonical-control-library.md#tests) fuer Details. +Siehe [Canonical Control Library — Tests](../services/sdk-modules/canonical-control-library.md#tests) und [Control Generator Pipeline](../services/sdk-modules/control-generator-pipeline.md) fuer Details. ```bash -# Alle Generator-Tests +# Alle Generator-Tests (71 Tests in 10 Klassen) cd backend-compliance && pytest -v tests/test_control_generator.py # Similarity Detector Tests @@ -237,10 +237,19 @@ cd backend-compliance && pytest -v tests/test_validate_controls.py ``` **Wichtig:** Die Generator-Tests nutzen Mocks fuer Anthropic-API und Qdrant — sie laufen ohne externe Abhaengigkeiten. -Die `TestPipelineMocked`-Klasse prueft insbesondere: -- Korrekte Lizenz-Klassifikation (Rule 1/2/3 Verhalten) -- Rule 3 exponiert **keine** Quellennamen in `generation_metadata` -- SHA-256 Hash-Deduplizierung fuer Chunks -- Config-Defaults (`batch_size: 5`, `skip_processed: true`) -- Rule 1 Citation wird korrekt mit Gesetzesreferenz generiert +**Testklassen in `test_control_generator.py`:** + +| Klasse | Tests | Prueft | +|--------|-------|--------| +| `TestLicenseMapping` | 12 | Lizenz-Klassifikation (Rule 1/2/3), Case-Insensitivitaet | +| `TestDomainDetection` | 5 | Keyword-basierte Domain-Erkennung (AUTH, CRYP, NET, DATA) | +| `TestJsonParsing` | 4 | JSON-Parser fuer LLM-Responses (Markdown-Fencing, Preamble) | +| `TestGeneratedControlRules` | 3 | Rule-spezifische Felder (original_text, citation, source_info) | +| `TestAnchorFinder` | 2 | RAG-Suche + Web-Framework-Erkennung | +| `TestPipelineMocked` | 5 | End-to-End Pipeline mit Mocks (Lizenz, Hash-Dedup, Config) | +| `TestParseJsonArray` | 15 | JSON-Array-Parser (Wrapper-Objekte, Bracket-Extraction, Fallbacks) | +| `TestBatchSizeConfig` | 5 | Batch-Groesse-Konfiguration + Defaults | +| `TestBatchProcessingLoop` | 10 | Batch-Verarbeitung (Rule-Split, Mixed-Rules, Too-Close, Null-Handling) | +| `TestRegulationFilter` | 5 | regulation_filter Prefix-Matching, leere regulation_codes | +| `TestPipelineVersion` | 5 | pipeline_version=2 in DB-Writes, null-Handling in Structure/Reform | diff --git a/docs-src/services/sdk-modules/canonical-control-library.md b/docs-src/services/sdk-modules/canonical-control-library.md index 3ebd239..d218a2d 100644 --- a/docs-src/services/sdk-modules/canonical-control-library.md +++ b/docs-src/services/sdk-modules/canonical-control-library.md @@ -244,44 +244,36 @@ Der Validator (`scripts/validate-controls.py`) prueft bei jedem Commit: Automatische Generierung von Controls aus dem gesamten RAG-Korpus (~105.000 Chunks aus Gesetzen, Verordnungen und Standards). Aktueller Stand: **~4.738 Controls** generiert. -### 9-Stufen-Pipeline +!!! tip "Ausfuehrliche Dokumentation" + Siehe **[Control Generator Pipeline](control-generator-pipeline.md)** fuer die vollstaendige Referenz inkl. API-Endpoints, Konfiguration, Kosten und Pipeline-Versionen. + +### 7-Stufen-Pipeline (v2) ```mermaid flowchart TD - A[1. RAG Scroll] -->|max_chunks| B[2. Prefilter - Lokales LLM] - B -->|Irrelevant| C[Als processed markieren] - B -->|Relevant| D[3. License Classify] - D -->|Batch sammeln| E[4. Batch Processing - 5 Chunks/API-Call] - E -->|Rule 1/2| F[4a. Structure Batch - Anthropic] - E -->|Rule 3| G[4b. Reform Batch - Anthropic] - F --> QA[5. QA Validation - Lokales LLM] - G --> QA - QA -->|Mismatch| QAF[Auto-Fix Category/Domain] - QA -->|OK| H[6. Harmonization - Embeddings] - QAF --> H - H -->|Duplikat| I[Als Duplikat speichern] - H -->|Neu| J[7. Anchor Search] - J --> K[8. Store Control] - K --> L[9. Mark Processed] + A[1. RAG Scan] -->|Alle Chunks laden| B[2. License Classify] + B -->|Rule 1/2| C[3a. Structure Batch] + B -->|Rule 3| D[3b. Reform Batch] + C --> E[4. Harmonize] + D --> E + E -->|Duplikat| F[Als Duplikat markieren] + E -->|Neu| G[5. Anchor Search] + G --> H[6. Store Control] + H --> I[7. Mark Processed] ``` -### Stufe 1: RAG Scroll (Vollstaendig) +!!! info "Pipeline-Version v2 (seit 2026-03-17)" + - **Kein lokaler Vorfilter mehr** — Anthropic API entscheidet selbst ueber Chunk-Relevanz via null-Returns + - **Annexe geschuetzt** — Technische Anforderungen in Anhaengen werden nicht mehr uebersprungen + - **`pipeline_version`** Spalte in DB unterscheidet v1- von v2-Controls -Scrollt durch **ALLE** Chunks in allen RAG-Collections mittels Qdrant Scroll-API. -Kein Limit — jeder Chunk wird verarbeitet, um keine gesetzlichen Anforderungen zu uebersehen. +### Stufe 1: RAG Scan + +Scrollt durch **ALLE** Chunks in den konfigurierten RAG-Collections mittels Qdrant Scroll-API. +Optionaler `regulation_filter` beschraenkt auf bestimmte Regulierungen per Prefix-Matching. Bereits verarbeitete Chunks werden per SHA-256-Hash uebersprungen (`canonical_processed_chunks`). -### Stufe 2: Lokaler LLM-Vorfilter (Qwen 30B) - -**Kostenoptimierung:** Bevor ein Chunk an die Anthropic API geht, prueft das lokale Qwen-Modell (`qwen3:30b-a3b` auf Mac Mini), ob der Chunk eine konkrete Anforderung enthaelt. - -- **Relevant:** Pflichten ("muss", "soll"), technische Massnahmen, Datenschutz-Vorgaben -- **Irrelevant:** Definitionen, Inhaltsverzeichnisse, Begriffsbestimmungen, Uebergangsvorschriften - -Irrelevante Chunks werden als `prefilter_skip` markiert und nie wieder verarbeitet. -Dies spart >50% der Anthropic-API-Kosten. - ### Stufe 3: Lizenz-Klassifikation (3-Regel-System) | Regel | Lizenz | Original erlaubt? | Beispiel | diff --git a/docs-src/services/sdk-modules/control-generator-pipeline.md b/docs-src/services/sdk-modules/control-generator-pipeline.md new file mode 100644 index 0000000..309a516 --- /dev/null +++ b/docs-src/services/sdk-modules/control-generator-pipeline.md @@ -0,0 +1,540 @@ +# Control Generator Pipeline + +Automatische Generierung von Canonical Controls aus dem gesamten RAG-Korpus (~105.000 Chunks aus Gesetzen, Verordnungen und Standards). + +**Backend:** `backend-compliance/compliance/services/control_generator.py` +**Routes:** `backend-compliance/compliance/api/control_generator_routes.py` +**API-Prefix:** `/api/compliance/v1/canonical/generate` + +--- + +## Pipeline-Uebersicht + +Die Pipeline durchlaeuft 7 Stufen, um aus RAG-Chunks eigenstaendige Security/Compliance Controls zu erzeugen: + +```mermaid +flowchart TD + A[1. RAG Scan] -->|Alle Chunks laden| B[2. License Classify] + B -->|Rule 1/2| C[3a. Structure Batch] + B -->|Rule 3| D[3b. Reform Batch] + C --> E[4. Harmonize] + D --> E + E -->|Duplikat| F[Als Duplikat markieren] + E -->|Neu| G[5. Anchor Search] + G --> H[6. Store Control] + H --> I[7. Mark Processed] +``` + +| Stufe | Name | Beschreibung | +|-------|------|-------------| +| 1 | **RAG Scan** | Laedt unverarbeitete Chunks aus Qdrant (Scroll-API), filtert per SHA-256-Hash | +| 2 | **License Classify** | Bestimmt die Lizenzregel (Rule 1/2/3) anhand `regulation_code` | +| 3a | **Structure (Batch)** | Rule 1+2: Strukturiert Originaltext als Control (Anthropic API) | +| 3b | **Reform (Batch)** | Rule 3: Vollstaendige Reformulierung ohne Originaltext (Anthropic API) | +| 4 | **Harmonize** | Embedding-basierte Duplikaterkennung (bge-m3, Cosine > 0.85) | +| 5 | **Anchor Search** | Findet Open-Source-Referenzen (OWASP, NIST, ENISA) | +| 6 | **Store** | Persistiert Control in `canonical_controls` mit Metadaten | +| 7 | **Mark Processed** | Markiert jeden Chunk als verarbeitet (auch bei Skip/Error/Duplikat) | + +--- + +## Pipeline-Versionen + +Die Pipeline hat zwei Versionen. Die Version wird als `pipeline_version` auf `canonical_controls` und `canonical_processed_chunks` gespeichert. + +### v1 (Original) + +| Eigenschaft | Wert | +|-------------|------| +| **Vorfilter** | Lokales LLM (llama3.2 3B) entscheidet ob Chunk relevant | +| **Anthropic-Prompt** | Alter Prompt ohne null-Skip | +| **Annexe/Anhaenge** | Kein Schutz — wurden haeufig faelschlich als irrelevant uebersprungen | +| **`pipeline_version`** | `1` | + +### v2 (Aktuell) + +| Eigenschaft | Wert | +|-------------|------| +| **Vorfilter** | Optional (`skip_prefilter`). Wenn aktiviert, entscheidet Anthropic API selbst | +| **Anthropic-Prompt** | Neuer Prompt mit **null-Skip**: API gibt `null` fuer Chunks ohne Anforderung zurueck | +| **Annexe/Anhaenge** | Explizit geschuetzt — Prompt-Anweisung: "Anhaenge/Annexe enthalten oft KONKRETE technische Anforderungen — diese MUESSEN als Control erfasst werden!" | +| **`pipeline_version`** | `2` | + +#### Wesentliche Aenderungen v1 → v2 + +1. **Relevanz-Entscheidung an Anthropic delegiert** — Das lokale LLM (Vorfilter) ist optional. Die Anthropic API entscheidet selbst, welche Chunks Controls enthalten, indem sie `null` fuer irrelevante Chunks zurueckgibt. +2. **null-Skip im JSON-Array** — Das Ergebnis-Array enthaelt `null`-Eintraege fuer Chunks ohne umsetzbare Anforderung. Kein separater Vorfilter-Schritt noetig. +3. **Annexe/Anhaenge geschuetzt** — Explizite Prompt-Anweisung verhindert, dass technische Anforderungen in Anhaengen uebersprungen werden. + +#### Datenbank-Feld + +```sql +-- Migration 062 +ALTER TABLE canonical_controls + ADD COLUMN pipeline_version smallint NOT NULL DEFAULT 1; + +ALTER TABLE canonical_processed_chunks + ADD COLUMN pipeline_version smallint NOT NULL DEFAULT 1; +``` + +Neue Controls erhalten automatisch `pipeline_version = 2`. Bestehende (v1) behalten `1`, damit sie spaeter identifiziert und ggf. reprocessiert werden koennen. + +--- + +## Konfiguration + +### Request-Parameter (`GenerateRequest`) + +| Parameter | Typ | Default | Beschreibung | +|-----------|-----|---------|-------------| +| `collections` | `List[str]` | Alle 5 Collections | Qdrant-Collections zum Durchsuchen | +| `domain` | `str` | — | Filter auf eine Domain (z.B. `AUTH`, `NET`) | +| `regulation_filter` | `List[str]` | — | Prefix-Matching auf `regulation_code` (z.B. `["eu_2023_1230", "owasp_"]`) | +| `skip_prefilter` | `bool` | `false` | Ueberspringt lokalen LLM-Vorfilter, sendet alle Chunks an die Anthropic API | +| `batch_size` | `int` | `5` | Chunks pro Anthropic-API-Call | +| `max_controls` | `int` | `50` | Maximale Anzahl Controls pro Job (0 = unbegrenzt) | +| `max_chunks` | `int` | `1000` | Maximale Chunks pro Job (0 = unbegrenzt, respektiert Dokumentgrenzen) | +| `skip_web_search` | `bool` | `false` | Ueberspringt Web-Suche in der Anchor-Findung (Stufe 5) | +| `dry_run` | `bool` | `false` | Trockenlauf ohne DB-Schreibzugriffe (synchron, mit Controls im Response) | + +!!! info "`regulation_filter` — Prefix-Matching" + Der Filter vergleicht den `regulation_code` jedes Chunks per Prefix. + Beispiel: `["eu_2023_1230"]` erfasst nur Chunks aus der Maschinenverordnung. + `["owasp_"]` erfasst alle OWASP-Dokumente (OWASP ASVS, OWASP SAMM, etc.). + Gross-/Kleinschreibung wird ignoriert. + +### Umgebungsvariablen + +| Variable | Default | Beschreibung | +|----------|---------|-------------| +| `ANTHROPIC_API_KEY` | — | API-Key fuer Anthropic Claude (Pflicht) | +| `CONTROL_GEN_ANTHROPIC_MODEL` | `claude-sonnet-4-6` | Anthropic-Modell fuer Strukturierung/Reformulierung | +| `OLLAMA_URL` | `http://host.docker.internal:11434` | Lokaler Ollama-Server (Vorfilter + QA) | +| `CONTROL_GEN_OLLAMA_MODEL` | `qwen3.5:35b-a3b` | Lokales LLM-Modell fuer Vorfilter und QA-Arbitrierung | +| `CONTROL_GEN_LLM_TIMEOUT` | `180` | Timeout in Sekunden pro Anthropic-API-Call | + +### Pipeline-interne Konstanten + +| Konstante | Wert | Beschreibung | +|-----------|------|-------------| +| `PIPELINE_VERSION` | `2` | Aktuelle Pipeline-Version | +| `HARMONIZATION_THRESHOLD` | `0.85` | Cosine-Similarity-Schwelle fuer Duplikaterkennung | +| `max_tokens` | `8192` | Maximale Token-Laenge der LLM-Antwort | + +--- + +## API Endpoints + +Alle Endpoints unter `/api/compliance/v1/canonical/`. + +### Uebersicht + +| Methode | Pfad | Beschreibung | +|---------|------|-------------| +| `POST` | `/generate` | Generierungs-Job starten (laeuft im Hintergrund) | +| `GET` | `/generate/status/{job_id}` | Status eines laufenden Jobs abfragen | +| `GET` | `/generate/jobs` | Alle Jobs auflisten (paginiert) | +| `GET` | `/generate/processed-stats` | Verarbeitungsstatistik pro Collection | +| `GET` | `/generate/review-queue` | Controls zur manuellen Pruefung | +| `POST` | `/generate/review/{control_id}` | Review eines einzelnen Controls abschliessen | +| `POST` | `/generate/bulk-review` | Bulk-Review nach `release_state` | +| `POST` | `/generate/qa-reclassify` | QA-Reklassifizierung bestehender Controls | +| `GET` | `/blocked-sources` | Gesperrte Quellen (Rule 3) auflisten | +| `POST` | `/blocked-sources/cleanup` | Cleanup-Workflow fuer gesperrte Quellen starten | + +--- + +### POST `/v1/canonical/generate` — Job starten + +Startet einen Generierungs-Job im Hintergrund. Gibt sofort eine `job_id` zurueck. + +**Request:** + +```json +{ + "collections": ["bp_compliance_gesetze"], + "regulation_filter": ["eu_2023_1230"], + "skip_prefilter": false, + "batch_size": 5, + "max_chunks": 500, + "max_controls": 0, + "skip_web_search": false, + "dry_run": false +} +``` + +**Response (200):** + +```json +{ + "job_id": "a1b2c3d4-...", + "status": "running", + "message": "Generation started in background. Poll /generate/status/{job_id} for progress." +} +``` + +**Beispiel:** + +```bash +# Alle Chunks der Maschinenverordnung verarbeiten +curl -X POST https://api-dev.breakpilot.ai/api/compliance/v1/canonical/generate \ + -H 'Content-Type: application/json' \ + -H 'X-Tenant-ID: 550e8400-e29b-41d4-a716-446655440000' \ + -d '{ + "collections": ["bp_compliance_ce"], + "regulation_filter": ["eu_2023_1230"], + "max_chunks": 200, + "batch_size": 5 + }' +``` + +```bash +# Dry Run: Keine DB-Aenderungen, Controls im Response +curl -X POST https://api-dev.breakpilot.ai/api/compliance/v1/canonical/generate \ + -H 'Content-Type: application/json' \ + -H 'X-Tenant-ID: 550e8400-e29b-41d4-a716-446655440000' \ + -d '{ + "collections": ["bp_compliance_gesetze"], + "max_chunks": 10, + "dry_run": true + }' +``` + +```bash +# Ohne Vorfilter: Alle Chunks direkt an Anthropic API +curl -X POST https://api-dev.breakpilot.ai/api/compliance/v1/canonical/generate \ + -H 'Content-Type: application/json' \ + -H 'X-Tenant-ID: 550e8400-e29b-41d4-a716-446655440000' \ + -d '{ + "collections": ["bp_compliance_gesetze"], + "regulation_filter": ["bdsg"], + "skip_prefilter": true, + "max_chunks": 100 + }' +``` + +!!! warning "Kosten beachten" + Ohne `regulation_filter` und mit `max_chunks: 0` werden **alle** ~105.000 Chunks verarbeitet. + Das verursacht erhebliche Anthropic-API-Kosten (~$700). + +--- + +### GET `/v1/canonical/generate/status/{job_id}` — Job-Status + +Gibt den vollstaendigen Status eines Jobs zurueck inkl. Metriken und Fehler. + +**Beispiel:** + +```bash +curl https://api-dev.breakpilot.ai/api/compliance/v1/canonical/generate/status/a1b2c3d4-... \ + -H 'X-Tenant-ID: 550e8400-e29b-41d4-a716-446655440000' +``` + +**Response:** + +```json +{ + "id": "a1b2c3d4-...", + "status": "completed", + "total_chunks_scanned": 500, + "controls_generated": 48, + "controls_verified": 45, + "controls_needs_review": 3, + "controls_too_close": 0, + "controls_duplicates_found": 12, + "controls_qa_fixed": 5, + "config": { "..." }, + "started_at": "2026-03-17T10:00:00+00:00", + "completed_at": "2026-03-17T10:15:32+00:00" +} +``` + +--- + +### GET `/v1/canonical/generate/jobs` — Alle Jobs + +Paginierte Liste aller Generierungs-Jobs. + +**Query-Parameter:** + +| Parameter | Default | Beschreibung | +|-----------|---------|-------------| +| `limit` | `20` | Anzahl Jobs (1-100) | +| `offset` | `0` | Offset fuer Paginierung | + +**Beispiel:** + +```bash +curl "https://api-dev.breakpilot.ai/api/compliance/v1/canonical/generate/jobs?limit=5" \ + -H 'X-Tenant-ID: 550e8400-e29b-41d4-a716-446655440000' +``` + +--- + +### GET `/v1/canonical/generate/review-queue` — Review-Queue + +Listet Controls auf, die eine manuelle Pruefung benoetigen. + +**Query-Parameter:** + +| Parameter | Default | Beschreibung | +|-----------|---------|-------------| +| `release_state` | `needs_review` | Filter: `needs_review`, `too_close`, `duplicate` | +| `limit` | `50` | Anzahl (1-200) | + +**Beispiel:** + +```bash +curl "https://api-dev.breakpilot.ai/api/compliance/v1/canonical/generate/review-queue?release_state=needs_review&limit=10" \ + -H 'X-Tenant-ID: 550e8400-e29b-41d4-a716-446655440000' +``` + +--- + +### POST `/v1/canonical/generate/review/{control_id}` — Review abschliessen + +Schliesst die manuelle Pruefung eines Controls ab. + +**Request:** + +```json +{ + "action": "approve", + "release_state": "draft", + "notes": "Inhaltlich korrekt, Severity passt." +} +``` + +**Moegliche `action`-Werte:** + +| Action | Neuer State | Beschreibung | +|--------|-------------|-------------| +| `approve` | `draft` (oder per `release_state` ueberschreiben) | Control freigeben | +| `reject` | `deprecated` | Control verwerfen | +| `needs_rework` | `needs_review` | Zurueck in die Queue | + +**Beispiel:** + +```bash +curl -X POST https://api-dev.breakpilot.ai/api/compliance/v1/canonical/generate/review/AUTH-042 \ + -H 'Content-Type: application/json' \ + -H 'X-Tenant-ID: 550e8400-e29b-41d4-a716-446655440000' \ + -d '{"action": "approve", "release_state": "draft"}' +``` + +--- + +### POST `/v1/canonical/generate/bulk-review` — Bulk-Review + +Aendert den `release_state` aller Controls, die einen bestimmten State haben. + +**Request:** + +```json +{ + "release_state": "needs_review", + "action": "approve", + "new_state": "draft" +} +``` + +**Beispiel:** + +```bash +# Alle needs_review Controls auf draft setzen +curl -X POST https://api-dev.breakpilot.ai/api/compliance/v1/canonical/generate/bulk-review \ + -H 'Content-Type: application/json' \ + -H 'X-Tenant-ID: 550e8400-e29b-41d4-a716-446655440000' \ + -d '{"release_state": "needs_review", "action": "approve", "new_state": "draft"}' +``` + +--- + +### GET `/v1/canonical/generate/processed-stats` — Verarbeitungsstatistik + +Liefert Statistiken pro RAG-Collection. + +**Beispiel:** + +```bash +curl https://api-dev.breakpilot.ai/api/compliance/v1/canonical/generate/processed-stats \ + -H 'X-Tenant-ID: 550e8400-e29b-41d4-a716-446655440000' +``` + +**Response:** + +```json +{ + "stats": [ + { + "collection": "bp_compliance_gesetze", + "processed_chunks": 45200, + "direct_adopted": 1850, + "llm_reformed": 120, + "skipped": 43230, + "total_chunks_estimated": 0, + "pending_chunks": 0 + } + ] +} +``` + +--- + +## Kosten und Performance + +### Kostenabschaetzung + +| Metrik | Wert | +|--------|------| +| **Kosten pro Chunk** | ~$0.0067 (Anthropic API, Batch-Modus) | +| **Yield (Controls/Chunks)** | ~4.5-10% (nur Chunks mit konkreten Anforderungen erzeugen Controls) | +| **Vorfilter-Ersparnis** | ~55% der API-Kosten wenn aktiviert (irrelevante Chunks werden lokal aussortiert) | + +### Performance-Kennzahlen + +| Metrik | Wert | +|--------|------| +| **Batch-Groesse** | 5 Chunks pro API-Call (Default) | +| **API-Aufrufe Reduktion** | ~80% weniger Aufrufe durch Batching | +| **LLM-Timeout** | 180 Sekunden pro Call | +| **QA-Overhead** | ~2s pro Control (nur bei Disagreement, ~10-15% der Controls) | + +### RAG Collections + +| Collection | Inhalte | Erwartete Regel | +|-----------|---------|----------------| +| `bp_compliance_gesetze` | Deutsche Gesetze (BDSG, TTDSG, TKG etc.) | Rule 1 | +| `bp_compliance_datenschutz` | Datenschutz-Leitlinien + EU-Verordnungen | Rule 1/2 | +| `bp_compliance_ce` | CE/Sicherheitsstandards | Rule 1/2/3 | +| `bp_dsfa_corpus` | DSFA-Korpus | Rule 1/2 | +| `bp_legal_templates` | Rechtsvorlagen | Rule 1 | + +### Aktuelle Groessenordnung + +| Metrik | Wert | +|--------|------| +| RAG-Chunks gesamt | ~105.000 (nach Dedup 2026-03-16) | +| Verarbeitete Chunks | ~105.000 | +| Generierte Controls | **~4.738** | +| Konversionsrate | ~4,5% | + +--- + +## Lizenz-Klassifikation (3-Regel-System) + +Jeder Chunk wird basierend auf `regulation_code` einer Lizenzregel zugeordnet: + +| Regel | Typ | Original erlaubt? | Beispiele | +|-------|-----|-------------------|----------| +| **Rule 1** (free_use) | EU-Gesetze, NIST, DE-Gesetze, Public Domain | Ja | DSGVO, BDSG, NIS2, AI Act | +| **Rule 2** (citation_required) | CC-BY, CC-BY-SA | Ja, mit Zitation | OWASP ASVS, OWASP SAMM | +| **Rule 3** (restricted) | Proprietaer | Nein, volle Reformulierung | BSI TR-03161, ISO 27001 | + +### Verarbeitung nach Regel + +- **Rule 1+2 → `_structure_batch()`**: Anthropic strukturiert den Originaltext als Control. Ein API-Call fuer den gesamten Batch. +- **Rule 3 → `_reformulate_batch()`**: Anthropic reformuliert vollstaendig — kein Originaltext, keine Quellennamen. Ein API-Call fuer den gesamten Batch. + +### Batch Processing + +Die Pipeline sammelt Chunks in Batches (Default: 5 Chunks) und sendet sie in einem einzigen Anthropic-API-Call. + +1. Relevante Chunks werden mit Lizenz-Info in `pending_batch` gesammelt +2. Bei `batch_size` erreicht → `_flush_batch()` +3. Batch wird nach Lizenzregel getrennt: Rule 1+2 → `_structure_batch()`, Rule 3 → `_reformulate_batch()` +4. Ergebnis: JSON-Array mit genau N Elementen (`null` fuer irrelevante Chunks) + +**Fallback:** Bei Batch-Fehler (Timeout, Parsing-Error) wird automatisch auf Einzelverarbeitung zurueckgefallen. + +--- + +## Chunk-Tracking (Processed Chunks) + +### Tabelle `canonical_processed_chunks` + +| Spalte | Typ | Beschreibung | +|--------|-----|-------------| +| `chunk_hash` | VARCHAR(64) | SHA-256 Hash des Chunk-Textes | +| `collection` | VARCHAR(100) | Qdrant-Collection | +| `regulation_code` | VARCHAR(100) | Quell-Regulation (z.B. `bdsg`, `eu_2016_679`) | +| `license_rule` | INTEGER | 1, 2 oder 3 | +| `processing_path` | VARCHAR(20) | Wie der Chunk verarbeitet wurde | +| `generated_control_ids` | JSONB | UUIDs der generierten Controls | +| `pipeline_version` | SMALLINT | Pipeline-Version (1 oder 2) | +| `job_id` | UUID | Referenz auf den Generierungs-Job | + +**UNIQUE Constraint:** `(chunk_hash, collection, document_version)` — verhindert Doppelverarbeitung. + +### Processing Paths + +| Wert | Stufe | Bedeutung | +|------|-------|-----------| +| `prefilter_skip` | 2 | Lokaler LLM-Vorfilter: Chunk nicht relevant | +| `structured` | 3a | Einzelner Chunk strukturiert (Rule 1/2) | +| `structured_batch` | 3a | Batch-Strukturierung (Rule 1/2) | +| `llm_reform` | 3b | Einzelner Chunk reformuliert (Rule 3) | +| `llm_reform_batch` | 3b | Batch-Reformulierung (Rule 3) | +| `no_control` | 3 | LLM konnte kein Control ableiten (null im Array) | +| `store_failed` | 6 | DB-Speichern fehlgeschlagen | +| `error` | — | Unerwarteter Fehler | + +--- + +## QA Validation (Automatische Qualitaetspruefung) + +Die QA-Stufe validiert die Klassifizierung jedes generierten Controls: + +1. **LLM-Category:** Anthropic liefert `category` und `domain` im JSON-Response +2. **Keyword-Detection:** `_detect_category(chunk.text)` liefert eine zweite Meinung +3. **Stimmen beide ueberein?** → Schneller Pfad (kein QA noetig) +4. **Bei Disagreement:** Lokales LLM (Ollama) arbitriert +5. **Auto-Fix:** Category/Domain werden automatisch korrigiert + +Die QA-Metriken werden in `generation_metadata` gespeichert: + +```json +{ + "qa_category_fix": {"from": "authentication", "to": "finance", "reason": "IFRS-Thema"}, + "qa_domain_fix": {"from": "AUTH", "to": "FIN", "reason": "Finanzregulierung"} +} +``` + +### QA-Reklassifizierung bestehender Controls + +```bash +# Dry Run: Welche AUTH-Controls sind falsch klassifiziert? +curl -X POST https://api-dev.breakpilot.ai/api/compliance/v1/canonical/generate/qa-reclassify \ + -H 'Content-Type: application/json' \ + -H 'X-Tenant-ID: 550e8400-e29b-41d4-a716-446655440000' \ + -d '{"limit": 50, "dry_run": true, "filter_domain_prefix": "AUTH"}' + +# Korrekturen anwenden: +curl -X POST https://api-dev.breakpilot.ai/api/compliance/v1/canonical/generate/qa-reclassify \ + -H 'Content-Type: application/json' \ + -H 'X-Tenant-ID: 550e8400-e29b-41d4-a716-446655440000' \ + -d '{"limit": 50, "dry_run": false, "filter_domain_prefix": "AUTH"}' +``` + +--- + +## Quelldateien + +| Datei | Beschreibung | +|-------|-------------| +| `backend-compliance/compliance/services/control_generator.py` | 7-Stufen-Pipeline mit Batch Processing | +| `backend-compliance/compliance/api/control_generator_routes.py` | REST API Endpoints | +| `backend-compliance/compliance/services/license_gate.py` | Lizenz-Gate-Logik | +| `backend-compliance/compliance/services/similarity_detector.py` | Too-Close-Detektor (5 Metriken) | +| `backend-compliance/compliance/services/rag_client.py` | RAG-Client (Qdrant Search + Scroll) | +| `backend-compliance/migrations/046_control_generator.sql` | Job-Tracking, Chunk-Tracking Tabellen | +| `backend-compliance/migrations/048_processing_path_expand.sql` | Erweiterte Processing-Path-Werte | +| `backend-compliance/migrations/062_pipeline_version.sql` | `pipeline_version` Spalte | +| `backend-compliance/tests/test_control_generator.py` | 15 Tests (Lizenz, Domain, Batch, Pipeline) | + +--- + +## Verwandte Dokumentation + +- [Canonical Control Library (CP-CLIB)](canonical-control-library.md) — Domains, Datenmodell, Too-Close-Detektor, CI/CD Validation +- [Multi-Layer Control Architecture](canonical-control-library.md#multi-layer-control-architecture) — 10-Stage Pipeline-Erweiterung mit Obligations, Patterns, Crosswalk diff --git a/mkdocs.yml b/mkdocs.yml index 5c6fc2e..97566cf 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -103,6 +103,7 @@ nav: - Dokumentengenerierung: services/sdk-modules/dokumentengenerierung.md - Policy-Bibliothek (29 Richtlinien): services/sdk-modules/policy-bibliothek.md - Canonical Control Library (CP-CLIB): services/sdk-modules/canonical-control-library.md + - Control Generator Pipeline: services/sdk-modules/control-generator-pipeline.md - Strategie: - Wettbewerbsanalyse & Roadmap: strategy/wettbewerbsanalyse.md - Entwicklung: