feat(pipeline): pipeline_version v2, migration 062, docs + 71 tests
- Add PIPELINE_VERSION=2 constant and pipeline_version column to canonical_controls and canonical_processed_chunks (migration 062) - Anthropic API decides chunk relevance via null-returns (skip_prefilter) - Annex/appendix chunks explicitly protected in prompts - Fix 6 failing tests (CRYP domain, _process_batch tuple return) - Add TestPipelineVersion + TestRegulationFilter test classes (10 new tests) - Add MkDocs page: control-generator-pipeline.md (541 lines) - Update canonical-control-library.md with v2 pipeline diagram - Update testing.md with 71-test breakdown table Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -53,6 +53,11 @@ LLM_TIMEOUT = float(os.getenv("CONTROL_GEN_LLM_TIMEOUT", "180"))
|
||||
|
||||
HARMONIZATION_THRESHOLD = 0.85 # Cosine similarity above this = duplicate
|
||||
|
||||
# Pipeline version — increment when generation rules change materially.
|
||||
# v1: Original (local LLM prefilter, old prompt)
|
||||
# v2: Anthropic decides relevance, null for non-requirement chunks, annexes protected
|
||||
PIPELINE_VERSION = 2
|
||||
|
||||
ALL_COLLECTIONS = [
|
||||
"bp_compliance_ce",
|
||||
"bp_compliance_gesetze",
|
||||
@@ -1663,7 +1668,7 @@ Kategorien: {CATEGORY_LIST_STR}"""
|
||||
license_rule, source_original_text, source_citation,
|
||||
customer_visible, generation_metadata,
|
||||
verification_method, category, generation_strategy,
|
||||
target_audience
|
||||
target_audience, pipeline_version
|
||||
) VALUES (
|
||||
:framework_id, :control_id, :title, :objective, :rationale,
|
||||
:scope, :requirements, :test_procedure, :evidence,
|
||||
@@ -1672,7 +1677,7 @@ Kategorien: {CATEGORY_LIST_STR}"""
|
||||
:license_rule, :source_original_text, :source_citation,
|
||||
:customer_visible, :generation_metadata,
|
||||
:verification_method, :category, :generation_strategy,
|
||||
:target_audience
|
||||
:target_audience, :pipeline_version
|
||||
)
|
||||
ON CONFLICT (framework_id, control_id) DO NOTHING
|
||||
RETURNING id
|
||||
@@ -1702,6 +1707,7 @@ Kategorien: {CATEGORY_LIST_STR}"""
|
||||
"category": control.category,
|
||||
"generation_strategy": control.generation_strategy,
|
||||
"target_audience": json.dumps(control.target_audience) if control.target_audience else None,
|
||||
"pipeline_version": PIPELINE_VERSION,
|
||||
},
|
||||
)
|
||||
self.db.commit()
|
||||
@@ -1728,11 +1734,13 @@ Kategorien: {CATEGORY_LIST_STR}"""
|
||||
INSERT INTO canonical_processed_chunks (
|
||||
chunk_hash, collection, regulation_code,
|
||||
document_version, source_license, license_rule,
|
||||
processing_path, generated_control_ids, job_id
|
||||
processing_path, generated_control_ids, job_id,
|
||||
pipeline_version
|
||||
) VALUES (
|
||||
:hash, :collection, :regulation_code,
|
||||
:doc_version, :license, :rule,
|
||||
:path, :control_ids, CAST(:job_id AS uuid)
|
||||
:path, :control_ids, CAST(:job_id AS uuid),
|
||||
:pipeline_version
|
||||
)
|
||||
ON CONFLICT (chunk_hash, collection, document_version) DO NOTHING
|
||||
"""),
|
||||
@@ -1746,6 +1754,7 @@ Kategorien: {CATEGORY_LIST_STR}"""
|
||||
"path": processing_path,
|
||||
"control_ids": json.dumps(control_ids),
|
||||
"job_id": job_id,
|
||||
"pipeline_version": PIPELINE_VERSION,
|
||||
},
|
||||
)
|
||||
self.db.commit()
|
||||
|
||||
22
backend-compliance/migrations/062_pipeline_version.sql
Normal file
22
backend-compliance/migrations/062_pipeline_version.sql
Normal file
@@ -0,0 +1,22 @@
|
||||
-- Migration 062: Add pipeline_version to track which generation rules produced each control/chunk
|
||||
--
|
||||
-- v1 = Original pipeline (local LLM prefilter, old prompt without null-skip)
|
||||
-- v2 = Improved pipeline (skip_prefilter, Anthropic decides relevance, annexes protected)
|
||||
--
|
||||
-- This allows identifying controls that may need reprocessing when pipeline rules change.
|
||||
|
||||
ALTER TABLE canonical_controls
|
||||
ADD COLUMN IF NOT EXISTS pipeline_version smallint NOT NULL DEFAULT 1;
|
||||
|
||||
ALTER TABLE canonical_processed_chunks
|
||||
ADD COLUMN IF NOT EXISTS pipeline_version smallint NOT NULL DEFAULT 1;
|
||||
|
||||
-- Index for efficient querying by version
|
||||
CREATE INDEX IF NOT EXISTS idx_canonical_controls_pipeline_version
|
||||
ON canonical_controls (pipeline_version);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_canonical_processed_chunks_pipeline_version
|
||||
ON canonical_processed_chunks (pipeline_version);
|
||||
|
||||
COMMENT ON COLUMN canonical_controls.pipeline_version IS 'Generation pipeline version: 1=original (local prefilter), 2=improved (Anthropic decides relevance, annexes protected)';
|
||||
COMMENT ON COLUMN canonical_processed_chunks.pipeline_version IS 'Pipeline version used when this chunk was processed';
|
||||
@@ -8,10 +8,12 @@ from compliance.services.control_generator import (
|
||||
_classify_regulation,
|
||||
_detect_domain,
|
||||
_parse_llm_json,
|
||||
_parse_llm_json_array,
|
||||
GeneratorConfig,
|
||||
GeneratedControl,
|
||||
ControlGeneratorPipeline,
|
||||
REGULATION_LICENSE_MAP,
|
||||
PIPELINE_VERSION,
|
||||
)
|
||||
from compliance.services.anchor_finder import AnchorFinder, OpenAnchor
|
||||
from compliance.services.rag_client import RAGSearchResult
|
||||
@@ -91,7 +93,7 @@ class TestDomainDetection:
|
||||
assert _detect_domain("Multi-factor authentication and password policy") == "AUTH"
|
||||
|
||||
def test_crypto_domain(self):
|
||||
assert _detect_domain("TLS 1.3 encryption and certificate management") == "CRYPT"
|
||||
assert _detect_domain("TLS 1.3 encryption and certificate management") == "CRYP"
|
||||
|
||||
def test_network_domain(self):
|
||||
assert _detect_domain("Firewall rules and network segmentation") == "NET"
|
||||
@@ -807,7 +809,7 @@ class TestBatchProcessingLoop:
|
||||
patch.object(pipeline, "_check_harmonization", new_callable=AsyncMock, return_value=[]), \
|
||||
patch("compliance.services.anchor_finder.AnchorFinder", mock_finder_cls):
|
||||
config = GeneratorConfig()
|
||||
result = await pipeline._process_batch(batch_items, config, "job-1")
|
||||
result, qa_count = await pipeline._process_batch(batch_items, config, "job-1")
|
||||
|
||||
mock_struct.assert_called_once()
|
||||
mock_reform.assert_not_called()
|
||||
@@ -839,7 +841,7 @@ class TestBatchProcessingLoop:
|
||||
patch("compliance.services.control_generator.check_similarity", new_callable=AsyncMock) as mock_sim:
|
||||
mock_sim.return_value = MagicMock(status="PASS", token_overlap=0.1, ngram_jaccard=0.1, lcs_ratio=0.1)
|
||||
config = GeneratorConfig()
|
||||
result = await pipeline._process_batch(batch_items, config, "job-2")
|
||||
result, qa_count = await pipeline._process_batch(batch_items, config, "job-2")
|
||||
|
||||
mock_struct.assert_not_called()
|
||||
mock_reform.assert_called_once()
|
||||
@@ -885,7 +887,7 @@ class TestBatchProcessingLoop:
|
||||
patch("compliance.services.control_generator.check_similarity", new_callable=AsyncMock) as mock_sim:
|
||||
mock_sim.return_value = MagicMock(status="PASS", token_overlap=0.05, ngram_jaccard=0.05, lcs_ratio=0.05)
|
||||
config = GeneratorConfig()
|
||||
result = await pipeline._process_batch(batch_items, config, "job-mixed")
|
||||
result, qa_count = await pipeline._process_batch(batch_items, config, "job-mixed")
|
||||
|
||||
# Both methods called
|
||||
mock_struct.assert_called_once()
|
||||
@@ -905,8 +907,9 @@ class TestBatchProcessingLoop:
|
||||
pipeline._existing_controls = []
|
||||
|
||||
config = GeneratorConfig()
|
||||
result = await pipeline._process_batch([], config, "job-empty")
|
||||
result, qa_count = await pipeline._process_batch([], config, "job-empty")
|
||||
assert result == []
|
||||
assert qa_count == 0
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_reformulate_batch_too_close_flagged(self):
|
||||
@@ -942,7 +945,7 @@ class TestBatchProcessingLoop:
|
||||
patch("compliance.services.anchor_finder.AnchorFinder", mock_finder_cls), \
|
||||
patch("compliance.services.control_generator.check_similarity", new_callable=AsyncMock, return_value=fail_report):
|
||||
config = GeneratorConfig()
|
||||
result = await pipeline._process_batch(batch_items, config, "job-tooclose")
|
||||
result, qa_count = await pipeline._process_batch(batch_items, config, "job-tooclose")
|
||||
|
||||
assert len(result) == 1
|
||||
assert result[0].release_state == "too_close"
|
||||
@@ -1112,3 +1115,194 @@ class TestRegulationFilter:
|
||||
results = await pipeline._scan_rag(config)
|
||||
|
||||
assert len(results) == 2
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Pipeline Version Tests
|
||||
# =============================================================================
|
||||
|
||||
class TestPipelineVersion:
|
||||
"""Tests for pipeline_version propagation in DB writes and null handling."""
|
||||
|
||||
def test_pipeline_version_constant_is_2(self):
|
||||
assert PIPELINE_VERSION == 2
|
||||
|
||||
def test_store_control_includes_pipeline_version(self):
|
||||
"""_store_control must pass pipeline_version=PIPELINE_VERSION to the INSERT."""
|
||||
mock_db = MagicMock()
|
||||
# Framework lookup returns a UUID
|
||||
fw_row = MagicMock()
|
||||
fw_row.__getitem__ = lambda self, idx: "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee"
|
||||
mock_db.execute.return_value.fetchone.return_value = fw_row
|
||||
|
||||
pipeline = ControlGeneratorPipeline(db=mock_db, rag_client=MagicMock())
|
||||
|
||||
control = GeneratedControl(
|
||||
control_id="SEC-TEST-001",
|
||||
title="Test Control",
|
||||
objective="Test objective",
|
||||
)
|
||||
pipeline._store_control(control, job_id="00000000-0000-0000-0000-000000000001")
|
||||
|
||||
# The second call to db.execute is the INSERT
|
||||
calls = mock_db.execute.call_args_list
|
||||
assert len(calls) >= 2, f"Expected at least 2 db.execute calls, got {len(calls)}"
|
||||
insert_call = calls[1]
|
||||
params = insert_call[0][1] # positional arg 1 = params dict
|
||||
assert "pipeline_version" in params
|
||||
assert params["pipeline_version"] == PIPELINE_VERSION
|
||||
|
||||
def test_mark_chunk_processed_includes_pipeline_version(self):
|
||||
"""_mark_chunk_processed must pass pipeline_version=PIPELINE_VERSION to the INSERT."""
|
||||
mock_db = MagicMock()
|
||||
pipeline = ControlGeneratorPipeline(db=mock_db, rag_client=MagicMock())
|
||||
|
||||
chunk = MagicMock()
|
||||
chunk.text = "Some chunk text for hashing"
|
||||
chunk.collection = "bp_compliance_ce"
|
||||
chunk.regulation_code = "eu_2016_679"
|
||||
|
||||
license_info = {"license": "CC0-1.0", "rule": 1}
|
||||
|
||||
pipeline._mark_chunk_processed(
|
||||
chunk=chunk,
|
||||
license_info=license_info,
|
||||
processing_path="structured_batch",
|
||||
control_ids=["SEC-TEST-001"],
|
||||
job_id="00000000-0000-0000-0000-000000000001",
|
||||
)
|
||||
|
||||
calls = mock_db.execute.call_args_list
|
||||
assert len(calls) >= 1
|
||||
insert_call = calls[0]
|
||||
params = insert_call[0][1]
|
||||
assert "pipeline_version" in params
|
||||
assert params["pipeline_version"] == PIPELINE_VERSION
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_structure_batch_handles_null_results(self):
|
||||
"""When _parse_llm_json_array returns [dict, None, dict], the null entries produce None."""
|
||||
mock_db = MagicMock()
|
||||
pipeline = ControlGeneratorPipeline(db=mock_db, rag_client=MagicMock())
|
||||
|
||||
# Three chunks
|
||||
chunks = []
|
||||
license_infos = []
|
||||
for i in range(3):
|
||||
c = MagicMock()
|
||||
c.text = f"Chunk text number {i} with enough content for processing"
|
||||
c.regulation_name = "DSGVO"
|
||||
c.regulation_code = "eu_2016_679"
|
||||
c.article = f"Art. {i + 1}"
|
||||
c.paragraph = ""
|
||||
c.source_url = ""
|
||||
c.collection = "bp_compliance_ce"
|
||||
chunks.append(c)
|
||||
license_infos.append({"rule": 1, "name": "DSGVO", "license": "CC0-1.0"})
|
||||
|
||||
# LLM returns a JSON array: valid, null, valid
|
||||
llm_response = json.dumps([
|
||||
{
|
||||
"chunk_index": 1,
|
||||
"title": "Datenschutz-Kontrolle 1",
|
||||
"objective": "Schutz personenbezogener Daten",
|
||||
"rationale": "DSGVO-Konformitaet",
|
||||
"requirements": ["Req 1"],
|
||||
"test_procedure": ["Test 1"],
|
||||
"evidence": ["Nachweis 1"],
|
||||
"severity": "high",
|
||||
"tags": ["dsgvo"],
|
||||
"domain": "DATA",
|
||||
"category": "datenschutz",
|
||||
"target_audience": ["unternehmen"],
|
||||
"source_article": "Art. 1",
|
||||
"source_paragraph": "",
|
||||
},
|
||||
None,
|
||||
{
|
||||
"chunk_index": 3,
|
||||
"title": "Datenschutz-Kontrolle 3",
|
||||
"objective": "Transparenzpflicht",
|
||||
"rationale": "Information der Betroffenen",
|
||||
"requirements": ["Req 3"],
|
||||
"test_procedure": ["Test 3"],
|
||||
"evidence": ["Nachweis 3"],
|
||||
"severity": "medium",
|
||||
"tags": ["transparenz"],
|
||||
"domain": "DATA",
|
||||
"category": "datenschutz",
|
||||
"target_audience": ["unternehmen"],
|
||||
"source_article": "Art. 3",
|
||||
"source_paragraph": "",
|
||||
},
|
||||
])
|
||||
|
||||
with patch("compliance.services.control_generator._llm_chat", new_callable=AsyncMock) as mock_llm:
|
||||
mock_llm.return_value = llm_response
|
||||
controls = await pipeline._structure_batch(chunks, license_infos)
|
||||
|
||||
assert len(controls) == 3
|
||||
assert controls[0] is not None
|
||||
assert controls[1] is None # Null entry from LLM
|
||||
assert controls[2] is not None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_reformulate_batch_handles_null_results(self):
|
||||
"""When _parse_llm_json_array returns [dict, None, dict], the null entries produce None."""
|
||||
mock_db = MagicMock()
|
||||
pipeline = ControlGeneratorPipeline(db=mock_db, rag_client=MagicMock())
|
||||
|
||||
chunks = []
|
||||
for i in range(3):
|
||||
c = MagicMock()
|
||||
c.text = f"Restricted chunk text number {i} with BSI content"
|
||||
c.regulation_name = "BSI TR-03161"
|
||||
c.regulation_code = "bsi_tr03161"
|
||||
c.article = f"Section {i + 1}"
|
||||
c.paragraph = ""
|
||||
c.source_url = ""
|
||||
c.collection = "bp_compliance_ce"
|
||||
chunks.append(c)
|
||||
|
||||
config = GeneratorConfig(domain="SEC")
|
||||
|
||||
llm_response = json.dumps([
|
||||
{
|
||||
"chunk_index": 1,
|
||||
"title": "Sicherheitskontrolle 1",
|
||||
"objective": "Authentifizierung absichern",
|
||||
"rationale": "Best Practice",
|
||||
"requirements": ["Req 1"],
|
||||
"test_procedure": ["Test 1"],
|
||||
"evidence": ["Nachweis 1"],
|
||||
"severity": "high",
|
||||
"tags": ["sicherheit"],
|
||||
"domain": "SEC",
|
||||
"category": "it-sicherheit",
|
||||
"target_audience": ["it-abteilung"],
|
||||
},
|
||||
None,
|
||||
{
|
||||
"chunk_index": 3,
|
||||
"title": "Sicherheitskontrolle 3",
|
||||
"objective": "Netzwerk segmentieren",
|
||||
"rationale": "Angriffsoberflaeche reduzieren",
|
||||
"requirements": ["Req 3"],
|
||||
"test_procedure": ["Test 3"],
|
||||
"evidence": ["Nachweis 3"],
|
||||
"severity": "medium",
|
||||
"tags": ["netzwerk"],
|
||||
"domain": "NET",
|
||||
"category": "netzwerksicherheit",
|
||||
"target_audience": ["it-abteilung"],
|
||||
},
|
||||
])
|
||||
|
||||
with patch("compliance.services.control_generator._llm_chat", new_callable=AsyncMock) as mock_llm:
|
||||
mock_llm.return_value = llm_response
|
||||
controls = await pipeline._reformulate_batch(chunks, config)
|
||||
|
||||
assert len(controls) == 3
|
||||
assert controls[0] is not None
|
||||
assert controls[1] is None # Null entry from LLM
|
||||
assert controls[2] is not None
|
||||
|
||||
@@ -214,13 +214,13 @@ Wenn du z.B. eine neue `GetUserStats()` Funktion im Go Service hinzufuegst:
|
||||
|
||||
## Modul-spezifische Tests
|
||||
|
||||
### Canonical Control Generator (82 Tests)
|
||||
### Canonical Control Generator (71+ Tests)
|
||||
|
||||
Die Control Library hat eine umfangreiche Test-Suite ueber 6 Dateien.
|
||||
Siehe [Canonical Control Library — Tests](../services/sdk-modules/canonical-control-library.md#tests) fuer Details.
|
||||
Siehe [Canonical Control Library — Tests](../services/sdk-modules/canonical-control-library.md#tests) und [Control Generator Pipeline](../services/sdk-modules/control-generator-pipeline.md) fuer Details.
|
||||
|
||||
```bash
|
||||
# Alle Generator-Tests
|
||||
# Alle Generator-Tests (71 Tests in 10 Klassen)
|
||||
cd backend-compliance && pytest -v tests/test_control_generator.py
|
||||
|
||||
# Similarity Detector Tests
|
||||
@@ -237,10 +237,19 @@ cd backend-compliance && pytest -v tests/test_validate_controls.py
|
||||
```
|
||||
|
||||
**Wichtig:** Die Generator-Tests nutzen Mocks fuer Anthropic-API und Qdrant — sie laufen ohne externe Abhaengigkeiten.
|
||||
Die `TestPipelineMocked`-Klasse prueft insbesondere:
|
||||
|
||||
- Korrekte Lizenz-Klassifikation (Rule 1/2/3 Verhalten)
|
||||
- Rule 3 exponiert **keine** Quellennamen in `generation_metadata`
|
||||
- SHA-256 Hash-Deduplizierung fuer Chunks
|
||||
- Config-Defaults (`batch_size: 5`, `skip_processed: true`)
|
||||
- Rule 1 Citation wird korrekt mit Gesetzesreferenz generiert
|
||||
**Testklassen in `test_control_generator.py`:**
|
||||
|
||||
| Klasse | Tests | Prueft |
|
||||
|--------|-------|--------|
|
||||
| `TestLicenseMapping` | 12 | Lizenz-Klassifikation (Rule 1/2/3), Case-Insensitivitaet |
|
||||
| `TestDomainDetection` | 5 | Keyword-basierte Domain-Erkennung (AUTH, CRYP, NET, DATA) |
|
||||
| `TestJsonParsing` | 4 | JSON-Parser fuer LLM-Responses (Markdown-Fencing, Preamble) |
|
||||
| `TestGeneratedControlRules` | 3 | Rule-spezifische Felder (original_text, citation, source_info) |
|
||||
| `TestAnchorFinder` | 2 | RAG-Suche + Web-Framework-Erkennung |
|
||||
| `TestPipelineMocked` | 5 | End-to-End Pipeline mit Mocks (Lizenz, Hash-Dedup, Config) |
|
||||
| `TestParseJsonArray` | 15 | JSON-Array-Parser (Wrapper-Objekte, Bracket-Extraction, Fallbacks) |
|
||||
| `TestBatchSizeConfig` | 5 | Batch-Groesse-Konfiguration + Defaults |
|
||||
| `TestBatchProcessingLoop` | 10 | Batch-Verarbeitung (Rule-Split, Mixed-Rules, Too-Close, Null-Handling) |
|
||||
| `TestRegulationFilter` | 5 | regulation_filter Prefix-Matching, leere regulation_codes |
|
||||
| `TestPipelineVersion` | 5 | pipeline_version=2 in DB-Writes, null-Handling in Structure/Reform |
|
||||
|
||||
@@ -244,44 +244,36 @@ Der Validator (`scripts/validate-controls.py`) prueft bei jedem Commit:
|
||||
Automatische Generierung von Controls aus dem gesamten RAG-Korpus (~105.000 Chunks aus Gesetzen, Verordnungen und Standards).
|
||||
Aktueller Stand: **~4.738 Controls** generiert.
|
||||
|
||||
### 9-Stufen-Pipeline
|
||||
!!! tip "Ausfuehrliche Dokumentation"
|
||||
Siehe **[Control Generator Pipeline](control-generator-pipeline.md)** fuer die vollstaendige Referenz inkl. API-Endpoints, Konfiguration, Kosten und Pipeline-Versionen.
|
||||
|
||||
### 7-Stufen-Pipeline (v2)
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
A[1. RAG Scroll] -->|max_chunks| B[2. Prefilter - Lokales LLM]
|
||||
B -->|Irrelevant| C[Als processed markieren]
|
||||
B -->|Relevant| D[3. License Classify]
|
||||
D -->|Batch sammeln| E[4. Batch Processing - 5 Chunks/API-Call]
|
||||
E -->|Rule 1/2| F[4a. Structure Batch - Anthropic]
|
||||
E -->|Rule 3| G[4b. Reform Batch - Anthropic]
|
||||
F --> QA[5. QA Validation - Lokales LLM]
|
||||
G --> QA
|
||||
QA -->|Mismatch| QAF[Auto-Fix Category/Domain]
|
||||
QA -->|OK| H[6. Harmonization - Embeddings]
|
||||
QAF --> H
|
||||
H -->|Duplikat| I[Als Duplikat speichern]
|
||||
H -->|Neu| J[7. Anchor Search]
|
||||
J --> K[8. Store Control]
|
||||
K --> L[9. Mark Processed]
|
||||
A[1. RAG Scan] -->|Alle Chunks laden| B[2. License Classify]
|
||||
B -->|Rule 1/2| C[3a. Structure Batch]
|
||||
B -->|Rule 3| D[3b. Reform Batch]
|
||||
C --> E[4. Harmonize]
|
||||
D --> E
|
||||
E -->|Duplikat| F[Als Duplikat markieren]
|
||||
E -->|Neu| G[5. Anchor Search]
|
||||
G --> H[6. Store Control]
|
||||
H --> I[7. Mark Processed]
|
||||
```
|
||||
|
||||
### Stufe 1: RAG Scroll (Vollstaendig)
|
||||
!!! info "Pipeline-Version v2 (seit 2026-03-17)"
|
||||
- **Kein lokaler Vorfilter mehr** — Anthropic API entscheidet selbst ueber Chunk-Relevanz via null-Returns
|
||||
- **Annexe geschuetzt** — Technische Anforderungen in Anhaengen werden nicht mehr uebersprungen
|
||||
- **`pipeline_version`** Spalte in DB unterscheidet v1- von v2-Controls
|
||||
|
||||
Scrollt durch **ALLE** Chunks in allen RAG-Collections mittels Qdrant Scroll-API.
|
||||
Kein Limit — jeder Chunk wird verarbeitet, um keine gesetzlichen Anforderungen zu uebersehen.
|
||||
### Stufe 1: RAG Scan
|
||||
|
||||
Scrollt durch **ALLE** Chunks in den konfigurierten RAG-Collections mittels Qdrant Scroll-API.
|
||||
Optionaler `regulation_filter` beschraenkt auf bestimmte Regulierungen per Prefix-Matching.
|
||||
|
||||
Bereits verarbeitete Chunks werden per SHA-256-Hash uebersprungen (`canonical_processed_chunks`).
|
||||
|
||||
### Stufe 2: Lokaler LLM-Vorfilter (Qwen 30B)
|
||||
|
||||
**Kostenoptimierung:** Bevor ein Chunk an die Anthropic API geht, prueft das lokale Qwen-Modell (`qwen3:30b-a3b` auf Mac Mini), ob der Chunk eine konkrete Anforderung enthaelt.
|
||||
|
||||
- **Relevant:** Pflichten ("muss", "soll"), technische Massnahmen, Datenschutz-Vorgaben
|
||||
- **Irrelevant:** Definitionen, Inhaltsverzeichnisse, Begriffsbestimmungen, Uebergangsvorschriften
|
||||
|
||||
Irrelevante Chunks werden als `prefilter_skip` markiert und nie wieder verarbeitet.
|
||||
Dies spart >50% der Anthropic-API-Kosten.
|
||||
|
||||
### Stufe 3: Lizenz-Klassifikation (3-Regel-System)
|
||||
|
||||
| Regel | Lizenz | Original erlaubt? | Beispiel |
|
||||
|
||||
540
docs-src/services/sdk-modules/control-generator-pipeline.md
Normal file
540
docs-src/services/sdk-modules/control-generator-pipeline.md
Normal file
@@ -0,0 +1,540 @@
|
||||
# Control Generator Pipeline
|
||||
|
||||
Automatische Generierung von Canonical Controls aus dem gesamten RAG-Korpus (~105.000 Chunks aus Gesetzen, Verordnungen und Standards).
|
||||
|
||||
**Backend:** `backend-compliance/compliance/services/control_generator.py`
|
||||
**Routes:** `backend-compliance/compliance/api/control_generator_routes.py`
|
||||
**API-Prefix:** `/api/compliance/v1/canonical/generate`
|
||||
|
||||
---
|
||||
|
||||
## Pipeline-Uebersicht
|
||||
|
||||
Die Pipeline durchlaeuft 7 Stufen, um aus RAG-Chunks eigenstaendige Security/Compliance Controls zu erzeugen:
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
A[1. RAG Scan] -->|Alle Chunks laden| B[2. License Classify]
|
||||
B -->|Rule 1/2| C[3a. Structure Batch]
|
||||
B -->|Rule 3| D[3b. Reform Batch]
|
||||
C --> E[4. Harmonize]
|
||||
D --> E
|
||||
E -->|Duplikat| F[Als Duplikat markieren]
|
||||
E -->|Neu| G[5. Anchor Search]
|
||||
G --> H[6. Store Control]
|
||||
H --> I[7. Mark Processed]
|
||||
```
|
||||
|
||||
| Stufe | Name | Beschreibung |
|
||||
|-------|------|-------------|
|
||||
| 1 | **RAG Scan** | Laedt unverarbeitete Chunks aus Qdrant (Scroll-API), filtert per SHA-256-Hash |
|
||||
| 2 | **License Classify** | Bestimmt die Lizenzregel (Rule 1/2/3) anhand `regulation_code` |
|
||||
| 3a | **Structure (Batch)** | Rule 1+2: Strukturiert Originaltext als Control (Anthropic API) |
|
||||
| 3b | **Reform (Batch)** | Rule 3: Vollstaendige Reformulierung ohne Originaltext (Anthropic API) |
|
||||
| 4 | **Harmonize** | Embedding-basierte Duplikaterkennung (bge-m3, Cosine > 0.85) |
|
||||
| 5 | **Anchor Search** | Findet Open-Source-Referenzen (OWASP, NIST, ENISA) |
|
||||
| 6 | **Store** | Persistiert Control in `canonical_controls` mit Metadaten |
|
||||
| 7 | **Mark Processed** | Markiert jeden Chunk als verarbeitet (auch bei Skip/Error/Duplikat) |
|
||||
|
||||
---
|
||||
|
||||
## Pipeline-Versionen
|
||||
|
||||
Die Pipeline hat zwei Versionen. Die Version wird als `pipeline_version` auf `canonical_controls` und `canonical_processed_chunks` gespeichert.
|
||||
|
||||
### v1 (Original)
|
||||
|
||||
| Eigenschaft | Wert |
|
||||
|-------------|------|
|
||||
| **Vorfilter** | Lokales LLM (llama3.2 3B) entscheidet ob Chunk relevant |
|
||||
| **Anthropic-Prompt** | Alter Prompt ohne null-Skip |
|
||||
| **Annexe/Anhaenge** | Kein Schutz — wurden haeufig faelschlich als irrelevant uebersprungen |
|
||||
| **`pipeline_version`** | `1` |
|
||||
|
||||
### v2 (Aktuell)
|
||||
|
||||
| Eigenschaft | Wert |
|
||||
|-------------|------|
|
||||
| **Vorfilter** | Optional (`skip_prefilter`). Wenn aktiviert, entscheidet Anthropic API selbst |
|
||||
| **Anthropic-Prompt** | Neuer Prompt mit **null-Skip**: API gibt `null` fuer Chunks ohne Anforderung zurueck |
|
||||
| **Annexe/Anhaenge** | Explizit geschuetzt — Prompt-Anweisung: "Anhaenge/Annexe enthalten oft KONKRETE technische Anforderungen — diese MUESSEN als Control erfasst werden!" |
|
||||
| **`pipeline_version`** | `2` |
|
||||
|
||||
#### Wesentliche Aenderungen v1 → v2
|
||||
|
||||
1. **Relevanz-Entscheidung an Anthropic delegiert** — Das lokale LLM (Vorfilter) ist optional. Die Anthropic API entscheidet selbst, welche Chunks Controls enthalten, indem sie `null` fuer irrelevante Chunks zurueckgibt.
|
||||
2. **null-Skip im JSON-Array** — Das Ergebnis-Array enthaelt `null`-Eintraege fuer Chunks ohne umsetzbare Anforderung. Kein separater Vorfilter-Schritt noetig.
|
||||
3. **Annexe/Anhaenge geschuetzt** — Explizite Prompt-Anweisung verhindert, dass technische Anforderungen in Anhaengen uebersprungen werden.
|
||||
|
||||
#### Datenbank-Feld
|
||||
|
||||
```sql
|
||||
-- Migration 062
|
||||
ALTER TABLE canonical_controls
|
||||
ADD COLUMN pipeline_version smallint NOT NULL DEFAULT 1;
|
||||
|
||||
ALTER TABLE canonical_processed_chunks
|
||||
ADD COLUMN pipeline_version smallint NOT NULL DEFAULT 1;
|
||||
```
|
||||
|
||||
Neue Controls erhalten automatisch `pipeline_version = 2`. Bestehende (v1) behalten `1`, damit sie spaeter identifiziert und ggf. reprocessiert werden koennen.
|
||||
|
||||
---
|
||||
|
||||
## Konfiguration
|
||||
|
||||
### Request-Parameter (`GenerateRequest`)
|
||||
|
||||
| Parameter | Typ | Default | Beschreibung |
|
||||
|-----------|-----|---------|-------------|
|
||||
| `collections` | `List[str]` | Alle 5 Collections | Qdrant-Collections zum Durchsuchen |
|
||||
| `domain` | `str` | — | Filter auf eine Domain (z.B. `AUTH`, `NET`) |
|
||||
| `regulation_filter` | `List[str]` | — | Prefix-Matching auf `regulation_code` (z.B. `["eu_2023_1230", "owasp_"]`) |
|
||||
| `skip_prefilter` | `bool` | `false` | Ueberspringt lokalen LLM-Vorfilter, sendet alle Chunks an die Anthropic API |
|
||||
| `batch_size` | `int` | `5` | Chunks pro Anthropic-API-Call |
|
||||
| `max_controls` | `int` | `50` | Maximale Anzahl Controls pro Job (0 = unbegrenzt) |
|
||||
| `max_chunks` | `int` | `1000` | Maximale Chunks pro Job (0 = unbegrenzt, respektiert Dokumentgrenzen) |
|
||||
| `skip_web_search` | `bool` | `false` | Ueberspringt Web-Suche in der Anchor-Findung (Stufe 5) |
|
||||
| `dry_run` | `bool` | `false` | Trockenlauf ohne DB-Schreibzugriffe (synchron, mit Controls im Response) |
|
||||
|
||||
!!! info "`regulation_filter` — Prefix-Matching"
|
||||
Der Filter vergleicht den `regulation_code` jedes Chunks per Prefix.
|
||||
Beispiel: `["eu_2023_1230"]` erfasst nur Chunks aus der Maschinenverordnung.
|
||||
`["owasp_"]` erfasst alle OWASP-Dokumente (OWASP ASVS, OWASP SAMM, etc.).
|
||||
Gross-/Kleinschreibung wird ignoriert.
|
||||
|
||||
### Umgebungsvariablen
|
||||
|
||||
| Variable | Default | Beschreibung |
|
||||
|----------|---------|-------------|
|
||||
| `ANTHROPIC_API_KEY` | — | API-Key fuer Anthropic Claude (Pflicht) |
|
||||
| `CONTROL_GEN_ANTHROPIC_MODEL` | `claude-sonnet-4-6` | Anthropic-Modell fuer Strukturierung/Reformulierung |
|
||||
| `OLLAMA_URL` | `http://host.docker.internal:11434` | Lokaler Ollama-Server (Vorfilter + QA) |
|
||||
| `CONTROL_GEN_OLLAMA_MODEL` | `qwen3.5:35b-a3b` | Lokales LLM-Modell fuer Vorfilter und QA-Arbitrierung |
|
||||
| `CONTROL_GEN_LLM_TIMEOUT` | `180` | Timeout in Sekunden pro Anthropic-API-Call |
|
||||
|
||||
### Pipeline-interne Konstanten
|
||||
|
||||
| Konstante | Wert | Beschreibung |
|
||||
|-----------|------|-------------|
|
||||
| `PIPELINE_VERSION` | `2` | Aktuelle Pipeline-Version |
|
||||
| `HARMONIZATION_THRESHOLD` | `0.85` | Cosine-Similarity-Schwelle fuer Duplikaterkennung |
|
||||
| `max_tokens` | `8192` | Maximale Token-Laenge der LLM-Antwort |
|
||||
|
||||
---
|
||||
|
||||
## API Endpoints
|
||||
|
||||
Alle Endpoints unter `/api/compliance/v1/canonical/`.
|
||||
|
||||
### Uebersicht
|
||||
|
||||
| Methode | Pfad | Beschreibung |
|
||||
|---------|------|-------------|
|
||||
| `POST` | `/generate` | Generierungs-Job starten (laeuft im Hintergrund) |
|
||||
| `GET` | `/generate/status/{job_id}` | Status eines laufenden Jobs abfragen |
|
||||
| `GET` | `/generate/jobs` | Alle Jobs auflisten (paginiert) |
|
||||
| `GET` | `/generate/processed-stats` | Verarbeitungsstatistik pro Collection |
|
||||
| `GET` | `/generate/review-queue` | Controls zur manuellen Pruefung |
|
||||
| `POST` | `/generate/review/{control_id}` | Review eines einzelnen Controls abschliessen |
|
||||
| `POST` | `/generate/bulk-review` | Bulk-Review nach `release_state` |
|
||||
| `POST` | `/generate/qa-reclassify` | QA-Reklassifizierung bestehender Controls |
|
||||
| `GET` | `/blocked-sources` | Gesperrte Quellen (Rule 3) auflisten |
|
||||
| `POST` | `/blocked-sources/cleanup` | Cleanup-Workflow fuer gesperrte Quellen starten |
|
||||
|
||||
---
|
||||
|
||||
### POST `/v1/canonical/generate` — Job starten
|
||||
|
||||
Startet einen Generierungs-Job im Hintergrund. Gibt sofort eine `job_id` zurueck.
|
||||
|
||||
**Request:**
|
||||
|
||||
```json
|
||||
{
|
||||
"collections": ["bp_compliance_gesetze"],
|
||||
"regulation_filter": ["eu_2023_1230"],
|
||||
"skip_prefilter": false,
|
||||
"batch_size": 5,
|
||||
"max_chunks": 500,
|
||||
"max_controls": 0,
|
||||
"skip_web_search": false,
|
||||
"dry_run": false
|
||||
}
|
||||
```
|
||||
|
||||
**Response (200):**
|
||||
|
||||
```json
|
||||
{
|
||||
"job_id": "a1b2c3d4-...",
|
||||
"status": "running",
|
||||
"message": "Generation started in background. Poll /generate/status/{job_id} for progress."
|
||||
}
|
||||
```
|
||||
|
||||
**Beispiel:**
|
||||
|
||||
```bash
|
||||
# Alle Chunks der Maschinenverordnung verarbeiten
|
||||
curl -X POST https://api-dev.breakpilot.ai/api/compliance/v1/canonical/generate \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'X-Tenant-ID: 550e8400-e29b-41d4-a716-446655440000' \
|
||||
-d '{
|
||||
"collections": ["bp_compliance_ce"],
|
||||
"regulation_filter": ["eu_2023_1230"],
|
||||
"max_chunks": 200,
|
||||
"batch_size": 5
|
||||
}'
|
||||
```
|
||||
|
||||
```bash
|
||||
# Dry Run: Keine DB-Aenderungen, Controls im Response
|
||||
curl -X POST https://api-dev.breakpilot.ai/api/compliance/v1/canonical/generate \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'X-Tenant-ID: 550e8400-e29b-41d4-a716-446655440000' \
|
||||
-d '{
|
||||
"collections": ["bp_compliance_gesetze"],
|
||||
"max_chunks": 10,
|
||||
"dry_run": true
|
||||
}'
|
||||
```
|
||||
|
||||
```bash
|
||||
# Ohne Vorfilter: Alle Chunks direkt an Anthropic API
|
||||
curl -X POST https://api-dev.breakpilot.ai/api/compliance/v1/canonical/generate \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'X-Tenant-ID: 550e8400-e29b-41d4-a716-446655440000' \
|
||||
-d '{
|
||||
"collections": ["bp_compliance_gesetze"],
|
||||
"regulation_filter": ["bdsg"],
|
||||
"skip_prefilter": true,
|
||||
"max_chunks": 100
|
||||
}'
|
||||
```
|
||||
|
||||
!!! warning "Kosten beachten"
|
||||
Ohne `regulation_filter` und mit `max_chunks: 0` werden **alle** ~105.000 Chunks verarbeitet.
|
||||
Das verursacht erhebliche Anthropic-API-Kosten (~$700).
|
||||
|
||||
---
|
||||
|
||||
### GET `/v1/canonical/generate/status/{job_id}` — Job-Status
|
||||
|
||||
Gibt den vollstaendigen Status eines Jobs zurueck inkl. Metriken und Fehler.
|
||||
|
||||
**Beispiel:**
|
||||
|
||||
```bash
|
||||
curl https://api-dev.breakpilot.ai/api/compliance/v1/canonical/generate/status/a1b2c3d4-... \
|
||||
-H 'X-Tenant-ID: 550e8400-e29b-41d4-a716-446655440000'
|
||||
```
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"id": "a1b2c3d4-...",
|
||||
"status": "completed",
|
||||
"total_chunks_scanned": 500,
|
||||
"controls_generated": 48,
|
||||
"controls_verified": 45,
|
||||
"controls_needs_review": 3,
|
||||
"controls_too_close": 0,
|
||||
"controls_duplicates_found": 12,
|
||||
"controls_qa_fixed": 5,
|
||||
"config": { "..." },
|
||||
"started_at": "2026-03-17T10:00:00+00:00",
|
||||
"completed_at": "2026-03-17T10:15:32+00:00"
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### GET `/v1/canonical/generate/jobs` — Alle Jobs
|
||||
|
||||
Paginierte Liste aller Generierungs-Jobs.
|
||||
|
||||
**Query-Parameter:**
|
||||
|
||||
| Parameter | Default | Beschreibung |
|
||||
|-----------|---------|-------------|
|
||||
| `limit` | `20` | Anzahl Jobs (1-100) |
|
||||
| `offset` | `0` | Offset fuer Paginierung |
|
||||
|
||||
**Beispiel:**
|
||||
|
||||
```bash
|
||||
curl "https://api-dev.breakpilot.ai/api/compliance/v1/canonical/generate/jobs?limit=5" \
|
||||
-H 'X-Tenant-ID: 550e8400-e29b-41d4-a716-446655440000'
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### GET `/v1/canonical/generate/review-queue` — Review-Queue
|
||||
|
||||
Listet Controls auf, die eine manuelle Pruefung benoetigen.
|
||||
|
||||
**Query-Parameter:**
|
||||
|
||||
| Parameter | Default | Beschreibung |
|
||||
|-----------|---------|-------------|
|
||||
| `release_state` | `needs_review` | Filter: `needs_review`, `too_close`, `duplicate` |
|
||||
| `limit` | `50` | Anzahl (1-200) |
|
||||
|
||||
**Beispiel:**
|
||||
|
||||
```bash
|
||||
curl "https://api-dev.breakpilot.ai/api/compliance/v1/canonical/generate/review-queue?release_state=needs_review&limit=10" \
|
||||
-H 'X-Tenant-ID: 550e8400-e29b-41d4-a716-446655440000'
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### POST `/v1/canonical/generate/review/{control_id}` — Review abschliessen
|
||||
|
||||
Schliesst die manuelle Pruefung eines Controls ab.
|
||||
|
||||
**Request:**
|
||||
|
||||
```json
|
||||
{
|
||||
"action": "approve",
|
||||
"release_state": "draft",
|
||||
"notes": "Inhaltlich korrekt, Severity passt."
|
||||
}
|
||||
```
|
||||
|
||||
**Moegliche `action`-Werte:**
|
||||
|
||||
| Action | Neuer State | Beschreibung |
|
||||
|--------|-------------|-------------|
|
||||
| `approve` | `draft` (oder per `release_state` ueberschreiben) | Control freigeben |
|
||||
| `reject` | `deprecated` | Control verwerfen |
|
||||
| `needs_rework` | `needs_review` | Zurueck in die Queue |
|
||||
|
||||
**Beispiel:**
|
||||
|
||||
```bash
|
||||
curl -X POST https://api-dev.breakpilot.ai/api/compliance/v1/canonical/generate/review/AUTH-042 \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'X-Tenant-ID: 550e8400-e29b-41d4-a716-446655440000' \
|
||||
-d '{"action": "approve", "release_state": "draft"}'
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### POST `/v1/canonical/generate/bulk-review` — Bulk-Review
|
||||
|
||||
Aendert den `release_state` aller Controls, die einen bestimmten State haben.
|
||||
|
||||
**Request:**
|
||||
|
||||
```json
|
||||
{
|
||||
"release_state": "needs_review",
|
||||
"action": "approve",
|
||||
"new_state": "draft"
|
||||
}
|
||||
```
|
||||
|
||||
**Beispiel:**
|
||||
|
||||
```bash
|
||||
# Alle needs_review Controls auf draft setzen
|
||||
curl -X POST https://api-dev.breakpilot.ai/api/compliance/v1/canonical/generate/bulk-review \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'X-Tenant-ID: 550e8400-e29b-41d4-a716-446655440000' \
|
||||
-d '{"release_state": "needs_review", "action": "approve", "new_state": "draft"}'
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### GET `/v1/canonical/generate/processed-stats` — Verarbeitungsstatistik
|
||||
|
||||
Liefert Statistiken pro RAG-Collection.
|
||||
|
||||
**Beispiel:**
|
||||
|
||||
```bash
|
||||
curl https://api-dev.breakpilot.ai/api/compliance/v1/canonical/generate/processed-stats \
|
||||
-H 'X-Tenant-ID: 550e8400-e29b-41d4-a716-446655440000'
|
||||
```
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"stats": [
|
||||
{
|
||||
"collection": "bp_compliance_gesetze",
|
||||
"processed_chunks": 45200,
|
||||
"direct_adopted": 1850,
|
||||
"llm_reformed": 120,
|
||||
"skipped": 43230,
|
||||
"total_chunks_estimated": 0,
|
||||
"pending_chunks": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Kosten und Performance
|
||||
|
||||
### Kostenabschaetzung
|
||||
|
||||
| Metrik | Wert |
|
||||
|--------|------|
|
||||
| **Kosten pro Chunk** | ~$0.0067 (Anthropic API, Batch-Modus) |
|
||||
| **Yield (Controls/Chunks)** | ~4.5-10% (nur Chunks mit konkreten Anforderungen erzeugen Controls) |
|
||||
| **Vorfilter-Ersparnis** | ~55% der API-Kosten wenn aktiviert (irrelevante Chunks werden lokal aussortiert) |
|
||||
|
||||
### Performance-Kennzahlen
|
||||
|
||||
| Metrik | Wert |
|
||||
|--------|------|
|
||||
| **Batch-Groesse** | 5 Chunks pro API-Call (Default) |
|
||||
| **API-Aufrufe Reduktion** | ~80% weniger Aufrufe durch Batching |
|
||||
| **LLM-Timeout** | 180 Sekunden pro Call |
|
||||
| **QA-Overhead** | ~2s pro Control (nur bei Disagreement, ~10-15% der Controls) |
|
||||
|
||||
### RAG Collections
|
||||
|
||||
| Collection | Inhalte | Erwartete Regel |
|
||||
|-----------|---------|----------------|
|
||||
| `bp_compliance_gesetze` | Deutsche Gesetze (BDSG, TTDSG, TKG etc.) | Rule 1 |
|
||||
| `bp_compliance_datenschutz` | Datenschutz-Leitlinien + EU-Verordnungen | Rule 1/2 |
|
||||
| `bp_compliance_ce` | CE/Sicherheitsstandards | Rule 1/2/3 |
|
||||
| `bp_dsfa_corpus` | DSFA-Korpus | Rule 1/2 |
|
||||
| `bp_legal_templates` | Rechtsvorlagen | Rule 1 |
|
||||
|
||||
### Aktuelle Groessenordnung
|
||||
|
||||
| Metrik | Wert |
|
||||
|--------|------|
|
||||
| RAG-Chunks gesamt | ~105.000 (nach Dedup 2026-03-16) |
|
||||
| Verarbeitete Chunks | ~105.000 |
|
||||
| Generierte Controls | **~4.738** |
|
||||
| Konversionsrate | ~4,5% |
|
||||
|
||||
---
|
||||
|
||||
## Lizenz-Klassifikation (3-Regel-System)
|
||||
|
||||
Jeder Chunk wird basierend auf `regulation_code` einer Lizenzregel zugeordnet:
|
||||
|
||||
| Regel | Typ | Original erlaubt? | Beispiele |
|
||||
|-------|-----|-------------------|----------|
|
||||
| **Rule 1** (free_use) | EU-Gesetze, NIST, DE-Gesetze, Public Domain | Ja | DSGVO, BDSG, NIS2, AI Act |
|
||||
| **Rule 2** (citation_required) | CC-BY, CC-BY-SA | Ja, mit Zitation | OWASP ASVS, OWASP SAMM |
|
||||
| **Rule 3** (restricted) | Proprietaer | Nein, volle Reformulierung | BSI TR-03161, ISO 27001 |
|
||||
|
||||
### Verarbeitung nach Regel
|
||||
|
||||
- **Rule 1+2 → `_structure_batch()`**: Anthropic strukturiert den Originaltext als Control. Ein API-Call fuer den gesamten Batch.
|
||||
- **Rule 3 → `_reformulate_batch()`**: Anthropic reformuliert vollstaendig — kein Originaltext, keine Quellennamen. Ein API-Call fuer den gesamten Batch.
|
||||
|
||||
### Batch Processing
|
||||
|
||||
Die Pipeline sammelt Chunks in Batches (Default: 5 Chunks) und sendet sie in einem einzigen Anthropic-API-Call.
|
||||
|
||||
1. Relevante Chunks werden mit Lizenz-Info in `pending_batch` gesammelt
|
||||
2. Bei `batch_size` erreicht → `_flush_batch()`
|
||||
3. Batch wird nach Lizenzregel getrennt: Rule 1+2 → `_structure_batch()`, Rule 3 → `_reformulate_batch()`
|
||||
4. Ergebnis: JSON-Array mit genau N Elementen (`null` fuer irrelevante Chunks)
|
||||
|
||||
**Fallback:** Bei Batch-Fehler (Timeout, Parsing-Error) wird automatisch auf Einzelverarbeitung zurueckgefallen.
|
||||
|
||||
---
|
||||
|
||||
## Chunk-Tracking (Processed Chunks)
|
||||
|
||||
### Tabelle `canonical_processed_chunks`
|
||||
|
||||
| Spalte | Typ | Beschreibung |
|
||||
|--------|-----|-------------|
|
||||
| `chunk_hash` | VARCHAR(64) | SHA-256 Hash des Chunk-Textes |
|
||||
| `collection` | VARCHAR(100) | Qdrant-Collection |
|
||||
| `regulation_code` | VARCHAR(100) | Quell-Regulation (z.B. `bdsg`, `eu_2016_679`) |
|
||||
| `license_rule` | INTEGER | 1, 2 oder 3 |
|
||||
| `processing_path` | VARCHAR(20) | Wie der Chunk verarbeitet wurde |
|
||||
| `generated_control_ids` | JSONB | UUIDs der generierten Controls |
|
||||
| `pipeline_version` | SMALLINT | Pipeline-Version (1 oder 2) |
|
||||
| `job_id` | UUID | Referenz auf den Generierungs-Job |
|
||||
|
||||
**UNIQUE Constraint:** `(chunk_hash, collection, document_version)` — verhindert Doppelverarbeitung.
|
||||
|
||||
### Processing Paths
|
||||
|
||||
| Wert | Stufe | Bedeutung |
|
||||
|------|-------|-----------|
|
||||
| `prefilter_skip` | 2 | Lokaler LLM-Vorfilter: Chunk nicht relevant |
|
||||
| `structured` | 3a | Einzelner Chunk strukturiert (Rule 1/2) |
|
||||
| `structured_batch` | 3a | Batch-Strukturierung (Rule 1/2) |
|
||||
| `llm_reform` | 3b | Einzelner Chunk reformuliert (Rule 3) |
|
||||
| `llm_reform_batch` | 3b | Batch-Reformulierung (Rule 3) |
|
||||
| `no_control` | 3 | LLM konnte kein Control ableiten (null im Array) |
|
||||
| `store_failed` | 6 | DB-Speichern fehlgeschlagen |
|
||||
| `error` | — | Unerwarteter Fehler |
|
||||
|
||||
---
|
||||
|
||||
## QA Validation (Automatische Qualitaetspruefung)
|
||||
|
||||
Die QA-Stufe validiert die Klassifizierung jedes generierten Controls:
|
||||
|
||||
1. **LLM-Category:** Anthropic liefert `category` und `domain` im JSON-Response
|
||||
2. **Keyword-Detection:** `_detect_category(chunk.text)` liefert eine zweite Meinung
|
||||
3. **Stimmen beide ueberein?** → Schneller Pfad (kein QA noetig)
|
||||
4. **Bei Disagreement:** Lokales LLM (Ollama) arbitriert
|
||||
5. **Auto-Fix:** Category/Domain werden automatisch korrigiert
|
||||
|
||||
Die QA-Metriken werden in `generation_metadata` gespeichert:
|
||||
|
||||
```json
|
||||
{
|
||||
"qa_category_fix": {"from": "authentication", "to": "finance", "reason": "IFRS-Thema"},
|
||||
"qa_domain_fix": {"from": "AUTH", "to": "FIN", "reason": "Finanzregulierung"}
|
||||
}
|
||||
```
|
||||
|
||||
### QA-Reklassifizierung bestehender Controls
|
||||
|
||||
```bash
|
||||
# Dry Run: Welche AUTH-Controls sind falsch klassifiziert?
|
||||
curl -X POST https://api-dev.breakpilot.ai/api/compliance/v1/canonical/generate/qa-reclassify \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'X-Tenant-ID: 550e8400-e29b-41d4-a716-446655440000' \
|
||||
-d '{"limit": 50, "dry_run": true, "filter_domain_prefix": "AUTH"}'
|
||||
|
||||
# Korrekturen anwenden:
|
||||
curl -X POST https://api-dev.breakpilot.ai/api/compliance/v1/canonical/generate/qa-reclassify \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'X-Tenant-ID: 550e8400-e29b-41d4-a716-446655440000' \
|
||||
-d '{"limit": 50, "dry_run": false, "filter_domain_prefix": "AUTH"}'
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Quelldateien
|
||||
|
||||
| Datei | Beschreibung |
|
||||
|-------|-------------|
|
||||
| `backend-compliance/compliance/services/control_generator.py` | 7-Stufen-Pipeline mit Batch Processing |
|
||||
| `backend-compliance/compliance/api/control_generator_routes.py` | REST API Endpoints |
|
||||
| `backend-compliance/compliance/services/license_gate.py` | Lizenz-Gate-Logik |
|
||||
| `backend-compliance/compliance/services/similarity_detector.py` | Too-Close-Detektor (5 Metriken) |
|
||||
| `backend-compliance/compliance/services/rag_client.py` | RAG-Client (Qdrant Search + Scroll) |
|
||||
| `backend-compliance/migrations/046_control_generator.sql` | Job-Tracking, Chunk-Tracking Tabellen |
|
||||
| `backend-compliance/migrations/048_processing_path_expand.sql` | Erweiterte Processing-Path-Werte |
|
||||
| `backend-compliance/migrations/062_pipeline_version.sql` | `pipeline_version` Spalte |
|
||||
| `backend-compliance/tests/test_control_generator.py` | 15 Tests (Lizenz, Domain, Batch, Pipeline) |
|
||||
|
||||
---
|
||||
|
||||
## Verwandte Dokumentation
|
||||
|
||||
- [Canonical Control Library (CP-CLIB)](canonical-control-library.md) — Domains, Datenmodell, Too-Close-Detektor, CI/CD Validation
|
||||
- [Multi-Layer Control Architecture](canonical-control-library.md#multi-layer-control-architecture) — 10-Stage Pipeline-Erweiterung mit Obligations, Patterns, Crosswalk
|
||||
@@ -103,6 +103,7 @@ nav:
|
||||
- Dokumentengenerierung: services/sdk-modules/dokumentengenerierung.md
|
||||
- Policy-Bibliothek (29 Richtlinien): services/sdk-modules/policy-bibliothek.md
|
||||
- Canonical Control Library (CP-CLIB): services/sdk-modules/canonical-control-library.md
|
||||
- Control Generator Pipeline: services/sdk-modules/control-generator-pipeline.md
|
||||
- Strategie:
|
||||
- Wettbewerbsanalyse & Roadmap: strategy/wettbewerbsanalyse.md
|
||||
- Entwicklung:
|
||||
|
||||
Reference in New Issue
Block a user