feat(pipeline): pipeline_version v2, migration 062, docs + 71 tests

- Add PIPELINE_VERSION=2 constant and pipeline_version column to
  canonical_controls and canonical_processed_chunks (migration 062)
- Anthropic API decides chunk relevance via null-returns (skip_prefilter)
- Annex/appendix chunks explicitly protected in prompts
- Fix 6 failing tests (CRYP domain, _process_batch tuple return)
- Add TestPipelineVersion + TestRegulationFilter test classes (10 new tests)
- Add MkDocs page: control-generator-pipeline.md (541 lines)
- Update canonical-control-library.md with v2 pipeline diagram
- Update testing.md with 71-test breakdown table

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-17 17:31:11 +01:00
parent 653aad57e3
commit a9e0869205
7 changed files with 815 additions and 48 deletions

View File

@@ -53,6 +53,11 @@ LLM_TIMEOUT = float(os.getenv("CONTROL_GEN_LLM_TIMEOUT", "180"))
HARMONIZATION_THRESHOLD = 0.85 # Cosine similarity above this = duplicate
# Pipeline version — increment when generation rules change materially.
# v1: Original (local LLM prefilter, old prompt)
# v2: Anthropic decides relevance, null for non-requirement chunks, annexes protected
PIPELINE_VERSION = 2
ALL_COLLECTIONS = [
"bp_compliance_ce",
"bp_compliance_gesetze",
@@ -1663,7 +1668,7 @@ Kategorien: {CATEGORY_LIST_STR}"""
license_rule, source_original_text, source_citation,
customer_visible, generation_metadata,
verification_method, category, generation_strategy,
target_audience
target_audience, pipeline_version
) VALUES (
:framework_id, :control_id, :title, :objective, :rationale,
:scope, :requirements, :test_procedure, :evidence,
@@ -1672,7 +1677,7 @@ Kategorien: {CATEGORY_LIST_STR}"""
:license_rule, :source_original_text, :source_citation,
:customer_visible, :generation_metadata,
:verification_method, :category, :generation_strategy,
:target_audience
:target_audience, :pipeline_version
)
ON CONFLICT (framework_id, control_id) DO NOTHING
RETURNING id
@@ -1702,6 +1707,7 @@ Kategorien: {CATEGORY_LIST_STR}"""
"category": control.category,
"generation_strategy": control.generation_strategy,
"target_audience": json.dumps(control.target_audience) if control.target_audience else None,
"pipeline_version": PIPELINE_VERSION,
},
)
self.db.commit()
@@ -1728,11 +1734,13 @@ Kategorien: {CATEGORY_LIST_STR}"""
INSERT INTO canonical_processed_chunks (
chunk_hash, collection, regulation_code,
document_version, source_license, license_rule,
processing_path, generated_control_ids, job_id
processing_path, generated_control_ids, job_id,
pipeline_version
) VALUES (
:hash, :collection, :regulation_code,
:doc_version, :license, :rule,
:path, :control_ids, CAST(:job_id AS uuid)
:path, :control_ids, CAST(:job_id AS uuid),
:pipeline_version
)
ON CONFLICT (chunk_hash, collection, document_version) DO NOTHING
"""),
@@ -1746,6 +1754,7 @@ Kategorien: {CATEGORY_LIST_STR}"""
"path": processing_path,
"control_ids": json.dumps(control_ids),
"job_id": job_id,
"pipeline_version": PIPELINE_VERSION,
},
)
self.db.commit()