feat(pipeline): pipeline_version v2, migration 062, docs + 71 tests
- Add PIPELINE_VERSION=2 constant and pipeline_version column to canonical_controls and canonical_processed_chunks (migration 062) - Anthropic API decides chunk relevance via null-returns (skip_prefilter) - Annex/appendix chunks explicitly protected in prompts - Fix 6 failing tests (CRYP domain, _process_batch tuple return) - Add TestPipelineVersion + TestRegulationFilter test classes (10 new tests) - Add MkDocs page: control-generator-pipeline.md (541 lines) - Update canonical-control-library.md with v2 pipeline diagram - Update testing.md with 71-test breakdown table Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -53,6 +53,11 @@ LLM_TIMEOUT = float(os.getenv("CONTROL_GEN_LLM_TIMEOUT", "180"))
|
||||
|
||||
HARMONIZATION_THRESHOLD = 0.85 # Cosine similarity above this = duplicate
|
||||
|
||||
# Pipeline version — increment when generation rules change materially.
|
||||
# v1: Original (local LLM prefilter, old prompt)
|
||||
# v2: Anthropic decides relevance, null for non-requirement chunks, annexes protected
|
||||
PIPELINE_VERSION = 2
|
||||
|
||||
ALL_COLLECTIONS = [
|
||||
"bp_compliance_ce",
|
||||
"bp_compliance_gesetze",
|
||||
@@ -1663,7 +1668,7 @@ Kategorien: {CATEGORY_LIST_STR}"""
|
||||
license_rule, source_original_text, source_citation,
|
||||
customer_visible, generation_metadata,
|
||||
verification_method, category, generation_strategy,
|
||||
target_audience
|
||||
target_audience, pipeline_version
|
||||
) VALUES (
|
||||
:framework_id, :control_id, :title, :objective, :rationale,
|
||||
:scope, :requirements, :test_procedure, :evidence,
|
||||
@@ -1672,7 +1677,7 @@ Kategorien: {CATEGORY_LIST_STR}"""
|
||||
:license_rule, :source_original_text, :source_citation,
|
||||
:customer_visible, :generation_metadata,
|
||||
:verification_method, :category, :generation_strategy,
|
||||
:target_audience
|
||||
:target_audience, :pipeline_version
|
||||
)
|
||||
ON CONFLICT (framework_id, control_id) DO NOTHING
|
||||
RETURNING id
|
||||
@@ -1702,6 +1707,7 @@ Kategorien: {CATEGORY_LIST_STR}"""
|
||||
"category": control.category,
|
||||
"generation_strategy": control.generation_strategy,
|
||||
"target_audience": json.dumps(control.target_audience) if control.target_audience else None,
|
||||
"pipeline_version": PIPELINE_VERSION,
|
||||
},
|
||||
)
|
||||
self.db.commit()
|
||||
@@ -1728,11 +1734,13 @@ Kategorien: {CATEGORY_LIST_STR}"""
|
||||
INSERT INTO canonical_processed_chunks (
|
||||
chunk_hash, collection, regulation_code,
|
||||
document_version, source_license, license_rule,
|
||||
processing_path, generated_control_ids, job_id
|
||||
processing_path, generated_control_ids, job_id,
|
||||
pipeline_version
|
||||
) VALUES (
|
||||
:hash, :collection, :regulation_code,
|
||||
:doc_version, :license, :rule,
|
||||
:path, :control_ids, CAST(:job_id AS uuid)
|
||||
:path, :control_ids, CAST(:job_id AS uuid),
|
||||
:pipeline_version
|
||||
)
|
||||
ON CONFLICT (chunk_hash, collection, document_version) DO NOTHING
|
||||
"""),
|
||||
@@ -1746,6 +1754,7 @@ Kategorien: {CATEGORY_LIST_STR}"""
|
||||
"path": processing_path,
|
||||
"control_ids": json.dumps(control_ids),
|
||||
"job_id": job_id,
|
||||
"pipeline_version": PIPELINE_VERSION,
|
||||
},
|
||||
)
|
||||
self.db.commit()
|
||||
|
||||
Reference in New Issue
Block a user