Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Successful in 45s
CI/CD / test-python-document-crawler (push) Has been cancelled
CI/CD / test-python-dsms-gateway (push) Has been cancelled
CI/CD / validate-canonical-controls (push) Has been cancelled
CI/CD / deploy-hetzner (push) Has been cancelled
CI/CD / test-python-backend-compliance (push) Has been cancelled
Implements the Control Generator Pipeline that systematically generates canonical security controls from 150k+ RAG chunks across all compliance collections (BSI, NIST, OWASP, ENISA, EU laws, German laws). Three license rules enforced throughout: - Rule 1 (free_use): Laws/Public Domain — original text preserved - Rule 2 (citation_required): CC-BY/CC-BY-SA — text with citation - Rule 3 (restricted): BSI/ISO — full reformulation, no source traces New files: - Migration 046: job tracking, chunk tracking, blocked sources tables - control_generator.py: 7-stage pipeline (scan→classify→structure/reform→harmonize→anchor→store→mark) - anchor_finder.py: RAG + DuckDuckGo open-source reference search - control_generator_routes.py: REST API (generate, review, stats, blocked-sources) - test_control_generator.py: license mapping, rule enforcement, anchor filtering tests Modified: - __init__.py: register control_generator_router - route.ts: proxy generator/review/stats endpoints - page.tsx: Generator modal, stats panel, state filter, review queue, license badges Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
104 lines
4.6 KiB
PL/PgSQL
104 lines
4.6 KiB
PL/PgSQL
-- Migration 046: Control Generator Pipeline
|
|
-- Adds job tracking, chunk tracking, blocked sources, and extends canonical_controls
|
|
-- for the 3-license-rule system (free_use, citation_required, restricted).
|
|
|
|
BEGIN;
|
|
|
|
-- =============================================================================
|
|
-- 1. Job-Tracking for Generator Runs
|
|
-- =============================================================================
|
|
|
|
CREATE TABLE IF NOT EXISTS canonical_generation_jobs (
|
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
status VARCHAR(20) DEFAULT 'pending'
|
|
CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled')),
|
|
config JSONB NOT NULL,
|
|
total_chunks_scanned INTEGER DEFAULT 0,
|
|
controls_generated INTEGER DEFAULT 0,
|
|
controls_verified INTEGER DEFAULT 0,
|
|
controls_needs_review INTEGER DEFAULT 0,
|
|
controls_too_close INTEGER DEFAULT 0,
|
|
controls_duplicates_found INTEGER DEFAULT 0,
|
|
errors JSONB DEFAULT '[]',
|
|
started_at TIMESTAMPTZ,
|
|
completed_at TIMESTAMPTZ,
|
|
created_at TIMESTAMPTZ DEFAULT NOW()
|
|
);
|
|
|
|
-- =============================================================================
|
|
-- 2. Tracking which RAG chunks have been processed
|
|
-- =============================================================================
|
|
|
|
CREATE TABLE IF NOT EXISTS canonical_processed_chunks (
|
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
chunk_hash VARCHAR(64) NOT NULL,
|
|
collection VARCHAR(100) NOT NULL,
|
|
regulation_code VARCHAR(100),
|
|
document_version VARCHAR(50),
|
|
source_license VARCHAR(50),
|
|
license_rule INTEGER NOT NULL
|
|
CHECK (license_rule IN (1, 2, 3)),
|
|
processing_path VARCHAR(20) NOT NULL
|
|
CHECK (processing_path IN ('structured', 'llm_reform', 'skipped')),
|
|
generated_control_ids JSONB DEFAULT '[]',
|
|
job_id UUID REFERENCES canonical_generation_jobs(id),
|
|
processed_at TIMESTAMPTZ DEFAULT NOW(),
|
|
UNIQUE (chunk_hash, collection, document_version)
|
|
);
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_cpc_collection ON canonical_processed_chunks(collection);
|
|
CREATE INDEX IF NOT EXISTS idx_cpc_regulation ON canonical_processed_chunks(regulation_code);
|
|
CREATE INDEX IF NOT EXISTS idx_cpc_job ON canonical_processed_chunks(job_id);
|
|
|
|
-- =============================================================================
|
|
-- 3. Blocked Sources (Rule 3 documents to be deleted after generation)
|
|
-- =============================================================================
|
|
|
|
CREATE TABLE IF NOT EXISTS canonical_blocked_sources (
|
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
regulation_code VARCHAR(100) NOT NULL UNIQUE,
|
|
document_title VARCHAR(500) NOT NULL,
|
|
reason VARCHAR(500) DEFAULT 'Kommerziell nicht nutzbar — darf nicht mit KI verarbeitet werden',
|
|
deletion_status VARCHAR(20) DEFAULT 'pending'
|
|
CHECK (deletion_status IN ('pending', 'marked_for_deletion', 'deleted')),
|
|
qdrant_collection VARCHAR(100),
|
|
original_file_path TEXT,
|
|
marked_at TIMESTAMPTZ DEFAULT NOW(),
|
|
deleted_at TIMESTAMPTZ
|
|
);
|
|
|
|
-- =============================================================================
|
|
-- 4. Extend canonical_controls: release_state + 3-rule columns
|
|
-- =============================================================================
|
|
|
|
-- Expand release_state enum to include generator states
|
|
ALTER TABLE canonical_controls DROP CONSTRAINT IF EXISTS canonical_controls_release_state_check;
|
|
ALTER TABLE canonical_controls ADD CONSTRAINT canonical_controls_release_state_check
|
|
CHECK (release_state IN ('draft', 'review', 'approved', 'deprecated', 'needs_review', 'too_close', 'duplicate'));
|
|
|
|
-- License rule: 1 = free_use, 2 = citation_required, 3 = restricted
|
|
ALTER TABLE canonical_controls ADD COLUMN IF NOT EXISTS
|
|
license_rule INTEGER DEFAULT NULL;
|
|
|
|
-- Original text from source (Rule 1+2 only; Rule 3 = always NULL)
|
|
ALTER TABLE canonical_controls ADD COLUMN IF NOT EXISTS
|
|
source_original_text TEXT DEFAULT NULL;
|
|
|
|
-- Citation info (Rule 1+2 only; Rule 3 = always NULL)
|
|
ALTER TABLE canonical_controls ADD COLUMN IF NOT EXISTS
|
|
source_citation JSONB DEFAULT NULL;
|
|
|
|
-- Whether source info may be shown to customers
|
|
ALTER TABLE canonical_controls ADD COLUMN IF NOT EXISTS
|
|
customer_visible BOOLEAN DEFAULT true;
|
|
|
|
-- Generation metadata (internal only, never shown to customers)
|
|
ALTER TABLE canonical_controls ADD COLUMN IF NOT EXISTS
|
|
generation_metadata JSONB DEFAULT NULL;
|
|
|
|
-- Index for filtering by license rule and customer visibility
|
|
CREATE INDEX IF NOT EXISTS idx_canonical_controls_license_rule ON canonical_controls(license_rule);
|
|
CREATE INDEX IF NOT EXISTS idx_canonical_controls_customer_visible ON canonical_controls(customer_visible);
|
|
|
|
COMMIT;
|