feat(training+controls): interactive video pipeline, training blocks, control generator, CE libraries
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 37s
CI/CD / test-python-backend-compliance (push) Successful in 39s
CI/CD / test-python-document-crawler (push) Successful in 26s
CI/CD / test-python-dsms-gateway (push) Successful in 23s
CI/CD / validate-canonical-controls (push) Successful in 12s
CI/CD / Deploy (push) Has been skipped

Interactive Training Videos (CP-TRAIN):
- DB migration 022: training_checkpoints + checkpoint_progress tables
- NarratorScript generation via Anthropic (AI Teacher persona, German)
- TTS batch synthesis + interactive video pipeline (slides + checkpoint slides + FFmpeg)
- 4 new API endpoints: generate-interactive, interactive-manifest, checkpoint submit, checkpoint progress
- InteractiveVideoPlayer component (HTML5 Video, quiz overlay, seek protection, progress tracking)
- Learner portal integration with automatic completion on all checkpoints passed
- 30 new tests (handler validation + grading logic + manifest/progress + seek protection)

Training Blocks:
- Block generator, block store, block config CRUD + preview/generate endpoints
- Migration 021: training_blocks schema

Control Generator + Canonical Library:
- Control generator routes + service enhancements
- Canonical control library helpers, sidebar entry
- Citation backfill service + tests
- CE libraries data (hazard, protection, evidence, lifecycle, components)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-16 21:41:48 +01:00
parent d2133dbfa2
commit 4f6bc8f6f6
50 changed files with 17299 additions and 198 deletions

View File

@@ -22,6 +22,7 @@ Endpoints:
from __future__ import annotations
import json
import logging
from typing import Any, Optional
@@ -277,8 +278,8 @@ async def list_framework_controls(
query += " AND category = :cat"
params["cat"] = category
if target_audience:
query += " AND target_audience = :ta"
params["ta"] = target_audience
query += " AND target_audience::jsonb @> (:ta)::jsonb"
params["ta"] = json.dumps([target_audience])
query += " ORDER BY control_id"
rows = db.execute(text(query), params).fetchall()
@@ -329,8 +330,8 @@ async def list_controls(
query += " AND category = :cat"
params["cat"] = category
if target_audience:
query += " AND target_audience = :ta"
params["ta"] = target_audience
query += " AND target_audience LIKE :ta_pattern"
params["ta_pattern"] = f'%"{target_audience}"%'
if source:
if source == "__none__":
query += " AND (source_citation IS NULL OR source_citation->>'source' IS NULL OR source_citation->>'source' = '')"
@@ -398,8 +399,8 @@ async def count_controls(
query += " AND category = :cat"
params["cat"] = category
if target_audience:
query += " AND target_audience = :ta"
params["ta"] = target_audience
query += " AND target_audience LIKE :ta_pattern"
params["ta_pattern"] = f'%"{target_audience}"%'
if source:
if source == "__none__":
query += " AND (source_citation IS NULL OR source_citation->>'source' IS NULL OR source_citation->>'source' = '')"

View File

@@ -26,6 +26,13 @@ from compliance.services.control_generator import (
ControlGeneratorPipeline,
GeneratorConfig,
ALL_COLLECTIONS,
VALID_CATEGORIES,
VALID_DOMAINS,
_detect_category,
_detect_domain,
_llm_local,
_parse_llm_json,
CATEGORY_LIST_STR,
)
from compliance.services.citation_backfill import CitationBackfill, BackfillResult
from compliance.services.rag_client import get_rag_client
@@ -42,6 +49,7 @@ class GenerateRequest(BaseModel):
domain: Optional[str] = None
collections: Optional[List[str]] = None
max_controls: int = 50
max_chunks: int = 1000 # Default: process max 1000 chunks per job (respects document boundaries)
batch_size: int = 5
skip_web_search: bool = False
dry_run: bool = False
@@ -57,6 +65,7 @@ class GenerateResponse(BaseModel):
controls_needs_review: int = 0
controls_too_close: int = 0
controls_duplicates_found: int = 0
controls_qa_fixed: int = 0
errors: list = []
controls: list = []
@@ -132,6 +141,7 @@ async def start_generation(req: GenerateRequest):
domain=req.domain,
batch_size=req.batch_size,
max_controls=req.max_controls,
max_chunks=req.max_chunks,
skip_web_search=req.skip_web_search,
dry_run=req.dry_run,
)
@@ -338,6 +348,188 @@ async def review_control(control_id: str, req: ReviewRequest):
db.close()
class BulkReviewRequest(BaseModel):
release_state: str # Filter: which controls to bulk-review
action: str # "approve" or "reject"
new_state: Optional[str] = None # Override target state
@router.post("/generate/bulk-review")
async def bulk_review(req: BulkReviewRequest):
"""Bulk review all controls matching a release_state filter.
Example: reject all needs_review → sets them to deprecated.
"""
if req.release_state not in ("needs_review", "too_close", "duplicate"):
raise HTTPException(status_code=400, detail=f"Invalid filter state: {req.release_state}")
if req.action == "approve":
target = req.new_state or "draft"
elif req.action == "reject":
target = "deprecated"
else:
raise HTTPException(status_code=400, detail=f"Unknown action: {req.action}")
if target not in ("draft", "review", "approved", "deprecated", "needs_review"):
raise HTTPException(status_code=400, detail=f"Invalid target state: {target}")
db = SessionLocal()
try:
result = db.execute(
text("""
UPDATE canonical_controls
SET release_state = :target, updated_at = NOW()
WHERE release_state = :source
RETURNING control_id
"""),
{"source": req.release_state, "target": target},
)
affected = [row[0] for row in result]
db.commit()
return {
"action": req.action,
"source_state": req.release_state,
"target_state": target,
"affected_count": len(affected),
}
finally:
db.close()
class QAReclassifyRequest(BaseModel):
limit: int = 100 # How many controls to reclassify per run
dry_run: bool = True # Preview only by default
filter_category: Optional[str] = None # Only reclassify controls of this category
filter_domain_prefix: Optional[str] = None # Only reclassify controls with this prefix
@router.post("/generate/qa-reclassify")
async def qa_reclassify(req: QAReclassifyRequest):
"""Run QA reclassification on existing controls using local LLM.
Finds controls where keyword-detection disagrees with current category/domain,
then uses Ollama to determine the correct classification.
"""
db = SessionLocal()
try:
# Load controls to check
where_clauses = ["release_state NOT IN ('deprecated')"]
params = {"limit": req.limit}
if req.filter_category:
where_clauses.append("category = :cat")
params["cat"] = req.filter_category
if req.filter_domain_prefix:
where_clauses.append("control_id LIKE :prefix")
params["prefix"] = f"{req.filter_domain_prefix}-%"
where_sql = " AND ".join(where_clauses)
rows = db.execute(
text(f"""
SELECT id, control_id, title, objective, category,
COALESCE(requirements::text, '[]') as requirements,
COALESCE(source_original_text, '') as source_text
FROM canonical_controls
WHERE {where_sql}
ORDER BY created_at DESC
LIMIT :limit
"""),
params,
).fetchall()
results = {"checked": 0, "mismatches": 0, "fixes": [], "errors": []}
for row in rows:
results["checked"] += 1
control_id = row[1]
title = row[2]
objective = row[3] or ""
current_category = row[4]
source_text = row[6] or objective
# Keyword detection on source text
kw_category = _detect_category(source_text) or _detect_category(objective)
kw_domain = _detect_domain(source_text)
current_prefix = control_id.split("-")[0] if "-" in control_id else ""
# Skip if keyword detection agrees with current classification
if kw_category == current_category and kw_domain == current_prefix:
continue
results["mismatches"] += 1
# Ask Ollama to arbitrate
try:
reqs_text = ""
try:
reqs = json.loads(row[5])
if isinstance(reqs, list):
reqs_text = ", ".join(str(r) for r in reqs[:3])
except Exception:
pass
prompt = f"""Pruefe dieses Compliance-Control auf korrekte Klassifizierung.
Titel: {title[:100]}
Ziel: {objective[:200]}
Anforderungen: {reqs_text[:200]}
Aktuelle Zuordnung: domain={current_prefix}, category={current_category}
Keyword-Erkennung: domain={kw_domain}, category={kw_category}
Welche Zuordnung ist korrekt? Antworte NUR als JSON:
{{"domain": "KUERZEL", "category": "kategorie_name", "reason": "kurze Begruendung"}}
Domains: AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe, ENV=Umwelt, HLT=Gesundheit
Kategorien: {CATEGORY_LIST_STR}"""
raw = await _llm_local(prompt)
data = _parse_llm_json(raw)
if not data:
continue
qa_domain = data.get("domain", "").upper()
qa_category = data.get("category", "")
reason = data.get("reason", "")
fix_entry = {
"control_id": control_id,
"title": title[:80],
"old_category": current_category,
"old_domain": current_prefix,
"new_category": qa_category if qa_category in VALID_CATEGORIES else current_category,
"new_domain": qa_domain if qa_domain in VALID_DOMAINS else current_prefix,
"reason": reason,
}
category_changed = qa_category in VALID_CATEGORIES and qa_category != current_category
if category_changed and not req.dry_run:
db.execute(
text("""
UPDATE canonical_controls
SET category = :category, updated_at = NOW()
WHERE id = :id
"""),
{"id": row[0], "category": qa_category},
)
fix_entry["applied"] = True
else:
fix_entry["applied"] = False
results["fixes"].append(fix_entry)
except Exception as e:
results["errors"].append({"control_id": control_id, "error": str(e)})
if not req.dry_run:
db.commit()
return results
finally:
db.close()
@router.get("/generate/processed-stats")
async def get_processed_stats():
"""Get processing statistics per collection."""

View File

@@ -39,7 +39,6 @@ router = APIRouter(tags=["extraction"])
ALL_COLLECTIONS = [
"bp_compliance_ce", # BSI-TR documents — primary Prüfaspekte source
"bp_compliance_recht", # Legal texts (GDPR, AI Act, ...)
"bp_compliance_gesetze", # German laws
"bp_compliance_datenschutz", # Data protection documents
"bp_dsfa_corpus", # DSFA corpus