feat(training+controls): interactive video pipeline, training blocks, control generator, CE libraries
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 37s
CI/CD / test-python-backend-compliance (push) Successful in 39s
CI/CD / test-python-document-crawler (push) Successful in 26s
CI/CD / test-python-dsms-gateway (push) Successful in 23s
CI/CD / validate-canonical-controls (push) Successful in 12s
CI/CD / Deploy (push) Has been skipped

Interactive Training Videos (CP-TRAIN):
- DB migration 022: training_checkpoints + checkpoint_progress tables
- NarratorScript generation via Anthropic (AI Teacher persona, German)
- TTS batch synthesis + interactive video pipeline (slides + checkpoint slides + FFmpeg)
- 4 new API endpoints: generate-interactive, interactive-manifest, checkpoint submit, checkpoint progress
- InteractiveVideoPlayer component (HTML5 Video, quiz overlay, seek protection, progress tracking)
- Learner portal integration with automatic completion on all checkpoints passed
- 30 new tests (handler validation + grading logic + manifest/progress + seek protection)

Training Blocks:
- Block generator, block store, block config CRUD + preview/generate endpoints
- Migration 021: training_blocks schema

Control Generator + Canonical Library:
- Control generator routes + service enhancements
- Canonical control library helpers, sidebar entry
- Citation backfill service + tests
- CE libraries data (hazard, protection, evidence, lifecycle, components)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-16 21:41:48 +01:00
parent d2133dbfa2
commit 4f6bc8f6f6
50 changed files with 17299 additions and 198 deletions

View File

@@ -22,6 +22,7 @@ Endpoints:
from __future__ import annotations
import json
import logging
from typing import Any, Optional
@@ -277,8 +278,8 @@ async def list_framework_controls(
query += " AND category = :cat"
params["cat"] = category
if target_audience:
query += " AND target_audience = :ta"
params["ta"] = target_audience
query += " AND target_audience::jsonb @> (:ta)::jsonb"
params["ta"] = json.dumps([target_audience])
query += " ORDER BY control_id"
rows = db.execute(text(query), params).fetchall()
@@ -329,8 +330,8 @@ async def list_controls(
query += " AND category = :cat"
params["cat"] = category
if target_audience:
query += " AND target_audience = :ta"
params["ta"] = target_audience
query += " AND target_audience LIKE :ta_pattern"
params["ta_pattern"] = f'%"{target_audience}"%'
if source:
if source == "__none__":
query += " AND (source_citation IS NULL OR source_citation->>'source' IS NULL OR source_citation->>'source' = '')"
@@ -398,8 +399,8 @@ async def count_controls(
query += " AND category = :cat"
params["cat"] = category
if target_audience:
query += " AND target_audience = :ta"
params["ta"] = target_audience
query += " AND target_audience LIKE :ta_pattern"
params["ta_pattern"] = f'%"{target_audience}"%'
if source:
if source == "__none__":
query += " AND (source_citation IS NULL OR source_citation->>'source' IS NULL OR source_citation->>'source' = '')"

View File

@@ -26,6 +26,13 @@ from compliance.services.control_generator import (
ControlGeneratorPipeline,
GeneratorConfig,
ALL_COLLECTIONS,
VALID_CATEGORIES,
VALID_DOMAINS,
_detect_category,
_detect_domain,
_llm_local,
_parse_llm_json,
CATEGORY_LIST_STR,
)
from compliance.services.citation_backfill import CitationBackfill, BackfillResult
from compliance.services.rag_client import get_rag_client
@@ -42,6 +49,7 @@ class GenerateRequest(BaseModel):
domain: Optional[str] = None
collections: Optional[List[str]] = None
max_controls: int = 50
max_chunks: int = 1000 # Default: process max 1000 chunks per job (respects document boundaries)
batch_size: int = 5
skip_web_search: bool = False
dry_run: bool = False
@@ -57,6 +65,7 @@ class GenerateResponse(BaseModel):
controls_needs_review: int = 0
controls_too_close: int = 0
controls_duplicates_found: int = 0
controls_qa_fixed: int = 0
errors: list = []
controls: list = []
@@ -132,6 +141,7 @@ async def start_generation(req: GenerateRequest):
domain=req.domain,
batch_size=req.batch_size,
max_controls=req.max_controls,
max_chunks=req.max_chunks,
skip_web_search=req.skip_web_search,
dry_run=req.dry_run,
)
@@ -338,6 +348,188 @@ async def review_control(control_id: str, req: ReviewRequest):
db.close()
class BulkReviewRequest(BaseModel):
release_state: str # Filter: which controls to bulk-review
action: str # "approve" or "reject"
new_state: Optional[str] = None # Override target state
@router.post("/generate/bulk-review")
async def bulk_review(req: BulkReviewRequest):
"""Bulk review all controls matching a release_state filter.
Example: reject all needs_review → sets them to deprecated.
"""
if req.release_state not in ("needs_review", "too_close", "duplicate"):
raise HTTPException(status_code=400, detail=f"Invalid filter state: {req.release_state}")
if req.action == "approve":
target = req.new_state or "draft"
elif req.action == "reject":
target = "deprecated"
else:
raise HTTPException(status_code=400, detail=f"Unknown action: {req.action}")
if target not in ("draft", "review", "approved", "deprecated", "needs_review"):
raise HTTPException(status_code=400, detail=f"Invalid target state: {target}")
db = SessionLocal()
try:
result = db.execute(
text("""
UPDATE canonical_controls
SET release_state = :target, updated_at = NOW()
WHERE release_state = :source
RETURNING control_id
"""),
{"source": req.release_state, "target": target},
)
affected = [row[0] for row in result]
db.commit()
return {
"action": req.action,
"source_state": req.release_state,
"target_state": target,
"affected_count": len(affected),
}
finally:
db.close()
class QAReclassifyRequest(BaseModel):
limit: int = 100 # How many controls to reclassify per run
dry_run: bool = True # Preview only by default
filter_category: Optional[str] = None # Only reclassify controls of this category
filter_domain_prefix: Optional[str] = None # Only reclassify controls with this prefix
@router.post("/generate/qa-reclassify")
async def qa_reclassify(req: QAReclassifyRequest):
"""Run QA reclassification on existing controls using local LLM.
Finds controls where keyword-detection disagrees with current category/domain,
then uses Ollama to determine the correct classification.
"""
db = SessionLocal()
try:
# Load controls to check
where_clauses = ["release_state NOT IN ('deprecated')"]
params = {"limit": req.limit}
if req.filter_category:
where_clauses.append("category = :cat")
params["cat"] = req.filter_category
if req.filter_domain_prefix:
where_clauses.append("control_id LIKE :prefix")
params["prefix"] = f"{req.filter_domain_prefix}-%"
where_sql = " AND ".join(where_clauses)
rows = db.execute(
text(f"""
SELECT id, control_id, title, objective, category,
COALESCE(requirements::text, '[]') as requirements,
COALESCE(source_original_text, '') as source_text
FROM canonical_controls
WHERE {where_sql}
ORDER BY created_at DESC
LIMIT :limit
"""),
params,
).fetchall()
results = {"checked": 0, "mismatches": 0, "fixes": [], "errors": []}
for row in rows:
results["checked"] += 1
control_id = row[1]
title = row[2]
objective = row[3] or ""
current_category = row[4]
source_text = row[6] or objective
# Keyword detection on source text
kw_category = _detect_category(source_text) or _detect_category(objective)
kw_domain = _detect_domain(source_text)
current_prefix = control_id.split("-")[0] if "-" in control_id else ""
# Skip if keyword detection agrees with current classification
if kw_category == current_category and kw_domain == current_prefix:
continue
results["mismatches"] += 1
# Ask Ollama to arbitrate
try:
reqs_text = ""
try:
reqs = json.loads(row[5])
if isinstance(reqs, list):
reqs_text = ", ".join(str(r) for r in reqs[:3])
except Exception:
pass
prompt = f"""Pruefe dieses Compliance-Control auf korrekte Klassifizierung.
Titel: {title[:100]}
Ziel: {objective[:200]}
Anforderungen: {reqs_text[:200]}
Aktuelle Zuordnung: domain={current_prefix}, category={current_category}
Keyword-Erkennung: domain={kw_domain}, category={kw_category}
Welche Zuordnung ist korrekt? Antworte NUR als JSON:
{{"domain": "KUERZEL", "category": "kategorie_name", "reason": "kurze Begruendung"}}
Domains: AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe, ENV=Umwelt, HLT=Gesundheit
Kategorien: {CATEGORY_LIST_STR}"""
raw = await _llm_local(prompt)
data = _parse_llm_json(raw)
if not data:
continue
qa_domain = data.get("domain", "").upper()
qa_category = data.get("category", "")
reason = data.get("reason", "")
fix_entry = {
"control_id": control_id,
"title": title[:80],
"old_category": current_category,
"old_domain": current_prefix,
"new_category": qa_category if qa_category in VALID_CATEGORIES else current_category,
"new_domain": qa_domain if qa_domain in VALID_DOMAINS else current_prefix,
"reason": reason,
}
category_changed = qa_category in VALID_CATEGORIES and qa_category != current_category
if category_changed and not req.dry_run:
db.execute(
text("""
UPDATE canonical_controls
SET category = :category, updated_at = NOW()
WHERE id = :id
"""),
{"id": row[0], "category": qa_category},
)
fix_entry["applied"] = True
else:
fix_entry["applied"] = False
results["fixes"].append(fix_entry)
except Exception as e:
results["errors"].append({"control_id": control_id, "error": str(e)})
if not req.dry_run:
db.commit()
return results
finally:
db.close()
@router.get("/generate/processed-stats")
async def get_processed_stats():
"""Get processing statistics per collection."""

View File

@@ -39,7 +39,6 @@ router = APIRouter(tags=["extraction"])
ALL_COLLECTIONS = [
"bp_compliance_ce", # BSI-TR documents — primary Prüfaspekte source
"bp_compliance_recht", # Legal texts (GDPR, AI Act, ...)
"bp_compliance_gesetze", # German laws
"bp_compliance_datenschutz", # Data protection documents
"bp_dsfa_corpus", # DSFA corpus

View File

@@ -0,0 +1,437 @@
"""
Citation Backfill Service — enrich existing controls with article/paragraph provenance.
3-tier matching strategy:
Tier 1 — Hash match: sha256(source_original_text) → RAG chunk lookup
Tier 2 — Regex parse: split concatenated "DSGVO Art. 35" → regulation + article
Tier 3 — Ollama LLM: ask local LLM to identify article/paragraph from text
"""
import hashlib
import json
import logging
import os
import re
from dataclasses import dataclass, field
from datetime import datetime, timezone
from typing import Optional
import httpx
from sqlalchemy import text
from sqlalchemy.orm import Session
from .rag_client import ComplianceRAGClient, RAGSearchResult
logger = logging.getLogger(__name__)
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
OLLAMA_MODEL = os.getenv("CONTROL_GEN_OLLAMA_MODEL", "qwen3.5:35b-a3b")
LLM_TIMEOUT = float(os.getenv("CONTROL_GEN_LLM_TIMEOUT", "180"))
ALL_COLLECTIONS = [
"bp_compliance_ce",
"bp_compliance_gesetze",
"bp_compliance_datenschutz",
"bp_dsfa_corpus",
"bp_legal_templates",
]
BACKFILL_SYSTEM_PROMPT = (
"Du bist ein Rechtsexperte. Deine Aufgabe ist es, aus einem Gesetzestext "
"den genauen Artikel und Absatz zu bestimmen. Antworte NUR mit validem JSON."
)
# Regex to split concatenated source like "DSGVO Art. 35" or "NIS2 Artikel 21 Abs. 2"
_SOURCE_ARTICLE_RE = re.compile(
r"^(.+?)\s+(Art(?:ikel)?\.?\s*\d+.*)$", re.IGNORECASE
)
@dataclass
class MatchResult:
article: str
paragraph: str
method: str # "hash", "regex", "llm"
@dataclass
class BackfillResult:
total_controls: int = 0
matched_hash: int = 0
matched_regex: int = 0
matched_llm: int = 0
unmatched: int = 0
updated: int = 0
errors: list = field(default_factory=list)
class CitationBackfill:
"""Backfill article/paragraph into existing control source_citations."""
def __init__(self, db: Session, rag_client: ComplianceRAGClient):
self.db = db
self.rag = rag_client
self._rag_index: dict[str, RAGSearchResult] = {}
async def run(self, dry_run: bool = True, limit: int = 0) -> BackfillResult:
"""Main entry: iterate controls missing article/paragraph, match to RAG, update."""
result = BackfillResult()
# Load controls needing backfill
controls = self._load_controls_needing_backfill(limit)
result.total_controls = len(controls)
logger.info("Backfill: %d controls need article/paragraph enrichment", len(controls))
if not controls:
return result
# Collect hashes we need to find — only build index for controls with source text
needed_hashes: set[str] = set()
for ctrl in controls:
src = ctrl.get("source_original_text")
if src:
needed_hashes.add(hashlib.sha256(src.encode()).hexdigest())
if needed_hashes:
# Build targeted RAG index — only scroll collections that our controls reference
logger.info("Building targeted RAG hash index for %d source texts...", len(needed_hashes))
await self._build_rag_index_targeted(controls)
logger.info("RAG index built: %d chunks indexed, %d hashes needed", len(self._rag_index), len(needed_hashes))
else:
logger.info("No source_original_text found — skipping RAG index build")
# Process each control
for i, ctrl in enumerate(controls):
if i > 0 and i % 100 == 0:
logger.info("Backfill progress: %d/%d processed", i, result.total_controls)
try:
match = await self._match_control(ctrl)
if match:
if match.method == "hash":
result.matched_hash += 1
elif match.method == "regex":
result.matched_regex += 1
elif match.method == "llm":
result.matched_llm += 1
if not dry_run:
self._update_control(ctrl, match)
result.updated += 1
else:
logger.debug(
"DRY RUN: Would update %s with article=%s paragraph=%s (method=%s)",
ctrl["control_id"], match.article, match.paragraph, match.method,
)
else:
result.unmatched += 1
except Exception as e:
error_msg = f"Error backfilling {ctrl.get('control_id', '?')}: {e}"
logger.error(error_msg)
result.errors.append(error_msg)
if not dry_run:
try:
self.db.commit()
except Exception as e:
logger.error("Backfill commit failed: %s", e)
result.errors.append(f"Commit failed: {e}")
logger.info(
"Backfill complete: %d total, hash=%d regex=%d llm=%d unmatched=%d updated=%d",
result.total_controls, result.matched_hash, result.matched_regex,
result.matched_llm, result.unmatched, result.updated,
)
return result
def _load_controls_needing_backfill(self, limit: int = 0) -> list[dict]:
"""Load controls where source_citation exists but lacks separate 'article' key."""
query = """
SELECT id, control_id, source_citation, source_original_text,
generation_metadata, license_rule
FROM canonical_controls
WHERE license_rule IN (1, 2)
AND source_citation IS NOT NULL
AND (
source_citation->>'article' IS NULL
OR source_citation->>'article' = ''
)
ORDER BY control_id
"""
if limit > 0:
query += f" LIMIT {limit}"
result = self.db.execute(text(query))
cols = result.keys()
controls = []
for row in result:
ctrl = dict(zip(cols, row))
ctrl["id"] = str(ctrl["id"])
# Parse JSON fields
for jf in ("source_citation", "generation_metadata"):
if isinstance(ctrl.get(jf), str):
try:
ctrl[jf] = json.loads(ctrl[jf])
except (json.JSONDecodeError, TypeError):
ctrl[jf] = {}
controls.append(ctrl)
return controls
async def _build_rag_index_targeted(self, controls: list[dict]):
"""Build RAG index by scrolling only collections relevant to our controls.
Uses regulation codes from generation_metadata to identify which collections
to search, falling back to all collections only if needed.
"""
# Determine which collections are relevant based on regulation codes
regulation_to_collection = self._map_regulations_to_collections(controls)
collections_to_search = set(regulation_to_collection.values()) or set(ALL_COLLECTIONS)
logger.info("Targeted index: searching %d collections: %s",
len(collections_to_search), ", ".join(collections_to_search))
for collection in collections_to_search:
offset = None
page = 0
seen_offsets: set[str] = set()
while True:
chunks, next_offset = await self.rag.scroll(
collection=collection, offset=offset, limit=200,
)
if not chunks:
break
for chunk in chunks:
if chunk.text and len(chunk.text.strip()) >= 50:
h = hashlib.sha256(chunk.text.encode()).hexdigest()
self._rag_index[h] = chunk
page += 1
if page % 50 == 0:
logger.info("Indexing %s: page %d (%d chunks so far)",
collection, page, len(self._rag_index))
if not next_offset:
break
if next_offset in seen_offsets:
logger.warning("Scroll loop in %s at page %d — stopping", collection, page)
break
seen_offsets.add(next_offset)
offset = next_offset
logger.info("Indexed collection %s: %d pages", collection, page)
def _map_regulations_to_collections(self, controls: list[dict]) -> dict[str, str]:
"""Map regulation codes from controls to likely Qdrant collections."""
# Heuristic: regulation code prefix → collection
collection_map = {
"eu_": "bp_compliance_gesetze",
"dsgvo": "bp_compliance_datenschutz",
"bdsg": "bp_compliance_gesetze",
"ttdsg": "bp_compliance_gesetze",
"nist_": "bp_compliance_ce",
"owasp": "bp_compliance_ce",
"bsi_": "bp_compliance_ce",
"enisa": "bp_compliance_ce",
"at_": "bp_compliance_recht",
"fr_": "bp_compliance_recht",
"es_": "bp_compliance_recht",
}
result: dict[str, str] = {}
for ctrl in controls:
meta = ctrl.get("generation_metadata") or {}
reg = meta.get("source_regulation", "")
if not reg:
continue
for prefix, coll in collection_map.items():
if reg.startswith(prefix):
result[reg] = coll
break
else:
# Unknown regulation — search all
for coll in ALL_COLLECTIONS:
result[f"_all_{coll}"] = coll
return result
async def _match_control(self, ctrl: dict) -> Optional[MatchResult]:
"""3-tier matching: hash → regex → LLM."""
# Tier 1: Hash match against RAG index
source_text = ctrl.get("source_original_text")
if source_text:
h = hashlib.sha256(source_text.encode()).hexdigest()
chunk = self._rag_index.get(h)
if chunk and (chunk.article or chunk.paragraph):
return MatchResult(
article=chunk.article or "",
paragraph=chunk.paragraph or "",
method="hash",
)
# Tier 2: Regex parse concatenated source
citation = ctrl.get("source_citation") or {}
source_str = citation.get("source", "")
parsed = _parse_concatenated_source(source_str)
if parsed and parsed["article"]:
return MatchResult(
article=parsed["article"],
paragraph="", # Regex can't extract paragraph from concatenated format
method="regex",
)
# Tier 3: Ollama LLM
if source_text:
return await self._llm_match(ctrl)
return None
async def _llm_match(self, ctrl: dict) -> Optional[MatchResult]:
"""Use Ollama to identify article/paragraph from source text."""
citation = ctrl.get("source_citation") or {}
regulation_name = citation.get("source", "")
metadata = ctrl.get("generation_metadata") or {}
regulation_code = metadata.get("source_regulation", "")
source_text = ctrl.get("source_original_text", "")
prompt = f"""Analysiere den folgenden Gesetzestext und bestimme den genauen Artikel und Absatz.
Gesetz: {regulation_name} (Code: {regulation_code})
Text:
---
{source_text[:2000]}
---
Antworte NUR mit JSON:
{{"article": "Art. XX", "paragraph": "Abs. Y"}}
Falls kein spezifischer Absatz erkennbar ist, setze paragraph auf "".
Falls kein Artikel erkennbar ist, setze article auf "".
Bei deutschen Gesetzen mit § verwende: "§ XX" statt "Art. XX"."""
try:
raw = await _llm_ollama(prompt, BACKFILL_SYSTEM_PROMPT)
data = _parse_json(raw)
if data and (data.get("article") or data.get("paragraph")):
return MatchResult(
article=data.get("article", ""),
paragraph=data.get("paragraph", ""),
method="llm",
)
except Exception as e:
logger.warning("LLM match failed for %s: %s", ctrl.get("control_id"), e)
return None
def _update_control(self, ctrl: dict, match: MatchResult):
"""Update source_citation and generation_metadata in DB."""
citation = ctrl.get("source_citation") or {}
# Clean the source name: remove concatenated article if present
source_str = citation.get("source", "")
parsed = _parse_concatenated_source(source_str)
if parsed:
citation["source"] = parsed["name"]
# Add separate article/paragraph fields
citation["article"] = match.article
citation["paragraph"] = match.paragraph
# Update generation_metadata
metadata = ctrl.get("generation_metadata") or {}
if match.article:
metadata["source_article"] = match.article
metadata["source_paragraph"] = match.paragraph
metadata["backfill_method"] = match.method
metadata["backfill_at"] = datetime.now(timezone.utc).isoformat()
self.db.execute(
text("""
UPDATE canonical_controls
SET source_citation = :citation,
generation_metadata = :metadata,
updated_at = NOW()
WHERE id = CAST(:id AS uuid)
"""),
{
"id": ctrl["id"],
"citation": json.dumps(citation),
"metadata": json.dumps(metadata),
},
)
def _parse_concatenated_source(source: str) -> Optional[dict]:
"""Parse 'DSGVO Art. 35'{name: 'DSGVO', article: 'Art. 35'}.
Also handles '§' format: 'BDSG § 42'{name: 'BDSG', article: '§ 42'}.
"""
if not source:
return None
# Try Art./Artikel pattern
m = _SOURCE_ARTICLE_RE.match(source)
if m:
return {"name": m.group(1).strip(), "article": m.group(2).strip()}
# Try § pattern
m2 = re.match(r"^(.+?)\s+(§\s*\d+.*)$", source)
if m2:
return {"name": m2.group(1).strip(), "article": m2.group(2).strip()}
return None
async def _llm_ollama(prompt: str, system_prompt: Optional[str] = None) -> str:
"""Call Ollama chat API for backfill matching."""
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": prompt})
payload = {
"model": OLLAMA_MODEL,
"messages": messages,
"stream": False,
"options": {"num_predict": 256},
"think": False,
}
try:
async with httpx.AsyncClient(timeout=LLM_TIMEOUT) as client:
resp = await client.post(f"{OLLAMA_URL}/api/chat", json=payload)
if resp.status_code != 200:
logger.error("Ollama backfill failed %d: %s", resp.status_code, resp.text[:300])
return ""
data = resp.json()
msg = data.get("message", {})
if isinstance(msg, dict):
return msg.get("content", "")
return data.get("response", str(msg))
except Exception as e:
logger.error("Ollama backfill request failed: %s", e)
return ""
def _parse_json(raw: str) -> Optional[dict]:
"""Extract JSON object from LLM output."""
if not raw:
return None
# Try direct parse
try:
return json.loads(raw)
except json.JSONDecodeError:
pass
# Try extracting from markdown code block
m = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", raw, re.DOTALL)
if m:
try:
return json.loads(m.group(1))
except json.JSONDecodeError:
pass
# Try finding first { ... }
m = re.search(r"\{[^{}]*\}", raw)
if m:
try:
return json.loads(m.group(0))
except json.JSONDecodeError:
pass
return None

View File

@@ -44,6 +44,7 @@ logger = logging.getLogger(__name__)
SDK_URL = os.getenv("SDK_URL", "http://ai-compliance-sdk:8090")
EMBEDDING_URL = os.getenv("EMBEDDING_URL", "http://embedding-service:8087")
QDRANT_URL = os.getenv("QDRANT_URL", "http://host.docker.internal:6333")
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "")
ANTHROPIC_MODEL = os.getenv("CONTROL_GEN_ANTHROPIC_MODEL", "claude-sonnet-4-6")
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
@@ -54,7 +55,6 @@ HARMONIZATION_THRESHOLD = 0.85 # Cosine similarity above this = duplicate
ALL_COLLECTIONS = [
"bp_compliance_ce",
"bp_compliance_recht",
"bp_compliance_gesetze",
"bp_compliance_datenschutz",
"bp_dsfa_corpus",
@@ -312,6 +312,12 @@ CATEGORY_KEYWORDS = {
"hygiene", "infektionsschutz", "pflege"],
}
VALID_CATEGORIES = set(CATEGORY_KEYWORDS.keys())
VALID_DOMAINS = {"AUTH", "CRYP", "NET", "DATA", "LOG", "ACC", "SEC", "INC",
"AI", "COMP", "GOV", "LAB", "FIN", "TRD", "ENV", "HLT"}
CATEGORY_LIST_STR = ", ".join(sorted(VALID_CATEGORIES))
VERIFICATION_KEYWORDS = {
"code_review": ["source code", "code review", "static analysis", "sast", "dast",
"dependency check", "quellcode", "codeanalyse", "secure coding",
@@ -373,6 +379,7 @@ class GeneratorConfig(BaseModel):
domain: Optional[str] = None
batch_size: int = 5
max_controls: int = 0 # 0 = unlimited (process ALL chunks)
max_chunks: int = 0 # 0 = unlimited; >0 = stop after N chunks (respects document boundaries)
skip_processed: bool = True
skip_web_search: bool = False
dry_run: bool = False
@@ -418,6 +425,7 @@ class GeneratorResult:
controls_needs_review: int = 0
controls_too_close: int = 0
controls_duplicates_found: int = 0
controls_qa_fixed: int = 0
chunks_skipped_prefilter: int = 0
errors: list = field(default_factory=list)
controls: list = field(default_factory=list)
@@ -713,7 +721,7 @@ class ControlGeneratorPipeline:
async def _scan_rag(self, config: GeneratorConfig) -> list[RAGSearchResult]:
"""Scroll through ALL chunks in RAG collections.
Uses the scroll endpoint to iterate over every chunk (not just top-K search).
Uses DIRECT Qdrant scroll API (bypasses Go SDK which has offset cycling bugs).
Filters out already-processed chunks by hash.
"""
collections = config.collections or ALL_COLLECTIONS
@@ -734,80 +742,105 @@ class ControlGeneratorPipeline:
seen_hashes: set[str] = set()
for collection in collections:
offset = None
page = 0
collection_total = 0
collection_new = 0
max_pages = 1000 # Safety limit: 1000 pages × 200 = 200K chunks max per collection
prev_chunk_count = -1 # Track stalls (same count means no progress)
stall_count = 0
qdrant_offset = None # Qdrant uses point ID as offset
while page < max_pages:
chunks, next_offset = await self.rag.scroll(
collection=collection,
offset=offset,
limit=200,
)
while True:
# Direct Qdrant scroll API — bypasses Go SDK offset cycling bug
try:
scroll_body: dict = {
"limit": 250,
"with_payload": True,
"with_vector": False,
}
if qdrant_offset is not None:
scroll_body["offset"] = qdrant_offset
if not chunks:
async with httpx.AsyncClient(timeout=30.0) as client:
resp = await client.post(
f"{QDRANT_URL}/collections/{collection}/points/scroll",
json=scroll_body,
)
if resp.status_code != 200:
logger.error("Qdrant scroll %s failed: %d %s", collection, resp.status_code, resp.text[:200])
break
data = resp.json().get("result", {})
points = data.get("points", [])
next_page_offset = data.get("next_page_offset")
except Exception as e:
logger.error("Qdrant scroll error for %s: %s", collection, e)
break
collection_total += len(chunks)
if not points:
break
for chunk in chunks:
if not chunk.text or len(chunk.text.strip()) < 50:
continue # Skip empty/tiny chunks
collection_total += len(points)
h = hashlib.sha256(chunk.text.encode()).hexdigest()
for point in points:
payload = point.get("payload", {})
# Different collections use different field names for text
chunk_text = (payload.get("chunk_text", "")
or payload.get("content", "")
or payload.get("text", "")
or payload.get("page_content", ""))
if not chunk_text or len(chunk_text.strip()) < 50:
continue
h = hashlib.sha256(chunk_text.encode()).hexdigest()
# Skip duplicates (same text in multiple collections)
if h in seen_hashes:
continue
seen_hashes.add(h)
# Skip already-processed
if h in processed_hashes:
continue
# Convert Qdrant point to RAGSearchResult
# Handle varying payload schemas across collections
reg_code = (payload.get("regulation_id", "")
or payload.get("regulation_code", "")
or payload.get("source_id", "")
or payload.get("source_code", ""))
reg_name = (payload.get("regulation_name_de", "")
or payload.get("regulation_name", "")
or payload.get("source_name", "")
or payload.get("guideline_name", "")
or payload.get("document_title", "")
or payload.get("filename", ""))
reg_short = (payload.get("regulation_short", "")
or reg_code)
chunk = RAGSearchResult(
text=chunk_text,
regulation_code=reg_code,
regulation_name=reg_name,
regulation_short=reg_short,
category=payload.get("category", "") or payload.get("data_type", ""),
article=payload.get("article", "") or payload.get("section_title", "") or payload.get("section", ""),
paragraph=payload.get("paragraph", ""),
source_url=payload.get("source_url", "") or payload.get("source", "") or payload.get("url", ""),
score=0.0,
collection=collection,
)
all_results.append(chunk)
collection_new += 1
page += 1
if page % 50 == 0:
if page % 100 == 0:
logger.info(
"Scrolling %s: page %d, %d total chunks, %d new unprocessed",
"Scrolling %s (direct Qdrant): page %d, %d total chunks, %d new unprocessed",
collection, page, collection_total, collection_new,
)
# Stop conditions
if not next_offset:
break
if next_page_offset is None:
break # Qdrant returns null when no more pages
# Detect stalls: if no NEW unique chunks found for several pages,
# we've likely cycled through all chunks in this collection.
# (Safer than offset dedup which breaks with mixed Qdrant ID types)
if collection_new == prev_chunk_count:
stall_count += 1
if stall_count >= 5:
logger.warning(
"Scroll stalled in %s at page %d — no new unique chunks for 5 pages (%d total, %d new) — stopping",
collection, page, collection_total, collection_new,
)
break
else:
stall_count = 0
prev_chunk_count = collection_new
offset = next_offset
if page >= max_pages:
logger.warning(
"Collection %s: reached max_pages limit (%d). %d chunks scrolled.",
collection, max_pages, collection_total,
)
qdrant_offset = next_page_offset
logger.info(
"Collection %s: %d total chunks scrolled, %d new unprocessed",
"Collection %s: %d total chunks scrolled (direct Qdrant), %d new unprocessed",
collection, collection_total, collection_new,
)
@@ -857,6 +890,11 @@ Gib JSON zurück mit diesen Feldern:
- evidence: Liste von Nachweisdokumenten (Strings)
- severity: low/medium/high/critical
- tags: Liste von Tags
- domain: Fachgebiet als Kuerzel (AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden/Verwaltung, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe/Handelsrecht, ENV=Umwelt, HLT=Gesundheit)
- category: Inhaltliche Kategorie — MUSS zum domain passen. Moegliche Werte: {CATEGORY_LIST_STR}
- target_audience: Liste der Zielgruppen (z.B. "unternehmen", "behoerden", "entwickler", "datenschutzbeauftragte", "geschaeftsfuehrung", "it-abteilung", "rechtsabteilung", "compliance-officer", "personalwesen", "einkauf", "produktion", "gesundheitswesen", "finanzwesen", "oeffentlicher_dienst")
- source_article: Artikel-/Paragraphen-Referenz aus dem Text (z.B. "Artikel 10", "§ 42"). Leer lassen wenn nicht erkennbar.
- source_paragraph: Absatz-Referenz aus dem Text (z.B. "Absatz 5", "Nr. 2"). Leer lassen wenn nicht erkennbar.
Text: {chunk.text[:2000]}
Quelle: {chunk.regulation_name} ({chunk.regulation_code}), {chunk.article}"""
@@ -868,24 +906,29 @@ Quelle: {chunk.regulation_name} ({chunk.regulation_code}), {chunk.article}"""
domain = _detect_domain(chunk.text)
control = self._build_control_from_json(data, domain)
llm_article = str(data.get("source_article", "")).strip()
llm_paragraph = str(data.get("source_paragraph", "")).strip()
effective_article = llm_article or chunk.article or ""
effective_paragraph = llm_paragraph or chunk.paragraph or ""
control.license_rule = 1
control.source_original_text = chunk.text
control.source_citation = {
"source": chunk.regulation_name,
"article": chunk.article or "",
"paragraph": chunk.paragraph or "",
"article": effective_article,
"paragraph": effective_paragraph,
"license": license_info.get("license", ""),
"url": chunk.source_url or "",
}
control.customer_visible = True
control.verification_method = _detect_verification_method(chunk.text)
control.category = _detect_category(chunk.text)
if not control.category:
control.category = _detect_category(chunk.text)
control.generation_metadata = {
"processing_path": "structured",
"license_rule": 1,
"source_regulation": chunk.regulation_code,
"source_article": chunk.article,
"source_paragraph": chunk.paragraph,
"source_article": effective_article,
"source_paragraph": effective_paragraph,
}
return control
@@ -910,6 +953,11 @@ Gib JSON zurück mit diesen Feldern:
- evidence: Liste von Nachweisdokumenten (Strings)
- severity: low/medium/high/critical
- tags: Liste von Tags
- domain: Fachgebiet als Kuerzel (AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden/Verwaltung, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe/Handelsrecht, ENV=Umwelt, HLT=Gesundheit)
- category: Inhaltliche Kategorie — MUSS zum domain passen. Moegliche Werte: {CATEGORY_LIST_STR}
- target_audience: Liste der Zielgruppen (z.B. "unternehmen", "behoerden", "entwickler", "datenschutzbeauftragte", "geschaeftsfuehrung", "it-abteilung", "rechtsabteilung", "compliance-officer", "personalwesen", "einkauf", "produktion", "gesundheitswesen", "finanzwesen", "oeffentlicher_dienst")
- source_article: Artikel-/Paragraphen-Referenz aus dem Text (z.B. "Artikel 10", "§ 42"). Leer lassen wenn nicht erkennbar.
- source_paragraph: Absatz-Referenz aus dem Text (z.B. "Absatz 5", "Nr. 2"). Leer lassen wenn nicht erkennbar.
Text: {chunk.text[:2000]}
Quelle: {chunk.regulation_name}, {chunk.article}"""
@@ -921,25 +969,30 @@ Quelle: {chunk.regulation_name}, {chunk.article}"""
domain = _detect_domain(chunk.text)
control = self._build_control_from_json(data, domain)
llm_article = str(data.get("source_article", "")).strip()
llm_paragraph = str(data.get("source_paragraph", "")).strip()
effective_article = llm_article or chunk.article or ""
effective_paragraph = llm_paragraph or chunk.paragraph or ""
control.license_rule = 2
control.source_original_text = chunk.text
control.source_citation = {
"source": chunk.regulation_name,
"article": chunk.article or "",
"paragraph": chunk.paragraph or "",
"article": effective_article,
"paragraph": effective_paragraph,
"license": license_info.get("license", ""),
"license_notice": attribution,
"url": chunk.source_url or "",
}
control.customer_visible = True
control.verification_method = _detect_verification_method(chunk.text)
control.category = _detect_category(chunk.text)
if not control.category:
control.category = _detect_category(chunk.text)
control.generation_metadata = {
"processing_path": "structured",
"license_rule": 2,
"source_regulation": chunk.regulation_code,
"source_article": chunk.article,
"source_paragraph": chunk.paragraph,
"source_article": effective_article,
"source_paragraph": effective_paragraph,
}
return control
@@ -968,7 +1021,8 @@ Gib JSON zurück mit diesen Feldern:
- evidence: Liste von Nachweisdokumenten (Strings)
- severity: low/medium/high/critical
- tags: Liste von Tags (eigene Begriffe)
- domain: Fachgebiet als Kuerzel (AUTH, CRYP, NET, DATA, LOG, ACC, SEC, INC, AI, COMP, GOV, LAB, FIN, TRD, ENV, HLT)
- domain: Fachgebiet als Kuerzel (AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden/Verwaltung, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe/Handelsrecht, ENV=Umwelt, HLT=Gesundheit)
- category: Inhaltliche Kategorie — MUSS zum domain passen. Moegliche Werte: {CATEGORY_LIST_STR}
- target_audience: Liste der Zielgruppen (z.B. "unternehmen", "behoerden", "entwickler", "datenschutzbeauftragte", "geschaeftsfuehrung", "it-abteilung", "rechtsabteilung", "compliance-officer", "personalwesen", "oeffentlicher_dienst")"""
raw = await _llm_chat(prompt, REFORM_SYSTEM_PROMPT)
@@ -982,7 +1036,8 @@ Gib JSON zurück mit diesen Feldern:
control.source_citation = None # NEVER cite source
control.customer_visible = False # Only our formulation
control.verification_method = _detect_verification_method(chunk.text)
control.category = _detect_category(chunk.text)
if not control.category:
control.category = _detect_category(chunk.text)
# generation_metadata: NO source names, NO original texts
control.generation_metadata = {
"processing_path": "llm_reform",
@@ -1046,7 +1101,10 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat di
- severity: low/medium/high/critical
- tags: Liste von Tags
- domain: Fachgebiet als Kuerzel (AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden/Verwaltung, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe/Handelsrecht, ENV=Umwelt, HLT=Gesundheit)
- category: Inhaltliche Kategorie — MUSS zum domain passen. Moegliche Werte: {CATEGORY_LIST_STR}
- target_audience: Liste der Zielgruppen fuer die dieses Control relevant ist. Moegliche Werte: "unternehmen", "behoerden", "entwickler", "datenschutzbeauftragte", "geschaeftsfuehrung", "it-abteilung", "rechtsabteilung", "compliance-officer", "personalwesen", "einkauf", "produktion", "vertrieb", "gesundheitswesen", "finanzwesen", "oeffentlicher_dienst"
- source_article: Artikel-/Paragraphen-Referenz aus dem Text extrahieren (z.B. "Artikel 10", "Art. 5", "§ 42", "Section 3"). Leer lassen wenn nicht erkennbar.
- source_paragraph: Absatz-Referenz aus dem Text extrahieren (z.B. "Absatz 5", "Abs. 3", "Nr. 2", "(1)"). Leer lassen wenn nicht erkennbar.
{joined}"""
@@ -1071,26 +1129,32 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat di
domain = _detect_domain(chunk.text)
control = self._build_control_from_json(data, domain)
control.license_rule = lic["rule"]
# Use LLM-extracted article/paragraph, fall back to chunk metadata
llm_article = str(data.get("source_article", "")).strip()
llm_paragraph = str(data.get("source_paragraph", "")).strip()
effective_article = llm_article or chunk.article or ""
effective_paragraph = llm_paragraph or chunk.paragraph or ""
if lic["rule"] in (1, 2):
control.source_original_text = chunk.text
control.source_citation = {
"source": chunk.regulation_name,
"article": chunk.article or "",
"paragraph": chunk.paragraph or "",
"article": effective_article,
"paragraph": effective_paragraph,
"license": lic.get("license", ""),
"license_notice": lic.get("attribution", ""),
"url": chunk.source_url or "",
}
control.customer_visible = True
control.verification_method = _detect_verification_method(chunk.text)
control.category = _detect_category(chunk.text)
if not control.category:
control.category = _detect_category(chunk.text)
same_doc = len(set(c.regulation_code for c in chunks)) == 1
control.generation_metadata = {
"processing_path": "structured_batch",
"license_rule": lic["rule"],
"source_regulation": chunk.regulation_code,
"source_article": chunk.article,
"source_paragraph": chunk.paragraph,
"source_article": effective_article,
"source_paragraph": effective_paragraph,
"batch_size": len(chunks),
"document_grouped": same_doc,
}
@@ -1133,6 +1197,7 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat di
- severity: low/medium/high/critical
- tags: Liste von Tags (eigene Begriffe)
- domain: Fachgebiet als Kuerzel (AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden/Verwaltung, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe/Handelsrecht, ENV=Umwelt, HLT=Gesundheit)
- category: Inhaltliche Kategorie — MUSS zum domain passen. Moegliche Werte: {CATEGORY_LIST_STR}
- target_audience: Liste der Zielgruppen (z.B. "unternehmen", "behoerden", "entwickler", "datenschutzbeauftragte", "geschaeftsfuehrung", "it-abteilung", "rechtsabteilung", "compliance-officer", "personalwesen", "einkauf", "produktion", "gesundheitswesen", "finanzwesen", "oeffentlicher_dienst")
{joined}"""
@@ -1159,7 +1224,8 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat di
control.source_citation = None
control.customer_visible = False
control.verification_method = _detect_verification_method(chunk.text)
control.category = _detect_category(chunk.text)
if not control.category:
control.category = _detect_category(chunk.text)
control.generation_metadata = {
"processing_path": "llm_reform_batch",
"license_rule": 3,
@@ -1209,6 +1275,9 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat di
all_controls[orig_idx] = ctrl
# Post-process all controls: harmonization + anchor search
# NOTE: QA validation runs as a separate batch AFTER generation (qa-reclassify endpoint)
# to avoid competing with Ollama prefilter for resources.
qa_fixed_count = 0
final: list[Optional[GeneratedControl]] = []
for i in range(len(batch_items)):
control = all_controls.get(i)
@@ -1245,7 +1314,7 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat di
else:
control.release_state = "needs_review"
# Control ID — prefer LLM-assigned domain over keyword detection
# Control ID — prefer QA-corrected or LLM-assigned domain over keyword detection
domain = (control.generation_metadata.get("_effective_domain")
or config.domain
or _detect_domain(control.objective))
@@ -1254,7 +1323,9 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat di
final.append(control)
return final
if qa_fixed_count:
logger.info("QA validation: fixed %d/%d controls in batch", qa_fixed_count, len(final))
return final, qa_fixed_count
# ── Stage 4: Harmonization ─────────────────────────────────────────
@@ -1337,11 +1408,15 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat di
# Use LLM-provided domain if available, fallback to keyword-detected domain
llm_domain = data.get("domain")
valid_domains = {"AUTH", "CRYP", "NET", "DATA", "LOG", "ACC", "SEC", "INC",
"AI", "COMP", "GOV", "LAB", "FIN", "TRD", "ENV", "HLT"}
if llm_domain and llm_domain.upper() in valid_domains:
if llm_domain and llm_domain.upper() in VALID_DOMAINS:
domain = llm_domain.upper()
# Use LLM-provided category if available
llm_category = data.get("category")
category = None
if llm_category and llm_category in VALID_CATEGORIES:
category = llm_category
# Parse target_audience from LLM response
target_audience = data.get("target_audience")
if isinstance(target_audience, str):
@@ -1362,6 +1437,7 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat di
implementation_effort=data.get("implementation_effort", "m") if data.get("implementation_effort") in ("s", "m", "l", "xl") else "m",
tags=tags[:20],
target_audience=target_audience,
category=category,
)
# Store effective domain for later control_id generation
control.generation_metadata["_effective_domain"] = domain
@@ -1395,6 +1471,79 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat di
pass
return f"{prefix}-001"
# ── Stage QA: Automated Quality Validation ───────────────────────
async def _qa_validate_control(
self, control: GeneratedControl, chunk_text: str
) -> tuple[GeneratedControl, bool]:
"""Cross-validate category/domain using keyword detection + local LLM.
Returns (control, was_fixed). Only triggers Ollama QA when the LLM
classification disagrees with keyword detection — keeps it fast.
"""
kw_category = _detect_category(chunk_text) or _detect_category(control.objective)
kw_domain = _detect_domain(chunk_text)
llm_domain = control.generation_metadata.get("_effective_domain", "")
# If keyword and LLM agree → no QA needed
if control.category == kw_category and llm_domain == kw_domain:
return control, False
# Disagreement detected → ask local LLM to arbitrate
title = control.title[:100]
objective = control.objective[:200]
reqs = ", ".join(control.requirements[:3]) if control.requirements else "keine"
prompt = f"""Pruefe dieses Compliance-Control auf korrekte Klassifizierung.
Titel: {title}
Ziel: {objective}
Anforderungen: {reqs}
Aktuelle Zuordnung: domain={llm_domain}, category={control.category}
Keyword-Erkennung: domain={kw_domain}, category={kw_category}
Welche Zuordnung ist korrekt? Antworte NUR als JSON:
{{"domain": "KUERZEL", "category": "kategorie_name", "reason": "kurze Begruendung"}}
Domains: AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe, ENV=Umwelt, HLT=Gesundheit
Kategorien: {CATEGORY_LIST_STR}"""
try:
raw = await _llm_local(prompt)
data = _parse_llm_json(raw)
if not data:
return control, False
fixed = False
qa_domain = data.get("domain", "").upper()
qa_category = data.get("category", "")
reason = data.get("reason", "")
if qa_category and qa_category in VALID_CATEGORIES and qa_category != control.category:
old_cat = control.category
control.category = qa_category
control.generation_metadata["qa_category_fix"] = {
"from": old_cat, "to": qa_category, "reason": reason,
}
logger.info("QA fix: '%s' category '%s' -> '%s' (%s)",
title[:40], old_cat, qa_category, reason)
fixed = True
if qa_domain and qa_domain in VALID_DOMAINS and qa_domain != llm_domain:
control.generation_metadata["qa_domain_fix"] = {
"from": llm_domain, "to": qa_domain, "reason": reason,
}
control.generation_metadata["_effective_domain"] = qa_domain
logger.info("QA fix: '%s' domain '%s' -> '%s' (%s)",
title[:40], llm_domain, qa_domain, reason)
fixed = True
return control, fixed
except Exception as e:
logger.warning("QA validation failed for '%s': %s", title[:40], e)
return control, False
# ── Pipeline Orchestration ─────────────────────────────────────────
def _create_job(self, config: GeneratorConfig) -> str:
@@ -1605,10 +1754,28 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat di
len(chunks), len(doc_groups),
)
# Flatten back: chunks from same document are now adjacent
# ── Apply max_chunks limit respecting document boundaries ──
# Process complete documents until we exceed the limit.
# Never split a document across jobs.
chunks = []
for group_list in doc_groups.values():
chunks.extend(group_list)
if config.max_chunks and config.max_chunks > 0:
for group_key, group_list in doc_groups.items():
if chunks and len(chunks) + len(group_list) > config.max_chunks:
# Adding this document would exceed the limit — stop here
break
chunks.extend(group_list)
logger.info(
"max_chunks=%d: selected %d chunks from %d complete documents (of %d total groups)",
config.max_chunks, len(chunks),
len(set(c.regulation_code for c in chunks)),
len(doc_groups),
)
else:
# No limit: flatten all groups
for group_list in doc_groups.values():
chunks.extend(group_list)
result.total_chunks_scanned = len(chunks)
# Process chunks — batch mode (N chunks per Anthropic API call)
BATCH_SIZE = config.batch_size or 5
@@ -1633,7 +1800,8 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat di
len(batch), ", ".join(regs_in_batch),
)
try:
batch_controls = await self._process_batch(batch, config, job_id)
batch_controls, batch_qa_fixes = await self._process_batch(batch, config, job_id)
result.controls_qa_fixed += batch_qa_fixes
except Exception as e:
logger.error("Batch processing failed: %s — falling back to single-chunk mode", e)
# Fallback: process each chunk individually
@@ -1785,6 +1953,8 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat di
if not control.title or not control.objective:
return None
# NOTE: QA validation runs as a separate batch AFTER generation (qa-reclassify endpoint)
# Stage 4: Harmonization
duplicates = await self._check_harmonization(control)
if duplicates:
@@ -1809,8 +1979,10 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat di
else:
control.release_state = "needs_review"
# Generate control_id
domain = config.domain or _detect_domain(control.objective)
# Generate control_id — prefer QA-corrected or LLM-assigned domain
domain = (control.generation_metadata.get("_effective_domain")
or config.domain
or _detect_domain(control.objective))
control.control_id = self._generate_control_id(domain, self.db)
# Store job_id in metadata