feat: add policy library with 29 German policy templates
All checks were successful
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Successful in 34s
CI/CD / test-python-backend-compliance (push) Successful in 35s
CI/CD / test-python-document-crawler (push) Successful in 26s
CI/CD / test-python-dsms-gateway (push) Successful in 19s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Successful in 2s

Add 29 new document types (IT security, data, personnel, vendor, BCM
policies) to VALID_DOCUMENT_TYPES and 5 category pills to the document
generator UI. Include seed script for production DB population.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-14 22:37:33 +01:00
parent 637fab6fdb
commit 0171d611f6
4 changed files with 1842 additions and 54 deletions

View File

@@ -32,19 +32,26 @@ import {
const CATEGORIES: { key: string; label: string; types: string[] | null }[] = [
{ key: 'all', label: 'Alle', types: null },
// Legal / Vertragsvorlagen
{ key: 'privacy_policy', label: 'Datenschutz', types: ['privacy_policy'] },
{ key: 'terms', label: 'AGB', types: ['terms_of_service', 'agb', 'clause'] },
{ key: 'impressum', label: 'Impressum', types: ['impressum'] },
{ key: 'dpa', label: 'AVV/DPA', types: ['dpa'] },
{ key: 'nda', label: 'NDA', types: ['nda'] },
{ key: 'sla', label: 'SLA', types: ['sla'] },
{ key: 'acceptable_use', label: 'AUP', types: ['acceptable_use'] },
{ key: 'widerruf', label: 'Widerruf', types: ['widerruf'] },
{ key: 'cookie', label: 'Cookie', types: ['cookie_policy', 'cookie_banner'] },
{ key: 'cloud', label: 'Cloud', types: ['cloud_service_agreement'] },
{ key: 'misc', label: 'Weitere', types: ['community_guidelines', 'copyright_policy', 'data_usage_clause'] },
{ key: 'dsfa', label: 'DSFA', types: ['dsfa'] },
// Sicherheitskonzepte (Migration 051)
{ key: 'security', label: 'Sicherheitskonzepte', types: ['it_security_concept', 'data_protection_concept', 'backup_recovery_concept', 'logging_concept', 'incident_response_plan', 'access_control_concept', 'risk_management_concept'] },
// Policy-Bibliothek (Migration 054)
{ key: 'it_security_policies', label: 'IT-Sicherheit Policies', types: ['information_security_policy', 'access_control_policy', 'password_policy', 'encryption_policy', 'logging_policy', 'backup_policy', 'incident_response_policy', 'change_management_policy', 'patch_management_policy', 'asset_management_policy', 'cloud_security_policy', 'devsecops_policy', 'secrets_management_policy', 'vulnerability_management_policy'] },
{ key: 'data_policies', label: 'Daten-Policies', types: ['data_protection_policy', 'data_classification_policy', 'data_retention_policy', 'data_transfer_policy', 'privacy_incident_policy'] },
{ key: 'hr_policies', label: 'Personal-Policies', types: ['employee_security_policy', 'security_awareness_policy', 'acceptable_use', 'remote_work_policy', 'offboarding_policy'] },
{ key: 'vendor_policies', label: 'Lieferanten-Policies', types: ['vendor_risk_management_policy', 'third_party_security_policy', 'supplier_security_policy'] },
{ key: 'bcm_policies', label: 'BCM/Notfall', types: ['business_continuity_policy', 'disaster_recovery_policy', 'crisis_management_policy'] },
]
// =============================================================================

View File

@@ -58,6 +58,40 @@ VALID_DOCUMENT_TYPES = {
"incident_response_plan",
"access_control_concept",
"risk_management_concept",
# Policy templates — IT Security (Migration 054)
"information_security_policy",
"access_control_policy",
"password_policy",
"encryption_policy",
"logging_policy",
"backup_policy",
"incident_response_policy",
"change_management_policy",
"patch_management_policy",
"asset_management_policy",
"cloud_security_policy",
"devsecops_policy",
"secrets_management_policy",
"vulnerability_management_policy",
# Policy templates — Data (Migration 054)
"data_protection_policy",
"data_classification_policy",
"data_retention_policy",
"data_transfer_policy",
"privacy_incident_policy",
# Policy templates — Personnel (Migration 054)
"employee_security_policy",
"security_awareness_policy",
"remote_work_policy",
"offboarding_policy",
# Policy templates — Vendor/Supply Chain (Migration 054)
"vendor_risk_management_policy",
"third_party_security_policy",
"supplier_security_policy",
# Policy templates — BCM (Migration 054)
"business_continuity_policy",
"disaster_recovery_policy",
"crisis_management_policy",
}
VALID_STATUSES = {"published", "draft", "archived"}

View File

@@ -47,7 +47,7 @@ ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "")
ANTHROPIC_MODEL = os.getenv("CONTROL_GEN_ANTHROPIC_MODEL", "claude-sonnet-4-6")
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
OLLAMA_MODEL = os.getenv("CONTROL_GEN_OLLAMA_MODEL", "qwen3.5:35b-a3b")
LLM_TIMEOUT = float(os.getenv("CONTROL_GEN_LLM_TIMEOUT", "120"))
LLM_TIMEOUT = float(os.getenv("CONTROL_GEN_LLM_TIMEOUT", "180"))
HARMONIZATION_THRESHOLD = 0.85 # Cosine similarity above this = duplicate
@@ -466,7 +466,7 @@ async def _llm_anthropic(prompt: str, system_prompt: Optional[str] = None) -> st
}
payload = {
"model": ANTHROPIC_MODEL,
"max_tokens": 4096,
"max_tokens": 8192,
"messages": [{"role": "user", "content": prompt}],
}
if system_prompt:
@@ -488,7 +488,7 @@ async def _llm_anthropic(prompt: str, system_prompt: Optional[str] = None) -> st
return content[0].get("text", "")
return ""
except Exception as e:
logger.error("Anthropic request failed: %s", e)
logger.error("Anthropic request failed: %s (type: %s)", e, type(e).__name__)
return ""
@@ -598,6 +598,57 @@ def _parse_llm_json(raw: str) -> dict:
return {}
def _parse_llm_json_array(raw: str) -> list[dict]:
"""Extract a JSON array from LLM response — returns list of dicts."""
match = re.search(r"```(?:json)?\s*\n?(.*?)\n?```", raw, re.DOTALL)
text = match.group(1) if match else raw
# Try parsing as array directly
try:
parsed = json.loads(text)
if isinstance(parsed, list):
return parsed
if isinstance(parsed, dict):
# Check if it wraps an array (e.g. {"controls": [...]})
for key in ("controls", "results", "items", "data"):
if key in parsed and isinstance(parsed[key], list):
return parsed[key]
return [parsed]
except json.JSONDecodeError:
pass
# Try finding [ ... ] block
bracket_match = re.search(r"\[.*\]", text, re.DOTALL)
if bracket_match:
try:
parsed = json.loads(bracket_match.group(0))
if isinstance(parsed, list):
return parsed
except json.JSONDecodeError:
pass
# Try finding multiple { ... } blocks (LLM sometimes returns separate objects)
objects = []
for obj_match in re.finditer(r"\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}", text, re.DOTALL):
try:
obj = json.loads(obj_match.group(0))
if isinstance(obj, dict) and obj.get("title"):
objects.append(obj)
except json.JSONDecodeError:
continue
if objects:
logger.info("Parsed %d individual JSON objects from batch response", len(objects))
return objects
# Fallback: try single object
single = _parse_llm_json(raw)
if single:
logger.info("Batch parse fallback: extracted single object")
else:
logger.warning("Batch parse failed — logging first 500 chars: %s", raw[:500])
return [single] if single else []
# ---------------------------------------------------------------------------
# Pipeline
# ---------------------------------------------------------------------------
@@ -606,11 +657,11 @@ REFORM_SYSTEM_PROMPT = """Du bist ein Security-Compliance-Experte. Deine Aufgabe
Security Controls zu formulieren. Du formulierst IMMER in eigenen Worten.
KOPIERE KEINE Sätze aus dem Quelltext. Verwende eigene Begriffe und Struktur.
NENNE NICHT die Quelle. Keine proprietären Bezeichner.
Antworte NUR mit validem JSON."""
Antworte NUR mit validem JSON. Bei mehreren Controls antworte mit einem JSON-Array."""
STRUCTURE_SYSTEM_PROMPT = """Du bist ein Security-Compliance-Experte. Strukturiere den gegebenen Text
als praxisorientiertes Security Control. Erstelle eine verständliche, umsetzbare Formulierung.
Antworte NUR mit validem JSON."""
Antworte NUR mit validem JSON. Bei mehreren Controls antworte mit einem JSON-Array."""
class ControlGeneratorPipeline:
@@ -881,6 +932,241 @@ Gib JSON zurück mit diesen Feldern:
}
return control
# ── Stage 3 BATCH: Multiple chunks in one API call ─────────────────
async def _structure_batch(
self,
chunks: list[RAGSearchResult],
license_infos: list[dict],
) -> list[Optional[GeneratedControl]]:
"""Structure multiple free-use/citation chunks in a single Anthropic call."""
chunk_entries = []
for idx, (chunk, lic) in enumerate(zip(chunks, license_infos)):
source_name = lic.get("name", chunk.regulation_name)
chunk_entries.append(
f"--- CHUNK {idx + 1} ---\n"
f"Text: {chunk.text[:2000]}\n"
f"Quelle: {chunk.regulation_name} ({chunk.regulation_code}), {chunk.article}\n"
f"Lizenz: {source_name} ({lic.get('license', '')})"
)
joined = "\n\n".join(chunk_entries)
prompt = f"""Strukturiere die folgenden {len(chunks)} Gesetzestexte jeweils als eigenstaendiges Security/Compliance Control.
Du DARFST den Originaltext verwenden (Quellen sind jeweils angegeben).
WICHTIG:
- Erstelle fuer JEDEN Chunk ein separates Control mit verstaendlicher, praxisorientierter Formulierung.
- Jedes Control muss eigenstaendig und vollstaendig sein — nicht auf andere Controls verweisen.
- Qualitaet ist wichtiger als Geschwindigkeit. Jedes Control muss die gleiche Qualitaet haben wie ein einzeln erstelltes.
Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat diese Felder:
- chunk_index: 1-basierter Index des Chunks (1, 2, 3, ...)
- title: Kurzer praegnanter Titel (max 100 Zeichen)
- objective: Was soll erreicht werden? (1-3 Saetze)
- rationale: Warum ist das wichtig? (1-2 Saetze)
- requirements: Liste von konkreten Anforderungen (Strings)
- test_procedure: Liste von Pruefschritten (Strings)
- evidence: Liste von Nachweisdokumenten (Strings)
- severity: low/medium/high/critical
- tags: Liste von Tags
{joined}"""
raw = await _llm_chat(prompt, STRUCTURE_SYSTEM_PROMPT)
results = _parse_llm_json_array(raw)
logger.info("Batch structure: parsed %d results from API response", len(results))
# Map results back to chunks by chunk_index (or by position if no index)
controls: list[Optional[GeneratedControl]] = [None] * len(chunks)
for pos, data in enumerate(results):
# Try chunk_index first, fall back to position
idx = data.get("chunk_index")
if idx is not None:
idx = int(idx) - 1 # Convert to 0-based
else:
idx = pos # Use position as fallback
if idx < 0 or idx >= len(chunks):
logger.warning("Batch: chunk_index %d out of range (0-%d), using position %d", idx, len(chunks)-1, pos)
idx = min(pos, len(chunks) - 1)
chunk = chunks[idx]
lic = license_infos[idx]
domain = _detect_domain(chunk.text)
control = self._build_control_from_json(data, domain)
control.license_rule = lic["rule"]
if lic["rule"] in (1, 2):
control.source_original_text = chunk.text
control.source_citation = {
"source": f"{chunk.regulation_name} {chunk.article or ''}".strip(),
"license": lic.get("license", ""),
"license_notice": lic.get("attribution", ""),
"url": chunk.source_url or "",
}
control.customer_visible = True
control.verification_method = _detect_verification_method(chunk.text)
control.category = _detect_category(chunk.text)
control.generation_metadata = {
"processing_path": "structured_batch",
"license_rule": lic["rule"],
"source_regulation": chunk.regulation_code,
"source_article": chunk.article,
"batch_size": len(chunks),
}
controls[idx] = control
return controls
async def _reformulate_batch(
self,
chunks: list[RAGSearchResult],
config: GeneratorConfig,
) -> list[Optional[GeneratedControl]]:
"""Reformulate multiple restricted chunks in a single Anthropic call."""
chunk_entries = []
for idx, chunk in enumerate(chunks):
domain = config.domain or _detect_domain(chunk.text)
chunk_entries.append(
f"--- ASPEKT {idx + 1} ---\n"
f"Domain: {domain}\n"
f"Text (nur zur Analyse, NICHT kopieren, NICHT referenzieren):\n{chunk.text[:1500]}"
)
joined = "\n\n".join(chunk_entries)
prompt = f"""Analysiere die folgenden {len(chunks)} Pruefaspekte und formuliere fuer JEDEN ein EIGENSTAENDIGES Security Control.
KOPIERE KEINE Saetze. Verwende eigene Begriffe und Struktur.
NENNE NICHT die Quellen. Keine proprietaeren Bezeichner (kein O.Auth_*, TR-03161, BSI-TR etc.).
WICHTIG:
- Jedes Control muss eigenstaendig und vollstaendig sein — nicht auf andere Controls verweisen.
- Qualitaet ist wichtiger als Geschwindigkeit. Jedes Control muss die gleiche Qualitaet haben wie ein einzeln erstelltes.
Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat diese Felder:
- chunk_index: 1-basierter Index des Aspekts (1, 2, 3, ...)
- title: Kurzer eigenstaendiger Titel (max 100 Zeichen)
- objective: Eigenstaendige Formulierung des Ziels (1-3 Saetze)
- rationale: Eigenstaendige Begruendung (1-2 Saetze)
- requirements: Liste von konkreten Anforderungen (Strings, eigene Worte)
- test_procedure: Liste von Pruefschritten (Strings)
- evidence: Liste von Nachweisdokumenten (Strings)
- severity: low/medium/high/critical
- tags: Liste von Tags (eigene Begriffe)
{joined}"""
raw = await _llm_chat(prompt, REFORM_SYSTEM_PROMPT)
results = _parse_llm_json_array(raw)
logger.info("Batch reform: parsed %d results from API response", len(results))
controls: list[Optional[GeneratedControl]] = [None] * len(chunks)
for pos, data in enumerate(results):
idx = data.get("chunk_index")
if idx is not None:
idx = int(idx) - 1
else:
idx = pos
if idx < 0 or idx >= len(chunks):
logger.warning("Batch reform: chunk_index %d out of range, using position %d", idx, pos)
idx = min(pos, len(chunks) - 1)
chunk = chunks[idx]
domain = config.domain or _detect_domain(chunk.text)
control = self._build_control_from_json(data, domain)
control.license_rule = 3
control.source_original_text = None
control.source_citation = None
control.customer_visible = False
control.verification_method = _detect_verification_method(chunk.text)
control.category = _detect_category(chunk.text)
control.generation_metadata = {
"processing_path": "llm_reform_batch",
"license_rule": 3,
"batch_size": len(chunks),
}
controls[idx] = control
return controls
async def _process_batch(
self,
batch_items: list[tuple[RAGSearchResult, dict]],
config: GeneratorConfig,
job_id: str,
) -> list[Optional[GeneratedControl]]:
"""Process a batch of (chunk, license_info) through stages 3-5."""
# Split by license rule: Rule 1+2 → structure, Rule 3 → reform
structure_items = [(c, l) for c, l in batch_items if l["rule"] in (1, 2)]
reform_items = [(c, l) for c, l in batch_items if l["rule"] == 3]
all_controls: dict[int, Optional[GeneratedControl]] = {}
if structure_items:
s_chunks = [c for c, _ in structure_items]
s_lics = [l for _, l in structure_items]
s_controls = await self._structure_batch(s_chunks, s_lics)
for (chunk, _), ctrl in zip(structure_items, s_controls):
orig_idx = next(i for i, (c, _) in enumerate(batch_items) if c is chunk)
all_controls[orig_idx] = ctrl
if reform_items:
r_chunks = [c for c, _ in reform_items]
r_controls = await self._reformulate_batch(r_chunks, config)
for (chunk, _), ctrl in zip(reform_items, r_controls):
orig_idx = next(i for i, (c, _) in enumerate(batch_items) if c is chunk)
if ctrl:
# Too-Close-Check for Rule 3
similarity = await check_similarity(chunk.text, f"{ctrl.objective} {ctrl.rationale}")
if similarity.status == "FAIL":
ctrl.release_state = "too_close"
ctrl.generation_metadata["similarity_status"] = "FAIL"
ctrl.generation_metadata["similarity_scores"] = {
"token_overlap": similarity.token_overlap,
"ngram_jaccard": similarity.ngram_jaccard,
"lcs_ratio": similarity.lcs_ratio,
}
all_controls[orig_idx] = ctrl
# Post-process all controls: harmonization + anchor search
final: list[Optional[GeneratedControl]] = []
for i in range(len(batch_items)):
control = all_controls.get(i)
if not control or (not control.title and not control.objective):
final.append(None)
continue
if control.release_state == "too_close":
final.append(control)
continue
# Harmonization
duplicates = await self._check_harmonization(control)
if duplicates:
control.release_state = "duplicate"
control.generation_metadata["similar_controls"] = duplicates
final.append(control)
continue
# Anchor search
try:
from .anchor_finder import AnchorFinder
finder = AnchorFinder(self.rag)
anchors = await finder.find_anchors(control, skip_web=config.skip_web_search)
control.open_anchors = [asdict(a) if hasattr(a, '__dataclass_fields__') else a for a in anchors]
except Exception as e:
logger.warning("Anchor search failed: %s", e)
# Release state
if control.license_rule in (1, 2):
control.release_state = "draft"
elif control.open_anchors:
control.release_state = "draft"
else:
control.release_state = "needs_review"
# Control ID
domain = config.domain or _detect_domain(control.objective)
control.control_id = self._generate_control_id(domain, self.db)
control.generation_metadata["job_id"] = job_id
final.append(control)
return final
# ── Stage 4: Harmonization ─────────────────────────────────────────
async def _check_harmonization(self, new_control: GeneratedControl) -> Optional[list]:
@@ -1168,6 +1454,7 @@ Gib JSON zurück mit diesen Feldern:
self.db.commit()
except Exception as e:
logger.warning("Failed to mark chunk processed: %s", e)
self.db.rollback()
# ── Main Pipeline ──────────────────────────────────────────────────
@@ -1192,9 +1479,71 @@ Gib JSON zurück mit diesen Feldern:
self._update_job(job_id, result)
return result
# Process chunks
# Process chunks — batch mode (N chunks per Anthropic API call)
BATCH_SIZE = config.batch_size or 5
controls_count = 0
chunks_skipped_prefilter = 0
pending_batch: list[tuple[RAGSearchResult, dict]] = [] # (chunk, license_info)
async def _flush_batch():
"""Send pending batch to Anthropic and process results."""
nonlocal controls_count
if not pending_batch:
return
batch = pending_batch.copy()
pending_batch.clear()
logger.info("Processing batch of %d chunks via single API call...", len(batch))
try:
batch_controls = await self._process_batch(batch, config, job_id)
except Exception as e:
logger.error("Batch processing failed: %s — falling back to single-chunk mode", e)
# Fallback: process each chunk individually
batch_controls = []
for chunk, _lic in batch:
try:
ctrl = await self._process_single_chunk(chunk, config, job_id)
batch_controls.append(ctrl)
except Exception as e2:
logger.error("Single-chunk fallback also failed: %s", e2)
batch_controls.append(None)
for (chunk, lic_info), control in zip(batch, batch_controls):
if control is None:
if not config.dry_run:
self._mark_chunk_processed(chunk, lic_info, "no_control", [], job_id)
continue
# Count by state
if control.release_state == "too_close":
result.controls_too_close += 1
elif control.release_state == "duplicate":
result.controls_duplicates_found += 1
elif control.release_state == "needs_review":
result.controls_needs_review += 1
else:
result.controls_verified += 1
# Store
if not config.dry_run:
ctrl_uuid = self._store_control(control, job_id)
if ctrl_uuid:
path = control.generation_metadata.get("processing_path", "structured_batch")
self._mark_chunk_processed(chunk, lic_info, path, [ctrl_uuid], job_id)
else:
self._mark_chunk_processed(chunk, lic_info, "store_failed", [], job_id)
result.controls_generated += 1
result.controls.append(asdict(control))
controls_count += 1
if self._existing_controls is not None:
self._existing_controls.append({
"control_id": control.control_id,
"title": control.title,
"objective": control.objective,
})
for i, chunk in enumerate(chunks):
try:
# Progress logging every 50 chunks
@@ -1210,65 +1559,24 @@ Gib JSON zurück mit diesen Feldern:
is_relevant, prefilter_reason = await _prefilter_chunk(chunk.text)
if not is_relevant:
chunks_skipped_prefilter += 1
# Mark as processed so we don't re-check next time
license_info = self._classify_license(chunk)
self._mark_chunk_processed(
chunk, license_info, "prefilter_skip", [], job_id
)
continue
control = await self._process_single_chunk(chunk, config, job_id)
if control is None:
# No control generated — still mark as processed
if not config.dry_run:
license_info = self._classify_license(chunk)
self._mark_chunk_processed(
chunk, license_info, "no_control", [], job_id
)
continue
# Classify license and add to batch
license_info = self._classify_license(chunk)
pending_batch.append((chunk, license_info))
# Count by state
if control.release_state == "too_close":
result.controls_too_close += 1
elif control.release_state == "duplicate":
result.controls_duplicates_found += 1
elif control.release_state == "needs_review":
result.controls_needs_review += 1
else:
result.controls_verified += 1
# Store (unless dry run)
if not config.dry_run:
ctrl_uuid = self._store_control(control, job_id)
if ctrl_uuid:
# Stage 7: Mark chunk processed
license_info = self._classify_license(chunk)
path = "llm_reform" if license_info["rule"] == 3 else "structured"
self._mark_chunk_processed(chunk, license_info, path, [ctrl_uuid], job_id)
else:
# Store failed — still mark as processed
license_info = self._classify_license(chunk)
self._mark_chunk_processed(
chunk, license_info, "store_failed", [], job_id
)
result.controls_generated += 1
result.controls.append(asdict(control))
controls_count += 1
# Add to existing controls for harmonization of next chunks
if self._existing_controls is not None:
self._existing_controls.append({
"control_id": control.control_id,
"title": control.title,
"objective": control.objective,
})
# Flush when batch is full
if len(pending_batch) >= BATCH_SIZE:
await _flush_batch()
except Exception as e:
error_msg = f"Error processing chunk {chunk.regulation_code}/{chunk.article}: {e}"
logger.error(error_msg)
result.errors.append(error_msg)
# Mark failed chunks as processed too (so we don't retry endlessly)
try:
if not config.dry_run:
license_info = self._classify_license(chunk)
@@ -1278,6 +1586,9 @@ Gib JSON zurück mit diesen Feldern:
except Exception:
pass
# Flush remaining chunks
await _flush_batch()
result.chunks_skipped_prefilter = chunks_skipped_prefilter
logger.info(
"Pipeline complete: %d controls generated, %d chunks skipped by prefilter, %d total chunks",

File diff suppressed because it is too large Load Diff