feat(training+controls): interactive video pipeline, training blocks, control generator, CE libraries
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 37s
CI/CD / test-python-backend-compliance (push) Successful in 39s
CI/CD / test-python-document-crawler (push) Successful in 26s
CI/CD / test-python-dsms-gateway (push) Successful in 23s
CI/CD / validate-canonical-controls (push) Successful in 12s
CI/CD / Deploy (push) Has been skipped
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 37s
CI/CD / test-python-backend-compliance (push) Successful in 39s
CI/CD / test-python-document-crawler (push) Successful in 26s
CI/CD / test-python-dsms-gateway (push) Successful in 23s
CI/CD / validate-canonical-controls (push) Successful in 12s
CI/CD / Deploy (push) Has been skipped
Interactive Training Videos (CP-TRAIN): - DB migration 022: training_checkpoints + checkpoint_progress tables - NarratorScript generation via Anthropic (AI Teacher persona, German) - TTS batch synthesis + interactive video pipeline (slides + checkpoint slides + FFmpeg) - 4 new API endpoints: generate-interactive, interactive-manifest, checkpoint submit, checkpoint progress - InteractiveVideoPlayer component (HTML5 Video, quiz overlay, seek protection, progress tracking) - Learner portal integration with automatic completion on all checkpoints passed - 30 new tests (handler validation + grading logic + manifest/progress + seek protection) Training Blocks: - Block generator, block store, block config CRUD + preview/generate endpoints - Migration 021: training_blocks schema Control Generator + Canonical Library: - Control generator routes + service enhancements - Canonical control library helpers, sidebar entry - Citation backfill service + tests - CE libraries data (hazard, protection, evidence, lifecycle, components) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -22,6 +22,7 @@ Endpoints:
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
from typing import Any, Optional
|
||||
|
||||
@@ -277,8 +278,8 @@ async def list_framework_controls(
|
||||
query += " AND category = :cat"
|
||||
params["cat"] = category
|
||||
if target_audience:
|
||||
query += " AND target_audience = :ta"
|
||||
params["ta"] = target_audience
|
||||
query += " AND target_audience::jsonb @> (:ta)::jsonb"
|
||||
params["ta"] = json.dumps([target_audience])
|
||||
|
||||
query += " ORDER BY control_id"
|
||||
rows = db.execute(text(query), params).fetchall()
|
||||
@@ -329,8 +330,8 @@ async def list_controls(
|
||||
query += " AND category = :cat"
|
||||
params["cat"] = category
|
||||
if target_audience:
|
||||
query += " AND target_audience = :ta"
|
||||
params["ta"] = target_audience
|
||||
query += " AND target_audience LIKE :ta_pattern"
|
||||
params["ta_pattern"] = f'%"{target_audience}"%'
|
||||
if source:
|
||||
if source == "__none__":
|
||||
query += " AND (source_citation IS NULL OR source_citation->>'source' IS NULL OR source_citation->>'source' = '')"
|
||||
@@ -398,8 +399,8 @@ async def count_controls(
|
||||
query += " AND category = :cat"
|
||||
params["cat"] = category
|
||||
if target_audience:
|
||||
query += " AND target_audience = :ta"
|
||||
params["ta"] = target_audience
|
||||
query += " AND target_audience LIKE :ta_pattern"
|
||||
params["ta_pattern"] = f'%"{target_audience}"%'
|
||||
if source:
|
||||
if source == "__none__":
|
||||
query += " AND (source_citation IS NULL OR source_citation->>'source' IS NULL OR source_citation->>'source' = '')"
|
||||
|
||||
@@ -26,6 +26,13 @@ from compliance.services.control_generator import (
|
||||
ControlGeneratorPipeline,
|
||||
GeneratorConfig,
|
||||
ALL_COLLECTIONS,
|
||||
VALID_CATEGORIES,
|
||||
VALID_DOMAINS,
|
||||
_detect_category,
|
||||
_detect_domain,
|
||||
_llm_local,
|
||||
_parse_llm_json,
|
||||
CATEGORY_LIST_STR,
|
||||
)
|
||||
from compliance.services.citation_backfill import CitationBackfill, BackfillResult
|
||||
from compliance.services.rag_client import get_rag_client
|
||||
@@ -42,6 +49,7 @@ class GenerateRequest(BaseModel):
|
||||
domain: Optional[str] = None
|
||||
collections: Optional[List[str]] = None
|
||||
max_controls: int = 50
|
||||
max_chunks: int = 1000 # Default: process max 1000 chunks per job (respects document boundaries)
|
||||
batch_size: int = 5
|
||||
skip_web_search: bool = False
|
||||
dry_run: bool = False
|
||||
@@ -57,6 +65,7 @@ class GenerateResponse(BaseModel):
|
||||
controls_needs_review: int = 0
|
||||
controls_too_close: int = 0
|
||||
controls_duplicates_found: int = 0
|
||||
controls_qa_fixed: int = 0
|
||||
errors: list = []
|
||||
controls: list = []
|
||||
|
||||
@@ -132,6 +141,7 @@ async def start_generation(req: GenerateRequest):
|
||||
domain=req.domain,
|
||||
batch_size=req.batch_size,
|
||||
max_controls=req.max_controls,
|
||||
max_chunks=req.max_chunks,
|
||||
skip_web_search=req.skip_web_search,
|
||||
dry_run=req.dry_run,
|
||||
)
|
||||
@@ -338,6 +348,188 @@ async def review_control(control_id: str, req: ReviewRequest):
|
||||
db.close()
|
||||
|
||||
|
||||
class BulkReviewRequest(BaseModel):
|
||||
release_state: str # Filter: which controls to bulk-review
|
||||
action: str # "approve" or "reject"
|
||||
new_state: Optional[str] = None # Override target state
|
||||
|
||||
|
||||
@router.post("/generate/bulk-review")
|
||||
async def bulk_review(req: BulkReviewRequest):
|
||||
"""Bulk review all controls matching a release_state filter.
|
||||
|
||||
Example: reject all needs_review → sets them to deprecated.
|
||||
"""
|
||||
if req.release_state not in ("needs_review", "too_close", "duplicate"):
|
||||
raise HTTPException(status_code=400, detail=f"Invalid filter state: {req.release_state}")
|
||||
|
||||
if req.action == "approve":
|
||||
target = req.new_state or "draft"
|
||||
elif req.action == "reject":
|
||||
target = "deprecated"
|
||||
else:
|
||||
raise HTTPException(status_code=400, detail=f"Unknown action: {req.action}")
|
||||
|
||||
if target not in ("draft", "review", "approved", "deprecated", "needs_review"):
|
||||
raise HTTPException(status_code=400, detail=f"Invalid target state: {target}")
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
result = db.execute(
|
||||
text("""
|
||||
UPDATE canonical_controls
|
||||
SET release_state = :target, updated_at = NOW()
|
||||
WHERE release_state = :source
|
||||
RETURNING control_id
|
||||
"""),
|
||||
{"source": req.release_state, "target": target},
|
||||
)
|
||||
affected = [row[0] for row in result]
|
||||
db.commit()
|
||||
|
||||
return {
|
||||
"action": req.action,
|
||||
"source_state": req.release_state,
|
||||
"target_state": target,
|
||||
"affected_count": len(affected),
|
||||
}
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
class QAReclassifyRequest(BaseModel):
|
||||
limit: int = 100 # How many controls to reclassify per run
|
||||
dry_run: bool = True # Preview only by default
|
||||
filter_category: Optional[str] = None # Only reclassify controls of this category
|
||||
filter_domain_prefix: Optional[str] = None # Only reclassify controls with this prefix
|
||||
|
||||
|
||||
@router.post("/generate/qa-reclassify")
|
||||
async def qa_reclassify(req: QAReclassifyRequest):
|
||||
"""Run QA reclassification on existing controls using local LLM.
|
||||
|
||||
Finds controls where keyword-detection disagrees with current category/domain,
|
||||
then uses Ollama to determine the correct classification.
|
||||
"""
|
||||
db = SessionLocal()
|
||||
try:
|
||||
# Load controls to check
|
||||
where_clauses = ["release_state NOT IN ('deprecated')"]
|
||||
params = {"limit": req.limit}
|
||||
if req.filter_category:
|
||||
where_clauses.append("category = :cat")
|
||||
params["cat"] = req.filter_category
|
||||
if req.filter_domain_prefix:
|
||||
where_clauses.append("control_id LIKE :prefix")
|
||||
params["prefix"] = f"{req.filter_domain_prefix}-%"
|
||||
|
||||
where_sql = " AND ".join(where_clauses)
|
||||
rows = db.execute(
|
||||
text(f"""
|
||||
SELECT id, control_id, title, objective, category,
|
||||
COALESCE(requirements::text, '[]') as requirements,
|
||||
COALESCE(source_original_text, '') as source_text
|
||||
FROM canonical_controls
|
||||
WHERE {where_sql}
|
||||
ORDER BY created_at DESC
|
||||
LIMIT :limit
|
||||
"""),
|
||||
params,
|
||||
).fetchall()
|
||||
|
||||
results = {"checked": 0, "mismatches": 0, "fixes": [], "errors": []}
|
||||
|
||||
for row in rows:
|
||||
results["checked"] += 1
|
||||
control_id = row[1]
|
||||
title = row[2]
|
||||
objective = row[3] or ""
|
||||
current_category = row[4]
|
||||
source_text = row[6] or objective
|
||||
|
||||
# Keyword detection on source text
|
||||
kw_category = _detect_category(source_text) or _detect_category(objective)
|
||||
kw_domain = _detect_domain(source_text)
|
||||
current_prefix = control_id.split("-")[0] if "-" in control_id else ""
|
||||
|
||||
# Skip if keyword detection agrees with current classification
|
||||
if kw_category == current_category and kw_domain == current_prefix:
|
||||
continue
|
||||
|
||||
results["mismatches"] += 1
|
||||
|
||||
# Ask Ollama to arbitrate
|
||||
try:
|
||||
reqs_text = ""
|
||||
try:
|
||||
reqs = json.loads(row[5])
|
||||
if isinstance(reqs, list):
|
||||
reqs_text = ", ".join(str(r) for r in reqs[:3])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
prompt = f"""Pruefe dieses Compliance-Control auf korrekte Klassifizierung.
|
||||
|
||||
Titel: {title[:100]}
|
||||
Ziel: {objective[:200]}
|
||||
Anforderungen: {reqs_text[:200]}
|
||||
|
||||
Aktuelle Zuordnung: domain={current_prefix}, category={current_category}
|
||||
Keyword-Erkennung: domain={kw_domain}, category={kw_category}
|
||||
|
||||
Welche Zuordnung ist korrekt? Antworte NUR als JSON:
|
||||
{{"domain": "KUERZEL", "category": "kategorie_name", "reason": "kurze Begruendung"}}
|
||||
|
||||
Domains: AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe, ENV=Umwelt, HLT=Gesundheit
|
||||
Kategorien: {CATEGORY_LIST_STR}"""
|
||||
|
||||
raw = await _llm_local(prompt)
|
||||
data = _parse_llm_json(raw)
|
||||
if not data:
|
||||
continue
|
||||
|
||||
qa_domain = data.get("domain", "").upper()
|
||||
qa_category = data.get("category", "")
|
||||
reason = data.get("reason", "")
|
||||
|
||||
fix_entry = {
|
||||
"control_id": control_id,
|
||||
"title": title[:80],
|
||||
"old_category": current_category,
|
||||
"old_domain": current_prefix,
|
||||
"new_category": qa_category if qa_category in VALID_CATEGORIES else current_category,
|
||||
"new_domain": qa_domain if qa_domain in VALID_DOMAINS else current_prefix,
|
||||
"reason": reason,
|
||||
}
|
||||
|
||||
category_changed = qa_category in VALID_CATEGORIES and qa_category != current_category
|
||||
|
||||
if category_changed and not req.dry_run:
|
||||
db.execute(
|
||||
text("""
|
||||
UPDATE canonical_controls
|
||||
SET category = :category, updated_at = NOW()
|
||||
WHERE id = :id
|
||||
"""),
|
||||
{"id": row[0], "category": qa_category},
|
||||
)
|
||||
fix_entry["applied"] = True
|
||||
else:
|
||||
fix_entry["applied"] = False
|
||||
|
||||
results["fixes"].append(fix_entry)
|
||||
|
||||
except Exception as e:
|
||||
results["errors"].append({"control_id": control_id, "error": str(e)})
|
||||
|
||||
if not req.dry_run:
|
||||
db.commit()
|
||||
|
||||
return results
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@router.get("/generate/processed-stats")
|
||||
async def get_processed_stats():
|
||||
"""Get processing statistics per collection."""
|
||||
|
||||
@@ -39,7 +39,6 @@ router = APIRouter(tags=["extraction"])
|
||||
|
||||
ALL_COLLECTIONS = [
|
||||
"bp_compliance_ce", # BSI-TR documents — primary Prüfaspekte source
|
||||
"bp_compliance_recht", # Legal texts (GDPR, AI Act, ...)
|
||||
"bp_compliance_gesetze", # German laws
|
||||
"bp_compliance_datenschutz", # Data protection documents
|
||||
"bp_dsfa_corpus", # DSFA corpus
|
||||
|
||||
437
backend-compliance/compliance/services/citation_backfill.py
Normal file
437
backend-compliance/compliance/services/citation_backfill.py
Normal file
@@ -0,0 +1,437 @@
|
||||
"""
|
||||
Citation Backfill Service — enrich existing controls with article/paragraph provenance.
|
||||
|
||||
3-tier matching strategy:
|
||||
Tier 1 — Hash match: sha256(source_original_text) → RAG chunk lookup
|
||||
Tier 2 — Regex parse: split concatenated "DSGVO Art. 35" → regulation + article
|
||||
Tier 3 — Ollama LLM: ask local LLM to identify article/paragraph from text
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
import httpx
|
||||
from sqlalchemy import text
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from .rag_client import ComplianceRAGClient, RAGSearchResult
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
|
||||
OLLAMA_MODEL = os.getenv("CONTROL_GEN_OLLAMA_MODEL", "qwen3.5:35b-a3b")
|
||||
LLM_TIMEOUT = float(os.getenv("CONTROL_GEN_LLM_TIMEOUT", "180"))
|
||||
|
||||
ALL_COLLECTIONS = [
|
||||
"bp_compliance_ce",
|
||||
"bp_compliance_gesetze",
|
||||
"bp_compliance_datenschutz",
|
||||
"bp_dsfa_corpus",
|
||||
"bp_legal_templates",
|
||||
]
|
||||
|
||||
BACKFILL_SYSTEM_PROMPT = (
|
||||
"Du bist ein Rechtsexperte. Deine Aufgabe ist es, aus einem Gesetzestext "
|
||||
"den genauen Artikel und Absatz zu bestimmen. Antworte NUR mit validem JSON."
|
||||
)
|
||||
|
||||
# Regex to split concatenated source like "DSGVO Art. 35" or "NIS2 Artikel 21 Abs. 2"
|
||||
_SOURCE_ARTICLE_RE = re.compile(
|
||||
r"^(.+?)\s+(Art(?:ikel)?\.?\s*\d+.*)$", re.IGNORECASE
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class MatchResult:
|
||||
article: str
|
||||
paragraph: str
|
||||
method: str # "hash", "regex", "llm"
|
||||
|
||||
|
||||
@dataclass
|
||||
class BackfillResult:
|
||||
total_controls: int = 0
|
||||
matched_hash: int = 0
|
||||
matched_regex: int = 0
|
||||
matched_llm: int = 0
|
||||
unmatched: int = 0
|
||||
updated: int = 0
|
||||
errors: list = field(default_factory=list)
|
||||
|
||||
|
||||
class CitationBackfill:
|
||||
"""Backfill article/paragraph into existing control source_citations."""
|
||||
|
||||
def __init__(self, db: Session, rag_client: ComplianceRAGClient):
|
||||
self.db = db
|
||||
self.rag = rag_client
|
||||
self._rag_index: dict[str, RAGSearchResult] = {}
|
||||
|
||||
async def run(self, dry_run: bool = True, limit: int = 0) -> BackfillResult:
|
||||
"""Main entry: iterate controls missing article/paragraph, match to RAG, update."""
|
||||
result = BackfillResult()
|
||||
|
||||
# Load controls needing backfill
|
||||
controls = self._load_controls_needing_backfill(limit)
|
||||
result.total_controls = len(controls)
|
||||
logger.info("Backfill: %d controls need article/paragraph enrichment", len(controls))
|
||||
|
||||
if not controls:
|
||||
return result
|
||||
|
||||
# Collect hashes we need to find — only build index for controls with source text
|
||||
needed_hashes: set[str] = set()
|
||||
for ctrl in controls:
|
||||
src = ctrl.get("source_original_text")
|
||||
if src:
|
||||
needed_hashes.add(hashlib.sha256(src.encode()).hexdigest())
|
||||
|
||||
if needed_hashes:
|
||||
# Build targeted RAG index — only scroll collections that our controls reference
|
||||
logger.info("Building targeted RAG hash index for %d source texts...", len(needed_hashes))
|
||||
await self._build_rag_index_targeted(controls)
|
||||
logger.info("RAG index built: %d chunks indexed, %d hashes needed", len(self._rag_index), len(needed_hashes))
|
||||
else:
|
||||
logger.info("No source_original_text found — skipping RAG index build")
|
||||
|
||||
# Process each control
|
||||
for i, ctrl in enumerate(controls):
|
||||
if i > 0 and i % 100 == 0:
|
||||
logger.info("Backfill progress: %d/%d processed", i, result.total_controls)
|
||||
|
||||
try:
|
||||
match = await self._match_control(ctrl)
|
||||
if match:
|
||||
if match.method == "hash":
|
||||
result.matched_hash += 1
|
||||
elif match.method == "regex":
|
||||
result.matched_regex += 1
|
||||
elif match.method == "llm":
|
||||
result.matched_llm += 1
|
||||
|
||||
if not dry_run:
|
||||
self._update_control(ctrl, match)
|
||||
result.updated += 1
|
||||
else:
|
||||
logger.debug(
|
||||
"DRY RUN: Would update %s with article=%s paragraph=%s (method=%s)",
|
||||
ctrl["control_id"], match.article, match.paragraph, match.method,
|
||||
)
|
||||
else:
|
||||
result.unmatched += 1
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Error backfilling {ctrl.get('control_id', '?')}: {e}"
|
||||
logger.error(error_msg)
|
||||
result.errors.append(error_msg)
|
||||
|
||||
if not dry_run:
|
||||
try:
|
||||
self.db.commit()
|
||||
except Exception as e:
|
||||
logger.error("Backfill commit failed: %s", e)
|
||||
result.errors.append(f"Commit failed: {e}")
|
||||
|
||||
logger.info(
|
||||
"Backfill complete: %d total, hash=%d regex=%d llm=%d unmatched=%d updated=%d",
|
||||
result.total_controls, result.matched_hash, result.matched_regex,
|
||||
result.matched_llm, result.unmatched, result.updated,
|
||||
)
|
||||
return result
|
||||
|
||||
def _load_controls_needing_backfill(self, limit: int = 0) -> list[dict]:
|
||||
"""Load controls where source_citation exists but lacks separate 'article' key."""
|
||||
query = """
|
||||
SELECT id, control_id, source_citation, source_original_text,
|
||||
generation_metadata, license_rule
|
||||
FROM canonical_controls
|
||||
WHERE license_rule IN (1, 2)
|
||||
AND source_citation IS NOT NULL
|
||||
AND (
|
||||
source_citation->>'article' IS NULL
|
||||
OR source_citation->>'article' = ''
|
||||
)
|
||||
ORDER BY control_id
|
||||
"""
|
||||
if limit > 0:
|
||||
query += f" LIMIT {limit}"
|
||||
|
||||
result = self.db.execute(text(query))
|
||||
cols = result.keys()
|
||||
controls = []
|
||||
for row in result:
|
||||
ctrl = dict(zip(cols, row))
|
||||
ctrl["id"] = str(ctrl["id"])
|
||||
# Parse JSON fields
|
||||
for jf in ("source_citation", "generation_metadata"):
|
||||
if isinstance(ctrl.get(jf), str):
|
||||
try:
|
||||
ctrl[jf] = json.loads(ctrl[jf])
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
ctrl[jf] = {}
|
||||
controls.append(ctrl)
|
||||
return controls
|
||||
|
||||
async def _build_rag_index_targeted(self, controls: list[dict]):
|
||||
"""Build RAG index by scrolling only collections relevant to our controls.
|
||||
|
||||
Uses regulation codes from generation_metadata to identify which collections
|
||||
to search, falling back to all collections only if needed.
|
||||
"""
|
||||
# Determine which collections are relevant based on regulation codes
|
||||
regulation_to_collection = self._map_regulations_to_collections(controls)
|
||||
collections_to_search = set(regulation_to_collection.values()) or set(ALL_COLLECTIONS)
|
||||
|
||||
logger.info("Targeted index: searching %d collections: %s",
|
||||
len(collections_to_search), ", ".join(collections_to_search))
|
||||
|
||||
for collection in collections_to_search:
|
||||
offset = None
|
||||
page = 0
|
||||
seen_offsets: set[str] = set()
|
||||
while True:
|
||||
chunks, next_offset = await self.rag.scroll(
|
||||
collection=collection, offset=offset, limit=200,
|
||||
)
|
||||
if not chunks:
|
||||
break
|
||||
for chunk in chunks:
|
||||
if chunk.text and len(chunk.text.strip()) >= 50:
|
||||
h = hashlib.sha256(chunk.text.encode()).hexdigest()
|
||||
self._rag_index[h] = chunk
|
||||
page += 1
|
||||
if page % 50 == 0:
|
||||
logger.info("Indexing %s: page %d (%d chunks so far)",
|
||||
collection, page, len(self._rag_index))
|
||||
if not next_offset:
|
||||
break
|
||||
if next_offset in seen_offsets:
|
||||
logger.warning("Scroll loop in %s at page %d — stopping", collection, page)
|
||||
break
|
||||
seen_offsets.add(next_offset)
|
||||
offset = next_offset
|
||||
|
||||
logger.info("Indexed collection %s: %d pages", collection, page)
|
||||
|
||||
def _map_regulations_to_collections(self, controls: list[dict]) -> dict[str, str]:
|
||||
"""Map regulation codes from controls to likely Qdrant collections."""
|
||||
# Heuristic: regulation code prefix → collection
|
||||
collection_map = {
|
||||
"eu_": "bp_compliance_gesetze",
|
||||
"dsgvo": "bp_compliance_datenschutz",
|
||||
"bdsg": "bp_compliance_gesetze",
|
||||
"ttdsg": "bp_compliance_gesetze",
|
||||
"nist_": "bp_compliance_ce",
|
||||
"owasp": "bp_compliance_ce",
|
||||
"bsi_": "bp_compliance_ce",
|
||||
"enisa": "bp_compliance_ce",
|
||||
"at_": "bp_compliance_recht",
|
||||
"fr_": "bp_compliance_recht",
|
||||
"es_": "bp_compliance_recht",
|
||||
}
|
||||
result: dict[str, str] = {}
|
||||
for ctrl in controls:
|
||||
meta = ctrl.get("generation_metadata") or {}
|
||||
reg = meta.get("source_regulation", "")
|
||||
if not reg:
|
||||
continue
|
||||
for prefix, coll in collection_map.items():
|
||||
if reg.startswith(prefix):
|
||||
result[reg] = coll
|
||||
break
|
||||
else:
|
||||
# Unknown regulation — search all
|
||||
for coll in ALL_COLLECTIONS:
|
||||
result[f"_all_{coll}"] = coll
|
||||
return result
|
||||
|
||||
async def _match_control(self, ctrl: dict) -> Optional[MatchResult]:
|
||||
"""3-tier matching: hash → regex → LLM."""
|
||||
|
||||
# Tier 1: Hash match against RAG index
|
||||
source_text = ctrl.get("source_original_text")
|
||||
if source_text:
|
||||
h = hashlib.sha256(source_text.encode()).hexdigest()
|
||||
chunk = self._rag_index.get(h)
|
||||
if chunk and (chunk.article or chunk.paragraph):
|
||||
return MatchResult(
|
||||
article=chunk.article or "",
|
||||
paragraph=chunk.paragraph or "",
|
||||
method="hash",
|
||||
)
|
||||
|
||||
# Tier 2: Regex parse concatenated source
|
||||
citation = ctrl.get("source_citation") or {}
|
||||
source_str = citation.get("source", "")
|
||||
parsed = _parse_concatenated_source(source_str)
|
||||
if parsed and parsed["article"]:
|
||||
return MatchResult(
|
||||
article=parsed["article"],
|
||||
paragraph="", # Regex can't extract paragraph from concatenated format
|
||||
method="regex",
|
||||
)
|
||||
|
||||
# Tier 3: Ollama LLM
|
||||
if source_text:
|
||||
return await self._llm_match(ctrl)
|
||||
|
||||
return None
|
||||
|
||||
async def _llm_match(self, ctrl: dict) -> Optional[MatchResult]:
|
||||
"""Use Ollama to identify article/paragraph from source text."""
|
||||
citation = ctrl.get("source_citation") or {}
|
||||
regulation_name = citation.get("source", "")
|
||||
metadata = ctrl.get("generation_metadata") or {}
|
||||
regulation_code = metadata.get("source_regulation", "")
|
||||
source_text = ctrl.get("source_original_text", "")
|
||||
|
||||
prompt = f"""Analysiere den folgenden Gesetzestext und bestimme den genauen Artikel und Absatz.
|
||||
|
||||
Gesetz: {regulation_name} (Code: {regulation_code})
|
||||
|
||||
Text:
|
||||
---
|
||||
{source_text[:2000]}
|
||||
---
|
||||
|
||||
Antworte NUR mit JSON:
|
||||
{{"article": "Art. XX", "paragraph": "Abs. Y"}}
|
||||
|
||||
Falls kein spezifischer Absatz erkennbar ist, setze paragraph auf "".
|
||||
Falls kein Artikel erkennbar ist, setze article auf "".
|
||||
Bei deutschen Gesetzen mit § verwende: "§ XX" statt "Art. XX"."""
|
||||
|
||||
try:
|
||||
raw = await _llm_ollama(prompt, BACKFILL_SYSTEM_PROMPT)
|
||||
data = _parse_json(raw)
|
||||
if data and (data.get("article") or data.get("paragraph")):
|
||||
return MatchResult(
|
||||
article=data.get("article", ""),
|
||||
paragraph=data.get("paragraph", ""),
|
||||
method="llm",
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning("LLM match failed for %s: %s", ctrl.get("control_id"), e)
|
||||
|
||||
return None
|
||||
|
||||
def _update_control(self, ctrl: dict, match: MatchResult):
|
||||
"""Update source_citation and generation_metadata in DB."""
|
||||
citation = ctrl.get("source_citation") or {}
|
||||
|
||||
# Clean the source name: remove concatenated article if present
|
||||
source_str = citation.get("source", "")
|
||||
parsed = _parse_concatenated_source(source_str)
|
||||
if parsed:
|
||||
citation["source"] = parsed["name"]
|
||||
|
||||
# Add separate article/paragraph fields
|
||||
citation["article"] = match.article
|
||||
citation["paragraph"] = match.paragraph
|
||||
|
||||
# Update generation_metadata
|
||||
metadata = ctrl.get("generation_metadata") or {}
|
||||
if match.article:
|
||||
metadata["source_article"] = match.article
|
||||
metadata["source_paragraph"] = match.paragraph
|
||||
metadata["backfill_method"] = match.method
|
||||
metadata["backfill_at"] = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
self.db.execute(
|
||||
text("""
|
||||
UPDATE canonical_controls
|
||||
SET source_citation = :citation,
|
||||
generation_metadata = :metadata,
|
||||
updated_at = NOW()
|
||||
WHERE id = CAST(:id AS uuid)
|
||||
"""),
|
||||
{
|
||||
"id": ctrl["id"],
|
||||
"citation": json.dumps(citation),
|
||||
"metadata": json.dumps(metadata),
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def _parse_concatenated_source(source: str) -> Optional[dict]:
|
||||
"""Parse 'DSGVO Art. 35' → {name: 'DSGVO', article: 'Art. 35'}.
|
||||
|
||||
Also handles '§' format: 'BDSG § 42' → {name: 'BDSG', article: '§ 42'}.
|
||||
"""
|
||||
if not source:
|
||||
return None
|
||||
|
||||
# Try Art./Artikel pattern
|
||||
m = _SOURCE_ARTICLE_RE.match(source)
|
||||
if m:
|
||||
return {"name": m.group(1).strip(), "article": m.group(2).strip()}
|
||||
|
||||
# Try § pattern
|
||||
m2 = re.match(r"^(.+?)\s+(§\s*\d+.*)$", source)
|
||||
if m2:
|
||||
return {"name": m2.group(1).strip(), "article": m2.group(2).strip()}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
async def _llm_ollama(prompt: str, system_prompt: Optional[str] = None) -> str:
|
||||
"""Call Ollama chat API for backfill matching."""
|
||||
messages = []
|
||||
if system_prompt:
|
||||
messages.append({"role": "system", "content": system_prompt})
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
|
||||
payload = {
|
||||
"model": OLLAMA_MODEL,
|
||||
"messages": messages,
|
||||
"stream": False,
|
||||
"options": {"num_predict": 256},
|
||||
"think": False,
|
||||
}
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=LLM_TIMEOUT) as client:
|
||||
resp = await client.post(f"{OLLAMA_URL}/api/chat", json=payload)
|
||||
if resp.status_code != 200:
|
||||
logger.error("Ollama backfill failed %d: %s", resp.status_code, resp.text[:300])
|
||||
return ""
|
||||
data = resp.json()
|
||||
msg = data.get("message", {})
|
||||
if isinstance(msg, dict):
|
||||
return msg.get("content", "")
|
||||
return data.get("response", str(msg))
|
||||
except Exception as e:
|
||||
logger.error("Ollama backfill request failed: %s", e)
|
||||
return ""
|
||||
|
||||
|
||||
def _parse_json(raw: str) -> Optional[dict]:
|
||||
"""Extract JSON object from LLM output."""
|
||||
if not raw:
|
||||
return None
|
||||
# Try direct parse
|
||||
try:
|
||||
return json.loads(raw)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
# Try extracting from markdown code block
|
||||
m = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", raw, re.DOTALL)
|
||||
if m:
|
||||
try:
|
||||
return json.loads(m.group(1))
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
# Try finding first { ... }
|
||||
m = re.search(r"\{[^{}]*\}", raw)
|
||||
if m:
|
||||
try:
|
||||
return json.loads(m.group(0))
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
return None
|
||||
@@ -44,6 +44,7 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
SDK_URL = os.getenv("SDK_URL", "http://ai-compliance-sdk:8090")
|
||||
EMBEDDING_URL = os.getenv("EMBEDDING_URL", "http://embedding-service:8087")
|
||||
QDRANT_URL = os.getenv("QDRANT_URL", "http://host.docker.internal:6333")
|
||||
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "")
|
||||
ANTHROPIC_MODEL = os.getenv("CONTROL_GEN_ANTHROPIC_MODEL", "claude-sonnet-4-6")
|
||||
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
|
||||
@@ -54,7 +55,6 @@ HARMONIZATION_THRESHOLD = 0.85 # Cosine similarity above this = duplicate
|
||||
|
||||
ALL_COLLECTIONS = [
|
||||
"bp_compliance_ce",
|
||||
"bp_compliance_recht",
|
||||
"bp_compliance_gesetze",
|
||||
"bp_compliance_datenschutz",
|
||||
"bp_dsfa_corpus",
|
||||
@@ -312,6 +312,12 @@ CATEGORY_KEYWORDS = {
|
||||
"hygiene", "infektionsschutz", "pflege"],
|
||||
}
|
||||
|
||||
VALID_CATEGORIES = set(CATEGORY_KEYWORDS.keys())
|
||||
VALID_DOMAINS = {"AUTH", "CRYP", "NET", "DATA", "LOG", "ACC", "SEC", "INC",
|
||||
"AI", "COMP", "GOV", "LAB", "FIN", "TRD", "ENV", "HLT"}
|
||||
|
||||
CATEGORY_LIST_STR = ", ".join(sorted(VALID_CATEGORIES))
|
||||
|
||||
VERIFICATION_KEYWORDS = {
|
||||
"code_review": ["source code", "code review", "static analysis", "sast", "dast",
|
||||
"dependency check", "quellcode", "codeanalyse", "secure coding",
|
||||
@@ -373,6 +379,7 @@ class GeneratorConfig(BaseModel):
|
||||
domain: Optional[str] = None
|
||||
batch_size: int = 5
|
||||
max_controls: int = 0 # 0 = unlimited (process ALL chunks)
|
||||
max_chunks: int = 0 # 0 = unlimited; >0 = stop after N chunks (respects document boundaries)
|
||||
skip_processed: bool = True
|
||||
skip_web_search: bool = False
|
||||
dry_run: bool = False
|
||||
@@ -418,6 +425,7 @@ class GeneratorResult:
|
||||
controls_needs_review: int = 0
|
||||
controls_too_close: int = 0
|
||||
controls_duplicates_found: int = 0
|
||||
controls_qa_fixed: int = 0
|
||||
chunks_skipped_prefilter: int = 0
|
||||
errors: list = field(default_factory=list)
|
||||
controls: list = field(default_factory=list)
|
||||
@@ -713,7 +721,7 @@ class ControlGeneratorPipeline:
|
||||
async def _scan_rag(self, config: GeneratorConfig) -> list[RAGSearchResult]:
|
||||
"""Scroll through ALL chunks in RAG collections.
|
||||
|
||||
Uses the scroll endpoint to iterate over every chunk (not just top-K search).
|
||||
Uses DIRECT Qdrant scroll API (bypasses Go SDK which has offset cycling bugs).
|
||||
Filters out already-processed chunks by hash.
|
||||
"""
|
||||
collections = config.collections or ALL_COLLECTIONS
|
||||
@@ -734,80 +742,105 @@ class ControlGeneratorPipeline:
|
||||
seen_hashes: set[str] = set()
|
||||
|
||||
for collection in collections:
|
||||
offset = None
|
||||
page = 0
|
||||
collection_total = 0
|
||||
collection_new = 0
|
||||
max_pages = 1000 # Safety limit: 1000 pages × 200 = 200K chunks max per collection
|
||||
prev_chunk_count = -1 # Track stalls (same count means no progress)
|
||||
stall_count = 0
|
||||
qdrant_offset = None # Qdrant uses point ID as offset
|
||||
|
||||
while page < max_pages:
|
||||
chunks, next_offset = await self.rag.scroll(
|
||||
collection=collection,
|
||||
offset=offset,
|
||||
limit=200,
|
||||
)
|
||||
while True:
|
||||
# Direct Qdrant scroll API — bypasses Go SDK offset cycling bug
|
||||
try:
|
||||
scroll_body: dict = {
|
||||
"limit": 250,
|
||||
"with_payload": True,
|
||||
"with_vector": False,
|
||||
}
|
||||
if qdrant_offset is not None:
|
||||
scroll_body["offset"] = qdrant_offset
|
||||
|
||||
if not chunks:
|
||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||
resp = await client.post(
|
||||
f"{QDRANT_URL}/collections/{collection}/points/scroll",
|
||||
json=scroll_body,
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
logger.error("Qdrant scroll %s failed: %d %s", collection, resp.status_code, resp.text[:200])
|
||||
break
|
||||
data = resp.json().get("result", {})
|
||||
points = data.get("points", [])
|
||||
next_page_offset = data.get("next_page_offset")
|
||||
except Exception as e:
|
||||
logger.error("Qdrant scroll error for %s: %s", collection, e)
|
||||
break
|
||||
|
||||
collection_total += len(chunks)
|
||||
if not points:
|
||||
break
|
||||
|
||||
for chunk in chunks:
|
||||
if not chunk.text or len(chunk.text.strip()) < 50:
|
||||
continue # Skip empty/tiny chunks
|
||||
collection_total += len(points)
|
||||
|
||||
h = hashlib.sha256(chunk.text.encode()).hexdigest()
|
||||
for point in points:
|
||||
payload = point.get("payload", {})
|
||||
# Different collections use different field names for text
|
||||
chunk_text = (payload.get("chunk_text", "")
|
||||
or payload.get("content", "")
|
||||
or payload.get("text", "")
|
||||
or payload.get("page_content", ""))
|
||||
if not chunk_text or len(chunk_text.strip()) < 50:
|
||||
continue
|
||||
|
||||
h = hashlib.sha256(chunk_text.encode()).hexdigest()
|
||||
|
||||
# Skip duplicates (same text in multiple collections)
|
||||
if h in seen_hashes:
|
||||
continue
|
||||
seen_hashes.add(h)
|
||||
|
||||
# Skip already-processed
|
||||
if h in processed_hashes:
|
||||
continue
|
||||
|
||||
# Convert Qdrant point to RAGSearchResult
|
||||
# Handle varying payload schemas across collections
|
||||
reg_code = (payload.get("regulation_id", "")
|
||||
or payload.get("regulation_code", "")
|
||||
or payload.get("source_id", "")
|
||||
or payload.get("source_code", ""))
|
||||
reg_name = (payload.get("regulation_name_de", "")
|
||||
or payload.get("regulation_name", "")
|
||||
or payload.get("source_name", "")
|
||||
or payload.get("guideline_name", "")
|
||||
or payload.get("document_title", "")
|
||||
or payload.get("filename", ""))
|
||||
reg_short = (payload.get("regulation_short", "")
|
||||
or reg_code)
|
||||
chunk = RAGSearchResult(
|
||||
text=chunk_text,
|
||||
regulation_code=reg_code,
|
||||
regulation_name=reg_name,
|
||||
regulation_short=reg_short,
|
||||
category=payload.get("category", "") or payload.get("data_type", ""),
|
||||
article=payload.get("article", "") or payload.get("section_title", "") or payload.get("section", ""),
|
||||
paragraph=payload.get("paragraph", ""),
|
||||
source_url=payload.get("source_url", "") or payload.get("source", "") or payload.get("url", ""),
|
||||
score=0.0,
|
||||
collection=collection,
|
||||
)
|
||||
all_results.append(chunk)
|
||||
collection_new += 1
|
||||
|
||||
page += 1
|
||||
if page % 50 == 0:
|
||||
if page % 100 == 0:
|
||||
logger.info(
|
||||
"Scrolling %s: page %d, %d total chunks, %d new unprocessed",
|
||||
"Scrolling %s (direct Qdrant): page %d, %d total chunks, %d new unprocessed",
|
||||
collection, page, collection_total, collection_new,
|
||||
)
|
||||
|
||||
# Stop conditions
|
||||
if not next_offset:
|
||||
break
|
||||
if next_page_offset is None:
|
||||
break # Qdrant returns null when no more pages
|
||||
|
||||
# Detect stalls: if no NEW unique chunks found for several pages,
|
||||
# we've likely cycled through all chunks in this collection.
|
||||
# (Safer than offset dedup which breaks with mixed Qdrant ID types)
|
||||
if collection_new == prev_chunk_count:
|
||||
stall_count += 1
|
||||
if stall_count >= 5:
|
||||
logger.warning(
|
||||
"Scroll stalled in %s at page %d — no new unique chunks for 5 pages (%d total, %d new) — stopping",
|
||||
collection, page, collection_total, collection_new,
|
||||
)
|
||||
break
|
||||
else:
|
||||
stall_count = 0
|
||||
prev_chunk_count = collection_new
|
||||
|
||||
offset = next_offset
|
||||
|
||||
if page >= max_pages:
|
||||
logger.warning(
|
||||
"Collection %s: reached max_pages limit (%d). %d chunks scrolled.",
|
||||
collection, max_pages, collection_total,
|
||||
)
|
||||
qdrant_offset = next_page_offset
|
||||
|
||||
logger.info(
|
||||
"Collection %s: %d total chunks scrolled, %d new unprocessed",
|
||||
"Collection %s: %d total chunks scrolled (direct Qdrant), %d new unprocessed",
|
||||
collection, collection_total, collection_new,
|
||||
)
|
||||
|
||||
@@ -857,6 +890,11 @@ Gib JSON zurück mit diesen Feldern:
|
||||
- evidence: Liste von Nachweisdokumenten (Strings)
|
||||
- severity: low/medium/high/critical
|
||||
- tags: Liste von Tags
|
||||
- domain: Fachgebiet als Kuerzel (AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden/Verwaltung, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe/Handelsrecht, ENV=Umwelt, HLT=Gesundheit)
|
||||
- category: Inhaltliche Kategorie — MUSS zum domain passen. Moegliche Werte: {CATEGORY_LIST_STR}
|
||||
- target_audience: Liste der Zielgruppen (z.B. "unternehmen", "behoerden", "entwickler", "datenschutzbeauftragte", "geschaeftsfuehrung", "it-abteilung", "rechtsabteilung", "compliance-officer", "personalwesen", "einkauf", "produktion", "gesundheitswesen", "finanzwesen", "oeffentlicher_dienst")
|
||||
- source_article: Artikel-/Paragraphen-Referenz aus dem Text (z.B. "Artikel 10", "§ 42"). Leer lassen wenn nicht erkennbar.
|
||||
- source_paragraph: Absatz-Referenz aus dem Text (z.B. "Absatz 5", "Nr. 2"). Leer lassen wenn nicht erkennbar.
|
||||
|
||||
Text: {chunk.text[:2000]}
|
||||
Quelle: {chunk.regulation_name} ({chunk.regulation_code}), {chunk.article}"""
|
||||
@@ -868,24 +906,29 @@ Quelle: {chunk.regulation_name} ({chunk.regulation_code}), {chunk.article}"""
|
||||
|
||||
domain = _detect_domain(chunk.text)
|
||||
control = self._build_control_from_json(data, domain)
|
||||
llm_article = str(data.get("source_article", "")).strip()
|
||||
llm_paragraph = str(data.get("source_paragraph", "")).strip()
|
||||
effective_article = llm_article or chunk.article or ""
|
||||
effective_paragraph = llm_paragraph or chunk.paragraph or ""
|
||||
control.license_rule = 1
|
||||
control.source_original_text = chunk.text
|
||||
control.source_citation = {
|
||||
"source": chunk.regulation_name,
|
||||
"article": chunk.article or "",
|
||||
"paragraph": chunk.paragraph or "",
|
||||
"article": effective_article,
|
||||
"paragraph": effective_paragraph,
|
||||
"license": license_info.get("license", ""),
|
||||
"url": chunk.source_url or "",
|
||||
}
|
||||
control.customer_visible = True
|
||||
control.verification_method = _detect_verification_method(chunk.text)
|
||||
control.category = _detect_category(chunk.text)
|
||||
if not control.category:
|
||||
control.category = _detect_category(chunk.text)
|
||||
control.generation_metadata = {
|
||||
"processing_path": "structured",
|
||||
"license_rule": 1,
|
||||
"source_regulation": chunk.regulation_code,
|
||||
"source_article": chunk.article,
|
||||
"source_paragraph": chunk.paragraph,
|
||||
"source_article": effective_article,
|
||||
"source_paragraph": effective_paragraph,
|
||||
}
|
||||
return control
|
||||
|
||||
@@ -910,6 +953,11 @@ Gib JSON zurück mit diesen Feldern:
|
||||
- evidence: Liste von Nachweisdokumenten (Strings)
|
||||
- severity: low/medium/high/critical
|
||||
- tags: Liste von Tags
|
||||
- domain: Fachgebiet als Kuerzel (AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden/Verwaltung, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe/Handelsrecht, ENV=Umwelt, HLT=Gesundheit)
|
||||
- category: Inhaltliche Kategorie — MUSS zum domain passen. Moegliche Werte: {CATEGORY_LIST_STR}
|
||||
- target_audience: Liste der Zielgruppen (z.B. "unternehmen", "behoerden", "entwickler", "datenschutzbeauftragte", "geschaeftsfuehrung", "it-abteilung", "rechtsabteilung", "compliance-officer", "personalwesen", "einkauf", "produktion", "gesundheitswesen", "finanzwesen", "oeffentlicher_dienst")
|
||||
- source_article: Artikel-/Paragraphen-Referenz aus dem Text (z.B. "Artikel 10", "§ 42"). Leer lassen wenn nicht erkennbar.
|
||||
- source_paragraph: Absatz-Referenz aus dem Text (z.B. "Absatz 5", "Nr. 2"). Leer lassen wenn nicht erkennbar.
|
||||
|
||||
Text: {chunk.text[:2000]}
|
||||
Quelle: {chunk.regulation_name}, {chunk.article}"""
|
||||
@@ -921,25 +969,30 @@ Quelle: {chunk.regulation_name}, {chunk.article}"""
|
||||
|
||||
domain = _detect_domain(chunk.text)
|
||||
control = self._build_control_from_json(data, domain)
|
||||
llm_article = str(data.get("source_article", "")).strip()
|
||||
llm_paragraph = str(data.get("source_paragraph", "")).strip()
|
||||
effective_article = llm_article or chunk.article or ""
|
||||
effective_paragraph = llm_paragraph or chunk.paragraph or ""
|
||||
control.license_rule = 2
|
||||
control.source_original_text = chunk.text
|
||||
control.source_citation = {
|
||||
"source": chunk.regulation_name,
|
||||
"article": chunk.article or "",
|
||||
"paragraph": chunk.paragraph or "",
|
||||
"article": effective_article,
|
||||
"paragraph": effective_paragraph,
|
||||
"license": license_info.get("license", ""),
|
||||
"license_notice": attribution,
|
||||
"url": chunk.source_url or "",
|
||||
}
|
||||
control.customer_visible = True
|
||||
control.verification_method = _detect_verification_method(chunk.text)
|
||||
control.category = _detect_category(chunk.text)
|
||||
if not control.category:
|
||||
control.category = _detect_category(chunk.text)
|
||||
control.generation_metadata = {
|
||||
"processing_path": "structured",
|
||||
"license_rule": 2,
|
||||
"source_regulation": chunk.regulation_code,
|
||||
"source_article": chunk.article,
|
||||
"source_paragraph": chunk.paragraph,
|
||||
"source_article": effective_article,
|
||||
"source_paragraph": effective_paragraph,
|
||||
}
|
||||
return control
|
||||
|
||||
@@ -968,7 +1021,8 @@ Gib JSON zurück mit diesen Feldern:
|
||||
- evidence: Liste von Nachweisdokumenten (Strings)
|
||||
- severity: low/medium/high/critical
|
||||
- tags: Liste von Tags (eigene Begriffe)
|
||||
- domain: Fachgebiet als Kuerzel (AUTH, CRYP, NET, DATA, LOG, ACC, SEC, INC, AI, COMP, GOV, LAB, FIN, TRD, ENV, HLT)
|
||||
- domain: Fachgebiet als Kuerzel (AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden/Verwaltung, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe/Handelsrecht, ENV=Umwelt, HLT=Gesundheit)
|
||||
- category: Inhaltliche Kategorie — MUSS zum domain passen. Moegliche Werte: {CATEGORY_LIST_STR}
|
||||
- target_audience: Liste der Zielgruppen (z.B. "unternehmen", "behoerden", "entwickler", "datenschutzbeauftragte", "geschaeftsfuehrung", "it-abteilung", "rechtsabteilung", "compliance-officer", "personalwesen", "oeffentlicher_dienst")"""
|
||||
|
||||
raw = await _llm_chat(prompt, REFORM_SYSTEM_PROMPT)
|
||||
@@ -982,7 +1036,8 @@ Gib JSON zurück mit diesen Feldern:
|
||||
control.source_citation = None # NEVER cite source
|
||||
control.customer_visible = False # Only our formulation
|
||||
control.verification_method = _detect_verification_method(chunk.text)
|
||||
control.category = _detect_category(chunk.text)
|
||||
if not control.category:
|
||||
control.category = _detect_category(chunk.text)
|
||||
# generation_metadata: NO source names, NO original texts
|
||||
control.generation_metadata = {
|
||||
"processing_path": "llm_reform",
|
||||
@@ -1046,7 +1101,10 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat di
|
||||
- severity: low/medium/high/critical
|
||||
- tags: Liste von Tags
|
||||
- domain: Fachgebiet als Kuerzel (AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden/Verwaltung, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe/Handelsrecht, ENV=Umwelt, HLT=Gesundheit)
|
||||
- category: Inhaltliche Kategorie — MUSS zum domain passen. Moegliche Werte: {CATEGORY_LIST_STR}
|
||||
- target_audience: Liste der Zielgruppen fuer die dieses Control relevant ist. Moegliche Werte: "unternehmen", "behoerden", "entwickler", "datenschutzbeauftragte", "geschaeftsfuehrung", "it-abteilung", "rechtsabteilung", "compliance-officer", "personalwesen", "einkauf", "produktion", "vertrieb", "gesundheitswesen", "finanzwesen", "oeffentlicher_dienst"
|
||||
- source_article: Artikel-/Paragraphen-Referenz aus dem Text extrahieren (z.B. "Artikel 10", "Art. 5", "§ 42", "Section 3"). Leer lassen wenn nicht erkennbar.
|
||||
- source_paragraph: Absatz-Referenz aus dem Text extrahieren (z.B. "Absatz 5", "Abs. 3", "Nr. 2", "(1)"). Leer lassen wenn nicht erkennbar.
|
||||
|
||||
{joined}"""
|
||||
|
||||
@@ -1071,26 +1129,32 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat di
|
||||
domain = _detect_domain(chunk.text)
|
||||
control = self._build_control_from_json(data, domain)
|
||||
control.license_rule = lic["rule"]
|
||||
# Use LLM-extracted article/paragraph, fall back to chunk metadata
|
||||
llm_article = str(data.get("source_article", "")).strip()
|
||||
llm_paragraph = str(data.get("source_paragraph", "")).strip()
|
||||
effective_article = llm_article or chunk.article or ""
|
||||
effective_paragraph = llm_paragraph or chunk.paragraph or ""
|
||||
if lic["rule"] in (1, 2):
|
||||
control.source_original_text = chunk.text
|
||||
control.source_citation = {
|
||||
"source": chunk.regulation_name,
|
||||
"article": chunk.article or "",
|
||||
"paragraph": chunk.paragraph or "",
|
||||
"article": effective_article,
|
||||
"paragraph": effective_paragraph,
|
||||
"license": lic.get("license", ""),
|
||||
"license_notice": lic.get("attribution", ""),
|
||||
"url": chunk.source_url or "",
|
||||
}
|
||||
control.customer_visible = True
|
||||
control.verification_method = _detect_verification_method(chunk.text)
|
||||
control.category = _detect_category(chunk.text)
|
||||
if not control.category:
|
||||
control.category = _detect_category(chunk.text)
|
||||
same_doc = len(set(c.regulation_code for c in chunks)) == 1
|
||||
control.generation_metadata = {
|
||||
"processing_path": "structured_batch",
|
||||
"license_rule": lic["rule"],
|
||||
"source_regulation": chunk.regulation_code,
|
||||
"source_article": chunk.article,
|
||||
"source_paragraph": chunk.paragraph,
|
||||
"source_article": effective_article,
|
||||
"source_paragraph": effective_paragraph,
|
||||
"batch_size": len(chunks),
|
||||
"document_grouped": same_doc,
|
||||
}
|
||||
@@ -1133,6 +1197,7 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat di
|
||||
- severity: low/medium/high/critical
|
||||
- tags: Liste von Tags (eigene Begriffe)
|
||||
- domain: Fachgebiet als Kuerzel (AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden/Verwaltung, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe/Handelsrecht, ENV=Umwelt, HLT=Gesundheit)
|
||||
- category: Inhaltliche Kategorie — MUSS zum domain passen. Moegliche Werte: {CATEGORY_LIST_STR}
|
||||
- target_audience: Liste der Zielgruppen (z.B. "unternehmen", "behoerden", "entwickler", "datenschutzbeauftragte", "geschaeftsfuehrung", "it-abteilung", "rechtsabteilung", "compliance-officer", "personalwesen", "einkauf", "produktion", "gesundheitswesen", "finanzwesen", "oeffentlicher_dienst")
|
||||
|
||||
{joined}"""
|
||||
@@ -1159,7 +1224,8 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat di
|
||||
control.source_citation = None
|
||||
control.customer_visible = False
|
||||
control.verification_method = _detect_verification_method(chunk.text)
|
||||
control.category = _detect_category(chunk.text)
|
||||
if not control.category:
|
||||
control.category = _detect_category(chunk.text)
|
||||
control.generation_metadata = {
|
||||
"processing_path": "llm_reform_batch",
|
||||
"license_rule": 3,
|
||||
@@ -1209,6 +1275,9 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat di
|
||||
all_controls[orig_idx] = ctrl
|
||||
|
||||
# Post-process all controls: harmonization + anchor search
|
||||
# NOTE: QA validation runs as a separate batch AFTER generation (qa-reclassify endpoint)
|
||||
# to avoid competing with Ollama prefilter for resources.
|
||||
qa_fixed_count = 0
|
||||
final: list[Optional[GeneratedControl]] = []
|
||||
for i in range(len(batch_items)):
|
||||
control = all_controls.get(i)
|
||||
@@ -1245,7 +1314,7 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat di
|
||||
else:
|
||||
control.release_state = "needs_review"
|
||||
|
||||
# Control ID — prefer LLM-assigned domain over keyword detection
|
||||
# Control ID — prefer QA-corrected or LLM-assigned domain over keyword detection
|
||||
domain = (control.generation_metadata.get("_effective_domain")
|
||||
or config.domain
|
||||
or _detect_domain(control.objective))
|
||||
@@ -1254,7 +1323,9 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat di
|
||||
|
||||
final.append(control)
|
||||
|
||||
return final
|
||||
if qa_fixed_count:
|
||||
logger.info("QA validation: fixed %d/%d controls in batch", qa_fixed_count, len(final))
|
||||
return final, qa_fixed_count
|
||||
|
||||
# ── Stage 4: Harmonization ─────────────────────────────────────────
|
||||
|
||||
@@ -1337,11 +1408,15 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat di
|
||||
|
||||
# Use LLM-provided domain if available, fallback to keyword-detected domain
|
||||
llm_domain = data.get("domain")
|
||||
valid_domains = {"AUTH", "CRYP", "NET", "DATA", "LOG", "ACC", "SEC", "INC",
|
||||
"AI", "COMP", "GOV", "LAB", "FIN", "TRD", "ENV", "HLT"}
|
||||
if llm_domain and llm_domain.upper() in valid_domains:
|
||||
if llm_domain and llm_domain.upper() in VALID_DOMAINS:
|
||||
domain = llm_domain.upper()
|
||||
|
||||
# Use LLM-provided category if available
|
||||
llm_category = data.get("category")
|
||||
category = None
|
||||
if llm_category and llm_category in VALID_CATEGORIES:
|
||||
category = llm_category
|
||||
|
||||
# Parse target_audience from LLM response
|
||||
target_audience = data.get("target_audience")
|
||||
if isinstance(target_audience, str):
|
||||
@@ -1362,6 +1437,7 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat di
|
||||
implementation_effort=data.get("implementation_effort", "m") if data.get("implementation_effort") in ("s", "m", "l", "xl") else "m",
|
||||
tags=tags[:20],
|
||||
target_audience=target_audience,
|
||||
category=category,
|
||||
)
|
||||
# Store effective domain for later control_id generation
|
||||
control.generation_metadata["_effective_domain"] = domain
|
||||
@@ -1395,6 +1471,79 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat di
|
||||
pass
|
||||
return f"{prefix}-001"
|
||||
|
||||
# ── Stage QA: Automated Quality Validation ───────────────────────
|
||||
|
||||
async def _qa_validate_control(
|
||||
self, control: GeneratedControl, chunk_text: str
|
||||
) -> tuple[GeneratedControl, bool]:
|
||||
"""Cross-validate category/domain using keyword detection + local LLM.
|
||||
|
||||
Returns (control, was_fixed). Only triggers Ollama QA when the LLM
|
||||
classification disagrees with keyword detection — keeps it fast.
|
||||
"""
|
||||
kw_category = _detect_category(chunk_text) or _detect_category(control.objective)
|
||||
kw_domain = _detect_domain(chunk_text)
|
||||
llm_domain = control.generation_metadata.get("_effective_domain", "")
|
||||
|
||||
# If keyword and LLM agree → no QA needed
|
||||
if control.category == kw_category and llm_domain == kw_domain:
|
||||
return control, False
|
||||
|
||||
# Disagreement detected → ask local LLM to arbitrate
|
||||
title = control.title[:100]
|
||||
objective = control.objective[:200]
|
||||
reqs = ", ".join(control.requirements[:3]) if control.requirements else "keine"
|
||||
prompt = f"""Pruefe dieses Compliance-Control auf korrekte Klassifizierung.
|
||||
|
||||
Titel: {title}
|
||||
Ziel: {objective}
|
||||
Anforderungen: {reqs}
|
||||
|
||||
Aktuelle Zuordnung: domain={llm_domain}, category={control.category}
|
||||
Keyword-Erkennung: domain={kw_domain}, category={kw_category}
|
||||
|
||||
Welche Zuordnung ist korrekt? Antworte NUR als JSON:
|
||||
{{"domain": "KUERZEL", "category": "kategorie_name", "reason": "kurze Begruendung"}}
|
||||
|
||||
Domains: AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe, ENV=Umwelt, HLT=Gesundheit
|
||||
Kategorien: {CATEGORY_LIST_STR}"""
|
||||
|
||||
try:
|
||||
raw = await _llm_local(prompt)
|
||||
data = _parse_llm_json(raw)
|
||||
if not data:
|
||||
return control, False
|
||||
|
||||
fixed = False
|
||||
qa_domain = data.get("domain", "").upper()
|
||||
qa_category = data.get("category", "")
|
||||
reason = data.get("reason", "")
|
||||
|
||||
if qa_category and qa_category in VALID_CATEGORIES and qa_category != control.category:
|
||||
old_cat = control.category
|
||||
control.category = qa_category
|
||||
control.generation_metadata["qa_category_fix"] = {
|
||||
"from": old_cat, "to": qa_category, "reason": reason,
|
||||
}
|
||||
logger.info("QA fix: '%s' category '%s' -> '%s' (%s)",
|
||||
title[:40], old_cat, qa_category, reason)
|
||||
fixed = True
|
||||
|
||||
if qa_domain and qa_domain in VALID_DOMAINS and qa_domain != llm_domain:
|
||||
control.generation_metadata["qa_domain_fix"] = {
|
||||
"from": llm_domain, "to": qa_domain, "reason": reason,
|
||||
}
|
||||
control.generation_metadata["_effective_domain"] = qa_domain
|
||||
logger.info("QA fix: '%s' domain '%s' -> '%s' (%s)",
|
||||
title[:40], llm_domain, qa_domain, reason)
|
||||
fixed = True
|
||||
|
||||
return control, fixed
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("QA validation failed for '%s': %s", title[:40], e)
|
||||
return control, False
|
||||
|
||||
# ── Pipeline Orchestration ─────────────────────────────────────────
|
||||
|
||||
def _create_job(self, config: GeneratorConfig) -> str:
|
||||
@@ -1605,10 +1754,28 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat di
|
||||
len(chunks), len(doc_groups),
|
||||
)
|
||||
|
||||
# Flatten back: chunks from same document are now adjacent
|
||||
# ── Apply max_chunks limit respecting document boundaries ──
|
||||
# Process complete documents until we exceed the limit.
|
||||
# Never split a document across jobs.
|
||||
chunks = []
|
||||
for group_list in doc_groups.values():
|
||||
chunks.extend(group_list)
|
||||
if config.max_chunks and config.max_chunks > 0:
|
||||
for group_key, group_list in doc_groups.items():
|
||||
if chunks and len(chunks) + len(group_list) > config.max_chunks:
|
||||
# Adding this document would exceed the limit — stop here
|
||||
break
|
||||
chunks.extend(group_list)
|
||||
logger.info(
|
||||
"max_chunks=%d: selected %d chunks from %d complete documents (of %d total groups)",
|
||||
config.max_chunks, len(chunks),
|
||||
len(set(c.regulation_code for c in chunks)),
|
||||
len(doc_groups),
|
||||
)
|
||||
else:
|
||||
# No limit: flatten all groups
|
||||
for group_list in doc_groups.values():
|
||||
chunks.extend(group_list)
|
||||
|
||||
result.total_chunks_scanned = len(chunks)
|
||||
|
||||
# Process chunks — batch mode (N chunks per Anthropic API call)
|
||||
BATCH_SIZE = config.batch_size or 5
|
||||
@@ -1633,7 +1800,8 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat di
|
||||
len(batch), ", ".join(regs_in_batch),
|
||||
)
|
||||
try:
|
||||
batch_controls = await self._process_batch(batch, config, job_id)
|
||||
batch_controls, batch_qa_fixes = await self._process_batch(batch, config, job_id)
|
||||
result.controls_qa_fixed += batch_qa_fixes
|
||||
except Exception as e:
|
||||
logger.error("Batch processing failed: %s — falling back to single-chunk mode", e)
|
||||
# Fallback: process each chunk individually
|
||||
@@ -1785,6 +1953,8 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat di
|
||||
if not control.title or not control.objective:
|
||||
return None
|
||||
|
||||
# NOTE: QA validation runs as a separate batch AFTER generation (qa-reclassify endpoint)
|
||||
|
||||
# Stage 4: Harmonization
|
||||
duplicates = await self._check_harmonization(control)
|
||||
if duplicates:
|
||||
@@ -1809,8 +1979,10 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat di
|
||||
else:
|
||||
control.release_state = "needs_review"
|
||||
|
||||
# Generate control_id
|
||||
domain = config.domain or _detect_domain(control.objective)
|
||||
# Generate control_id — prefer QA-corrected or LLM-assigned domain
|
||||
domain = (control.generation_metadata.get("_effective_domain")
|
||||
or config.domain
|
||||
or _detect_domain(control.objective))
|
||||
control.control_id = self._generate_control_id(domain, self.db)
|
||||
|
||||
# Store job_id in metadata
|
||||
|
||||
292
backend-compliance/migrations/059_wiki_cra_annex_i_detail.sql
Normal file
292
backend-compliance/migrations/059_wiki_cra_annex_i_detail.sql
Normal file
@@ -0,0 +1,292 @@
|
||||
-- Migration 059: CRA Annex I — Detaillierte Essential Cybersecurity Requirements
|
||||
-- Erweitert den bestehenden Wiki-Artikel 'cra-security-controls' um Part 1 + Part 2,
|
||||
-- Produktklassifizierung und ISO 27001 Mapping.
|
||||
-- Zusaetzlich: Neuer Artikel fuer CRA-Produktklassifizierung und Konformitaetsbewertung.
|
||||
|
||||
-- ============================================================================
|
||||
-- 1) Update: CRA Security Controls (Annex I) — Vollstaendige 8-Kategorien-Struktur
|
||||
-- ============================================================================
|
||||
UPDATE compliance_wiki_articles
|
||||
SET
|
||||
title = 'CRA Annex I — Essential Cybersecurity Requirements (Vollstaendig)',
|
||||
summary = 'Annex I des CRA definiert die wesentlichen Cybersicherheitsanforderungen in zwei Teilen: Teil 1 (Produktsicherheit, 11 Anforderungen) und Teil 2 (Schwachstellenbehandlung, 8 Anforderungen). Daraus ergeben sich rund 35 konkrete Security-Controls in 8 Kategorien.',
|
||||
content = '## Ueberblick
|
||||
|
||||
Der **EU Cyber Resilience Act (CRA)**, Verordnung (EU) 2024/2847, legt in **Annex I** die **Essential Cybersecurity Requirements** fest, die alle Produkte mit digitalen Elementen erfuellen muessen. Annex I besteht aus zwei Teilen:
|
||||
|
||||
- **Teil 1 — Sicherheitsanforderungen an Produkte** (11 Kernanforderungen)
|
||||
- **Teil 2 — Anforderungen an die Schwachstellenbehandlung** (8 Prozessanforderungen)
|
||||
|
||||
Daraus lassen sich etwa **35 konkrete Security-Controls** in **8 thematischen Kategorien** ableiten. Diese Controls bilden die Grundlage fuer eine Cybersecurity-Compliance-Strategie.
|
||||
|
||||
---
|
||||
|
||||
## Teil 1: Sicherheitsanforderungen an Produkte
|
||||
|
||||
### Kategorie 1 — Secure-by-Design und Architektur
|
||||
|
||||
Diese Controls stellen sicher, dass Sicherheit von Anfang an in die Produktarchitektur integriert wird.
|
||||
|
||||
| # | Control | CRA-Referenz | Beschreibung | ISO 27001 Mapping |
|
||||
|---|---------|-------------|-------------|-------------------|
|
||||
| 1 | **Secure-by-Default-Konfiguration** | Annex I, 1(1) | Produkte muessen mit sicheren Standardeinstellungen ausgeliefert werden. Keine offenen Ports, keine aktivierten Debug-Schnittstellen, keine unnoetig laufenden Dienste. | A.8.9 |
|
||||
| 2 | **Minimale Angriffsflaeche** | Annex I, 1(2) | Nur notwendige Schnittstellen, Dienste und Protokolle aktivieren. Jede zusaetzliche Funktionalitaet vergroessert die Angriffsflaeche und muss einzeln gerechtfertigt werden. | A.8.9, A.8.20 |
|
||||
| 3 | **Sichere Systemarchitektur** | Annex I, 1(3) | Sicherheitskritische Komponenten muessen isoliert werden (Sandboxing, Containerisierung, Privilege Separation). Defense-in-Depth-Prinzip anwenden. | A.8.27 |
|
||||
| 4 | **Least-Privilege-Prinzip** | Annex I, 1(3)(d) | Jede Komponente, jeder Prozess und jeder Benutzer erhaelt nur die minimal notwendigen Berechtigungen. Privilegien-Eskalation muss verhindert werden. | A.8.2, A.8.3 |
|
||||
| 5 | **Manipulationsschutz** | Annex I, 1(3)(c) | Schutz vor unautorisierter Aenderung von Software und Konfiguration durch Integritaetsmechanismen (Code Signing, Secure Boot, TPM). | A.8.24 |
|
||||
| 6 | **Integritaetspruefung** | Annex I, 1(3)(c) | Automatische Ueberpruefung der Integritaet von Software, Firmware und Konfigurationsdaten bei Start und Laufzeit. Hash-basierte Validierung und digitale Signaturen. | A.8.24 |
|
||||
|
||||
### Kategorie 2 — Authentifizierung und Zugriffskontrolle
|
||||
|
||||
Controls zur Sicherstellung, dass nur autorisierte Personen und Systeme Zugriff erhalten.
|
||||
|
||||
| # | Control | CRA-Referenz | Beschreibung | ISO 27001 Mapping |
|
||||
|---|---------|-------------|-------------|-------------------|
|
||||
| 7 | **Starke Authentifizierung** | Annex I, 1(3)(d) | Implementierung sicherer Authentifizierungsmechanismen. Multi-Faktor-Authentifizierung fuer administrative Zugriffe. Unterstuetzung moderner Standards (FIDO2, WebAuthn). | A.8.5 |
|
||||
| 8 | **Keine Default-Passwoerter** | Annex I, 1(3)(d) | Produkte duerfen keine universellen Standardpasswoerter verwenden. Jedes Geraet muss ein individuelles Passwort erhalten oder den Benutzer zur Aenderung bei Ersteinrichtung zwingen. | A.8.5 |
|
||||
| 9 | **Sicheres Credential-Management** | Annex I, 1(3)(d) | Zugangsdaten muessen verschluesselt gespeichert werden (bcrypt, Argon2id). Keine Klartextspeicherung. API-Keys und Tokens regelmaessig rotieren. | A.8.5 |
|
||||
| 10 | **Sitzungsmanagement** | Annex I, 1(3)(d) | Sichere Session-Verwaltung mit Timeout, Token-Binding und Session-Invalidierung bei Logout oder Passwortwechsel. CSRF-Schutz implementieren. | A.8.5 |
|
||||
| 11 | **Brute-Force-Schutz** | Annex I, 1(3)(d) | Schutz vor Brute-Force- und Credential-Stuffing-Angriffen durch Rate Limiting, Account Lockout und CAPTCHA-Mechanismen. | A.8.5, A.8.16 |
|
||||
| 12 | **Rollenbasierte Autorisierung** | Annex I, 1(3)(d) | Implementierung von RBAC (Role-Based Access Control). Trennung von administrativen und Nutzerfunktionen. Prinzip der geringsten Privilegien durchsetzen. | A.8.2, A.8.3 |
|
||||
|
||||
### Kategorie 3 — Kryptografie und Datenschutz
|
||||
|
||||
Controls zum Schutz von Daten durch kryptografische Verfahren.
|
||||
|
||||
| # | Control | CRA-Referenz | Beschreibung | ISO 27001 Mapping |
|
||||
|---|---------|-------------|-------------|-------------------|
|
||||
| 13 | **Verschluesselung sensibler Daten** | Annex I, 1(3)(e) | Alle sensiblen Daten muessen verschluesselt werden — sowohl bei der Speicherung (at rest, AES-256) als auch bei der Uebertragung (in transit, TLS 1.2+). | A.8.24 |
|
||||
| 14 | **Speicher-Schutz (Data at Rest)** | Annex I, 1(3)(e) | Verschluesselung gespeicherter Daten auf Festplatten, in Datenbanken und Backups. Schluessel getrennt von Daten speichern. | A.8.24 |
|
||||
| 15 | **Transport-Schutz (Data in Transit)** | Annex I, 1(3)(e) | Alle Netzwerkkommunikation ueber TLS 1.2 oder hoeher. Veraltete Protokolle (SSL, TLS 1.0/1.1) deaktivieren. Certificate Pinning fuer kritische Verbindungen. | A.8.24 |
|
||||
| 16 | **Sicheres Schluesselmanagement** | Annex I, 1(3)(e) | Kryptografische Schluessel in HSM oder Vault speichern. Regelmaessige Rotation (mind. jaehrlich). Dokumentation der Schluessel-Lebenszyklen. Sofortige Rotation bei Kompromittierungsverdacht. | A.8.24 |
|
||||
| 17 | **Datenminimierung** | Annex I, 1(3)(f) | Nur Daten erfassen und verarbeiten, die fuer die Produktfunktion erforderlich sind. Personenbezogene Daten gemaess DSGVO-Grundsaetzen behandeln. | A.8.10, A.8.11 |
|
||||
|
||||
### Kategorie 4 — Secure Software Development Lifecycle
|
||||
|
||||
Controls fuer sichere Softwareentwicklung ueber den gesamten Lebenszyklus.
|
||||
|
||||
| # | Control | CRA-Referenz | Beschreibung | ISO 27001 Mapping |
|
||||
|---|---------|-------------|-------------|-------------------|
|
||||
| 18 | **Strukturierter SSDLC** | Annex I, 1(1) | Implementierung eines formalen Secure Software Development Lifecycle mit definierten Security Gates in jeder Phase (Requirements, Design, Implementation, Test, Release). | A.8.25, A.8.26 |
|
||||
| 19 | **Systematische Code Reviews** | Annex I, 1(1) | Peer Reviews mit Security-Fokus fuer jeden Code-Commit. Einsatz von Checklisten fuer OWASP Top 10 und CWE Top 25. Security Champions in jedem Entwicklerteam. | A.8.25 |
|
||||
| 20 | **Automatisierte Sicherheitstests** | Annex I, 1(1) | Static Application Security Testing (SAST), Dynamic Application Security Testing (DAST) und Software Composition Analysis (SCA) in der CI/CD-Pipeline. Secrets Detection fuer eingebettete Zugangsdaten. | A.8.25 |
|
||||
| 21 | **Supply-Chain-Security** | Annex I, 1(5) | Systematische Pruefung aller Drittanbieter-Komponenten auf Schwachstellen und Lizenz-Compliance. Vertrauenswuerdigkeit von Lieferanten bewerten. | A.5.19, A.5.21 |
|
||||
| 22 | **Dependency-Monitoring** | Annex I, 1(5) | Kontinuierliche Ueberwachung aller Abhaengigkeiten auf bekannte Schwachstellen (CVE). Automatische Benachrichtigung bei neuen CVEs in verwendeten Bibliotheken. | A.8.8, A.8.25 |
|
||||
| 23 | **Software Bill of Materials (SBOM)** | Annex I, 1(5) | Fuer jedes Produkt ein maschinenlesbares SBOM fuehren (CycloneDX oder SPDX). Mindestens Top-Level-Abhaengigkeiten mit Name, Version und Lizenz dokumentieren. SBOM bei jedem Release aktualisieren. | A.8.25 |
|
||||
|
||||
### Kategorie 5 — Logging, Monitoring und Anomalie-Erkennung
|
||||
|
||||
Controls zur Erkennung und Nachverfolgung von Sicherheitsereignissen.
|
||||
|
||||
| # | Control | CRA-Referenz | Beschreibung | ISO 27001 Mapping |
|
||||
|---|---------|-------------|-------------|-------------------|
|
||||
| 24 | **Security-Logging** | Annex I, 1(3)(g) | Protokollierung aller sicherheitsrelevanten Ereignisse: Login-Versuche, Berechtigungsaenderungen, administrative Aktionen, API-Zugriffe, Fehler und Ausnahmen. Logs muessen Zeitstempel, Akteur, Aktion und Ergebnis enthalten. | A.8.15 |
|
||||
| 25 | **Ereignis-Monitoring** | Annex I, 1(3)(g) | Zentrale Sammlung und Echtzeit-Ueberwachung sicherheitsrelevanter Events. Einsatz eines SIEM-Systems oder vergleichbarer Loesung. Korrelation von Events aus verschiedenen Quellen. | A.8.16 |
|
||||
| 26 | **Anomalie-Erkennung** | Annex I, 1(3)(g) | Automatische Erkennung von Angriffsmustern und ungewoehnlichem Verhalten. Alarmierung bei Abweichungen von Baseline-Verhalten. Integration von Threat Intelligence Feeds. | A.8.16 |
|
||||
| 27 | **Log-Integritaet und -Aufbewahrung** | Annex I, 1(3)(g) | Logs muessen manipulationssicher gespeichert werden (append-only, signiert oder WORM). Aufbewahrung mindestens 12 Monate. Zugriff auf Logs nur fuer autorisiertes Security-Personal. | A.8.15 |
|
||||
|
||||
### Kategorie 6 — Update- und Patch-Management
|
||||
|
||||
Controls fuer die sichere Bereitstellung und Installation von Updates.
|
||||
|
||||
| # | Control | CRA-Referenz | Beschreibung | ISO 27001 Mapping |
|
||||
|---|---------|-------------|-------------|-------------------|
|
||||
| 28 | **Sichere Update-Mechanismen** | Annex I, 1(4) | Updates muessen ueber sichere Kanaele verteilt werden (HTTPS, signierte Pakete). Automatische oder einfach zugaengliche Update-Moeglichkeit fuer Endnutzer. Rollback-Faehigkeit bei fehlerhaften Updates. | A.8.8, A.8.19 |
|
||||
| 29 | **Update-Authentizitaet** | Annex I, 1(4) | Alle Updates muessen digital signiert sein. Signaturpruefung vor Installation erzwingen. Verwendung vertrauenswuerdiger Signaturschluessel mit dokumentierter Key Ceremony. | A.8.24 |
|
||||
| 30 | **Update-Integritaet** | Annex I, 1(4) | Integritaetspruefung jedes Update-Pakets vor und nach Installation (Hash-Vergleich, Signatur-Verifikation). Manipulation waehrend der Uebertragung erkennen und ablehnen. | A.8.24 |
|
||||
| 31 | **Lifecycle-Support** | Annex I, 1(4) | Security-Updates waehrend des gesamten erwarteten Produktlebenszyklus bereitstellen — mindestens **5 Jahre** ab Inverkehrbringen oder die erwartete Nutzungsdauer, je nachdem welcher Zeitraum laenger ist. End-of-Life klar kommunizieren. | A.8.8 |
|
||||
|
||||
---
|
||||
|
||||
## Teil 2: Anforderungen an die Schwachstellenbehandlung
|
||||
|
||||
### Kategorie 7 — Vulnerability Management
|
||||
|
||||
Controls fuer die systematische Identifikation, Bewertung und Behebung von Schwachstellen.
|
||||
|
||||
| # | Control | CRA-Referenz | Beschreibung | ISO 27001 Mapping |
|
||||
|---|---------|-------------|-------------|-------------------|
|
||||
| 32 | **Schwachstellen-Identifikation** | Annex I, 2(1) | Kontinuierliches CVE-Monitoring aller eingesetzten Komponenten. Regelmaessige Vulnerability Scans (woechentlich automatisiert). Bug-Bounty-Programme oder Responsible-Disclosure-Kanaele einrichten. | A.8.8 |
|
||||
| 33 | **SBOM-Pflege und Analyse** | Annex I, 2(1) | SBOM aktuell halten und kontinuierlich gegen CVE-Datenbanken pruefen. Automatische Alarmierung bei neu entdeckten Schwachstellen in verwendeten Komponenten. | A.8.8, A.8.25 |
|
||||
| 34 | **Risikobasierte Priorisierung** | Annex I, 2(2) | Schwachstellen nach CVSS-Score und tatsaechlichem Risiko priorisieren. Reaktionszeiten nach Schweregrad: Kritisch (24–72h), Hoch (7 Tage), Mittel (30 Tage), Niedrig (naechster Zyklus). | A.8.8 |
|
||||
| 35 | **Coordinated Vulnerability Disclosure** | Annex I, 2(5) | Veroeffentlichung einer CVD-Policy mit klarem Meldeprozess. Kontaktadresse fuer Sicherheitsforscher bereitstellen. Eingangsbestaetigung innerhalb von 5 Werktagen. Koordinierte Veroeffentlichung nach Patch-Verfuegbarkeit. | A.5.5, A.5.6 |
|
||||
|
||||
### Kategorie 8 — Incident Response und Meldepflichten
|
||||
|
||||
Controls fuer die Erkennung, Behandlung und Meldung von Sicherheitsvorfaellen.
|
||||
|
||||
| # | Control | CRA-Referenz | Beschreibung | ISO 27001 Mapping |
|
||||
|---|---------|-------------|-------------|-------------------|
|
||||
| 36 | **Incident-Response-Prozess** | Annex I, 2(5) | Dokumentierter Prozess mit definierten Phasen: Detection → Classification → Containment → Investigation → Recovery → Reporting → Lessons Learned. Regelmaessige Uebungen (Tabletop Exercises). | A.5.24, A.5.25, A.5.26 |
|
||||
| 37 | **Fruehwarnung (24h)** | Annex I, 2(7) + Art. 14(2)(a) | Bei aktiv ausgenutzten Schwachstellen oder schweren Vorfaellen: Fruehwarnung an ENISA und/oder zustaendige nationale Behoerde innerhalb von **24 Stunden** nach Kenntniserlangung. | A.5.24, A.5.26 |
|
||||
| 38 | **Detaillierter Vorfallsbericht (72h)** | Annex I, 2(7) + Art. 14(2)(b) | Innerhalb von **72 Stunden**: Detaillierter Bericht mit Umfang, Auswirkung, Ursachenanalyse und eingeleiteten Gegenmassnahmen. Bei personenbezogenen Daten zusaetzlich Art. 33/34 DSGVO beachten. | A.5.24, A.5.26 |
|
||||
| 39 | **Patch-Bereitstellung** | Annex I, 2(3) | Patches fuer gemeldete und bestaetigte Schwachstellen so schnell wie moeglich bereitstellen. Sicherheitshinweise (Security Advisories) an Kunden veroeffentlichen. CSAF-Format fuer maschinenlesbare Advisories empfohlen. | A.8.8 |
|
||||
| 40 | **Dokumentation und Nachbereitung** | Annex I, 2(6) | Alle Schwachstellen und Vorfaelle lueckenlos dokumentieren und fuer mindestens 10 Jahre aufbewahren. Lessons-Learned-Prozess nach jedem bedeutenden Vorfall. Ergebnisse in Risikobewertung einfliessen lassen. | A.5.27 |
|
||||
|
||||
---
|
||||
|
||||
## Produktklassifizierung nach CRA
|
||||
|
||||
Der CRA unterscheidet drei Produktkategorien mit unterschiedlichen Konformitaetsanforderungen:
|
||||
|
||||
### Standardprodukte (Default)
|
||||
|
||||
**Beispiele:** einfache Apps, Desktop-Software, Spiele, Foto-Editoren
|
||||
|
||||
- **Konformitaetsbewertung:** Selbstbewertung (Modul A)
|
||||
- **Anforderungen:** Alle Annex-I-Anforderungen, aber einfachster Nachweis
|
||||
- **Betrifft:** ca. 90% aller Produkte
|
||||
|
||||
### Wichtige Produkte (Annex III) — Klasse I
|
||||
|
||||
**Beispiele:** Passwort-Manager, VPN-Software, Firewalls, Router, Smart-Home-Systeme, IoT-Geraete mit Sensorfunktion, SIEM-Systeme
|
||||
|
||||
- **Konformitaetsbewertung:** Harmonisierte Standards oder Drittanbieter-Bewertung
|
||||
- **Anforderungen:** Alle Annex-I-Anforderungen + erhoehte Nachweispflichten
|
||||
- **Betrifft:** ca. 8% aller Produkte
|
||||
|
||||
### Wichtige Produkte — Klasse II
|
||||
|
||||
**Beispiele:** Betriebssysteme, Hypervisoren, Container-Runtimes, Public-Key-Infrastruktur, industrielle Steuerungssysteme (ICS/SCADA)
|
||||
|
||||
- **Konformitaetsbewertung:** Verpflichtende Drittanbieter-Bewertung durch benannte Stelle
|
||||
- **Anforderungen:** Alle Annex-I-Anforderungen + strengste Nachweispflichten
|
||||
- **Betrifft:** ca. 2% aller Produkte
|
||||
|
||||
### Kritische Produkte (Annex IV)
|
||||
|
||||
**Beispiele:** Hardware-Security-Module (HSM), Smartcard-Chips, Secure Elements, Smart-Meter-Gateways
|
||||
|
||||
- **Konformitaetsbewertung:** Europaeisches Cybersicherheitszertifikat erforderlich (EUCC)
|
||||
- **Anforderungen:** Hoechste Stufe — europaeische Zertifizierung obligatorisch
|
||||
|
||||
---
|
||||
|
||||
## Zuordnung der Controls zu Dokumenten
|
||||
|
||||
Diese 40 Controls koennen automatisiert zu folgenden Compliance-Dokumenten fuehren:
|
||||
|
||||
| Dokument | Controls | Beschreibung |
|
||||
|----------|----------|-------------|
|
||||
| **Cybersecurity Policy** | 1–40 | Uebergreifendes Grundsatzdokument fuer Cybersicherheit |
|
||||
| **Secure Development Policy** | 18–23 | Richtlinie fuer den sicheren Entwicklungsprozess (SSDLC) |
|
||||
| **Vulnerability Management Policy** | 32–35, 39 | CVD, Patching, SBOM-Analyse |
|
||||
| **Incident Response Plan** | 36–38, 40 | 24h/72h Meldung, Eskalation, Nachbereitung |
|
||||
| **Access Control Policy** | 7–12 | Authentifizierung, Autorisierung, Passwort-Richtlinie |
|
||||
| **Cryptographic Policy** | 13–17 | Verschluesselung, Schluesselmanagement, Datenschutz |
|
||||
| **Update/Patch Policy** | 28–31 | Update-Mechanismen, Signierung, Lifecycle-Support |
|
||||
| **Logging & Monitoring Policy** | 24–27 | Security-Logging, SIEM, Anomalie-Erkennung |
|
||||
|
||||
---
|
||||
|
||||
## Zeitplan fuer die Umsetzung
|
||||
|
||||
| Datum | Meilenstein |
|
||||
|-------|------------|
|
||||
| 10.12.2024 | CRA in Kraft getreten |
|
||||
| 11.06.2026 | Konformitaetsbewertungsstellen muessen benannt sein |
|
||||
| 11.09.2026 | **Meldepflichten aktiv** (Controls 37, 38) |
|
||||
| 11.12.2027 | **Volle Anwendung** — alle 40 Controls muessen umgesetzt sein, CE-Kennzeichnung erforderlich |
|
||||
|
||||
---
|
||||
|
||||
## Sanktionen bei Nicht-Einhaltung
|
||||
|
||||
| Verstoss | Maximales Bussgeld |
|
||||
|----------|-------------------|
|
||||
| Wesentliche Anforderungen (Annex I) | 15 Mio. EUR oder 2,5% des weltweiten Jahresumsatzes |
|
||||
| Sonstige Pflichten | 10 Mio. EUR oder 2% des weltweiten Jahresumsatzes |
|
||||
| Falsche/unvollstaendige Informationen | 5 Mio. EUR oder 1% des weltweiten Jahresumsatzes |',
|
||||
legal_refs = ARRAY['Annex I CRA', 'Annex III CRA', 'Annex IV CRA', 'Art. 13 CRA', 'Art. 14 CRA', 'Art. 15 CRA', 'Art. 64 CRA', '(EU) 2024/2847'],
|
||||
tags = ARRAY['security-controls', 'annex-i', 'secure-by-design', 'authentifizierung', 'kryptografie', 'sbom', 'vulnerability', 'patching', 'incident-response', 'produktklassifizierung', 'iso-27001', 'ssdlc'],
|
||||
relevance = 'critical',
|
||||
updated_at = NOW()
|
||||
WHERE id = 'cra-security-controls';
|
||||
|
||||
-- ============================================================================
|
||||
-- 2) Neuer Artikel: CRA-Konformitaetsbewertung — Praktischer Leitfaden
|
||||
-- ============================================================================
|
||||
INSERT INTO compliance_wiki_articles (id, category_id, title, summary, content, legal_refs, tags, relevance, source_urls) VALUES
|
||||
('cra-konformitaet', 'cra',
|
||||
'CRA-Konformitaetsbewertung — Praktischer Leitfaden',
|
||||
'Schritt-fuer-Schritt-Anleitung zur CRA-Konformitaetsbewertung: Produktklassifizierung, Dokumentation, Self-Assessment vs. Drittanbieter-Pruefung, CE-Kennzeichnung.',
|
||||
'## Ueberblick
|
||||
|
||||
Jeder Hersteller muss vor dem Inverkehrbringen eine **Konformitaetsbewertung** durchfuehren, um nachzuweisen, dass sein Produkt die Essential Cybersecurity Requirements (Annex I) erfuellt. Der Aufwand haengt von der Produktkategorie ab.
|
||||
|
||||
## Schritt 1: Produkt klassifizieren
|
||||
|
||||
Bestimmen Sie, ob Ihr Produkt unter eine der Sonderkategorien faellt:
|
||||
|
||||
### Entscheidungsbaum
|
||||
|
||||
```
|
||||
Ist das Produkt in Annex IV gelistet?
|
||||
→ Ja: Kritisches Produkt → Europaeische Zertifizierung (EUCC)
|
||||
→ Nein: Weiter
|
||||
|
||||
Ist das Produkt in Annex III, Klasse II gelistet?
|
||||
→ Ja: Wichtig Klasse II → Drittanbieter-Bewertung (Pflicht)
|
||||
→ Nein: Weiter
|
||||
|
||||
Ist das Produkt in Annex III, Klasse I gelistet?
|
||||
→ Ja: Wichtig Klasse I → Harmonisierte Standards ODER Drittanbieter
|
||||
→ Nein: Standardprodukt → Selbstbewertung (Modul A)
|
||||
```
|
||||
|
||||
## Schritt 2: Cybersecurity-Risikobewertung
|
||||
|
||||
Fuehren Sie eine systematische Risikoanalyse durch:
|
||||
|
||||
1. **Assets identifizieren** — Welche Daten verarbeitet das Produkt? Welche Schnittstellen hat es?
|
||||
2. **Bedrohungen analysieren** — STRIDE-Methodik oder vergleichbar anwenden
|
||||
3. **Schwachstellen bewerten** — Bekannte CVEs, Design-Schwaechen, Konfigurationsfehler
|
||||
4. **Risiken priorisieren** — Eintrittswahrscheinlichkeit × Auswirkung
|
||||
5. **Massnahmen definieren** — Welche Controls aus Annex I adressieren welches Risiko?
|
||||
|
||||
## Schritt 3: Controls implementieren
|
||||
|
||||
Setzen Sie die relevanten Controls aus den 8 Kategorien um (siehe Artikel „CRA Annex I — Essential Cybersecurity Requirements"). Dokumentieren Sie fuer jeden Control:
|
||||
|
||||
- **Status**: Implementiert / In Bearbeitung / Nicht anwendbar
|
||||
- **Nachweis**: Wie wird die Umsetzung belegt? (Code, Konfiguration, Test, Policy)
|
||||
- **Verantwortlich**: Wer ist zustaendig?
|
||||
|
||||
## Schritt 4: Technische Dokumentation
|
||||
|
||||
Die technische Dokumentation muss enthalten:
|
||||
|
||||
- Beschreibung des Produkts und seiner Funktionen
|
||||
- Cybersecurity-Risikobewertung
|
||||
- Angewandte harmonisierte Normen
|
||||
- Nachweis der Einhaltung jeder Annex-I-Anforderung
|
||||
- SBOM (Software Bill of Materials)
|
||||
- Informationen zum Support-Zeitraum
|
||||
|
||||
## Schritt 5: Konformitaetserklaerung und CE-Kennzeichnung
|
||||
|
||||
Nach erfolgreicher Bewertung:
|
||||
|
||||
1. **EU-Konformitaetserklaerung** ausstellen
|
||||
2. **CE-Kennzeichnung** anbringen
|
||||
3. **Dokumentation** mindestens 10 Jahre aufbewahren
|
||||
4. Produkt darf in der EU vertrieben werden
|
||||
|
||||
## Haeufige Fehler
|
||||
|
||||
| Fehler | Konsequenz |
|
||||
|--------|-----------|
|
||||
| Default-Passwoerter nicht entfernt | Verstoss gegen Annex I, 1(3)(d) |
|
||||
| Kein SBOM erstellt | Verstoss gegen Annex I, 1(5) |
|
||||
| Kein Update-Mechanismus | Verstoss gegen Annex I, 1(4) |
|
||||
| Keine CVD-Policy | Verstoss gegen Annex I, 2(5) |
|
||||
| Support-Zeitraum nicht definiert | Verstoss gegen Art. 13(8) |
|
||||
|
||||
## Empfehlung
|
||||
|
||||
Nutzen Sie die **BreakPilot Compliance SDK Control Library**, um den Umsetzungsstand Ihrer CRA-Controls systematisch zu tracken und automatisiert Nachweise zu generieren.',
|
||||
ARRAY['Annex I CRA', 'Annex II CRA', 'Annex III CRA', 'Annex IV CRA', 'Annex V CRA', 'Art. 13 CRA', 'Art. 24 CRA', 'Art. 25 CRA', 'Art. 26 CRA', 'Art. 27 CRA'],
|
||||
ARRAY['konformitaet', 'ce-kennzeichnung', 'self-assessment', 'technische-dokumentation', 'sbom', 'risikobewertung'],
|
||||
'important',
|
||||
ARRAY['https://eur-lex.europa.eu/eli/reg/2024/2847/oj/eng'])
|
||||
ON CONFLICT (id) DO NOTHING;
|
||||
221
backend-compliance/tests/test_citation_backfill.py
Normal file
221
backend-compliance/tests/test_citation_backfill.py
Normal file
@@ -0,0 +1,221 @@
|
||||
"""Tests for citation_backfill.py — article/paragraph enrichment."""
|
||||
import hashlib
|
||||
import json
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from compliance.services.citation_backfill import (
|
||||
CitationBackfill,
|
||||
MatchResult,
|
||||
_parse_concatenated_source,
|
||||
_parse_json,
|
||||
)
|
||||
from compliance.services.rag_client import RAGSearchResult
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Unit tests: _parse_concatenated_source
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class TestParseConcatenatedSource:
|
||||
def test_dsgvo_art(self):
|
||||
result = _parse_concatenated_source("DSGVO Art. 35")
|
||||
assert result == {"name": "DSGVO", "article": "Art. 35"}
|
||||
|
||||
def test_nis2_artikel(self):
|
||||
result = _parse_concatenated_source("NIS2 Artikel 21 Abs. 2")
|
||||
assert result == {"name": "NIS2", "article": "Artikel 21 Abs. 2"}
|
||||
|
||||
def test_long_name_with_article(self):
|
||||
result = _parse_concatenated_source("Verordnung (EU) 2024/1689 (KI-Verordnung) Art. 6")
|
||||
assert result == {"name": "Verordnung (EU) 2024/1689 (KI-Verordnung)", "article": "Art. 6"}
|
||||
|
||||
def test_paragraph_sign(self):
|
||||
result = _parse_concatenated_source("BDSG § 42")
|
||||
assert result == {"name": "BDSG", "article": "§ 42"}
|
||||
|
||||
def test_paragraph_sign_with_abs(self):
|
||||
result = _parse_concatenated_source("TTDSG § 25 Abs. 1")
|
||||
assert result == {"name": "TTDSG", "article": "§ 25 Abs. 1"}
|
||||
|
||||
def test_no_article(self):
|
||||
result = _parse_concatenated_source("DSGVO")
|
||||
assert result is None
|
||||
|
||||
def test_empty_string(self):
|
||||
result = _parse_concatenated_source("")
|
||||
assert result is None
|
||||
|
||||
def test_none(self):
|
||||
result = _parse_concatenated_source(None)
|
||||
assert result is None
|
||||
|
||||
def test_just_name_no_article(self):
|
||||
result = _parse_concatenated_source("Cyber Resilience Act")
|
||||
assert result is None
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Unit tests: _parse_json
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class TestParseJson:
|
||||
def test_direct_json(self):
|
||||
result = _parse_json('{"article": "Art. 35", "paragraph": "Abs. 1"}')
|
||||
assert result == {"article": "Art. 35", "paragraph": "Abs. 1"}
|
||||
|
||||
def test_markdown_code_block(self):
|
||||
raw = '```json\n{"article": "§ 42", "paragraph": ""}\n```'
|
||||
result = _parse_json(raw)
|
||||
assert result == {"article": "§ 42", "paragraph": ""}
|
||||
|
||||
def test_text_with_json(self):
|
||||
raw = 'Der Artikel ist {"article": "Art. 6", "paragraph": "Abs. 2"} wie beschrieben.'
|
||||
result = _parse_json(raw)
|
||||
assert result == {"article": "Art. 6", "paragraph": "Abs. 2"}
|
||||
|
||||
def test_empty(self):
|
||||
assert _parse_json("") is None
|
||||
assert _parse_json(None) is None
|
||||
|
||||
def test_no_json(self):
|
||||
assert _parse_json("Das ist kein JSON.") is None
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Integration tests: CitationBackfill matching
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def _make_rag_chunk(text="Test text", article="Art. 35", paragraph="Abs. 1",
|
||||
regulation_code="eu_2016_679", regulation_name="DSGVO"):
|
||||
return RAGSearchResult(
|
||||
text=text,
|
||||
regulation_code=regulation_code,
|
||||
regulation_name=regulation_name,
|
||||
regulation_short="DSGVO",
|
||||
category="datenschutz",
|
||||
article=article,
|
||||
paragraph=paragraph,
|
||||
source_url="https://example.com",
|
||||
score=0.0,
|
||||
collection="bp_compliance_gesetze",
|
||||
)
|
||||
|
||||
|
||||
class TestCitationBackfillMatching:
|
||||
def setup_method(self):
|
||||
self.db = MagicMock()
|
||||
self.rag = MagicMock()
|
||||
self.backfill = CitationBackfill(db=self.db, rag_client=self.rag)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_hash_match(self):
|
||||
"""Tier 1: exact text hash matches a RAG chunk."""
|
||||
source_text = "Dies ist ein Gesetzestext mit spezifischen Anforderungen an die Datensicherheit."
|
||||
chunk = _make_rag_chunk(text=source_text, article="Art. 32", paragraph="Abs. 1")
|
||||
h = hashlib.sha256(source_text.encode()).hexdigest()
|
||||
self.backfill._rag_index = {h: chunk}
|
||||
|
||||
ctrl = {
|
||||
"control_id": "DATA-001",
|
||||
"source_original_text": source_text,
|
||||
"source_citation": {"source": "DSGVO Art. 32"},
|
||||
"generation_metadata": {"source_regulation": "eu_2016_679"},
|
||||
}
|
||||
|
||||
result = await self.backfill._match_control(ctrl)
|
||||
assert result is not None
|
||||
assert result.method == "hash"
|
||||
assert result.article == "Art. 32"
|
||||
assert result.paragraph == "Abs. 1"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_regex_match(self):
|
||||
"""Tier 2: regex parses concatenated source when no hash match."""
|
||||
self.backfill._rag_index = {}
|
||||
|
||||
ctrl = {
|
||||
"control_id": "NET-010",
|
||||
"source_original_text": None, # No original text available
|
||||
"source_citation": {"source": "NIS2 Artikel 21"},
|
||||
"generation_metadata": {},
|
||||
}
|
||||
|
||||
result = await self.backfill._match_control(ctrl)
|
||||
assert result is not None
|
||||
assert result.method == "regex"
|
||||
assert result.article == "Artikel 21"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_llm_match(self):
|
||||
"""Tier 3: Ollama LLM identifies article/paragraph."""
|
||||
self.backfill._rag_index = {}
|
||||
|
||||
ctrl = {
|
||||
"control_id": "AUTH-005",
|
||||
"source_original_text": "Verantwortliche muessen geeignete technische Massnahmen treffen...",
|
||||
"source_citation": {"source": "DSGVO"}, # No article in source
|
||||
"generation_metadata": {"source_regulation": "eu_2016_679"},
|
||||
}
|
||||
|
||||
with patch("compliance.services.citation_backfill._llm_ollama", new_callable=AsyncMock) as mock_llm:
|
||||
mock_llm.return_value = '{"article": "Art. 25", "paragraph": "Abs. 1"}'
|
||||
result = await self.backfill._match_control(ctrl)
|
||||
|
||||
assert result is not None
|
||||
assert result.method == "llm"
|
||||
assert result.article == "Art. 25"
|
||||
assert result.paragraph == "Abs. 1"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_no_match(self):
|
||||
"""No match when no source text and no parseable source."""
|
||||
self.backfill._rag_index = {}
|
||||
|
||||
ctrl = {
|
||||
"control_id": "SEC-001",
|
||||
"source_original_text": None,
|
||||
"source_citation": {"source": "Unknown Source"},
|
||||
"generation_metadata": {},
|
||||
}
|
||||
|
||||
result = await self.backfill._match_control(ctrl)
|
||||
assert result is None
|
||||
|
||||
def test_update_control_cleans_source(self):
|
||||
"""_update_control splits concatenated source and adds article/paragraph."""
|
||||
ctrl = {
|
||||
"id": "test-uuid-123",
|
||||
"control_id": "DATA-001",
|
||||
"source_citation": {"source": "DSGVO Art. 32", "license": "EU_LAW"},
|
||||
"generation_metadata": {"processing_path": "structured"},
|
||||
}
|
||||
match = MatchResult(article="Art. 32", paragraph="Abs. 1", method="hash")
|
||||
|
||||
self.backfill._update_control(ctrl, match)
|
||||
|
||||
call_args = self.db.execute.call_args
|
||||
params = call_args[1] if call_args[1] else call_args[0][1]
|
||||
citation = json.loads(params["citation"])
|
||||
metadata = json.loads(params["metadata"])
|
||||
|
||||
assert citation["source"] == "DSGVO" # Cleaned: article removed
|
||||
assert citation["article"] == "Art. 32"
|
||||
assert citation["paragraph"] == "Abs. 1"
|
||||
assert metadata["source_paragraph"] == "Abs. 1"
|
||||
assert metadata["backfill_method"] == "hash"
|
||||
assert "backfill_at" in metadata
|
||||
|
||||
def test_rule3_not_loaded(self):
|
||||
"""Verify the SQL query only loads Rule 1+2 controls."""
|
||||
# Simulate what _load_controls_needing_backfill does
|
||||
self.db.execute.return_value = MagicMock(keys=lambda: [], __iter__=lambda s: iter([]))
|
||||
self.backfill._load_controls_needing_backfill()
|
||||
|
||||
sql_text = str(self.db.execute.call_args[0][0].text)
|
||||
assert "license_rule IN (1, 2)" in sql_text
|
||||
assert "source_citation IS NOT NULL" in sql_text
|
||||
Reference in New Issue
Block a user