feat(pipeline): implement Control Dependency Engine (Block 9)

Core engine (dependency_engine.py):
- 5 dependency types: prerequisite, supersedes, compensating_control,
  conditional_requirement, scope_exclusion
- Generic condition evaluator (JSONB rules with AND/OR/NOT/field ops)
- Priority-based conflict resolution
- Cycle detection (DFS) + topological sort
- Full evaluation with MCP-compatible dependency_resolution trace
- 39 tests all passing (incl. GHV scenario from user requirements)

Automatic generator (dependency_generator.py):
- Ontology-based: same normalized_object + phase sequence -> prerequisite
- Pattern-based: define->implement, implement->monitor, etc.
- Domain packs: YAML rules for GDPR, AI Act, CRA, Security, Labor Contracts
- 14 tests all passing

API routes (dependency_routes.py):
- CRUD for dependencies
- POST /evaluate with dependency resolution
- POST /generate (auto-generation with dry_run)
- POST /validate (cycle detection)
- GET /graph (nodes + edges for visualization)

Prompt enhancement (decomposition_pass.py):
- Added dependency_hints + lifecycle_phase_order to Pass 0b prompt
- Stored in generation_metadata for post-processing

DB migration: control_dependencies + control_evaluation_results tables

126 tests total, all passing.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-26 20:28:10 +02:00
parent 5aaa62dca7
commit 42ab5ead26
14 changed files with 2421 additions and 2 deletions

View File

@@ -351,3 +351,33 @@ def build_canonical_key(
if asset_scope:
parts.append(asset_scope)
return ":".join(parts)
# ============================================================================
# PHASE ORDERING (for dependency engine — lifecycle sequence)
# ============================================================================
PHASE_ORDER: dict[str, int] = {
"scope": 1,
"definition": 2,
"governance": 2,
"design": 3,
"implementation": 4,
"configuration": 5,
"operation": 6,
"training": 6,
"monitoring": 7,
"testing": 8,
"review": 9,
"assessment": 10,
"remediation": 10,
"validation": 11,
"reporting": 12,
"evidence": 13,
}
def get_phase_order(action_type: str) -> int:
"""Get the lifecycle phase order for an action_type (1-13)."""
phase = get_phase(action_type)
return PHASE_ORDER.get(phase, 6) # default: operation (middle)

View File

@@ -220,6 +220,9 @@ class AtomicControlCandidate:
pass_criteria: list = field(default_factory=list)
fail_criteria: list = field(default_factory=list)
check_type: str = ""
# Dependency Engine Felder
dependency_hints: list = field(default_factory=list)
lifecycle_phase_order: int = 0
def to_dict(self) -> dict:
return {
@@ -238,6 +241,8 @@ class AtomicControlCandidate:
"pass_criteria": self.pass_criteria,
"fail_criteria": self.fail_criteria,
"check_type": self.check_type,
"dependency_hints": self.dependency_hints,
"lifecycle_phase_order": self.lifecycle_phase_order,
}
@@ -2133,7 +2138,9 @@ Antworte als JSON:
"severity": "critical|high|medium|low",
"category": "security|privacy|governance|operations|finance|reporting",
"check_type": "technical_config_check|document_clause_check|code_pattern_check|evidence_check|interview_required",
"merge_key": "action_type:normalized_object:control_phase"
"merge_key": "action_type:normalized_object:control_phase",
"dependency_hints": ["dependency_type:action_type:normalized_object (Voraussetzungen, Ersetzungen, Kompensationen)"],
"lifecycle_phase_order": "1-13 (1=scope, 2=definition, 4=implementation, 7=monitoring, 8=testing, 12=reporting)"
}}"""
@@ -2229,7 +2236,9 @@ Jedes Control hat dieses Format:
"severity": "critical|high|medium|low",
"category": "security|privacy|governance|operations|finance|reporting",
"check_type": "technical_config_check|document_clause_check|code_pattern_check|evidence_check|interview_required",
"merge_key": "action_type:normalized_object:control_phase"
"merge_key": "action_type:normalized_object:control_phase",
"dependency_hints": ["dependency_type:action_type:normalized_object (Voraussetzungen, Ersetzungen, Kompensationen)"],
"lifecycle_phase_order": "1-13 (1=scope, 2=definition, 4=implementation, 7=monitoring, 8=testing, 12=reporting)"
}}"""
@@ -2971,6 +2980,8 @@ class DecompositionPass:
pass_criteria=_ensure_list(parsed.get("pass_criteria", [])),
fail_criteria=_ensure_list(parsed.get("fail_criteria", [])),
check_type=parsed.get("check_type", ""),
dependency_hints=_ensure_list(parsed.get("dependency_hints", [])),
lifecycle_phase_order=int(parsed.get("lifecycle_phase_order", 0) or 0),
)
# Store merge_key from LLM output in metadata
llm_merge_key = parsed.get("merge_key", "")
@@ -2980,6 +2991,12 @@ class DecompositionPass:
atomic.parent_control_uuid = obl["parent_uuid"]
atomic.obligation_candidate_id = obl["candidate_id"]
# Set lifecycle_phase_order deterministically if not set by LLM
if not atomic.lifecycle_phase_order:
from services.control_ontology import classify_action, get_phase_order
action_type = classify_action(obl.get("action", "") or atomic.title)
atomic.lifecycle_phase_order = get_phase_order(action_type)
# Cap severity for implementation-specific obligations
if obl.get("is_implementation_specific") and atomic.severity in (
"critical", "high"
@@ -3438,6 +3455,9 @@ class DecompositionPass:
"pass_criteria": atomic.pass_criteria or [],
"fail_criteria": atomic.fail_criteria or [],
"check_type": atomic.check_type or "",
# Dependency Engine Felder
"dependency_hints": atomic.dependency_hints or [],
"lifecycle_phase_order": atomic.lifecycle_phase_order or 0,
}),
"framework_id": "14b1bdd2-abc7-4a43-adae-14471ee5c7cf",
},

View File

@@ -0,0 +1,599 @@
"""
Control Dependency Engine — evaluates control statuses considering
inter-control dependencies.
Pure functions (no DB coupling) for:
- Generic condition evaluation (JSONB rules -> bool)
- Effect application (modifies target status)
- Cycle detection (DFS-based)
- Topological sort (evaluation order)
- Full evaluation resolution with priority-based conflict handling
DB interaction is in separate load/store functions at the bottom.
"""
from __future__ import annotations
import json
import logging
import uuid
from collections import defaultdict
from dataclasses import dataclass, field
from enum import Enum
from typing import Any, Optional
from sqlalchemy import text
logger = logging.getLogger(__name__)
# ============================================================================
# ENUMS
# ============================================================================
class DependencyType(str, Enum):
PREREQUISITE = "prerequisite"
CONDITIONAL_REQUIREMENT = "conditional_requirement"
SUPERSEDES = "supersedes"
COMPENSATING_CONTROL = "compensating_control"
SCOPE_EXCLUSION = "scope_exclusion"
class EvaluationStatus(str, Enum):
PASS = "pass"
FAIL = "fail"
NOT_APPLICABLE = "not_applicable"
PARTIALLY_SATISFIED = "partially_satisfied"
COMPENSATED_FAIL = "compensated_fail"
REVIEW_REQUIRED = "review_required"
# Default priority per dependency type (lower = higher priority)
DEFAULT_PRIORITIES: dict[str, int] = {
"supersedes": 10,
"scope_exclusion": 20,
"prerequisite": 50,
"conditional_requirement": 70,
"compensating_control": 80,
}
# ============================================================================
# DATA CLASSES
# ============================================================================
@dataclass
class Dependency:
id: str = ""
source_control_id: str = ""
target_control_id: str = ""
dependency_type: str = "prerequisite"
condition: dict = field(default_factory=dict)
effect: dict = field(default_factory=dict)
priority: int = 100
generation_method: str = "manual"
is_active: bool = True
@dataclass
class ControlState:
"""In-memory representation of a control's evaluation state."""
control_id: str = ""
raw_status: str = "fail"
resolved_status: str = ""
context: dict = field(default_factory=dict)
@dataclass
class EvaluationResult:
control_id: str = ""
evaluation_run_id: str = ""
raw_status: str = "fail"
resolved_status: str = "fail"
dependency_resolution: list = field(default_factory=list)
confidence: float = 1.0
reasoning: str = ""
# ============================================================================
# CONDITION EVALUATOR
# ============================================================================
def _resolve_field(field_path: str, context: dict) -> Any:
"""Resolve a dot-notation field path against a nested dict.
Examples:
_resolve_field("source.status", {"source": {"status": "pass"}}) -> "pass"
_resolve_field("context.company_size", {"context": {"company_size": "large"}}) -> "large"
"""
parts = field_path.split(".")
current = context
for part in parts:
if isinstance(current, dict):
current = current.get(part)
else:
return None
return current
def _evaluate_single_clause(clause: dict, context: dict) -> bool:
"""Evaluate a single {field, op, value} clause."""
field_path = clause.get("field", "")
op = clause.get("op", "==")
expected = clause.get("value")
actual = _resolve_field(field_path, context)
if op == "==":
return actual == expected
elif op == "!=":
return actual != expected
elif op == "in":
if isinstance(expected, list):
return actual in expected
return False
elif op == "not_in":
if isinstance(expected, list):
return actual not in expected
return True
elif op == ">":
try:
return float(actual) > float(expected)
except (TypeError, ValueError):
return False
elif op == "<":
try:
return float(actual) < float(expected)
except (TypeError, ValueError):
return False
elif op == ">=":
try:
return float(actual) >= float(expected)
except (TypeError, ValueError):
return False
elif op == "<=":
try:
return float(actual) <= float(expected)
except (TypeError, ValueError):
return False
elif op == "contains":
if isinstance(actual, (list, set, tuple)):
return expected in actual
if isinstance(actual, str):
return str(expected) in actual
return False
return False
def evaluate_condition(condition: dict, context: dict) -> bool:
"""Evaluate a generic condition against a context dict.
Supports:
- Empty condition -> True (always matches)
- Simple clause: {"field": "source.status", "op": "==", "value": "pass"}
- Compound AND: {"operator": "AND", "clauses": [...]}
- Compound OR: {"operator": "OR", "clauses": [...]}
- Negation: {"operator": "NOT", "clause": {...}}
"""
if not condition:
return True
operator = condition.get("operator")
if operator == "AND":
clauses = condition.get("clauses", [])
return all(evaluate_condition(c, context) for c in clauses)
if operator == "OR":
clauses = condition.get("clauses", [])
return any(evaluate_condition(c, context) for c in clauses)
if operator == "NOT":
clause = condition.get("clause", {})
return not evaluate_condition(clause, context)
# Simple clause with field/op/value
if "field" in condition:
return _evaluate_single_clause(condition, context)
return True
# ============================================================================
# EFFECT APPLIER
# ============================================================================
def apply_effect(effect: dict, current_status: str) -> str:
"""Apply a dependency effect to produce a new status.
Effect schema:
{"set_status": "not_applicable"}
{"set_status": "compensated_fail"}
"""
new_status = effect.get("set_status")
if new_status and new_status in {s.value for s in EvaluationStatus}:
return new_status
return current_status
# ============================================================================
# CYCLE DETECTION
# ============================================================================
WHITE, GRAY, BLACK = 0, 1, 2
def detect_cycles(dependencies: list[Dependency]) -> list[list[str]]:
"""Detect cycles in the dependency graph using DFS.
Returns list of cycles (each cycle = list of control IDs).
Empty list = no cycles.
"""
graph: dict[str, list[str]] = defaultdict(list)
all_nodes: set[str] = set()
for dep in dependencies:
if dep.is_active:
graph[dep.source_control_id].append(dep.target_control_id)
all_nodes.add(dep.source_control_id)
all_nodes.add(dep.target_control_id)
color: dict[str, int] = {n: WHITE for n in all_nodes}
parent: dict[str, Optional[str]] = {n: None for n in all_nodes}
cycles: list[list[str]] = []
def dfs(node: str) -> None:
color[node] = GRAY
for neighbor in graph.get(node, []):
if color.get(neighbor, WHITE) == GRAY:
# Found a cycle — trace back
cycle = [neighbor, node]
current = parent.get(node)
while current and current != neighbor:
cycle.append(current)
current = parent.get(current)
cycles.append(cycle)
elif color.get(neighbor, WHITE) == WHITE:
parent[neighbor] = node
dfs(neighbor)
color[node] = BLACK
for node in all_nodes:
if color[node] == WHITE:
dfs(node)
return cycles
def topological_sort(dependencies: list[Dependency]) -> list[str]:
"""Return control IDs in dependency-safe evaluation order.
Sources (prerequisites) come before targets (dependents).
Controls not involved in any dependency are omitted.
"""
graph: dict[str, list[str]] = defaultdict(list)
in_degree: dict[str, int] = defaultdict(int)
all_nodes: set[str] = set()
for dep in dependencies:
if dep.is_active:
# source -> target means: source should be evaluated first
graph[dep.source_control_id].append(dep.target_control_id)
in_degree.setdefault(dep.source_control_id, 0)
in_degree[dep.target_control_id] = in_degree.get(dep.target_control_id, 0) + 1
all_nodes.add(dep.source_control_id)
all_nodes.add(dep.target_control_id)
# Kahn's algorithm
queue = [n for n in all_nodes if in_degree.get(n, 0) == 0]
result: list[str] = []
while queue:
queue.sort() # deterministic order
node = queue.pop(0)
result.append(node)
for neighbor in graph.get(node, []):
in_degree[neighbor] -= 1
if in_degree[neighbor] == 0:
queue.append(neighbor)
return result
# ============================================================================
# MAIN EVALUATION ENGINE
# ============================================================================
def evaluate_controls(
control_states: dict[str, ControlState],
dependencies: list[Dependency],
context: dict,
) -> dict[str, EvaluationResult]:
"""Evaluate all controls considering dependencies.
Args:
control_states: control_id -> ControlState (with raw_status)
dependencies: all active dependencies
context: company profile (industry, company_size, scope_signals, etc.)
Returns:
control_id -> EvaluationResult (with resolved_status + trace)
Algorithm:
1. Build adjacency (target -> dependencies)
2. Detect cycles -> involved controls = review_required
3. Topological sort for evaluation order
4. For each control: evaluate conditions, apply highest-priority effect
5. Record full dependency trace for MCP output
"""
evaluation_run_id = str(uuid.uuid4())
# 1. Build adjacency: target_control_id -> list of dependencies
target_deps: dict[str, list[Dependency]] = defaultdict(list)
for dep in dependencies:
if dep.is_active:
target_deps[dep.target_control_id].append(dep)
# 2. Cycle detection
cycles = detect_cycles(dependencies)
cycle_controls: set[str] = set()
for cycle in cycles:
cycle_controls.update(cycle)
# 3. Topological sort (excluding cycle controls)
safe_deps = [
d for d in dependencies
if d.is_active
and d.source_control_id not in cycle_controls
and d.target_control_id not in cycle_controls
]
eval_order = topological_sort(safe_deps)
# Add remaining controls (those not in any dependency + cycle controls)
all_ids = set(control_states.keys())
remaining = all_ids - set(eval_order)
eval_order.extend(sorted(remaining))
# 4. Iterate and evaluate
results: dict[str, EvaluationResult] = {}
for control_id in eval_order:
state = control_states.get(control_id)
if not state:
continue
# Cycle controls -> review_required
if control_id in cycle_controls:
results[control_id] = EvaluationResult(
control_id=control_id,
evaluation_run_id=evaluation_run_id,
raw_status=state.raw_status,
resolved_status=EvaluationStatus.REVIEW_REQUIRED.value,
dependency_resolution=[{"cycle_detected": True}],
confidence=0.5,
reasoning="Zyklische Abhaengigkeit erkannt — manuelle Pruefung erforderlich.",
)
continue
# Collect dependencies targeting this control
deps_for_target = target_deps.get(control_id, [])
if not deps_for_target:
results[control_id] = EvaluationResult(
control_id=control_id,
evaluation_run_id=evaluation_run_id,
raw_status=state.raw_status,
resolved_status=state.raw_status,
confidence=1.0,
)
continue
# Evaluate each dependency's condition
matching_effects: list[tuple[int, dict, Dependency]] = []
trace: list[dict] = []
for dep in sorted(deps_for_target, key=lambda d: d.priority):
source_state = control_states.get(dep.source_control_id)
source_result = results.get(dep.source_control_id)
source_status = "unknown"
if source_result:
source_status = source_result.resolved_status
elif source_state:
source_status = source_state.raw_status
eval_ctx = {
"source": {"status": source_status},
"target": {"status": state.raw_status},
"context": context,
}
condition_met = evaluate_condition(dep.condition, eval_ctx)
trace_entry = {
"dependency_id": dep.id,
"dependency_type": dep.dependency_type,
"source_control_id": dep.source_control_id,
"source_status": source_status,
"condition_met": condition_met,
"effect_applied": dep.effect if condition_met else None,
"priority": dep.priority,
}
trace.append(trace_entry)
if condition_met:
matching_effects.append((dep.priority, dep.effect, dep))
# Apply highest-priority (lowest number) effect
resolved = state.raw_status
if matching_effects:
matching_effects.sort(key=lambda x: x[0])
_, best_effect, _ = matching_effects[0]
resolved = apply_effect(best_effect, state.raw_status)
results[control_id] = EvaluationResult(
control_id=control_id,
evaluation_run_id=evaluation_run_id,
raw_status=state.raw_status,
resolved_status=resolved,
dependency_resolution=trace,
confidence=_compute_confidence(trace),
)
return results
def _compute_confidence(trace: list[dict]) -> float:
"""Compute confidence based on dependency resolution trace."""
if not trace:
return 1.0
met_count = sum(1 for t in trace if t.get("condition_met"))
total = len(trace)
if met_count == 0:
return 1.0 # No dependencies fired -> raw status stands
if met_count == 1:
return 0.95 # Single dependency resolved
# Multiple dependencies -> slightly lower confidence
return max(0.7, 1.0 - (met_count - 1) * 0.1)
# ============================================================================
# DB INTERACTION (separate from pure logic)
# ============================================================================
def load_dependencies_for_controls(
db, control_ids: list[str],
) -> list[Dependency]:
"""Load all active dependencies involving the given control IDs."""
if not control_ids:
return []
rows = db.execute(
text("""
SELECT id, source_control_id, target_control_id,
dependency_type, condition, effect, priority,
generation_method, is_active
FROM control_dependencies
WHERE is_active = TRUE
AND (source_control_id = ANY(CAST(:ids AS uuid[]))
OR target_control_id = ANY(CAST(:ids AS uuid[])))
"""),
{"ids": control_ids},
).fetchall()
return [
Dependency(
id=str(r[0]),
source_control_id=str(r[1]),
target_control_id=str(r[2]),
dependency_type=r[3],
condition=r[4] if isinstance(r[4], dict) else {},
effect=r[5] if isinstance(r[5], dict) else {},
priority=r[6],
generation_method=r[7],
is_active=r[8],
)
for r in rows
]
def load_all_active_dependencies(db) -> list[Dependency]:
"""Load all active dependencies."""
rows = db.execute(
text("""
SELECT id, source_control_id, target_control_id,
dependency_type, condition, effect, priority,
generation_method, is_active
FROM control_dependencies
WHERE is_active = TRUE
ORDER BY priority
"""),
).fetchall()
return [
Dependency(
id=str(r[0]),
source_control_id=str(r[1]),
target_control_id=str(r[2]),
dependency_type=r[3],
condition=r[4] if isinstance(r[4], dict) else {},
effect=r[5] if isinstance(r[5], dict) else {},
priority=r[6],
generation_method=r[7],
is_active=r[8],
)
for r in rows
]
def store_dependency(db, dep: Dependency) -> str:
"""Insert a dependency, return its UUID."""
row = db.execute(
text("""
INSERT INTO control_dependencies
(source_control_id, target_control_id, dependency_type,
condition, effect, priority, generation_method, generation_metadata)
VALUES
(CAST(:src AS uuid), CAST(:tgt AS uuid), :dtype,
CAST(:cond AS jsonb), CAST(:eff AS jsonb), :prio, :gmethod, CAST(:gmeta AS jsonb))
ON CONFLICT (source_control_id, target_control_id, dependency_type)
DO UPDATE SET
condition = EXCLUDED.condition,
effect = EXCLUDED.effect,
priority = EXCLUDED.priority,
updated_at = NOW()
RETURNING id::text
"""),
{
"src": dep.source_control_id,
"tgt": dep.target_control_id,
"dtype": dep.dependency_type,
"cond": json.dumps(dep.condition),
"eff": json.dumps(dep.effect),
"prio": dep.priority,
"gmethod": dep.generation_method,
"gmeta": json.dumps({}),
},
).fetchone()
return row[0] if row else ""
def store_evaluation_results(
db, results: dict[str, EvaluationResult], company_profile: dict,
) -> int:
"""Batch insert evaluation results. Returns row count."""
count = 0
for result in results.values():
db.execute(
text("""
INSERT INTO control_evaluation_results
(control_id, evaluation_run_id, company_profile,
raw_status, resolved_status, dependency_resolution,
confidence, reasoning)
VALUES
(CAST(:cid AS uuid), CAST(:rid AS uuid), CAST(:prof AS jsonb),
:raw, :resolved, CAST(:trace AS jsonb),
:conf, :reason)
ON CONFLICT (control_id, evaluation_run_id)
DO UPDATE SET
resolved_status = EXCLUDED.resolved_status,
dependency_resolution = EXCLUDED.dependency_resolution,
confidence = EXCLUDED.confidence
"""),
{
"cid": result.control_id,
"rid": result.evaluation_run_id,
"prof": json.dumps(company_profile),
"raw": result.raw_status,
"resolved": result.resolved_status,
"trace": json.dumps(result.dependency_resolution),
"conf": result.confidence,
"reason": result.reasoning,
},
)
count += 1
return count

View File

@@ -0,0 +1,381 @@
"""
Dependency Generator — automatic discovery of control dependencies.
Three strategies:
1. Ontology-based: same normalized_object + phase sequence -> prerequisite
2. Pattern-based: known patterns (define->implement, implement->monitor, etc.)
3. Domain packs: YAML-defined rules for specific regulatory domains
"""
from __future__ import annotations
import logging
import os
import re
from collections import defaultdict
from typing import Optional
import yaml
from services.dependency_engine import Dependency, DEFAULT_PRIORITIES
logger = logging.getLogger(__name__)
# ============================================================================
# PHASE ORDERING (imported from ontology)
# ============================================================================
from services.control_ontology import PHASE_ORDER
# ============================================================================
# PATTERN RULES
# ============================================================================
PATTERN_RULES: list[dict] = [
{
"name": "define_before_implement",
"source_filter": {"action_type": "define"},
"target_filter": {"action_type": "implement"},
"match_on": "normalized_object",
"dependency_type": "prerequisite",
"condition": {},
"effect": {"set_status": "review_required"},
"priority": 50,
},
{
"name": "implement_before_monitor",
"source_filter": {"action_type_in": ["implement", "configure", "enforce"]},
"target_filter": {"action_type_in": ["monitor", "review", "test"]},
"match_on": "normalized_object",
"dependency_type": "prerequisite",
"condition": {},
"effect": {"set_status": "review_required"},
"priority": 50,
},
{
"name": "define_before_enforce",
"source_filter": {"action_type": "define"},
"target_filter": {"action_type": "enforce"},
"match_on": "normalized_object",
"dependency_type": "prerequisite",
"condition": {},
"effect": {"set_status": "review_required"},
"priority": 50,
},
{
"name": "implement_before_validate",
"source_filter": {"action_type_in": ["implement", "configure"]},
"target_filter": {"action_type_in": ["validate", "verify"]},
"match_on": "normalized_object",
"dependency_type": "prerequisite",
"condition": {},
"effect": {"set_status": "review_required"},
"priority": 50,
},
{
"name": "train_before_review",
"source_filter": {"action_type": "train"},
"target_filter": {"action_type_in": ["review", "assess"]},
"match_on": "normalized_object",
"dependency_type": "prerequisite",
"condition": {},
"effect": {"set_status": "review_required"},
"priority": 60,
},
]
# ============================================================================
# HELPER: Parse merge_key into components
# ============================================================================
def _parse_merge_key(merge_key: str) -> dict:
"""Parse 'action_type:normalized_object:phase[:asset_scope]' into components."""
parts = merge_key.split(":")
result = {
"action_type": parts[0] if len(parts) > 0 else "",
"normalized_object": parts[1] if len(parts) > 1 else "",
"phase": parts[2] if len(parts) > 2 else "",
"asset_scope": parts[3] if len(parts) > 3 else "",
}
return result
def _get_control_merge_key(control: dict) -> str:
"""Extract merge_key from a control dict (from generation_metadata or top-level)."""
mk = control.get("merge_key", "")
if not mk:
meta = control.get("generation_metadata", {})
if isinstance(meta, str):
try:
import json
meta = json.loads(meta)
except (ValueError, TypeError):
meta = {}
mk = meta.get("merge_group_hint", "")
return mk
# ============================================================================
# ONTOLOGY-BASED GENERATOR
# ============================================================================
def generate_ontology_dependencies(controls: list[dict]) -> list[Dependency]:
"""Generate prerequisite dependencies from lifecycle phase ordering.
Rule: If two controls share the same normalized_object and control A's
phase precedes control B's phase, then A is a prerequisite for B.
Groups by normalized_object first (O(n) grouping, O(k^2) per group
where k is typically 2-8).
"""
# Group controls by normalized_object
groups: dict[str, list[dict]] = defaultdict(list)
for ctrl in controls:
mk = _get_control_merge_key(ctrl)
if not mk:
continue
parsed = _parse_merge_key(mk)
obj = parsed["normalized_object"]
if obj:
ctrl["_parsed_mk"] = parsed
ctrl["_phase_order"] = PHASE_ORDER.get(parsed["phase"], 6)
groups[obj].append(ctrl)
dependencies: list[Dependency] = []
for obj, group in groups.items():
if len(group) < 2:
continue
# Sort by phase order
group.sort(key=lambda c: c["_phase_order"])
# Create prerequisite edges between adjacent phases
for i in range(len(group)):
for j in range(i + 1, len(group)):
a = group[i]
b = group[j]
if a["_phase_order"] < b["_phase_order"]:
dep = Dependency(
source_control_id=a.get("id", a.get("control_id", "")),
target_control_id=b.get("id", b.get("control_id", "")),
dependency_type="prerequisite",
condition={},
effect={"set_status": "review_required"},
priority=DEFAULT_PRIORITIES["prerequisite"],
generation_method="ontology",
)
dependencies.append(dep)
return dependencies
# ============================================================================
# PATTERN-BASED GENERATOR
# ============================================================================
def _matches_filter(control: dict, filter_: dict) -> bool:
"""Check if a control matches a pattern filter."""
parsed = control.get("_parsed_mk", {})
action = parsed.get("action_type", "")
if "action_type" in filter_:
if action != filter_["action_type"]:
return False
if "action_type_in" in filter_:
if action not in filter_["action_type_in"]:
return False
return True
def generate_pattern_dependencies(
controls: list[dict],
rules: Optional[list[dict]] = None,
) -> list[Dependency]:
"""Apply pattern rules to generate dependencies between controls."""
if rules is None:
rules = PATTERN_RULES
# Pre-parse merge keys
for ctrl in controls:
if "_parsed_mk" not in ctrl:
mk = _get_control_merge_key(ctrl)
if mk:
ctrl["_parsed_mk"] = _parse_merge_key(mk)
else:
ctrl["_parsed_mk"] = {}
dependencies: list[Dependency] = []
for rule in rules:
sources = [c for c in controls if _matches_filter(c, rule["source_filter"])]
targets = [c for c in controls if _matches_filter(c, rule["target_filter"])]
match_on = rule.get("match_on")
for src in sources:
for tgt in targets:
src_id = src.get("id", src.get("control_id", ""))
tgt_id = tgt.get("id", tgt.get("control_id", ""))
if src_id == tgt_id:
continue
if match_on == "normalized_object":
src_obj = src.get("_parsed_mk", {}).get("normalized_object", "")
tgt_obj = tgt.get("_parsed_mk", {}).get("normalized_object", "")
if not src_obj or src_obj != tgt_obj:
continue
dep = Dependency(
source_control_id=src_id,
target_control_id=tgt_id,
dependency_type=rule["dependency_type"],
condition=rule.get("condition", {}),
effect=rule.get("effect", {"set_status": "review_required"}),
priority=rule.get("priority", 100),
generation_method="pattern",
)
dependencies.append(dep)
return dependencies
# ============================================================================
# DOMAIN PACK GENERATOR
# ============================================================================
def load_domain_pack(path: str) -> dict:
"""Load a YAML domain pack."""
with open(path, "r", encoding="utf-8") as f:
return yaml.safe_load(f) or {}
def _title_matches(title: str, patterns: list[str]) -> bool:
"""Check if a title contains any of the given patterns (case-insensitive)."""
title_lower = title.lower()
return any(p.lower() in title_lower for p in patterns)
def generate_domain_dependencies(
controls: list[dict],
domain_pack_dir: str = "",
) -> list[Dependency]:
"""Apply all domain packs to generate domain-specific dependencies."""
if not domain_pack_dir:
domain_pack_dir = os.path.join(
os.path.dirname(os.path.dirname(__file__)), "data", "domain_packs"
)
if not os.path.isdir(domain_pack_dir):
return []
dependencies: list[Dependency] = []
for filename in sorted(os.listdir(domain_pack_dir)):
if not filename.endswith((".yaml", ".yml")):
continue
pack = load_domain_pack(os.path.join(domain_pack_dir, filename))
rules = pack.get("rules", [])
for rule in rules:
src_match = rule.get("source_match", {})
tgt_match = rule.get("target_match", {})
src_title_patterns = src_match.get("title_contains", [])
tgt_title_patterns = tgt_match.get("title_contains", [])
sources = [
c for c in controls
if src_title_patterns and _title_matches(c.get("title", ""), src_title_patterns)
]
targets = [
c for c in controls
if tgt_title_patterns and _title_matches(c.get("title", ""), tgt_title_patterns)
]
for src in sources:
for tgt in targets:
src_id = src.get("id", src.get("control_id", ""))
tgt_id = tgt.get("id", tgt.get("control_id", ""))
if src_id == tgt_id:
continue
dep = Dependency(
source_control_id=src_id,
target_control_id=tgt_id,
dependency_type=rule.get("dependency_type", "prerequisite"),
condition=rule.get("condition", {
"field": "source.status", "op": "==", "value": "pass",
}),
effect=rule.get("effect", {"set_status": "not_applicable"}),
priority=rule.get("priority", DEFAULT_PRIORITIES.get(
rule.get("dependency_type", "prerequisite"), 100
)),
generation_method="domain_pack",
)
dependencies.append(dep)
return dependencies
# ============================================================================
# TOP-LEVEL GENERATOR
# ============================================================================
def generate_all_dependencies(
controls: list[dict],
enable_ontology: bool = True,
enable_patterns: bool = True,
enable_domain_packs: bool = True,
domain_pack_dir: str = "",
) -> tuple[list[Dependency], dict]:
"""Run all generators and return deduplicated dependencies + stats."""
stats = {
"ontology_generated": 0,
"pattern_generated": 0,
"domain_generated": 0,
"total_before_dedup": 0,
"total_unique": 0,
"duplicates_removed": 0,
}
all_deps: list[Dependency] = []
if enable_ontology:
onto_deps = generate_ontology_dependencies(controls)
stats["ontology_generated"] = len(onto_deps)
all_deps.extend(onto_deps)
if enable_patterns:
pat_deps = generate_pattern_dependencies(controls)
stats["pattern_generated"] = len(pat_deps)
all_deps.extend(pat_deps)
if enable_domain_packs:
dom_deps = generate_domain_dependencies(controls, domain_pack_dir)
stats["domain_generated"] = len(dom_deps)
all_deps.extend(dom_deps)
stats["total_before_dedup"] = len(all_deps)
# Deduplicate by (source, target, type)
seen: set[tuple[str, str, str]] = set()
unique: list[Dependency] = []
for dep in all_deps:
key = (dep.source_control_id, dep.target_control_id, dep.dependency_type)
if key not in seen:
seen.add(key)
unique.append(dep)
stats["total_unique"] = len(unique)
stats["duplicates_removed"] = stats["total_before_dedup"] - stats["total_unique"]
return unique, stats