breakpilot-compliance/backend-compliance/tests/test_knowledge_intake.py

"""Tests for Knowledge Intake — classify a document and assess its impact (no extraction, no LLM).

Acceptance: build a deterministic index from existing knowledge; for an incoming document, surface
which capabilities / playbooks / patterns / reference scenarios / obligations it probably touches,
whether it is a new domain, and triage it (HIGH / LOW / NONE / NEW_DOMAIN). The point: of N documents,
which few actually change our knowledge.
"""

from __future__ import annotations

from compliance.knowledge_intake import (
    DocumentDescriptor, ImpactLevel, KnowledgeIndex,
    assess_document_impact, build_knowledge_index,
)

PATTERNS = [
    {"id": "TP-A", "transition_goal": {"to": {"regulation": "CRA"}},
     "delta_requirements": [{"capability": "sbom_creation", "covers_targets": ["CRA"]},
                            {"capability": "coordinated_vulnerability_disclosure", "covers_targets": ["CRA"]}]},
    {"id": "TP-B", "transition_goal": {"to": [{"regulation": "CRA"}, {"regulation": "MaschinenVO"}]},
     "delta_requirements": [{"capability": "machine_guards", "covers_targets": ["MaschinenVO"]}]},
]
PLAYBOOKS = [{"capability_id": "sbom_creation"}]
RTS = [{"id": "RTS-1", "transition_goal": {"to": [{"target": "CRA"}]}}]


def _index():
    return build_knowledge_index(PATTERNS, PLAYBOOKS, RTS, obligation_index={"CRA": ["o1", "o2"]})


def test_build_index_extracts_regs_caps_playbooks():
    idx = _index()
    assert "CRA" in idx.regulations and "MaschinenVO" in idx.regulations
    assert idx.capability_regulations["sbom_creation"] == ["CRA"]
    assert idx.playbook_capabilities == ["sbom_creation"]
    assert idx.transition_patterns["TP-B"] == ["CRA", "MaschinenVO"]   # list-form to[] handled
    assert idx.reference_scenarios["RTS-1"] == ["CRA"]                 # target key handled


def test_affected_by_regulation():
    kp = assess_document_impact(DocumentDescriptor(document_id="d", regulations=["CRA"]), _index())
    assert "sbom_creation" in kp.affected_capabilities
    assert "TP-A" in kp.affected_transition_patterns and "RTS-1" in kp.affected_reference_scenarios
    assert kp.affected_obligations == ["o1", "o2"]
    assert not kp.new_domain


def test_affected_by_keyword_even_without_regulation():
    # a doc with no declared regulation but keyword 'sbom' still finds the capability
    kp = assess_document_impact(DocumentDescriptor(document_id="d", keywords=["sbom"]), _index())
    assert "sbom_creation" in kp.affected_capabilities
    assert "sbom_creation" in kp.affected_playbooks


def test_playbooks_are_affected_caps_with_a_playbook():
    kp = assess_document_impact(DocumentDescriptor(document_id="d", regulations=["CRA"]), _index())
    assert kp.affected_playbooks == ["sbom_creation"]                 # cvd has no playbook here


def test_new_domain_when_only_unknown_regulations():
    kp = assess_document_impact(DocumentDescriptor(document_id="d", regulations=["UmweltVO"]), _index())
    assert kp.new_domain and kp.impact_level == ImpactLevel.NEW_DOMAIN
    assert kp.unknown_regulations == ["UmweltVO"]
    assert kp.affected_capabilities == []


def test_none_when_nothing_matches():
    kp = assess_document_impact(DocumentDescriptor(document_id="d", keywords=["newsletter"]), _index())
    assert kp.impact_level == ImpactLevel.NONE and not kp.new_domain
    assert "ignorierbar" in kp.recommendation


def test_high_impact_triage():
    kp = assess_document_impact(DocumentDescriptor(document_id="d", regulations=["CRA"]), _index())
    # >= 3 affected caps OR a playbook -> HIGH
    assert kp.impact_level == ImpactLevel.HIGH
    assert "priorisieren" in kp.recommendation


def test_low_impact_when_small_and_no_playbook():
    idx = KnowledgeIndex(regulations=["CRA"], capability_regulations={"x": ["CRA"]}, playbook_capabilities=[])
    kp = assess_document_impact(DocumentDescriptor(document_id="d", regulations=["CRA"]), idx)
    assert kp.impact_level == ImpactLevel.LOW and kp.affected_capabilities == ["x"]


def test_classification_echoed():
    kp = assess_document_impact(DocumentDescriptor(document_id="d", regulations=["CRA"], keywords=["SBOM"], document_type="faq"), _index())
    assert kp.classification["regulations"] == ["CRA"]
    assert kp.classification["keywords"] == ["sbom"] and kp.classification["document_type"] == ["faq"]


def test_deterministic():
    idx = _index()
    d = DocumentDescriptor(document_id="d", regulations=["CRA"], keywords=["sbom"])
    a = assess_document_impact(d, idx)
    b = assess_document_impact(d, idx)
    assert a.model_dump() == b.model_dump()