"""Tests for Knowledge Intake — classify a document and assess its impact (no extraction, no LLM). Acceptance: build a deterministic index from existing knowledge; for an incoming document, surface which capabilities / playbooks / patterns / reference scenarios / obligations it probably touches, whether it is a new domain, and triage it (HIGH / LOW / NONE / NEW_DOMAIN). The point: of N documents, which few actually change our knowledge. """ from __future__ import annotations from compliance.knowledge_intake import ( DocumentDescriptor, ImpactLevel, KnowledgeIndex, assess_document_impact, build_knowledge_index, ) PATTERNS = [ {"id": "TP-A", "transition_goal": {"to": {"regulation": "CRA"}}, "delta_requirements": [{"capability": "sbom_creation", "covers_targets": ["CRA"]}, {"capability": "coordinated_vulnerability_disclosure", "covers_targets": ["CRA"]}]}, {"id": "TP-B", "transition_goal": {"to": [{"regulation": "CRA"}, {"regulation": "MaschinenVO"}]}, "delta_requirements": [{"capability": "machine_guards", "covers_targets": ["MaschinenVO"]}]}, ] PLAYBOOKS = [{"capability_id": "sbom_creation"}] RTS = [{"id": "RTS-1", "transition_goal": {"to": [{"target": "CRA"}]}}] def _index(): return build_knowledge_index(PATTERNS, PLAYBOOKS, RTS, obligation_index={"CRA": ["o1", "o2"]}) def test_build_index_extracts_regs_caps_playbooks(): idx = _index() assert "CRA" in idx.regulations and "MaschinenVO" in idx.regulations assert idx.capability_regulations["sbom_creation"] == ["CRA"] assert idx.playbook_capabilities == ["sbom_creation"] assert idx.transition_patterns["TP-B"] == ["CRA", "MaschinenVO"] # list-form to[] handled assert idx.reference_scenarios["RTS-1"] == ["CRA"] # target key handled def test_affected_by_regulation(): kp = assess_document_impact(DocumentDescriptor(document_id="d", regulations=["CRA"]), _index()) assert "sbom_creation" in kp.affected_capabilities assert "TP-A" in kp.affected_transition_patterns and "RTS-1" in kp.affected_reference_scenarios assert kp.affected_obligations == ["o1", "o2"] assert not kp.new_domain def test_affected_by_keyword_even_without_regulation(): # a doc with no declared regulation but keyword 'sbom' still finds the capability kp = assess_document_impact(DocumentDescriptor(document_id="d", keywords=["sbom"]), _index()) assert "sbom_creation" in kp.affected_capabilities assert "sbom_creation" in kp.affected_playbooks def test_playbooks_are_affected_caps_with_a_playbook(): kp = assess_document_impact(DocumentDescriptor(document_id="d", regulations=["CRA"]), _index()) assert kp.affected_playbooks == ["sbom_creation"] # cvd has no playbook here def test_new_domain_when_only_unknown_regulations(): kp = assess_document_impact(DocumentDescriptor(document_id="d", regulations=["UmweltVO"]), _index()) assert kp.new_domain and kp.impact_level == ImpactLevel.NEW_DOMAIN assert kp.unknown_regulations == ["UmweltVO"] assert kp.affected_capabilities == [] def test_none_when_nothing_matches(): kp = assess_document_impact(DocumentDescriptor(document_id="d", keywords=["newsletter"]), _index()) assert kp.impact_level == ImpactLevel.NONE and not kp.new_domain assert "ignorierbar" in kp.recommendation def test_high_impact_triage(): kp = assess_document_impact(DocumentDescriptor(document_id="d", regulations=["CRA"]), _index()) # >= 3 affected caps OR a playbook -> HIGH assert kp.impact_level == ImpactLevel.HIGH assert "priorisieren" in kp.recommendation def test_low_impact_when_small_and_no_playbook(): idx = KnowledgeIndex(regulations=["CRA"], capability_regulations={"x": ["CRA"]}, playbook_capabilities=[]) kp = assess_document_impact(DocumentDescriptor(document_id="d", regulations=["CRA"]), idx) assert kp.impact_level == ImpactLevel.LOW and kp.affected_capabilities == ["x"] def test_classification_echoed(): kp = assess_document_impact(DocumentDescriptor(document_id="d", regulations=["CRA"], keywords=["SBOM"], document_type="faq"), _index()) assert kp.classification["regulations"] == ["CRA"] assert kp.classification["keywords"] == ["sbom"] and kp.classification["document_type"] == ["faq"] def test_deterministic(): idx = _index() d = DocumentDescriptor(document_id="d", regulations=["CRA"], keywords=["sbom"]) a = assess_document_impact(d, idx) b = assess_document_impact(d, idx) assert a.model_dump() == b.model_dump()