07e392913f
Phase A1. The real knowledge production is not writing — it is TARGETED UPDATING: when 20 documents arrive, which 5 change our knowledge and which 15 are ignorable? Before the parser, Knowledge Intake classifies a new document (no content extraction) and intersects its signals with an index of the existing knowledge to emit a Knowledge Package (an impact analysis). - compliance/knowledge_intake/: build_knowledge_index(patterns, playbooks, reference_scenarios, obligation_index) + assess_document_impact(descriptor, index) -> KnowledgePackage. Deterministic, NO content extraction, NO LLM. Surfaces affected capabilities / playbooks / transition patterns / reference scenarios / (injected) obligations, whether it is a new domain, and a triage level (HIGH / LOW / NONE / NEW_DOMAIN) with a recommendation. - ADR-006: Knowledge Intake = classify + impact before extraction; full factory Intake -> Package -> Parser -> Draft -> Review -> Published; phase order A1 Intake / A2 Draft / A3 Review. - reference suite: "Knowledge Intake" section triages 3 example documents (CRA SBOM-FAQ -> high, 14C/2PB/3RTS/2Obl; environmental guidance -> new_domain; marketing blog -> ignorable). Section lives in _helpers.py to keep generate.py under the 500-LOC budget. - Honest known refinement surfaced by intake: regulation-ID normalization (CRA vs Cyber Resilience Act). 10 intake tests (60 with the adjacent modules), mypy --strict clean (16 files), check-loc 0. Product code with no app caller + ADR/reference = non-runtime -> no deploy (ADR-001). Freeze-safe. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
98 lines
4.5 KiB
Python
98 lines
4.5 KiB
Python
"""Tests for Knowledge Intake — classify a document and assess its impact (no extraction, no LLM).
|
|
|
|
Acceptance: build a deterministic index from existing knowledge; for an incoming document, surface
|
|
which capabilities / playbooks / patterns / reference scenarios / obligations it probably touches,
|
|
whether it is a new domain, and triage it (HIGH / LOW / NONE / NEW_DOMAIN). The point: of N documents,
|
|
which few actually change our knowledge.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from compliance.knowledge_intake import (
|
|
DocumentDescriptor, ImpactLevel, KnowledgeIndex,
|
|
assess_document_impact, build_knowledge_index,
|
|
)
|
|
|
|
PATTERNS = [
|
|
{"id": "TP-A", "transition_goal": {"to": {"regulation": "CRA"}},
|
|
"delta_requirements": [{"capability": "sbom_creation", "covers_targets": ["CRA"]},
|
|
{"capability": "coordinated_vulnerability_disclosure", "covers_targets": ["CRA"]}]},
|
|
{"id": "TP-B", "transition_goal": {"to": [{"regulation": "CRA"}, {"regulation": "MaschinenVO"}]},
|
|
"delta_requirements": [{"capability": "machine_guards", "covers_targets": ["MaschinenVO"]}]},
|
|
]
|
|
PLAYBOOKS = [{"capability_id": "sbom_creation"}]
|
|
RTS = [{"id": "RTS-1", "transition_goal": {"to": [{"target": "CRA"}]}}]
|
|
|
|
|
|
def _index():
|
|
return build_knowledge_index(PATTERNS, PLAYBOOKS, RTS, obligation_index={"CRA": ["o1", "o2"]})
|
|
|
|
|
|
def test_build_index_extracts_regs_caps_playbooks():
|
|
idx = _index()
|
|
assert "CRA" in idx.regulations and "MaschinenVO" in idx.regulations
|
|
assert idx.capability_regulations["sbom_creation"] == ["CRA"]
|
|
assert idx.playbook_capabilities == ["sbom_creation"]
|
|
assert idx.transition_patterns["TP-B"] == ["CRA", "MaschinenVO"] # list-form to[] handled
|
|
assert idx.reference_scenarios["RTS-1"] == ["CRA"] # target key handled
|
|
|
|
|
|
def test_affected_by_regulation():
|
|
kp = assess_document_impact(DocumentDescriptor(document_id="d", regulations=["CRA"]), _index())
|
|
assert "sbom_creation" in kp.affected_capabilities
|
|
assert "TP-A" in kp.affected_transition_patterns and "RTS-1" in kp.affected_reference_scenarios
|
|
assert kp.affected_obligations == ["o1", "o2"]
|
|
assert not kp.new_domain
|
|
|
|
|
|
def test_affected_by_keyword_even_without_regulation():
|
|
# a doc with no declared regulation but keyword 'sbom' still finds the capability
|
|
kp = assess_document_impact(DocumentDescriptor(document_id="d", keywords=["sbom"]), _index())
|
|
assert "sbom_creation" in kp.affected_capabilities
|
|
assert "sbom_creation" in kp.affected_playbooks
|
|
|
|
|
|
def test_playbooks_are_affected_caps_with_a_playbook():
|
|
kp = assess_document_impact(DocumentDescriptor(document_id="d", regulations=["CRA"]), _index())
|
|
assert kp.affected_playbooks == ["sbom_creation"] # cvd has no playbook here
|
|
|
|
|
|
def test_new_domain_when_only_unknown_regulations():
|
|
kp = assess_document_impact(DocumentDescriptor(document_id="d", regulations=["UmweltVO"]), _index())
|
|
assert kp.new_domain and kp.impact_level == ImpactLevel.NEW_DOMAIN
|
|
assert kp.unknown_regulations == ["UmweltVO"]
|
|
assert kp.affected_capabilities == []
|
|
|
|
|
|
def test_none_when_nothing_matches():
|
|
kp = assess_document_impact(DocumentDescriptor(document_id="d", keywords=["newsletter"]), _index())
|
|
assert kp.impact_level == ImpactLevel.NONE and not kp.new_domain
|
|
assert "ignorierbar" in kp.recommendation
|
|
|
|
|
|
def test_high_impact_triage():
|
|
kp = assess_document_impact(DocumentDescriptor(document_id="d", regulations=["CRA"]), _index())
|
|
# >= 3 affected caps OR a playbook -> HIGH
|
|
assert kp.impact_level == ImpactLevel.HIGH
|
|
assert "priorisieren" in kp.recommendation
|
|
|
|
|
|
def test_low_impact_when_small_and_no_playbook():
|
|
idx = KnowledgeIndex(regulations=["CRA"], capability_regulations={"x": ["CRA"]}, playbook_capabilities=[])
|
|
kp = assess_document_impact(DocumentDescriptor(document_id="d", regulations=["CRA"]), idx)
|
|
assert kp.impact_level == ImpactLevel.LOW and kp.affected_capabilities == ["x"]
|
|
|
|
|
|
def test_classification_echoed():
|
|
kp = assess_document_impact(DocumentDescriptor(document_id="d", regulations=["CRA"], keywords=["SBOM"], document_type="faq"), _index())
|
|
assert kp.classification["regulations"] == ["CRA"]
|
|
assert kp.classification["keywords"] == ["sbom"] and kp.classification["document_type"] == ["faq"]
|
|
|
|
|
|
def test_deterministic():
|
|
idx = _index()
|
|
d = DocumentDescriptor(document_id="d", regulations=["CRA"], keywords=["sbom"])
|
|
a = assess_document_impact(d, idx)
|
|
b = assess_document_impact(d, idx)
|
|
assert a.model_dump() == b.model_dump()
|