feat(knowledge-intake): classify a document + assess its impact before extraction
Phase A1. The real knowledge production is not writing — it is TARGETED UPDATING: when 20 documents arrive, which 5 change our knowledge and which 15 are ignorable? Before the parser, Knowledge Intake classifies a new document (no content extraction) and intersects its signals with an index of the existing knowledge to emit a Knowledge Package (an impact analysis). - compliance/knowledge_intake/: build_knowledge_index(patterns, playbooks, reference_scenarios, obligation_index) + assess_document_impact(descriptor, index) -> KnowledgePackage. Deterministic, NO content extraction, NO LLM. Surfaces affected capabilities / playbooks / transition patterns / reference scenarios / (injected) obligations, whether it is a new domain, and a triage level (HIGH / LOW / NONE / NEW_DOMAIN) with a recommendation. - ADR-006: Knowledge Intake = classify + impact before extraction; full factory Intake -> Package -> Parser -> Draft -> Review -> Published; phase order A1 Intake / A2 Draft / A3 Review. - reference suite: "Knowledge Intake" section triages 3 example documents (CRA SBOM-FAQ -> high, 14C/2PB/3RTS/2Obl; environmental guidance -> new_domain; marketing blog -> ignorable). Section lives in _helpers.py to keep generate.py under the 500-LOC budget. - Honest known refinement surfaced by intake: regulation-ID normalization (CRA vs Cyber Resilience Act). 10 intake tests (60 with the adjacent modules), mypy --strict clean (16 files), check-loc 0. Product code with no app caller + ADR/reference = non-runtime -> no deploy (ADR-001). Freeze-safe. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,23 @@
|
||||
"""Knowledge Intake — classify an incoming document and assess its impact on existing knowledge.
|
||||
|
||||
The stage BEFORE the parser: no content extraction, only Einordnung. Intersects a document's signals
|
||||
(regulations + keywords) with an index of the existing knowledge to emit a `KnowledgePackage` — which
|
||||
capabilities / playbooks / patterns / reference scenarios / obligations it probably touches, whether
|
||||
it is a new domain, and how much review it warrants. Deterministic, no LLM, no new corpus (freeze v1.0).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from .engine import assess_document_impact, build_knowledge_index
|
||||
from .schemas import (
|
||||
DocumentDescriptor, ImpactLevel, KnowledgeIndex, KnowledgePackage,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"build_knowledge_index",
|
||||
"assess_document_impact",
|
||||
"DocumentDescriptor",
|
||||
"KnowledgeIndex",
|
||||
"KnowledgePackage",
|
||||
"ImpactLevel",
|
||||
]
|
||||
@@ -0,0 +1,111 @@
|
||||
"""Knowledge Intake — classify a document and assess its impact on existing knowledge.
|
||||
|
||||
The real Knowledge Production is not writing — it is TARGETED UPDATING: when 20 documents arrive,
|
||||
which 5 actually change our knowledge and which 15 are ignorable? Intake answers this deterministically
|
||||
by intersecting a document's signals (declared regulations + keywords) with an index of the existing
|
||||
knowledge (capabilities, playbooks, transition patterns, reference scenarios, injected obligations).
|
||||
It performs NO content extraction (that is the later parser stage) and uses NO LLM.
|
||||
|
||||
Pipeline: Knowledge Intake -> Knowledge Package -> Parser -> Draft Generator -> Review -> Published.
|
||||
Pure, deterministic, computed-not-stored. No new corpus/meta-model class (freeze v1.0). Python 3.9.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Dict, List, Optional, Set
|
||||
|
||||
from .schemas import DocumentDescriptor, ImpactLevel, KnowledgeIndex, KnowledgePackage
|
||||
|
||||
|
||||
def _targets(goal_to: Any) -> List[str]:
|
||||
"""Extract target regulations from a transition_goal.to (single dict OR list of targets)."""
|
||||
out: List[str] = []
|
||||
items = goal_to if isinstance(goal_to, list) else [goal_to]
|
||||
for it in items:
|
||||
if isinstance(it, dict):
|
||||
reg = it.get("regulation") or it.get("target") or it.get("framework")
|
||||
if reg:
|
||||
out.append(str(reg))
|
||||
return out
|
||||
|
||||
|
||||
def build_knowledge_index(
|
||||
patterns: List[Dict[str, Any]],
|
||||
playbooks: List[Dict[str, Any]],
|
||||
reference_scenarios: List[Dict[str, Any]],
|
||||
obligation_index: Optional[Dict[str, List[str]]] = None,
|
||||
) -> KnowledgeIndex:
|
||||
"""Assemble the matching index from already-loaded knowledge dicts (file I/O stays in the caller)."""
|
||||
tp: Dict[str, List[str]] = {}
|
||||
cap_regs: Dict[str, List[str]] = {}
|
||||
for p in patterns:
|
||||
pid = str(p.get("id", ""))
|
||||
targets = _targets(p.get("transition_goal", {}).get("to"))
|
||||
if pid:
|
||||
tp[pid] = targets
|
||||
for item in list(p.get("likely_covered", [])) + list(p.get("delta_requirements", [])):
|
||||
cap = item.get("capability")
|
||||
if not cap:
|
||||
continue
|
||||
regs = [str(t) for t in item.get("covers_targets", [])] or targets
|
||||
cap_regs.setdefault(str(cap), [])
|
||||
cap_regs[str(cap)] = sorted(set(cap_regs[str(cap)]) | set(regs))
|
||||
rts = {str(r.get("id", "")): _targets(r.get("transition_goal", {}).get("to")) for r in reference_scenarios}
|
||||
rts.pop("", None)
|
||||
obl = obligation_index or {}
|
||||
regulations = sorted(
|
||||
{t for ts in tp.values() for t in ts}
|
||||
| {t for ts in rts.values() for t in ts}
|
||||
| {t for ts in cap_regs.values() for t in ts}
|
||||
| set(obl.keys())
|
||||
)
|
||||
return KnowledgeIndex(
|
||||
regulations=regulations, capability_regulations=cap_regs,
|
||||
playbook_capabilities=sorted({str(pb.get("capability_id", "")) for pb in playbooks} - {""}),
|
||||
transition_patterns=tp, reference_scenarios=rts, obligation_index=dict(obl),
|
||||
)
|
||||
|
||||
|
||||
def _kw_match(keywords: Set[str], capability: str) -> bool:
|
||||
tokens = set(capability.lower().split("_"))
|
||||
return bool(keywords & tokens) or capability.lower() in keywords
|
||||
|
||||
|
||||
def assess_document_impact(descriptor: DocumentDescriptor, index: KnowledgeIndex) -> KnowledgePackage:
|
||||
"""Classify the document and compute which existing knowledge it probably touches, and how much."""
|
||||
doc_regs = set(descriptor.regulations)
|
||||
known = set(index.regulations)
|
||||
unknown = sorted(doc_regs - known)
|
||||
new_domain = bool(doc_regs) and not (doc_regs & known)
|
||||
kw = {k.lower() for k in descriptor.keywords}
|
||||
|
||||
caps = sorted(c for c, regs in index.capability_regulations.items() if (set(regs) & doc_regs) or _kw_match(kw, c))
|
||||
playbooks = sorted(set(caps) & set(index.playbook_capabilities))
|
||||
patterns = sorted(pid for pid, regs in index.transition_patterns.items() if set(regs) & doc_regs)
|
||||
scenarios = sorted(rid for rid, regs in index.reference_scenarios.items() if set(regs) & doc_regs)
|
||||
obligations = sorted({o for r in doc_regs for o in index.obligation_index.get(r, [])})
|
||||
|
||||
total = len(caps) + len(playbooks) + len(patterns) + len(scenarios) + len(obligations)
|
||||
if new_domain:
|
||||
level, rec = ImpactLevel.NEW_DOMAIN, "Neue Domäne — Corpus-Intake nötig (kein bestehendes Wissen betroffen)."
|
||||
elif total == 0:
|
||||
level, rec = ImpactLevel.NONE, "Wahrscheinlich ignorierbar — betrifft keinen bekannten Wissensbaustein."
|
||||
elif len(caps) >= 3 or playbooks or len(obligations) >= 5:
|
||||
level, rec = ImpactLevel.HIGH, "Gezielter Review priorisieren — hoher Impact auf bestehendes Wissen."
|
||||
else:
|
||||
level, rec = ImpactLevel.LOW, "Gezielter Review — geringer, eingegrenzter Impact."
|
||||
|
||||
summary = "Betrifft %d Capabilities, %d Playbooks, %d Patterns, %d Reference Scenarios, %d Obligations; %s." % (
|
||||
len(caps), len(playbooks), len(patterns), len(scenarios), len(obligations),
|
||||
"NEUE Domäne" if new_domain else "keine neue Domäne",
|
||||
)
|
||||
return KnowledgePackage(
|
||||
document_id=descriptor.document_id,
|
||||
classification={"regulations": sorted(doc_regs), "keywords": sorted(kw),
|
||||
"document_type": [descriptor.document_type] if descriptor.document_type else []},
|
||||
new_domain=new_domain, unknown_regulations=unknown,
|
||||
affected_capabilities=caps, affected_playbooks=playbooks,
|
||||
affected_transition_patterns=patterns, affected_reference_scenarios=scenarios,
|
||||
affected_obligations=obligations, impact_level=level,
|
||||
impact_summary=summary, recommendation=rec,
|
||||
)
|
||||
@@ -0,0 +1,62 @@
|
||||
"""Schemas for Knowledge Intake — classify a new document and assess its IMPACT (no extraction yet).
|
||||
|
||||
Before the parser/draft stages, Intake answers „welche Teile unseres Wissensbestands sind überhaupt
|
||||
betroffen?". It does NOT extract content — it only classifies the document and intersects its signals
|
||||
with an index of the existing knowledge (capabilities, playbooks, transition patterns, reference
|
||||
scenarios, injected obligations) to emit a `KnowledgePackage` (an impact analysis). Deterministic,
|
||||
computed-not-stored, no new corpus, no new meta-model class (freeze v1.0). Python 3.9 compatible.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from enum import Enum
|
||||
from typing import Dict, List
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class ImpactLevel(str, Enum):
|
||||
NONE = "none" # touches nothing known -> likely ignorable
|
||||
LOW = "low" # touches a little -> targeted review
|
||||
HIGH = "high" # touches a lot -> prioritise review
|
||||
NEW_DOMAIN = "new_domain" # references only unknown regulations -> domain intake
|
||||
|
||||
|
||||
class DocumentDescriptor(BaseModel):
|
||||
"""Lightweight signals of an incoming document — NO content body, only classification inputs."""
|
||||
|
||||
document_id: str
|
||||
title: str = ""
|
||||
source: str = "" # e.g. BSI, ENISA, EU
|
||||
document_type: str = "" # e.g. guidance, faq, regulation, recommendation
|
||||
regulations: List[str] = Field(default_factory=list) # declared regulations it references
|
||||
keywords: List[str] = Field(default_factory=list) # lightweight topic signals (e.g. sbom)
|
||||
product_types: List[str] = Field(default_factory=list)
|
||||
|
||||
|
||||
class KnowledgeIndex(BaseModel):
|
||||
"""A deterministic index of the EXISTING knowledge to match an incoming document against."""
|
||||
|
||||
regulations: List[str] = Field(default_factory=list) # all regulations the corpus knows
|
||||
capability_regulations: Dict[str, List[str]] = Field(default_factory=dict) # capability -> covers_targets
|
||||
playbook_capabilities: List[str] = Field(default_factory=list) # capabilities that HAVE a playbook
|
||||
transition_patterns: Dict[str, List[str]] = Field(default_factory=dict) # pattern_id -> target regulations
|
||||
reference_scenarios: Dict[str, List[str]] = Field(default_factory=dict) # rts_id -> regulations
|
||||
obligation_index: Dict[str, List[str]] = Field(default_factory=dict) # regulation -> obligation ids (INJECTED)
|
||||
|
||||
|
||||
class KnowledgePackage(BaseModel):
|
||||
"""The impact analysis for one document — what of our knowledge it probably touches, and how much."""
|
||||
|
||||
document_id: str
|
||||
classification: Dict[str, List[str]] = Field(default_factory=dict) # echoed regulations/keywords/types
|
||||
new_domain: bool = False
|
||||
unknown_regulations: List[str] = Field(default_factory=list)
|
||||
affected_capabilities: List[str] = Field(default_factory=list)
|
||||
affected_playbooks: List[str] = Field(default_factory=list)
|
||||
affected_transition_patterns: List[str] = Field(default_factory=list)
|
||||
affected_reference_scenarios: List[str] = Field(default_factory=list)
|
||||
affected_obligations: List[str] = Field(default_factory=list)
|
||||
impact_level: ImpactLevel = ImpactLevel.NONE
|
||||
impact_summary: str = ""
|
||||
recommendation: str = ""
|
||||
Reference in New Issue
Block a user