feat(knowledge-intake): classify a document + assess its impact before extraction

Phase A1. The real knowledge production is not writing — it is TARGETED UPDATING: when 20 documents
arrive, which 5 change our knowledge and which 15 are ignorable? Before the parser, Knowledge Intake
classifies a new document (no content extraction) and intersects its signals with an index of the
existing knowledge to emit a Knowledge Package (an impact analysis).

- compliance/knowledge_intake/: build_knowledge_index(patterns, playbooks, reference_scenarios,
  obligation_index) + assess_document_impact(descriptor, index) -> KnowledgePackage. Deterministic,
  NO content extraction, NO LLM. Surfaces affected capabilities / playbooks / transition patterns /
  reference scenarios / (injected) obligations, whether it is a new domain, and a triage level
  (HIGH / LOW / NONE / NEW_DOMAIN) with a recommendation.
- ADR-006: Knowledge Intake = classify + impact before extraction; full factory Intake -> Package ->
  Parser -> Draft -> Review -> Published; phase order A1 Intake / A2 Draft / A3 Review.
- reference suite: "Knowledge Intake" section triages 3 example documents (CRA SBOM-FAQ -> high,
  14C/2PB/3RTS/2Obl; environmental guidance -> new_domain; marketing blog -> ignorable). Section
  lives in _helpers.py to keep generate.py under the 500-LOC budget.
- Honest known refinement surfaced by intake: regulation-ID normalization (CRA vs Cyber Resilience Act).

10 intake tests (60 with the adjacent modules), mypy --strict clean (16 files), check-loc 0.
Product code with no app caller + ADR/reference = non-runtime -> no deploy (ADR-001). Freeze-safe.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-06-27 13:58:59 +02:00
parent d51bcd77c7
commit 07e392913f
8 changed files with 419 additions and 2 deletions
@@ -0,0 +1,23 @@
"""Knowledge Intake — classify an incoming document and assess its impact on existing knowledge.
The stage BEFORE the parser: no content extraction, only Einordnung. Intersects a document's signals
(regulations + keywords) with an index of the existing knowledge to emit a `KnowledgePackage` — which
capabilities / playbooks / patterns / reference scenarios / obligations it probably touches, whether
it is a new domain, and how much review it warrants. Deterministic, no LLM, no new corpus (freeze v1.0).
"""
from __future__ import annotations
from .engine import assess_document_impact, build_knowledge_index
from .schemas import (
DocumentDescriptor, ImpactLevel, KnowledgeIndex, KnowledgePackage,
)
__all__ = [
"build_knowledge_index",
"assess_document_impact",
"DocumentDescriptor",
"KnowledgeIndex",
"KnowledgePackage",
"ImpactLevel",
]
@@ -0,0 +1,111 @@
"""Knowledge Intake — classify a document and assess its impact on existing knowledge.
The real Knowledge Production is not writing — it is TARGETED UPDATING: when 20 documents arrive,
which 5 actually change our knowledge and which 15 are ignorable? Intake answers this deterministically
by intersecting a document's signals (declared regulations + keywords) with an index of the existing
knowledge (capabilities, playbooks, transition patterns, reference scenarios, injected obligations).
It performs NO content extraction (that is the later parser stage) and uses NO LLM.
Pipeline: Knowledge Intake -> Knowledge Package -> Parser -> Draft Generator -> Review -> Published.
Pure, deterministic, computed-not-stored. No new corpus/meta-model class (freeze v1.0). Python 3.9.
"""
from __future__ import annotations
from typing import Any, Dict, List, Optional, Set
from .schemas import DocumentDescriptor, ImpactLevel, KnowledgeIndex, KnowledgePackage
def _targets(goal_to: Any) -> List[str]:
"""Extract target regulations from a transition_goal.to (single dict OR list of targets)."""
out: List[str] = []
items = goal_to if isinstance(goal_to, list) else [goal_to]
for it in items:
if isinstance(it, dict):
reg = it.get("regulation") or it.get("target") or it.get("framework")
if reg:
out.append(str(reg))
return out
def build_knowledge_index(
patterns: List[Dict[str, Any]],
playbooks: List[Dict[str, Any]],
reference_scenarios: List[Dict[str, Any]],
obligation_index: Optional[Dict[str, List[str]]] = None,
) -> KnowledgeIndex:
"""Assemble the matching index from already-loaded knowledge dicts (file I/O stays in the caller)."""
tp: Dict[str, List[str]] = {}
cap_regs: Dict[str, List[str]] = {}
for p in patterns:
pid = str(p.get("id", ""))
targets = _targets(p.get("transition_goal", {}).get("to"))
if pid:
tp[pid] = targets
for item in list(p.get("likely_covered", [])) + list(p.get("delta_requirements", [])):
cap = item.get("capability")
if not cap:
continue
regs = [str(t) for t in item.get("covers_targets", [])] or targets
cap_regs.setdefault(str(cap), [])
cap_regs[str(cap)] = sorted(set(cap_regs[str(cap)]) | set(regs))
rts = {str(r.get("id", "")): _targets(r.get("transition_goal", {}).get("to")) for r in reference_scenarios}
rts.pop("", None)
obl = obligation_index or {}
regulations = sorted(
{t for ts in tp.values() for t in ts}
| {t for ts in rts.values() for t in ts}
| {t for ts in cap_regs.values() for t in ts}
| set(obl.keys())
)
return KnowledgeIndex(
regulations=regulations, capability_regulations=cap_regs,
playbook_capabilities=sorted({str(pb.get("capability_id", "")) for pb in playbooks} - {""}),
transition_patterns=tp, reference_scenarios=rts, obligation_index=dict(obl),
)
def _kw_match(keywords: Set[str], capability: str) -> bool:
tokens = set(capability.lower().split("_"))
return bool(keywords & tokens) or capability.lower() in keywords
def assess_document_impact(descriptor: DocumentDescriptor, index: KnowledgeIndex) -> KnowledgePackage:
"""Classify the document and compute which existing knowledge it probably touches, and how much."""
doc_regs = set(descriptor.regulations)
known = set(index.regulations)
unknown = sorted(doc_regs - known)
new_domain = bool(doc_regs) and not (doc_regs & known)
kw = {k.lower() for k in descriptor.keywords}
caps = sorted(c for c, regs in index.capability_regulations.items() if (set(regs) & doc_regs) or _kw_match(kw, c))
playbooks = sorted(set(caps) & set(index.playbook_capabilities))
patterns = sorted(pid for pid, regs in index.transition_patterns.items() if set(regs) & doc_regs)
scenarios = sorted(rid for rid, regs in index.reference_scenarios.items() if set(regs) & doc_regs)
obligations = sorted({o for r in doc_regs for o in index.obligation_index.get(r, [])})
total = len(caps) + len(playbooks) + len(patterns) + len(scenarios) + len(obligations)
if new_domain:
level, rec = ImpactLevel.NEW_DOMAIN, "Neue Domäne — Corpus-Intake nötig (kein bestehendes Wissen betroffen)."
elif total == 0:
level, rec = ImpactLevel.NONE, "Wahrscheinlich ignorierbar — betrifft keinen bekannten Wissensbaustein."
elif len(caps) >= 3 or playbooks or len(obligations) >= 5:
level, rec = ImpactLevel.HIGH, "Gezielter Review priorisieren — hoher Impact auf bestehendes Wissen."
else:
level, rec = ImpactLevel.LOW, "Gezielter Review — geringer, eingegrenzter Impact."
summary = "Betrifft %d Capabilities, %d Playbooks, %d Patterns, %d Reference Scenarios, %d Obligations; %s." % (
len(caps), len(playbooks), len(patterns), len(scenarios), len(obligations),
"NEUE Domäne" if new_domain else "keine neue Domäne",
)
return KnowledgePackage(
document_id=descriptor.document_id,
classification={"regulations": sorted(doc_regs), "keywords": sorted(kw),
"document_type": [descriptor.document_type] if descriptor.document_type else []},
new_domain=new_domain, unknown_regulations=unknown,
affected_capabilities=caps, affected_playbooks=playbooks,
affected_transition_patterns=patterns, affected_reference_scenarios=scenarios,
affected_obligations=obligations, impact_level=level,
impact_summary=summary, recommendation=rec,
)
@@ -0,0 +1,62 @@
"""Schemas for Knowledge Intake — classify a new document and assess its IMPACT (no extraction yet).
Before the parser/draft stages, Intake answers „welche Teile unseres Wissensbestands sind überhaupt
betroffen?". It does NOT extract content — it only classifies the document and intersects its signals
with an index of the existing knowledge (capabilities, playbooks, transition patterns, reference
scenarios, injected obligations) to emit a `KnowledgePackage` (an impact analysis). Deterministic,
computed-not-stored, no new corpus, no new meta-model class (freeze v1.0). Python 3.9 compatible.
"""
from __future__ import annotations
from enum import Enum
from typing import Dict, List
from pydantic import BaseModel, Field
class ImpactLevel(str, Enum):
NONE = "none" # touches nothing known -> likely ignorable
LOW = "low" # touches a little -> targeted review
HIGH = "high" # touches a lot -> prioritise review
NEW_DOMAIN = "new_domain" # references only unknown regulations -> domain intake
class DocumentDescriptor(BaseModel):
"""Lightweight signals of an incoming document — NO content body, only classification inputs."""
document_id: str
title: str = ""
source: str = "" # e.g. BSI, ENISA, EU
document_type: str = "" # e.g. guidance, faq, regulation, recommendation
regulations: List[str] = Field(default_factory=list) # declared regulations it references
keywords: List[str] = Field(default_factory=list) # lightweight topic signals (e.g. sbom)
product_types: List[str] = Field(default_factory=list)
class KnowledgeIndex(BaseModel):
"""A deterministic index of the EXISTING knowledge to match an incoming document against."""
regulations: List[str] = Field(default_factory=list) # all regulations the corpus knows
capability_regulations: Dict[str, List[str]] = Field(default_factory=dict) # capability -> covers_targets
playbook_capabilities: List[str] = Field(default_factory=list) # capabilities that HAVE a playbook
transition_patterns: Dict[str, List[str]] = Field(default_factory=dict) # pattern_id -> target regulations
reference_scenarios: Dict[str, List[str]] = Field(default_factory=dict) # rts_id -> regulations
obligation_index: Dict[str, List[str]] = Field(default_factory=dict) # regulation -> obligation ids (INJECTED)
class KnowledgePackage(BaseModel):
"""The impact analysis for one document — what of our knowledge it probably touches, and how much."""
document_id: str
classification: Dict[str, List[str]] = Field(default_factory=dict) # echoed regulations/keywords/types
new_domain: bool = False
unknown_regulations: List[str] = Field(default_factory=list)
affected_capabilities: List[str] = Field(default_factory=list)
affected_playbooks: List[str] = Field(default_factory=list)
affected_transition_patterns: List[str] = Field(default_factory=list)
affected_reference_scenarios: List[str] = Field(default_factory=list)
affected_obligations: List[str] = Field(default_factory=list)
impact_level: ImpactLevel = ImpactLevel.NONE
impact_summary: str = ""
recommendation: str = ""