feat(knowledge-intake): classify a document + assess its impact before extraction

Phase A1. The real knowledge production is not writing — it is TARGETED UPDATING: when 20 documents
arrive, which 5 change our knowledge and which 15 are ignorable? Before the parser, Knowledge Intake
classifies a new document (no content extraction) and intersects its signals with an index of the
existing knowledge to emit a Knowledge Package (an impact analysis).

- compliance/knowledge_intake/: build_knowledge_index(patterns, playbooks, reference_scenarios,
  obligation_index) + assess_document_impact(descriptor, index) -> KnowledgePackage. Deterministic,
  NO content extraction, NO LLM. Surfaces affected capabilities / playbooks / transition patterns /
  reference scenarios / (injected) obligations, whether it is a new domain, and a triage level
  (HIGH / LOW / NONE / NEW_DOMAIN) with a recommendation.
- ADR-006: Knowledge Intake = classify + impact before extraction; full factory Intake -> Package ->
  Parser -> Draft -> Review -> Published; phase order A1 Intake / A2 Draft / A3 Review.
- reference suite: "Knowledge Intake" section triages 3 example documents (CRA SBOM-FAQ -> high,
  14C/2PB/3RTS/2Obl; environmental guidance -> new_domain; marketing blog -> ignorable). Section
  lives in _helpers.py to keep generate.py under the 500-LOC budget.
- Honest known refinement surfaced by intake: regulation-ID normalization (CRA vs Cyber Resilience Act).

10 intake tests (60 with the adjacent modules), mypy --strict clean (16 files), check-loc 0.
Product code with no app caller + ADR/reference = non-runtime -> no deploy (ADR-001). Freeze-safe.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-06-27 13:58:59 +02:00
parent d51bcd77c7
commit 07e392913f
8 changed files with 419 additions and 2 deletions
@@ -0,0 +1,111 @@
"""Knowledge Intake — classify a document and assess its impact on existing knowledge.
The real Knowledge Production is not writing — it is TARGETED UPDATING: when 20 documents arrive,
which 5 actually change our knowledge and which 15 are ignorable? Intake answers this deterministically
by intersecting a document's signals (declared regulations + keywords) with an index of the existing
knowledge (capabilities, playbooks, transition patterns, reference scenarios, injected obligations).
It performs NO content extraction (that is the later parser stage) and uses NO LLM.
Pipeline: Knowledge Intake -> Knowledge Package -> Parser -> Draft Generator -> Review -> Published.
Pure, deterministic, computed-not-stored. No new corpus/meta-model class (freeze v1.0). Python 3.9.
"""
from __future__ import annotations
from typing import Any, Dict, List, Optional, Set
from .schemas import DocumentDescriptor, ImpactLevel, KnowledgeIndex, KnowledgePackage
def _targets(goal_to: Any) -> List[str]:
"""Extract target regulations from a transition_goal.to (single dict OR list of targets)."""
out: List[str] = []
items = goal_to if isinstance(goal_to, list) else [goal_to]
for it in items:
if isinstance(it, dict):
reg = it.get("regulation") or it.get("target") or it.get("framework")
if reg:
out.append(str(reg))
return out
def build_knowledge_index(
patterns: List[Dict[str, Any]],
playbooks: List[Dict[str, Any]],
reference_scenarios: List[Dict[str, Any]],
obligation_index: Optional[Dict[str, List[str]]] = None,
) -> KnowledgeIndex:
"""Assemble the matching index from already-loaded knowledge dicts (file I/O stays in the caller)."""
tp: Dict[str, List[str]] = {}
cap_regs: Dict[str, List[str]] = {}
for p in patterns:
pid = str(p.get("id", ""))
targets = _targets(p.get("transition_goal", {}).get("to"))
if pid:
tp[pid] = targets
for item in list(p.get("likely_covered", [])) + list(p.get("delta_requirements", [])):
cap = item.get("capability")
if not cap:
continue
regs = [str(t) for t in item.get("covers_targets", [])] or targets
cap_regs.setdefault(str(cap), [])
cap_regs[str(cap)] = sorted(set(cap_regs[str(cap)]) | set(regs))
rts = {str(r.get("id", "")): _targets(r.get("transition_goal", {}).get("to")) for r in reference_scenarios}
rts.pop("", None)
obl = obligation_index or {}
regulations = sorted(
{t for ts in tp.values() for t in ts}
| {t for ts in rts.values() for t in ts}
| {t for ts in cap_regs.values() for t in ts}
| set(obl.keys())
)
return KnowledgeIndex(
regulations=regulations, capability_regulations=cap_regs,
playbook_capabilities=sorted({str(pb.get("capability_id", "")) for pb in playbooks} - {""}),
transition_patterns=tp, reference_scenarios=rts, obligation_index=dict(obl),
)
def _kw_match(keywords: Set[str], capability: str) -> bool:
tokens = set(capability.lower().split("_"))
return bool(keywords & tokens) or capability.lower() in keywords
def assess_document_impact(descriptor: DocumentDescriptor, index: KnowledgeIndex) -> KnowledgePackage:
"""Classify the document and compute which existing knowledge it probably touches, and how much."""
doc_regs = set(descriptor.regulations)
known = set(index.regulations)
unknown = sorted(doc_regs - known)
new_domain = bool(doc_regs) and not (doc_regs & known)
kw = {k.lower() for k in descriptor.keywords}
caps = sorted(c for c, regs in index.capability_regulations.items() if (set(regs) & doc_regs) or _kw_match(kw, c))
playbooks = sorted(set(caps) & set(index.playbook_capabilities))
patterns = sorted(pid for pid, regs in index.transition_patterns.items() if set(regs) & doc_regs)
scenarios = sorted(rid for rid, regs in index.reference_scenarios.items() if set(regs) & doc_regs)
obligations = sorted({o for r in doc_regs for o in index.obligation_index.get(r, [])})
total = len(caps) + len(playbooks) + len(patterns) + len(scenarios) + len(obligations)
if new_domain:
level, rec = ImpactLevel.NEW_DOMAIN, "Neue Domäne — Corpus-Intake nötig (kein bestehendes Wissen betroffen)."
elif total == 0:
level, rec = ImpactLevel.NONE, "Wahrscheinlich ignorierbar — betrifft keinen bekannten Wissensbaustein."
elif len(caps) >= 3 or playbooks or len(obligations) >= 5:
level, rec = ImpactLevel.HIGH, "Gezielter Review priorisieren — hoher Impact auf bestehendes Wissen."
else:
level, rec = ImpactLevel.LOW, "Gezielter Review — geringer, eingegrenzter Impact."
summary = "Betrifft %d Capabilities, %d Playbooks, %d Patterns, %d Reference Scenarios, %d Obligations; %s." % (
len(caps), len(playbooks), len(patterns), len(scenarios), len(obligations),
"NEUE Domäne" if new_domain else "keine neue Domäne",
)
return KnowledgePackage(
document_id=descriptor.document_id,
classification={"regulations": sorted(doc_regs), "keywords": sorted(kw),
"document_type": [descriptor.document_type] if descriptor.document_type else []},
new_domain=new_domain, unknown_regulations=unknown,
affected_capabilities=caps, affected_playbooks=playbooks,
affected_transition_patterns=patterns, affected_reference_scenarios=scenarios,
affected_obligations=obligations, impact_level=level,
impact_summary=summary, recommendation=rec,
)