"""Knowledge Intake — classify a document and assess its impact on existing knowledge. The real Knowledge Production is not writing — it is TARGETED UPDATING: when 20 documents arrive, which 5 actually change our knowledge and which 15 are ignorable? Intake answers this deterministically by intersecting a document's signals (declared regulations + keywords) with an index of the existing knowledge (capabilities, playbooks, transition patterns, reference scenarios, injected obligations). It performs NO content extraction (that is the later parser stage) and uses NO LLM. Pipeline: Knowledge Intake -> Knowledge Package -> Parser -> Draft Generator -> Review -> Published. Pure, deterministic, computed-not-stored. No new corpus/meta-model class (freeze v1.0). Python 3.9. """ from __future__ import annotations from typing import Any, Dict, List, Optional, Set from .schemas import DocumentDescriptor, ImpactLevel, KnowledgeIndex, KnowledgePackage def _targets(goal_to: Any) -> List[str]: """Extract target regulations from a transition_goal.to (single dict OR list of targets).""" out: List[str] = [] items = goal_to if isinstance(goal_to, list) else [goal_to] for it in items: if isinstance(it, dict): reg = it.get("regulation") or it.get("target") or it.get("framework") if reg: out.append(str(reg)) return out def build_knowledge_index( patterns: List[Dict[str, Any]], playbooks: List[Dict[str, Any]], reference_scenarios: List[Dict[str, Any]], obligation_index: Optional[Dict[str, List[str]]] = None, ) -> KnowledgeIndex: """Assemble the matching index from already-loaded knowledge dicts (file I/O stays in the caller).""" tp: Dict[str, List[str]] = {} cap_regs: Dict[str, List[str]] = {} for p in patterns: pid = str(p.get("id", "")) targets = _targets(p.get("transition_goal", {}).get("to")) if pid: tp[pid] = targets for item in list(p.get("likely_covered", [])) + list(p.get("delta_requirements", [])): cap = item.get("capability") if not cap: continue regs = [str(t) for t in item.get("covers_targets", [])] or targets cap_regs.setdefault(str(cap), []) cap_regs[str(cap)] = sorted(set(cap_regs[str(cap)]) | set(regs)) rts = {str(r.get("id", "")): _targets(r.get("transition_goal", {}).get("to")) for r in reference_scenarios} rts.pop("", None) obl = obligation_index or {} regulations = sorted( {t for ts in tp.values() for t in ts} | {t for ts in rts.values() for t in ts} | {t for ts in cap_regs.values() for t in ts} | set(obl.keys()) ) return KnowledgeIndex( regulations=regulations, capability_regulations=cap_regs, playbook_capabilities=sorted({str(pb.get("capability_id", "")) for pb in playbooks} - {""}), transition_patterns=tp, reference_scenarios=rts, obligation_index=dict(obl), ) def _kw_match(keywords: Set[str], capability: str) -> bool: tokens = set(capability.lower().split("_")) return bool(keywords & tokens) or capability.lower() in keywords def assess_document_impact(descriptor: DocumentDescriptor, index: KnowledgeIndex) -> KnowledgePackage: """Classify the document and compute which existing knowledge it probably touches, and how much.""" doc_regs = set(descriptor.regulations) known = set(index.regulations) unknown = sorted(doc_regs - known) new_domain = bool(doc_regs) and not (doc_regs & known) kw = {k.lower() for k in descriptor.keywords} caps = sorted(c for c, regs in index.capability_regulations.items() if (set(regs) & doc_regs) or _kw_match(kw, c)) playbooks = sorted(set(caps) & set(index.playbook_capabilities)) patterns = sorted(pid for pid, regs in index.transition_patterns.items() if set(regs) & doc_regs) scenarios = sorted(rid for rid, regs in index.reference_scenarios.items() if set(regs) & doc_regs) obligations = sorted({o for r in doc_regs for o in index.obligation_index.get(r, [])}) total = len(caps) + len(playbooks) + len(patterns) + len(scenarios) + len(obligations) if new_domain: level, rec = ImpactLevel.NEW_DOMAIN, "Neue Domäne — Corpus-Intake nötig (kein bestehendes Wissen betroffen)." elif total == 0: level, rec = ImpactLevel.NONE, "Wahrscheinlich ignorierbar — betrifft keinen bekannten Wissensbaustein." elif len(caps) >= 3 or playbooks or len(obligations) >= 5: level, rec = ImpactLevel.HIGH, "Gezielter Review priorisieren — hoher Impact auf bestehendes Wissen." else: level, rec = ImpactLevel.LOW, "Gezielter Review — geringer, eingegrenzter Impact." summary = "Betrifft %d Capabilities, %d Playbooks, %d Patterns, %d Reference Scenarios, %d Obligations; %s." % ( len(caps), len(playbooks), len(patterns), len(scenarios), len(obligations), "NEUE Domäne" if new_domain else "keine neue Domäne", ) return KnowledgePackage( document_id=descriptor.document_id, classification={"regulations": sorted(doc_regs), "keywords": sorted(kw), "document_type": [descriptor.document_type] if descriptor.document_type else []}, new_domain=new_domain, unknown_regulations=unknown, affected_capabilities=caps, affected_playbooks=playbooks, affected_transition_patterns=patterns, affected_reference_scenarios=scenarios, affected_obligations=obligations, impact_level=level, impact_summary=summary, recommendation=rec, )