breakpilot-compliance/backend-compliance/compliance/knowledge_intake/schemas.py

"""Schemas for Knowledge Intake — classify a new document and assess its IMPACT (no extraction yet).

Before the parser/draft stages, Intake answers „welche Teile unseres Wissensbestands sind überhaupt
betroffen?". It does NOT extract content — it only classifies the document and intersects its signals
with an index of the existing knowledge (capabilities, playbooks, transition patterns, reference
scenarios, injected obligations) to emit a `KnowledgePackage` (an impact analysis). Deterministic,
computed-not-stored, no new corpus, no new meta-model class (freeze v1.0). Python 3.9 compatible.
"""

from __future__ import annotations

from enum import Enum
from typing import Dict, List

from pydantic import BaseModel, Field


class ImpactLevel(str, Enum):
    NONE = "none"                # touches nothing known -> likely ignorable
    LOW = "low"                  # touches a little -> targeted review
    HIGH = "high"                # touches a lot -> prioritise review
    NEW_DOMAIN = "new_domain"    # references only unknown regulations -> domain intake


class DocumentDescriptor(BaseModel):
    """Lightweight signals of an incoming document — NO content body, only classification inputs."""

    document_id: str
    title: str = ""
    source: str = ""                                      # e.g. BSI, ENISA, EU
    document_type: str = ""                               # e.g. guidance, faq, regulation, recommendation
    regulations: List[str] = Field(default_factory=list)  # declared regulations it references
    keywords: List[str] = Field(default_factory=list)     # lightweight topic signals (e.g. sbom)
    product_types: List[str] = Field(default_factory=list)


class KnowledgeIndex(BaseModel):
    """A deterministic index of the EXISTING knowledge to match an incoming document against."""

    regulations: List[str] = Field(default_factory=list)               # all regulations the corpus knows
    capability_regulations: Dict[str, List[str]] = Field(default_factory=dict)   # capability -> covers_targets
    playbook_capabilities: List[str] = Field(default_factory=list)     # capabilities that HAVE a playbook
    transition_patterns: Dict[str, List[str]] = Field(default_factory=dict)      # pattern_id -> target regulations
    reference_scenarios: Dict[str, List[str]] = Field(default_factory=dict)      # rts_id -> regulations
    obligation_index: Dict[str, List[str]] = Field(default_factory=dict)         # regulation -> obligation ids (INJECTED)


class KnowledgePackage(BaseModel):
    """The impact analysis for one document — what of our knowledge it probably touches, and how much."""

    document_id: str
    classification: Dict[str, List[str]] = Field(default_factory=dict)   # echoed regulations/keywords/types
    new_domain: bool = False
    unknown_regulations: List[str] = Field(default_factory=list)
    affected_capabilities: List[str] = Field(default_factory=list)
    affected_playbooks: List[str] = Field(default_factory=list)
    affected_transition_patterns: List[str] = Field(default_factory=list)
    affected_reference_scenarios: List[str] = Field(default_factory=list)
    affected_obligations: List[str] = Field(default_factory=list)
    impact_level: ImpactLevel = ImpactLevel.NONE
    impact_summary: str = ""
    recommendation: str = ""