"""Observation Model — the empirical learning unit (Task 59a: model BEFORE persistence/API). The learning point is NOT the hypothesis, it is the QUESTION. A hypothesis ("ISO 27001 suggests supplier management") produces a question ("Is there a documented supplier-security process?"), and the answer is rarely binary — "yes" / "no" / "partial, only critical suppliers" / "certified but not lived" are very different observations. So the chain is: Hypothesis -> Question -> Observation -> (Review) -> Hypothesis Two principles (durable): - Richer than confirmed/refuted: an Observation carries an `observation_type` (confirmed / partial / refuted / not_applicable / unknown), a free-text answer, a scope_note ("only critical suppliers"), and whether evidence was uploaded. - REVIEW GATE: a raw answer NEVER changes a hypothesis directly. Only REVIEWED observations calibrate; otherwise the system learns from outliers. Hypotheses stay curated knowledge; confidence is COMPUTED from the reviewed observation stream (keyed by hypothesis id), not stored on the hypothesis. This module defines the model + the deterministic statistics it enables (a DISTRIBUTION, not a single %). Persistence (store), aggregation across customers and hypothesis calibration are later tasks (59b/c/d). Pure, no I/O. Python 3.9 compatible. """ from __future__ import annotations from enum import Enum from typing import Dict, List, Optional, Sequence from pydantic import BaseModel, Field class ObservationType(str, Enum): CONFIRMED = "confirmed" PARTIAL = "partial" REFUTED = "refuted" NOT_APPLICABLE = "not_applicable" UNKNOWN = "unknown" class Observation(BaseModel): """One real-onboarding answer to one hypothesis-driven question. The raw empirical unit.""" hypothesis_id: str capability: str = "" # denormalised for convenient aggregation question: str = "" # the question that was actually asked answer: str = "" # the customer's raw answer (free text) observation_type: ObservationType = ObservationType.UNKNOWN scope_note: Optional[str] = None # "only critical suppliers" / "only DE" / "not lived" evidence_uploaded: bool = False reviewed: bool = False # the review gate: only reviewed obs calibrate reviewed_by: Optional[str] = None # observation types that count as evidence for/against the capability (n/a + unknown do not) _FOR_AGAINST = (ObservationType.CONFIRMED, ObservationType.PARTIAL, ObservationType.REFUTED) def empirical_distribution( observations: Sequence[Observation], reviewed_only: bool = True ) -> Dict[str, int]: """Count observations per type — the DISTRIBUTION (e.g. confirmed 61 / partial 31 / refuted 8), far richer than a single percentage. By default only REVIEWED observations count (the review gate).""" dist = {t.value: 0 for t in ObservationType} for o in observations: if o.reviewed or not reviewed_only: dist[o.observation_type.value] += 1 return dist def empirical_confidence( observations: Sequence[Observation], reviewed_only: bool = True ) -> Optional[float]: """Confidence from the reviewed stream: (confirmed + 0.5*partial) / (confirmed+partial+refuted). `not_applicable` and `unknown` are excluded from the denominator (they are not evidence either way). `None` until any for/against observation is reviewed — never an expert/LLM score.""" dist = empirical_distribution(observations, reviewed_only) base = dist[ObservationType.CONFIRMED.value] + dist[ObservationType.PARTIAL.value] + dist[ObservationType.REFUTED.value] if base == 0: return None return round((dist[ObservationType.CONFIRMED.value] + 0.5 * dist[ObservationType.PARTIAL.value]) / base, 2) def reviewed(observations: Sequence[Observation]) -> List[Observation]: """The calibration set: only reviewed observations (a raw answer never updates a hypothesis).""" return [o for o in observations if o.reviewed]