feat: Observation Model — the empirical learning unit, defined BEFORE persistence (Task 59a)

The learning point is not the hypothesis, it is the QUESTION — and confirmed/refuted is too coarse. "partial, only critical suppliers" or "certified but not lived" are not "wrong", they are valuable knowledge. So the chain is Hypothesis -> Question -> Observation -> (Review) -> Hypothesis, and the observation model must be defined cleanly before any store/API (else thousands of too-coarse observations get migrated later). compliance/onboarding/observations.py: - ObservationType: confirmed / partial / refuted / not_applicable / unknown (richer than binary). - Observation: {hypothesis_id, capability, question, answer (free text), observation_type, scope_note ("only critical suppliers"), evidence_uploaded, reviewed, reviewed_by}. - empirical_distribution() -> a DISTRIBUTION (confirmed 61 / partial 31 / refuted 8), not one %. - empirical_confidence() -> (confirmed + 0.5*partial) / (confirmed+partial+refuted); n.a./unknown excluded; None until calibrated. - REVIEW GATE: only reviewed observations calibrate — a raw answer never changes a hypothesis (no learning from outliers). Refactor: the hypothesis is now PURE curated knowledge — the binary observations counter and any confidence are removed from CapabilityHypothesis and the YAML; confidence is COMPUTED from the separate reviewed observation stream. Pure, mypy --strict clean. Persistence/aggregation/calibration are 59b/c/d. Non-runtime -> no deploy. 12 tests pass, check-loc 0.
2026-06-28 13:31:43 +02:00
parent 59b7006e5a
commit 98d616d82b
5 changed files with 143 additions and 76 deletions
@@ -11,12 +11,16 @@ from __future__ import annotations
 from .engine import advisor_start, apply_answer
 from .hypotheses import (
    CapabilityHypothesis,
-    HypothesisObservations,
-    empirical_confidence,
    inferred_hypotheses,
-    record_observation,
    resolve_for_certifications,
 )
+from .observations import (
+    Observation,
+    ObservationType,
+    empirical_confidence,
+    empirical_distribution,
+    reviewed,
+)
 from .schemas import (
    AdvisorMeasure,
    AdvisorQuestion,
@@ -36,9 +40,11 @@ __all__ = [
    "InferredAssumption",
    "RejectedAssumption",
    "CapabilityHypothesis",
-    "HypothesisObservations",
-    "empirical_confidence",
-    "record_observation",
    "inferred_hypotheses",
    "resolve_for_certifications",
+    "Observation",
+    "ObservationType",
+    "empirical_distribution",
+    "empirical_confidence",
+    "reviewed",
 ]
@@ -11,17 +11,15 @@ long-term moat. The library is DATA, loaded outside this module and injected. Py

 from __future__ import annotations

-from typing import Dict, List, Optional, Sequence
+from typing import Dict, List, Sequence

 from pydantic import BaseModel, Field


-class HypothesisObservations(BaseModel):
-    confirmed: int = 0
-    refuted: int = 0
-
-
 class CapabilityHypothesis(BaseModel):
+    """Curated knowledge only. Confidence is NOT stored here — it is computed from the reviewed
+    observation stream (see observations.py); a raw answer never changes a hypothesis (review gate)."""
+
    id: str
    capability: str
    supported_by: List[str] = Field(default_factory=list)        # certifications that suggest this capability
@@ -29,24 +27,9 @@ class CapabilityHypothesis(BaseModel):
    verification_required: bool = True                           # Welt-1: never auto-satisfied
    question_intent: str = "verify_existence"
    expected_evidence: List[str] = Field(default_factory=list)
-    observations: HypothesisObservations = Field(default_factory=HypothesisObservations)
    kind: str = "shared"                                         # shared / specific


-def empirical_confidence(obs: HypothesisObservations) -> Optional[float]:
-    """Confidence from observations only: confirmed / (confirmed+refuted). None until any are recorded."""
-    n = obs.confirmed + obs.refuted
-    return round(obs.confirmed / n, 2) if n else None
-
-
-def record_observation(obs: HypothesisObservations, confirmed: bool) -> HypothesisObservations:
-    """One real-onboarding observation -> updated counts (the empirical calibration step)."""
-    return HypothesisObservations(
-        confirmed=obs.confirmed + (1 if confirmed else 0),
-        refuted=obs.refuted + (0 if confirmed else 1),
-    )
-
-
 def inferred_hypotheses(
    certifications: Sequence[str], library: Sequence[CapabilityHypothesis]
 ) -> List[CapabilityHypothesis]:
@@ -0,0 +1,85 @@
+"""Observation Model — the empirical learning unit (Task 59a: model BEFORE persistence/API).
+
+The learning point is NOT the hypothesis, it is the QUESTION. A hypothesis ("ISO 27001 suggests supplier
+management") produces a question ("Is there a documented supplier-security process?"), and the answer is
+rarely binary — "yes" / "no" / "partial, only critical suppliers" / "certified but not lived" are very
+different observations. So the chain is:
+
+    Hypothesis -> Question -> Observation -> (Review) -> Hypothesis
+
+Two principles (durable):
+  - Richer than confirmed/refuted: an Observation carries an `observation_type` (confirmed / partial /
+    refuted / not_applicable / unknown), a free-text answer, a scope_note ("only critical suppliers"),
+    and whether evidence was uploaded.
+  - REVIEW GATE: a raw answer NEVER changes a hypothesis directly. Only REVIEWED observations calibrate;
+    otherwise the system learns from outliers. Hypotheses stay curated knowledge; confidence is COMPUTED
+    from the reviewed observation stream (keyed by hypothesis id), not stored on the hypothesis.
+
+This module defines the model + the deterministic statistics it enables (a DISTRIBUTION, not a single
+%). Persistence (store), aggregation across customers and hypothesis calibration are later tasks
+(59b/c/d). Pure, no I/O. Python 3.9 compatible.
+"""
+
+from __future__ import annotations
+
+from enum import Enum
+from typing import Dict, List, Optional, Sequence
+
+from pydantic import BaseModel, Field
+
+
+class ObservationType(str, Enum):
+    CONFIRMED = "confirmed"
+    PARTIAL = "partial"
+    REFUTED = "refuted"
+    NOT_APPLICABLE = "not_applicable"
+    UNKNOWN = "unknown"
+
+
+class Observation(BaseModel):
+    """One real-onboarding answer to one hypothesis-driven question. The raw empirical unit."""
+
+    hypothesis_id: str
+    capability: str = ""                                # denormalised for convenient aggregation
+    question: str = ""                                  # the question that was actually asked
+    answer: str = ""                                    # the customer's raw answer (free text)
+    observation_type: ObservationType = ObservationType.UNKNOWN
+    scope_note: Optional[str] = None                    # "only critical suppliers" / "only DE" / "not lived"
+    evidence_uploaded: bool = False
+    reviewed: bool = False                              # the review gate: only reviewed obs calibrate
+    reviewed_by: Optional[str] = None
+
+
+# observation types that count as evidence for/against the capability (n/a + unknown do not)
+_FOR_AGAINST = (ObservationType.CONFIRMED, ObservationType.PARTIAL, ObservationType.REFUTED)
+
+
+def empirical_distribution(
+    observations: Sequence[Observation], reviewed_only: bool = True
+) -> Dict[str, int]:
+    """Count observations per type — the DISTRIBUTION (e.g. confirmed 61 / partial 31 / refuted 8),
+    far richer than a single percentage. By default only REVIEWED observations count (the review gate)."""
+    dist = {t.value: 0 for t in ObservationType}
+    for o in observations:
+        if o.reviewed or not reviewed_only:
+            dist[o.observation_type.value] += 1
+    return dist
+
+
+def empirical_confidence(
+    observations: Sequence[Observation], reviewed_only: bool = True
+) -> Optional[float]:
+    """Confidence from the reviewed stream: (confirmed + 0.5*partial) / (confirmed+partial+refuted).
+
+    `not_applicable` and `unknown` are excluded from the denominator (they are not evidence either way).
+    `None` until any for/against observation is reviewed — never an expert/LLM score."""
+    dist = empirical_distribution(observations, reviewed_only)
+    base = dist[ObservationType.CONFIRMED.value] + dist[ObservationType.PARTIAL.value] + dist[ObservationType.REFUTED.value]
+    if base == 0:
+        return None
+    return round((dist[ObservationType.CONFIRMED.value] + 0.5 * dist[ObservationType.PARTIAL.value]) / base, 2)
+
+
+def reviewed(observations: Sequence[Observation]) -> List[Observation]:
+    """The calibration set: only reviewed observations (a raw answer never updates a hypothesis)."""
+    return [o for o in observations if o.reviewed]