breakpilot-compliance/backend-compliance/compliance/onboarding/observations.py

"""Observation Model — the empirical learning unit (Task 59a: model BEFORE persistence/API).

The learning point is NOT the hypothesis, it is the QUESTION. A hypothesis ("ISO 27001 suggests supplier
management") produces a question ("Is there a documented supplier-security process?"), and the answer is
rarely binary — "yes" / "no" / "partial, only critical suppliers" / "certified but not lived" are very
different observations. So the chain is:

    Hypothesis -> Question -> Observation -> (Review) -> Hypothesis

Two principles (durable):
  - Richer than confirmed/refuted: an Observation carries an `observation_type` (confirmed / partial /
    refuted / not_applicable / unknown), a free-text answer, a scope_note ("only critical suppliers"),
    and whether evidence was uploaded.
  - REVIEW GATE: a raw answer NEVER changes a hypothesis directly. Only REVIEWED observations calibrate;
    otherwise the system learns from outliers. Hypotheses stay curated knowledge; confidence is COMPUTED
    from the reviewed observation stream (keyed by hypothesis id), not stored on the hypothesis.

This module defines the model + the deterministic statistics it enables (a DISTRIBUTION, not a single
%). Persistence (store), aggregation across customers and hypothesis calibration are later tasks
(59b/c/d). Pure, no I/O. Python 3.9 compatible.
"""

from __future__ import annotations

from enum import Enum
from typing import Dict, List, Optional, Sequence

from pydantic import BaseModel, Field


class ObservationType(str, Enum):
    CONFIRMED = "confirmed"
    PARTIAL = "partial"
    REFUTED = "refuted"
    NOT_APPLICABLE = "not_applicable"
    UNKNOWN = "unknown"


class Observation(BaseModel):
    """One real-onboarding answer to one hypothesis-driven question. The raw empirical unit."""

    hypothesis_id: str
    capability: str = ""                                # denormalised for convenient aggregation
    question: str = ""                                  # the question that was actually asked
    answer: str = ""                                    # the customer's raw answer (free text)
    observation_type: ObservationType = ObservationType.UNKNOWN
    scope_note: Optional[str] = None                    # "only critical suppliers" / "only DE" / "not lived"
    evidence_uploaded: bool = False
    reviewed: bool = False                              # the review gate: only reviewed obs calibrate
    reviewed_by: Optional[str] = None


# observation types that count as evidence for/against the capability (n/a + unknown do not)
_FOR_AGAINST = (ObservationType.CONFIRMED, ObservationType.PARTIAL, ObservationType.REFUTED)


def empirical_distribution(
    observations: Sequence[Observation], reviewed_only: bool = True
) -> Dict[str, int]:
    """Count observations per type — the DISTRIBUTION (e.g. confirmed 61 / partial 31 / refuted 8),
    far richer than a single percentage. By default only REVIEWED observations count (the review gate)."""
    dist = {t.value: 0 for t in ObservationType}
    for o in observations:
        if o.reviewed or not reviewed_only:
            dist[o.observation_type.value] += 1
    return dist


def empirical_confidence(
    observations: Sequence[Observation], reviewed_only: bool = True
) -> Optional[float]:
    """Confidence from the reviewed stream: (confirmed + 0.5*partial) / (confirmed+partial+refuted).

    `not_applicable` and `unknown` are excluded from the denominator (they are not evidence either way).
    `None` until any for/against observation is reviewed — never an expert/LLM score."""
    dist = empirical_distribution(observations, reviewed_only)
    base = dist[ObservationType.CONFIRMED.value] + dist[ObservationType.PARTIAL.value] + dist[ObservationType.REFUTED.value]
    if base == 0:
        return None
    return round((dist[ObservationType.CONFIRMED.value] + 0.5 * dist[ObservationType.PARTIAL.value]) / base, 2)


def reviewed(observations: Sequence[Observation]) -> List[Observation]:
    """The calibration set: only reviewed observations (a raw answer never updates a hypothesis)."""
    return [o for o in observations if o.reviewed]