From 98d616d82be4420996b640a3f1eea745524d3521 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sun, 28 Jun 2026 13:31:43 +0200 Subject: [PATCH] =?UTF-8?q?feat:=20Observation=20Model=20=E2=80=94=20the?= =?UTF-8?q?=20empirical=20learning=20unit,=20defined=20BEFORE=20persistenc?= =?UTF-8?q?e=20(Task=2059a)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The learning point is not the hypothesis, it is the QUESTION — and confirmed/refuted is too coarse. "partial, only critical suppliers" or "certified but not lived" are not "wrong", they are valuable knowledge. So the chain is Hypothesis -> Question -> Observation -> (Review) -> Hypothesis, and the observation model must be defined cleanly before any store/API (else thousands of too-coarse observations get migrated later). compliance/onboarding/observations.py: - ObservationType: confirmed / partial / refuted / not_applicable / unknown (richer than binary). - Observation: {hypothesis_id, capability, question, answer (free text), observation_type, scope_note ("only critical suppliers"), evidence_uploaded, reviewed, reviewed_by}. - empirical_distribution() -> a DISTRIBUTION (confirmed 61 / partial 31 / refuted 8), not one %. - empirical_confidence() -> (confirmed + 0.5*partial) / (confirmed+partial+refuted); n.a./unknown excluded; None until calibrated. - REVIEW GATE: only reviewed observations calibrate — a raw answer never changes a hypothesis (no learning from outliers). Refactor: the hypothesis is now PURE curated knowledge — the binary observations counter and any confidence are removed from CapabilityHypothesis and the YAML; confidence is COMPUTED from the separate reviewed observation stream. Pure, mypy --strict clean. Persistence/aggregation/calibration are 59b/c/d. Non-runtime -> no deploy. 12 tests pass, check-loc 0. --- .../compliance/onboarding/__init__.py | 18 ++-- .../compliance/onboarding/hypotheses.py | 25 +----- .../compliance/onboarding/observations.py | 85 +++++++++++++++++++ .../certification_hypotheses/hypotheses.yaml | 64 ++++++-------- .../tests/test_certification_hypotheses.py | 27 ++++-- 5 files changed, 143 insertions(+), 76 deletions(-) create mode 100644 backend-compliance/compliance/onboarding/observations.py diff --git a/backend-compliance/compliance/onboarding/__init__.py b/backend-compliance/compliance/onboarding/__init__.py index 1dc96da1..2a6f5447 100644 --- a/backend-compliance/compliance/onboarding/__init__.py +++ b/backend-compliance/compliance/onboarding/__init__.py @@ -11,12 +11,16 @@ from __future__ import annotations from .engine import advisor_start, apply_answer from .hypotheses import ( CapabilityHypothesis, - HypothesisObservations, - empirical_confidence, inferred_hypotheses, - record_observation, resolve_for_certifications, ) +from .observations import ( + Observation, + ObservationType, + empirical_confidence, + empirical_distribution, + reviewed, +) from .schemas import ( AdvisorMeasure, AdvisorQuestion, @@ -36,9 +40,11 @@ __all__ = [ "InferredAssumption", "RejectedAssumption", "CapabilityHypothesis", - "HypothesisObservations", - "empirical_confidence", - "record_observation", "inferred_hypotheses", "resolve_for_certifications", + "Observation", + "ObservationType", + "empirical_distribution", + "empirical_confidence", + "reviewed", ] diff --git a/backend-compliance/compliance/onboarding/hypotheses.py b/backend-compliance/compliance/onboarding/hypotheses.py index 1150eeb5..31687e16 100644 --- a/backend-compliance/compliance/onboarding/hypotheses.py +++ b/backend-compliance/compliance/onboarding/hypotheses.py @@ -11,17 +11,15 @@ long-term moat. The library is DATA, loaded outside this module and injected. Py from __future__ import annotations -from typing import Dict, List, Optional, Sequence +from typing import Dict, List, Sequence from pydantic import BaseModel, Field -class HypothesisObservations(BaseModel): - confirmed: int = 0 - refuted: int = 0 - - class CapabilityHypothesis(BaseModel): + """Curated knowledge only. Confidence is NOT stored here — it is computed from the reviewed + observation stream (see observations.py); a raw answer never changes a hypothesis (review gate).""" + id: str capability: str supported_by: List[str] = Field(default_factory=list) # certifications that suggest this capability @@ -29,24 +27,9 @@ class CapabilityHypothesis(BaseModel): verification_required: bool = True # Welt-1: never auto-satisfied question_intent: str = "verify_existence" expected_evidence: List[str] = Field(default_factory=list) - observations: HypothesisObservations = Field(default_factory=HypothesisObservations) kind: str = "shared" # shared / specific -def empirical_confidence(obs: HypothesisObservations) -> Optional[float]: - """Confidence from observations only: confirmed / (confirmed+refuted). None until any are recorded.""" - n = obs.confirmed + obs.refuted - return round(obs.confirmed / n, 2) if n else None - - -def record_observation(obs: HypothesisObservations, confirmed: bool) -> HypothesisObservations: - """One real-onboarding observation -> updated counts (the empirical calibration step).""" - return HypothesisObservations( - confirmed=obs.confirmed + (1 if confirmed else 0), - refuted=obs.refuted + (0 if confirmed else 1), - ) - - def inferred_hypotheses( certifications: Sequence[str], library: Sequence[CapabilityHypothesis] ) -> List[CapabilityHypothesis]: diff --git a/backend-compliance/compliance/onboarding/observations.py b/backend-compliance/compliance/onboarding/observations.py new file mode 100644 index 00000000..37a411a9 --- /dev/null +++ b/backend-compliance/compliance/onboarding/observations.py @@ -0,0 +1,85 @@ +"""Observation Model — the empirical learning unit (Task 59a: model BEFORE persistence/API). + +The learning point is NOT the hypothesis, it is the QUESTION. A hypothesis ("ISO 27001 suggests supplier +management") produces a question ("Is there a documented supplier-security process?"), and the answer is +rarely binary — "yes" / "no" / "partial, only critical suppliers" / "certified but not lived" are very +different observations. So the chain is: + + Hypothesis -> Question -> Observation -> (Review) -> Hypothesis + +Two principles (durable): + - Richer than confirmed/refuted: an Observation carries an `observation_type` (confirmed / partial / + refuted / not_applicable / unknown), a free-text answer, a scope_note ("only critical suppliers"), + and whether evidence was uploaded. + - REVIEW GATE: a raw answer NEVER changes a hypothesis directly. Only REVIEWED observations calibrate; + otherwise the system learns from outliers. Hypotheses stay curated knowledge; confidence is COMPUTED + from the reviewed observation stream (keyed by hypothesis id), not stored on the hypothesis. + +This module defines the model + the deterministic statistics it enables (a DISTRIBUTION, not a single +%). Persistence (store), aggregation across customers and hypothesis calibration are later tasks +(59b/c/d). Pure, no I/O. Python 3.9 compatible. +""" + +from __future__ import annotations + +from enum import Enum +from typing import Dict, List, Optional, Sequence + +from pydantic import BaseModel, Field + + +class ObservationType(str, Enum): + CONFIRMED = "confirmed" + PARTIAL = "partial" + REFUTED = "refuted" + NOT_APPLICABLE = "not_applicable" + UNKNOWN = "unknown" + + +class Observation(BaseModel): + """One real-onboarding answer to one hypothesis-driven question. The raw empirical unit.""" + + hypothesis_id: str + capability: str = "" # denormalised for convenient aggregation + question: str = "" # the question that was actually asked + answer: str = "" # the customer's raw answer (free text) + observation_type: ObservationType = ObservationType.UNKNOWN + scope_note: Optional[str] = None # "only critical suppliers" / "only DE" / "not lived" + evidence_uploaded: bool = False + reviewed: bool = False # the review gate: only reviewed obs calibrate + reviewed_by: Optional[str] = None + + +# observation types that count as evidence for/against the capability (n/a + unknown do not) +_FOR_AGAINST = (ObservationType.CONFIRMED, ObservationType.PARTIAL, ObservationType.REFUTED) + + +def empirical_distribution( + observations: Sequence[Observation], reviewed_only: bool = True +) -> Dict[str, int]: + """Count observations per type — the DISTRIBUTION (e.g. confirmed 61 / partial 31 / refuted 8), + far richer than a single percentage. By default only REVIEWED observations count (the review gate).""" + dist = {t.value: 0 for t in ObservationType} + for o in observations: + if o.reviewed or not reviewed_only: + dist[o.observation_type.value] += 1 + return dist + + +def empirical_confidence( + observations: Sequence[Observation], reviewed_only: bool = True +) -> Optional[float]: + """Confidence from the reviewed stream: (confirmed + 0.5*partial) / (confirmed+partial+refuted). + + `not_applicable` and `unknown` are excluded from the denominator (they are not evidence either way). + `None` until any for/against observation is reviewed — never an expert/LLM score.""" + dist = empirical_distribution(observations, reviewed_only) + base = dist[ObservationType.CONFIRMED.value] + dist[ObservationType.PARTIAL.value] + dist[ObservationType.REFUTED.value] + if base == 0: + return None + return round((dist[ObservationType.CONFIRMED.value] + 0.5 * dist[ObservationType.PARTIAL.value]) / base, 2) + + +def reviewed(observations: Sequence[Observation]) -> List[Observation]: + """The calibration set: only reviewed observations (a raw answer never updates a hypothesis).""" + return [o for o in observations if o.reviewed] diff --git a/backend-compliance/knowledge/certification_hypotheses/hypotheses.yaml b/backend-compliance/knowledge/certification_hypotheses/hypotheses.yaml index e0011b92..ca4ac312 100644 --- a/backend-compliance/knowledge/certification_hypotheses/hypotheses.yaml +++ b/backend-compliance/knowledge/certification_hypotheses/hypotheses.yaml @@ -10,83 +10,67 @@ # Multi-certification then merges AUTOMATICALLY (a company's inferred caps = every hypothesis whose # supported_by intersects its certs). capability ids match the existing transition patterns. # -# `confidence.empirical` stays NULL until calibrated from REAL onboardings (observations.confirmed / -# refuted) — never an LLM/expert score. Capabilities a cert does NOT suggest (SBOM, CVD, support period, -# signed updates) simply have NO hypothesis -> they always stay in the delta and get asked. AI first -# draft (~95%), expert review + customer calibration follow. No norm text reproduced. No real names. +# Confidence is NOT stored on the hypothesis — it is COMPUTED from a SEPARATE, reviewed observation +# stream (observations.py): each answer is a richer Observation (confirmed/partial/refuted/n.a./unknown +# + scope note), and a raw answer NEVER changes a hypothesis directly (review gate). Capabilities a cert +# does NOT suggest (SBOM, CVD, support period, signed updates) simply have NO hypothesis -> they always +# stay in the delta and get asked. AI first draft (~95%), expert review + customer calibration follow. +# No norm text reproduced. No real names. hypotheses: # ── SHARED CORE — management-system capabilities that recur across certifications ─────────── - {id: HYP-document_control, capability: document_and_change_control, relationship: supports, kind: shared, supported_by: [ISO9001, ISO13485, ISO27001, TISAX, ASPICE, IATF16949], - verification_required: true, question_intent: verify_existence, expected_evidence: [document_control_procedure], - confidence: {empirical: null}, observations: {confirmed: 0, refuted: 0}} + verification_required: true, question_intent: verify_existence, expected_evidence: [document_control_procedure]} - {id: HYP-incident_management, capability: incident_management, relationship: supports, kind: shared, supported_by: [ISO27001, TISAX, IEC62443, ISO13485], - verification_required: true, question_intent: verify_existence, expected_evidence: [incident_procedure], - confidence: {empirical: null}, observations: {confirmed: 0, refuted: 0}} + verification_required: true, question_intent: verify_existence, expected_evidence: [incident_procedure]} - {id: HYP-supplier_security, capability: supplier_security, relationship: supports, kind: shared, supported_by: [ISO27001, TISAX, IEC62443], - verification_required: true, question_intent: verify_existence, expected_evidence: [supplier_security_records], - confidence: {empirical: null}, observations: {confirmed: 0, refuted: 0}} + verification_required: true, question_intent: verify_existence, expected_evidence: [supplier_security_records]} - {id: HYP-supplier_evaluation, capability: supplier_evaluation, relationship: supports, kind: shared, supported_by: [ISO9001, IATF16949, ISO13485], - verification_required: true, question_intent: verify_existence, expected_evidence: [supplier_evaluation_records], - confidence: {empirical: null}, observations: {confirmed: 0, refuted: 0}} + verification_required: true, question_intent: verify_existence, expected_evidence: [supplier_evaluation_records]} - {id: HYP-access_control, capability: access_control_and_authentication, relationship: supports, kind: shared, supported_by: [ISO27001, TISAX, IEC62443], - verification_required: true, question_intent: verify_existence, expected_evidence: [access_control_policy], - confidence: {empirical: null}, observations: {confirmed: 0, refuted: 0}} + verification_required: true, question_intent: verify_existence, expected_evidence: [access_control_policy]} - {id: HYP-logging_monitoring, capability: security_logging_and_monitoring, relationship: supports, kind: shared, supported_by: [ISO27001, TISAX, IEC62443], - verification_required: true, question_intent: verify_existence, expected_evidence: [logging_configuration], - confidence: {empirical: null}, observations: {confirmed: 0, refuted: 0}} + verification_required: true, question_intent: verify_existence, expected_evidence: [logging_configuration]} - {id: HYP-asset_config, capability: asset_and_configuration_management, relationship: supports, kind: shared, supported_by: [ISO27001, TISAX, IEC62443], - verification_required: true, question_intent: verify_existence, expected_evidence: [asset_inventory], - confidence: {empirical: null}, observations: {confirmed: 0, refuted: 0}} + verification_required: true, question_intent: verify_existence, expected_evidence: [asset_inventory]} - {id: HYP-vuln_management, capability: technical_vulnerability_management, relationship: partially_supports, kind: shared, supported_by: [ISO27001, TISAX, IEC62443], - verification_required: true, question_intent: confirm_product_scope, expected_evidence: [vulnerability_management_process], - confidence: {empirical: null}, observations: {confirmed: 0, refuted: 0}} + verification_required: true, question_intent: confirm_product_scope, expected_evidence: [vulnerability_management_process]} - {id: HYP-isms, capability: information_security_management, relationship: supports, kind: shared, supported_by: [ISO27001, TISAX], - verification_required: true, question_intent: verify_existence, expected_evidence: [isms_scope], - confidence: {empirical: null}, observations: {confirmed: 0, refuted: 0}} + verification_required: true, question_intent: verify_existence, expected_evidence: [isms_scope]} - {id: HYP-cryptography, capability: cryptography, relationship: supports, kind: shared, supported_by: [ISO27001, TISAX, IEC62443], - verification_required: true, question_intent: verify_existence, expected_evidence: [crypto_policy], - confidence: {empirical: null}, observations: {confirmed: 0, refuted: 0}} + verification_required: true, question_intent: verify_existence, expected_evidence: [crypto_policy]} - {id: HYP-training, capability: security_awareness_training, relationship: supports, kind: shared, supported_by: [ISO27001, TISAX], - verification_required: true, question_intent: verify_existence, expected_evidence: [training_records], - confidence: {empirical: null}, observations: {confirmed: 0, refuted: 0}} + verification_required: true, question_intent: verify_existence, expected_evidence: [training_records]} - {id: HYP-prototype_protection, capability: protect_prototypes, relationship: supports, kind: shared, supported_by: [TISAX], - verification_required: true, question_intent: verify_existence, expected_evidence: [prototype_protection_policy], - confidence: {empirical: null}, observations: {confirmed: 0, refuted: 0}} + verification_required: true, question_intent: verify_existence, expected_evidence: [prototype_protection_policy]} - {id: HYP-release_approval, capability: release_and_approval_process, relationship: supports, kind: shared, supported_by: [ISO9001, IATF16949, ISO13485], - verification_required: true, question_intent: verify_existence, expected_evidence: [release_procedure], - confidence: {empirical: null}, observations: {confirmed: 0, refuted: 0}} + verification_required: true, question_intent: verify_existence, expected_evidence: [release_procedure]} - {id: HYP-ce_conformity, capability: ce_conformity_assessment_and_technical_documentation, relationship: partially_supports, kind: shared, supported_by: [ISO9001, IATF16949], - verification_required: true, question_intent: request_evidence, expected_evidence: [technical_documentation], - confidence: {empirical: null}, observations: {confirmed: 0, refuted: 0}} + verification_required: true, question_intent: request_evidence, expected_evidence: [technical_documentation]} # ── CERT-SPECIFIC — capabilities a single domain's certificate suggests ───────────────────── - {id: HYP-secure_dev, capability: secure_development_lifecycle, relationship: partially_supports, kind: specific, supported_by: [IEC62443, ASPICE], - verification_required: true, question_intent: verify_existence, expected_evidence: [secure_development_policy], - confidence: {empirical: null}, observations: {confirmed: 0, refuted: 0}} + verification_required: true, question_intent: verify_existence, expected_evidence: [secure_development_policy]} - {id: HYP-csms, capability: cybersecurity_management_system, relationship: supports, kind: specific, supported_by: [IEC62443], - verification_required: true, question_intent: verify_existence, expected_evidence: [csms_records], - confidence: {empirical: null}, observations: {confirmed: 0, refuted: 0}} + verification_required: true, question_intent: verify_existence, expected_evidence: [csms_records]} - {id: HYP-environmental_docs, capability: environmental_management_documentation, relationship: supports, kind: specific, supported_by: [ISO14001], - verification_required: true, question_intent: verify_existence, expected_evidence: [environmental_aspects_register], - confidence: {empirical: null}, observations: {confirmed: 0, refuted: 0}} + verification_required: true, question_intent: verify_existence, expected_evidence: [environmental_aspects_register]} - {id: HYP-software_process, capability: assess_software_process_capability, relationship: supports, kind: specific, supported_by: [ASPICE], - verification_required: true, question_intent: verify_existence, expected_evidence: [aspice_assessment], - confidence: {empirical: null}, observations: {confirmed: 0, refuted: 0}} + verification_required: true, question_intent: verify_existence, expected_evidence: [aspice_assessment]} diff --git a/backend-compliance/tests/test_certification_hypotheses.py b/backend-compliance/tests/test_certification_hypotheses.py index 5c2d7056..e2aec149 100644 --- a/backend-compliance/tests/test_certification_hypotheses.py +++ b/backend-compliance/tests/test_certification_hypotheses.py @@ -14,12 +14,13 @@ import yaml from compliance.onboarding import ( CapabilityHypothesis, - HypothesisObservations, + Observation, + ObservationType, OnboardingInput, advisor_start, empirical_confidence, + empirical_distribution, inferred_hypotheses, - record_observation, resolve_for_certifications, ) from compliance.transition_reasoning import TargetRequirement @@ -47,13 +48,21 @@ def test_multi_certification_merges_automatically(): assert "sbom_creation" not in caps and "secure_signed_update_distribution" not in caps -def test_empirical_confidence_is_computed_not_assigned(): - obs = HypothesisObservations() - assert empirical_confidence(obs) is None # null until observed - obs = record_observation(obs, True) - obs = record_observation(obs, True) - obs = record_observation(obs, False) - assert empirical_confidence(obs) == 0.67 # 2 / 3, from observations only +def test_observations_are_richer_than_binary_and_review_gated(): + # the learning unit is the QUESTION; an answer can be partial with a scope note, not just yes/no + raw = [Observation(hypothesis_id="HYP-supplier", observation_type=ObservationType.CONFIRMED)] + assert empirical_confidence(raw) is None # unreviewed -> does NOT calibrate (review gate) + obs = [ + Observation(hypothesis_id="HYP-supplier", observation_type=ObservationType.CONFIRMED, reviewed=True), + Observation(hypothesis_id="HYP-supplier", observation_type=ObservationType.PARTIAL, + scope_note="nur kritische Lieferanten", reviewed=True), + Observation(hypothesis_id="HYP-supplier", observation_type=ObservationType.REFUTED, reviewed=True), + Observation(hypothesis_id="HYP-supplier", observation_type=ObservationType.NOT_APPLICABLE, reviewed=True), + ] + dist = empirical_distribution(obs) # a DISTRIBUTION, not a single percentage + assert dist["confirmed"] == 1 and dist["partial"] == 1 and dist["refuted"] == 1 and dist["not_applicable"] == 1 + # confidence = (confirmed + 0.5*partial) / (confirmed+partial+refuted); n.a. excluded from the base + assert empirical_confidence(obs) == 0.5 def test_resolve_adapts_to_advisor_input():