feat(onboarding): Observation Log — append-only JSONL calibration store (Task 59b/c v1)

Per the user's decision (2026-06-28): observations are CALIBRATION data for the knowledge base, NOT business data and NOT product-DB data. So they live with the other versioned knowledge artifacts as an append-only JSONL log under knowledge/observations/ — NO migration, NO DB. (A real persistence layer is only warranted once thousands of onboardings exist; not before.) - ObservationRecord = Observation + log metadata (observation_id, timestamp [caller-stamped, no hidden clock], customer_archetype [anonymised — NEVER a real name], evidence, provenance, knowledge_version). - append_observation() writes one JSON line; append-only, lines are never rewritten. A later review is a NEW line with the same observation_id; load_observations(reconcile=True) keeps the latest per id. - load_observations() reads a single .jsonl or a directory of monthly .jsonl files. - aggregate_by_hypothesis() (59c) -> per-hypothesis distribution + confidence, COMPUTED from the log (computed-not-stored); the review gate (reviewed-only) is enforced in empirical_distribution/confidence. - review_queue() -> the unreviewed worklist. Observation -> Review -> Accepted -> recompute, never Observation -> confidence++. Nothing is ever written back to a hypothesis. You can `rm` the log and recompute, `git diff` it over months, or rebuild confidence under a new policy — fully consistent with computed-not-stored and the product/knowledge data separation. Non-runtime (module + tests only, no endpoint) -> origin/main, NO dev deploy. 5 new tests (append-only, review supersession, review-gate statistics, queue, monthly-file load); 27 onboarding tests pass, mypy --strict clean (9 modules), check-loc 0. 59d (surface computed confidence at runtime) stays a later step.
2026-06-28 16:29:54 +02:00
parent e54f3cde94
commit 7df15010ff
4 changed files with 197 additions and 0 deletions
@@ -0,0 +1,73 @@
+"""Observation Log — append-only JSONL store + computed statistics (Task 59b/c v1).
+
+Pins the user's decision (2026-06-28): observations are CALIBRATION data, not product data -> an
+append-only JSONL log under knowledge/observations/, NO DB, NO migration. Distribution and confidence are
+COMPUTED from the log; only REVIEWED observations calibrate (review gate); a later review is a new line
+that supersedes by observation_id. Nothing is ever written back to a hypothesis.
+"""
+
+from __future__ import annotations
+
+from compliance.onboarding import (
+    ObservationRecord,
+    ObservationType,
+    aggregate_by_hypothesis,
+    append_observation,
+    load_observations,
+    review_queue,
+)
+
+
+def _rec(oid, hyp, otype, reviewed=False, **kw):
+    return ObservationRecord(
+        observation_id=oid, hypothesis_id=hyp, observation_type=otype, reviewed=reviewed,
+        timestamp="2026-07-01T00:00:00Z", customer_archetype="machine_builder+ISO27001", **kw)
+
+
+def test_append_only_round_trip(tmp_path):
+    p = str(tmp_path / "obs.jsonl")
+    append_observation(_rec("o1", "HYP-secure_dev", ObservationType.CONFIRMED, reviewed=True), p)
+    append_observation(_rec("o2", "HYP-secure_dev", ObservationType.REFUTED, reviewed=True), p)
+    recs = load_observations(p)
+    assert {r.observation_id for r in recs} == {"o1", "o2"}
+    assert all(r.customer_archetype == "machine_builder+ISO27001" for r in recs)  # anonymised archetype, not a name
+
+
+def test_review_supersedes_by_id_append_only(tmp_path):
+    p = str(tmp_path / "obs.jsonl")
+    append_observation(_rec("o1", "HYP-x", ObservationType.CONFIRMED, reviewed=False), p)   # raw answer
+    append_observation(_rec("o1", "HYP-x", ObservationType.CONFIRMED, reviewed=True,
+                            reviewed_by="anna"), p)                                          # later review event
+    assert len(load_observations(p, reconcile=False)) == 2                                  # both lines kept (append-only)
+    recs = load_observations(p)                                                             # reconciled
+    assert len(recs) == 1 and recs[0].reviewed and recs[0].reviewed_by == "anna"
+
+
+def test_statistics_apply_the_review_gate(tmp_path):
+    p = str(tmp_path / "obs.jsonl")
+    append_observation(_rec("a", "HYP-sdl", ObservationType.CONFIRMED, reviewed=True), p)
+    append_observation(_rec("b", "HYP-sdl", ObservationType.CONFIRMED, reviewed=True), p)
+    append_observation(_rec("c", "HYP-sdl", ObservationType.REFUTED, reviewed=True), p)
+    append_observation(_rec("d", "HYP-sdl", ObservationType.CONFIRMED, reviewed=False), p)  # unreviewed -> ignored
+    stats = {s.hypothesis_id: s for s in aggregate_by_hypothesis(load_observations(p))}
+    s = stats["HYP-sdl"]
+    assert s.total_count == 4 and s.reviewed_count == 3
+    assert s.distribution["confirmed"] == 2 and s.distribution["refuted"] == 1   # unreviewed one excluded
+    assert s.confidence == round(2 / 3, 2)                                        # (2 + 0.5*0) / 3
+
+
+def test_review_queue_lists_unreviewed(tmp_path):
+    p = str(tmp_path / "obs.jsonl")
+    append_observation(_rec("a", "HYP-y", ObservationType.CONFIRMED, reviewed=True), p)
+    append_observation(_rec("b", "HYP-y", ObservationType.PARTIAL, reviewed=False), p)
+    q = review_queue(load_observations(p))
+    assert [r.observation_id for r in q] == ["b"]
+
+
+def test_load_directory_of_monthly_files(tmp_path):
+    d = tmp_path / "observations"
+    d.mkdir()
+    append_observation(_rec("a", "HYP-z", ObservationType.CONFIRMED, reviewed=True), str(d / "2026-06.jsonl"))
+    append_observation(_rec("b", "HYP-z", ObservationType.REFUTED, reviewed=True), str(d / "2026-07.jsonl"))
+    recs = load_observations(str(d))
+    assert {r.observation_id for r in recs} == {"a", "b"}