Files
breakpilot-compliance/backend-compliance/tests/test_observation_log.py
T
Benjamin Admin 7df15010ff feat(onboarding): Observation Log — append-only JSONL calibration store (Task 59b/c v1)
Per the user's decision (2026-06-28): observations are CALIBRATION data for the knowledge base, NOT
business data and NOT product-DB data. So they live with the other versioned knowledge artifacts as an
append-only JSONL log under knowledge/observations/ — NO migration, NO DB. (A real persistence layer is
only warranted once thousands of onboardings exist; not before.)

  - ObservationRecord = Observation + log metadata (observation_id, timestamp [caller-stamped, no hidden
    clock], customer_archetype [anonymised — NEVER a real name], evidence, provenance, knowledge_version).
  - append_observation() writes one JSON line; append-only, lines are never rewritten. A later review is a
    NEW line with the same observation_id; load_observations(reconcile=True) keeps the latest per id.
  - load_observations() reads a single .jsonl or a directory of monthly .jsonl files.
  - aggregate_by_hypothesis() (59c) -> per-hypothesis distribution + confidence, COMPUTED from the log
    (computed-not-stored); the review gate (reviewed-only) is enforced in empirical_distribution/confidence.
  - review_queue() -> the unreviewed worklist. Observation -> Review -> Accepted -> recompute, never
    Observation -> confidence++. Nothing is ever written back to a hypothesis.

You can `rm` the log and recompute, `git diff` it over months, or rebuild confidence under a new policy —
fully consistent with computed-not-stored and the product/knowledge data separation.

Non-runtime (module + tests only, no endpoint) -> origin/main, NO dev deploy. 5 new tests (append-only,
review supersession, review-gate statistics, queue, monthly-file load); 27 onboarding tests pass, mypy
--strict clean (9 modules), check-loc 0. 59d (surface computed confidence at runtime) stays a later step.
2026-06-28 16:29:54 +02:00

74 lines
3.7 KiB
Python

"""Observation Log — append-only JSONL store + computed statistics (Task 59b/c v1).
Pins the user's decision (2026-06-28): observations are CALIBRATION data, not product data -> an
append-only JSONL log under knowledge/observations/, NO DB, NO migration. Distribution and confidence are
COMPUTED from the log; only REVIEWED observations calibrate (review gate); a later review is a new line
that supersedes by observation_id. Nothing is ever written back to a hypothesis.
"""
from __future__ import annotations
from compliance.onboarding import (
ObservationRecord,
ObservationType,
aggregate_by_hypothesis,
append_observation,
load_observations,
review_queue,
)
def _rec(oid, hyp, otype, reviewed=False, **kw):
return ObservationRecord(
observation_id=oid, hypothesis_id=hyp, observation_type=otype, reviewed=reviewed,
timestamp="2026-07-01T00:00:00Z", customer_archetype="machine_builder+ISO27001", **kw)
def test_append_only_round_trip(tmp_path):
p = str(tmp_path / "obs.jsonl")
append_observation(_rec("o1", "HYP-secure_dev", ObservationType.CONFIRMED, reviewed=True), p)
append_observation(_rec("o2", "HYP-secure_dev", ObservationType.REFUTED, reviewed=True), p)
recs = load_observations(p)
assert {r.observation_id for r in recs} == {"o1", "o2"}
assert all(r.customer_archetype == "machine_builder+ISO27001" for r in recs) # anonymised archetype, not a name
def test_review_supersedes_by_id_append_only(tmp_path):
p = str(tmp_path / "obs.jsonl")
append_observation(_rec("o1", "HYP-x", ObservationType.CONFIRMED, reviewed=False), p) # raw answer
append_observation(_rec("o1", "HYP-x", ObservationType.CONFIRMED, reviewed=True,
reviewed_by="anna"), p) # later review event
assert len(load_observations(p, reconcile=False)) == 2 # both lines kept (append-only)
recs = load_observations(p) # reconciled
assert len(recs) == 1 and recs[0].reviewed and recs[0].reviewed_by == "anna"
def test_statistics_apply_the_review_gate(tmp_path):
p = str(tmp_path / "obs.jsonl")
append_observation(_rec("a", "HYP-sdl", ObservationType.CONFIRMED, reviewed=True), p)
append_observation(_rec("b", "HYP-sdl", ObservationType.CONFIRMED, reviewed=True), p)
append_observation(_rec("c", "HYP-sdl", ObservationType.REFUTED, reviewed=True), p)
append_observation(_rec("d", "HYP-sdl", ObservationType.CONFIRMED, reviewed=False), p) # unreviewed -> ignored
stats = {s.hypothesis_id: s for s in aggregate_by_hypothesis(load_observations(p))}
s = stats["HYP-sdl"]
assert s.total_count == 4 and s.reviewed_count == 3
assert s.distribution["confirmed"] == 2 and s.distribution["refuted"] == 1 # unreviewed one excluded
assert s.confidence == round(2 / 3, 2) # (2 + 0.5*0) / 3
def test_review_queue_lists_unreviewed(tmp_path):
p = str(tmp_path / "obs.jsonl")
append_observation(_rec("a", "HYP-y", ObservationType.CONFIRMED, reviewed=True), p)
append_observation(_rec("b", "HYP-y", ObservationType.PARTIAL, reviewed=False), p)
q = review_queue(load_observations(p))
assert [r.observation_id for r in q] == ["b"]
def test_load_directory_of_monthly_files(tmp_path):
d = tmp_path / "observations"
d.mkdir()
append_observation(_rec("a", "HYP-z", ObservationType.CONFIRMED, reviewed=True), str(d / "2026-06.jsonl"))
append_observation(_rec("b", "HYP-z", ObservationType.REFUTED, reviewed=True), str(d / "2026-07.jsonl"))
recs = load_observations(str(d))
assert {r.observation_id for r in recs} == {"a", "b"}