fix(onboarding): separate observation vs requirement signals — a demanded SBOM is not a present SBOM

Semantic correction of the knowledge base BEFORE the empirical loop (#59) is built — otherwise the
Observation Store would learn from already-misclassified signals. The Silent Pass conflated two kinds of
signal into one: an OBSERVATION ("I saw an SBOM in the repo") and a REQUIREMENT ("a tender DEMANDS an
SBOM"). They were aliased to the same canonical id, so a tender clause read as "SBOM already present" and
suppressed the very question that should have been asked.

Fix — make the kind explicit and authoritative (no new architecture, data + thin wiring):
  - `kind` ∈ {observation, requirement} on ProducedSignal (producer may declare) and on the canonical
    SignalVocabularyEntry (AUTHORITATIVE — a mislabelled producer cannot collapse the two).
  - Vocabulary split: sbom_file_found → sbom_present (obs) + sbom_required (req);
    security_txt_or_cvd_policy → cvd_policy_present (obs) + psirt_required (req); add signed_updates_required.
    requirement signals are intentionally UNMAPPED in intake_signal_map (they describe a target, not state).
  - silent_intake() consumes ONLY kind==observation; requirement signals are preserved in
    `requirements_seen` (visible/auditable) but NEVER become a detected capability.
  - normalize_signals() stamps the vocabulary's kind onto every IntakeSignal; unknown ids still pass through.

This is the same Observation-vs-Requirement split the Requirements Verification Platform rests on:
observations are reality, requirements are targets, and their comparison is the delta. A tender / OEM spec /
law now produces requirement signals; scanners / repos / documents produce observation signals.

Tests: rewrote the two test_signal_producer cases that previously ASSERTED the bug (tender == repo) to pin
the correct split; regression — `requires_sbom` yields no capability + stays in requirements_seen while
`cyclonedx_found` still detects sbom_creation; endpoint-level regression that a tender requirement does not
auto-detect and the gap stays asked; vocabulary-kind-overrides-mislabelled-producer. 25 onboarding tests
pass, mypy --strict clean, demo runs, check-loc 0. Runtime effect → deploy + smoke. (Fix A; partial-vs-
detected decoupling follows as Fix B before #59.)
This commit is contained in:
Benjamin Admin
2026-06-28 15:52:50 +02:00
parent b5b6cdddb3
commit c39787ad96
7 changed files with 121 additions and 42 deletions
@@ -47,6 +47,20 @@ def test_advisor_start_returns_full_payload():
assert "sbom_creation" not in {q["capability_id"] for q in d["top_5_questions"]} # detected -> not asked
def test_requirement_signal_does_not_auto_detect_capability():
# a tender that DEMANDS an SBOM (requirement) must NOT be read as "SBOM present": sbom_creation stays
# open (asked / in the delta), unlike a real cyclonedx_found observation.
body = dict(_BODY, scanner_findings=[
{"signal_id": "requires_sbom", "source_type": "tender", "provenance": "tender §4.2"},
])
r = _client.post("/onboarding/advisor-start", json=body)
assert r.status_code == 200, r.text
d = r.json()
assert "sbom_creation" not in d["auto_detected"] # demanded != present
asked = {q["capability_id"] for q in d["top_5_questions"]}
assert "sbom_creation" in asked or "sbom_creation" in d["capability_delta"] # still an open gap
def test_unknown_target_is_404():
body = dict(_BODY, target="NOPE")
r = _client.post("/onboarding/advisor-start", json=body)
@@ -1,9 +1,10 @@
"""Signal Producer + Normalizer — one signal language for all sources.
"""Signal Producer + Normalizer — one signal language, but TWO signal KINDS.
Pins the abstraction the user asked for: every source emits the same ProducedSignal, and the Normalizer
reduces producer-specific signal ids to ONE canonical signal via a vocabulary. The Silent Pass therefore
cannot tell whether "SBOM present" came from a website, a repo, a PDF, a tender or the user — and gets no
per-scanner logic.
Pins the abstraction: every source emits the same ProducedSignal, and the Normalizer reduces
producer-specific ids to ONE canonical signal via a vocabulary. CRITICAL: an OBSERVATION ("I saw an
SBOM") and a REQUIREMENT ("a tender DEMANDS an SBOM") must NEVER collapse to the same signal — a
demanded SBOM is not a present one. kind is authoritative on the canonical vocabulary entry, and the
Silent Pass consumes only observations.
"""
from __future__ import annotations
@@ -27,24 +28,47 @@ _MAP = [SignalMapping(**m) for m in yaml.safe_load(
open(os.path.join(_DIR, "..", "knowledge", "onboarding", "intake_signal_map.yaml"), encoding="utf-8"))["mappings"]]
def test_different_producers_yield_the_same_canonical_signal():
# the SAME fact, emitted by four totally different producers with different raw ids
def test_observation_producers_yield_one_canonical_signal():
# the SAME OBSERVATION, emitted by three different producers with different raw ids
produced = [
ProducedSignal(signal_id="cyclonedx_found", source_type="repository", provenance="sbom.cdx.json"),
ProducedSignal(signal_id="spdx_found", source_type="repository", provenance="sbom.spdx"),
ProducedSignal(signal_id="sbom_uploaded", source_type="document", provenance="customer_upload.pdf"),
ProducedSignal(signal_id="requires_sbom", source_type="tender", provenance="tender §4.2"),
]
normalized = normalize_signals(produced, _VOCAB)
assert {s.signal for s in normalized} == {"sbom_file_found"} # all reduced to ONE canonical signal
assert {s.source for s in normalized} == {"repository", "document", "tender"} # provenance preserved
assert {s.signal for s in normalized} == {"sbom_present"} # all reduced to ONE canonical observation
assert {s.kind for s in normalized} == {"observation"} # all observations
assert {s.source for s in normalized} == {"repository", "document"} # provenance preserved
def test_silent_pass_consumes_normalized_signals_source_agnostic():
# a tender that "requires SBOM" produces the same effect as a repo that HAS one
from_repo = normalize_signals([ProducedSignal(signal_id="cyclonedx_found", source_type="repository", evidence="sbom")], _VOCAB)
def test_requirement_and_observation_never_collapse():
# a tender that DEMANDS an SBOM must NOT become the same signal as a repo that HAS one
normalized = normalize_signals([
ProducedSignal(signal_id="cyclonedx_found", source_type="repository"), # observation
ProducedSignal(signal_id="requires_sbom", source_type="tender", provenance="tender §4.2"), # requirement
], _VOCAB)
by_kind = {s.kind: s.signal for s in normalized}
assert by_kind["observation"] == "sbom_present"
assert by_kind["requirement"] == "sbom_required"
assert by_kind["observation"] != by_kind["requirement"]
def test_requirement_signal_produces_no_capability():
# the regression the whole fix is about: a DEMANDED SBOM yields NO detected capability,
# but is preserved as a requirement; a real SBOM in the repo still IS detected.
from_tender = normalize_signals([ProducedSignal(signal_id="requires_sbom", source_type="tender")], _VOCAB)
assert silent_intake(from_repo, _MAP).capability_ids() == silent_intake(from_tender, _MAP).capability_ids() == ["sbom_creation"]
res_tender = silent_intake(from_tender, _MAP)
assert res_tender.capability_ids() == [] # NOT read as present
assert res_tender.requirements_seen == ["sbom_required"] # but preserved + visible
from_repo = normalize_signals([ProducedSignal(signal_id="cyclonedx_found", source_type="repository", evidence="sbom")], _VOCAB)
assert silent_intake(from_repo, _MAP).capability_ids() == ["sbom_creation"]
def test_vocabulary_kind_overrides_a_mislabelled_producer():
# even if a producer wrongly tags a requirement as observation, the vocabulary is authoritative
norm = normalize_signals([ProducedSignal(signal_id="requires_sbom", source_type="tender", kind="observation")], _VOCAB)
assert norm[0].signal == "sbom_required" and norm[0].kind == "requirement"
def test_unknown_signal_passes_through_not_dropped():
@@ -36,8 +36,8 @@ _REQ += [TargetRequirement(capability_id=d["capability"], expected_evidence=d.ge
# scanner findings (injected): a machine builder with a public CVD policy, an SBOM + signed releases in
# the repo, a product risk-assessment doc, and a cloud-connected PLC product.
_SIGNALS = [
IntakeSignal(source="website", signal="security_txt_or_cvd_policy", detail="/.well-known/security.txt"),
IntakeSignal(source="repository", signal="sbom_file_found", detail="sbom.cdx.json"),
IntakeSignal(source="website", signal="cvd_policy_present", detail="/.well-known/security.txt"),
IntakeSignal(source="repository", signal="sbom_present", detail="sbom.cdx.json"),
IntakeSignal(source="repository", signal="signed_releases"),
IntakeSignal(source="document", signal="product_risk_assessment_doc"),
IntakeSignal(source="product", signal="cloud_connectivity"),