feat: Signal Producer interface + Normalizer — one signal language for all sources (before #58)

Not scanner stubs — the scanners exist. The Silent Pass needs only their UNIFIED output. This adds the
small common DATA FORMAT (not a new module/framework) the user asked for, exactly the Requirement-
Source / MCAP / regulation-alias pattern: many inputs, one language.

  Producer A / B / C  ->  normalize_signals (vocabulary: id + aliases)  ->  canonical IntakeSignal  ->  Silent Pass

- ProducedSignal {signal_id, source_type, confidence, evidence, provenance} = what ANY source emits
  (website scanner, repo scanner, PDF parser, tender parser, API, the user).
- knowledge/onboarding/signal_vocabulary.yaml reduces producer dialects to a canonical signal: "SBOM
  present" arrives as cyclonedx_found / spdx_found / sbom_uploaded / requires_sbom (tender) — all become
  `sbom_file_found`. The Silent Pass cannot tell where it came from -> no per-scanner special logic, ever.
- Unknown signals pass through (a new producer stays visible). confidence/evidence/provenance flow to
  the detected capability for the audit trail.

A tender that "requires SBOM" now produces the same effect as a repo that HAS one — fits Vision V2
(Requirement Source over Regulation). Endpoint (#58) then has its final shape: POST -> Producers ->
Normalizer -> Silent Pass -> Profile -> Delta -> Questions -> Roadmap. Non-runtime -> no deploy. mypy
--strict clean, 14 onboarding tests pass, check-loc 0.
This commit is contained in:
Benjamin Admin
2026-06-28 14:49:57 +02:00
parent 9c33582412
commit c2c8f7e424
7 changed files with 184 additions and 16 deletions
@@ -21,6 +21,11 @@ from .observations import (
empirical_distribution,
reviewed,
)
from .signals import (
ProducedSignal,
SignalVocabularyEntry,
normalize_signals,
)
from .silent_intake import (
DetectedCapability,
IntakeSignal,
@@ -61,4 +66,7 @@ __all__ = [
"DetectedCapability",
"ProductFact",
"SilentIntakeResult",
"ProducedSignal",
"SignalVocabularyEntry",
"normalize_signals",
]
@@ -0,0 +1,61 @@
"""Signal Producer interface + Normalizer — one signal language for all sources (NOT new architecture).
The platform already HAS scanners (website, repo/code, SBOM, security headers, TLS, SPF/DKIM/DMARC,
document analysis, RAG over uploads, product classification). The Silent Pass does not want a
WebsiteScanner or a RepoScanner — it wants their UNIFIED output. So every source (a scanner, a PDF
parser, a tender parser, an API, or the user) emits the SAME `ProducedSignal`
{signal_id, source_type, confidence, evidence, provenance}, and `normalize_signals` reduces producer-
specific signal ids to ONE canonical signal id via a vocabulary (id + aliases) — exactly the
Requirement-Source / MCAP / regulation-alias pattern. The Silent Pass then never gets per-scanner logic.
A common DATA FORMAT, not a new module/framework. Later a tender (`requires_sbom`) or an OEM spec
(`supplier_requires_psirt`) produces the same stream as a website — the Silent Pass cannot tell the
difference. Pure, deterministic, no I/O. Python 3.9 compatible.
"""
from __future__ import annotations
from typing import Dict, List, Optional, Sequence
from pydantic import BaseModel, Field
from .silent_intake import IntakeSignal
class ProducedSignal(BaseModel):
"""What ANY signal producer emits — the common interface every source agrees on."""
signal_id: str # raw or canonical id the producer used
source_type: str = "" # website / repository / document / product / tender / oem / user / api
confidence: float = 1.0
evidence: Optional[str] = None # the artifact found (already in hand)
provenance: str = "" # url / filename / tender clause / "customer statement"
class SignalVocabularyEntry(BaseModel):
"""One canonical signal + the producer-specific aliases that mean the same thing."""
id: str
aliases: List[str] = Field(default_factory=list)
def normalize_signals(
produced: Sequence[ProducedSignal], vocabulary: Sequence[SignalVocabularyEntry]
) -> List[IntakeSignal]:
"""Reduce heterogeneous producer signals to the canonical IntakeSignal stream (alias resolution).
Unknown signal ids pass through unchanged (a new producer's signal stays visible, not silently
dropped). Deterministic; carries confidence/evidence/provenance for the audit trail.
"""
alias: Dict[str, str] = {}
for v in vocabulary:
alias[v.id] = v.id
for a in v.aliases:
alias[a] = v.id
out: List[IntakeSignal] = []
for p in produced:
canonical = alias.get(p.signal_id, p.signal_id)
out.append(IntakeSignal(
source=p.source_type, signal=canonical, confidence=p.confidence,
evidence=p.evidence, provenance=p.provenance))
return out
@@ -20,11 +20,15 @@ from pydantic import BaseModel, Field
class IntakeSignal(BaseModel):
"""One finding a scanner/parser produced (no LLM here — the scanners are upstream)."""
"""A CANONICAL signal the Silent Pass consumes. Producer-agnostic: the same `signal` may have come
from a website, a repo, a PDF, a tender or the user — normalize_signals() unified them (see signals.py)."""
source: str # website / repository / document / product
signal: str # signal id, e.g. "sbom_file_found"
detail: str = "" # optional (url, filename) for the audit trail
source: str # source_type: website / repository / document / product / tender / user
signal: str # CANONICAL signal id, e.g. "sbom_file_found"
confidence: float = 1.0 # carried from the producer
evidence: Optional[str] = None # the artifact already in hand
provenance: str = "" # where it came from (url / filename / tender clause) — audit trail
detail: str = "" # free-text (kept for back-compat)
class SignalMapping(BaseModel):
@@ -43,6 +47,8 @@ class DetectedCapability(BaseModel):
relationship: str = "detected"
source: str = "" # which signal/source detected it (audit trail)
evidence: Optional[str] = None
confidence: float = 1.0 # carried from the producing signal
provenance: str = "" # where the signal came from
class ProductFact(BaseModel):
@@ -82,7 +88,8 @@ def silent_intake(
if m.capability and m.capability not in caps:
caps[m.capability] = DetectedCapability(
capability=m.capability, relationship=m.relationship,
source="%s:%s" % (s.source, s.signal), evidence=m.evidence)
source="%s:%s" % (s.source, s.signal), evidence=m.evidence,
confidence=s.confidence, provenance=s.provenance)
if m.evidence:
evidence.add(m.evidence)
if m.product_fact: