"""Signal Producer interface + Normalizer — one signal language for all sources (NOT new architecture). The platform already HAS scanners (website, repo/code, SBOM, security headers, TLS, SPF/DKIM/DMARC, document analysis, RAG over uploads, product classification). The Silent Pass does not want a WebsiteScanner or a RepoScanner — it wants their UNIFIED output. So every source (a scanner, a PDF parser, a tender parser, an API, or the user) emits the SAME `ProducedSignal` {signal_id, source_type, confidence, evidence, provenance}, and `normalize_signals` reduces producer- specific signal ids to ONE canonical signal id via a vocabulary (id + aliases) — exactly the Requirement-Source / MCAP / regulation-alias pattern. The Silent Pass then never gets per-scanner logic. A common DATA FORMAT, not a new module/framework. Later a tender (`requires_sbom`) or an OEM spec (`supplier_requires_psirt`) produces the same stream as a website — the Silent Pass cannot tell the difference. Pure, deterministic, no I/O. Python 3.9 compatible. """ from __future__ import annotations from typing import Dict, List, Optional, Sequence from pydantic import BaseModel, Field from .silent_intake import IntakeSignal class ProducedSignal(BaseModel): """What ANY signal producer emits — the common interface every source agrees on.""" signal_id: str # raw or canonical id the producer used source_type: str = "" # website / repository / document / product / tender / oem / user / api confidence: float = 1.0 evidence: Optional[str] = None # the artifact found (already in hand) provenance: str = "" # url / filename / tender clause / "customer statement" class SignalVocabularyEntry(BaseModel): """One canonical signal + the producer-specific aliases that mean the same thing.""" id: str aliases: List[str] = Field(default_factory=list) def normalize_signals( produced: Sequence[ProducedSignal], vocabulary: Sequence[SignalVocabularyEntry] ) -> List[IntakeSignal]: """Reduce heterogeneous producer signals to the canonical IntakeSignal stream (alias resolution). Unknown signal ids pass through unchanged (a new producer's signal stays visible, not silently dropped). Deterministic; carries confidence/evidence/provenance for the audit trail. """ alias: Dict[str, str] = {} for v in vocabulary: alias[v.id] = v.id for a in v.aliases: alias[a] = v.id out: List[IntakeSignal] = [] for p in produced: canonical = alias.get(p.signal_id, p.signal_id) out.append(IntakeSignal( source=p.source_type, signal=canonical, confidence=p.confidence, evidence=p.evidence, provenance=p.provenance)) return out