breakpilot-compliance/backend-compliance/compliance/journey_matcher/engine.py

"""Journey Matcher — the Delta -> Journey function of the Capability Delta Engine.

Three INDEPENDENT functions now compose the pipeline, each a different problem, all interchangeable:
  1. Evidence   -> Capability   (Company 2A)
  2. Capability -> Delta        (RS-005, transition_reasoning)
  3. Delta      -> Journey      (THIS module)

The paradigm shift: a Journey is no longer the CAUSE (Goal -> Journey -> Delta) but the EXPLANATION
(Goal -> Required -> Delta -> Journey). The matcher does NOT look at certifications, regulations,
tenders, OEM specs or the goal — it looks ONLY at the Capability Delta and asks: which known journeys
describe exactly this delta? Output is a ranked, auditable explanation ("Journey A explains 82% of the
delta, because 8 of 10 missing capabilities are identical, same target type, ...").

Deliberately DUMB and deterministic: pure set overlap, NO ML, NO embeddings, NO LLM. A learning ranker
can be layered ON TOP later; this core stays auditable. Journey signatures are INJECTED (certificate-
agnostic capability clusters), never loaded here — the engine stays hermetic. No new corpus, no
graph/meta-model class (freeze v1.0). Python 3.9 compatible.

Honesty: `score` is the share of the DELTA a journey explains (recall over the customer's missing
capabilities), never a "fit" or a compliance verdict. `journey_only` documents where a journey reaches
BEYOND this delta, so a broad journey that explains everything is not silently preferred.
"""

from __future__ import annotations

from typing import List, Optional, Sequence

from .schemas import (
    JourneyMatch,
    JourneyMatchReason,
    JourneyMatchResult,
    JourneySignature,
    MatchContext,
)


def _context_signals(journey: JourneySignature, context: Optional[MatchContext]) -> List[str]:
    """Corroborating reasons only — these are documented, they never change the score."""
    if context is None:
        return []
    signals: List[str] = []
    if context.target_type and journey.target_type and context.target_type == journey.target_type:
        signals.append("gleiche Zielart")
    if context.industry and journey.industry and context.industry == journey.industry:
        signals.append("gleiche Branche")
    if context.product_type and journey.product_type and context.product_type == journey.product_type:
        signals.append("gleicher Produkttyp")
    return signals


def match_journeys(
    delta: Sequence[str],
    journeys: Sequence[JourneySignature],
    context: Optional[MatchContext] = None,
) -> JourneyMatchResult:
    """Rank known journeys by the share of the Capability Delta they EXPLAIN.

    `delta` = the customer's MISSING capabilities (from RS-005). `journeys` = injected, certificate-
    agnostic signatures. score = |delta INTERSECT pattern| / |delta|. Ranking is deterministic:
    score desc, then context-signal count desc (corroboration only), then journey_id asc. Context
    never changes the score — only the documented reasons. Pure; no I/O; computed-not-stored.
    """
    delta_set = set(delta)
    n = len(delta_set)
    matches: List[JourneyMatch] = []
    for j in journeys:
        pattern = set(j.capability_pattern)
        matched = sorted(delta_set & pattern)
        score = (len(matched) / n) if n else 0.0
        signals = _context_signals(j, context)
        reason = JourneyMatchReason(
            matched_capabilities=matched,
            unexplained_delta=sorted(delta_set - pattern),
            journey_only=sorted(pattern - delta_set),
            context_signals=signals,
        )
        matches.append(
            JourneyMatch(
                journey_id=j.journey_id,
                label=j.label,
                score=round(score, 2),
                explains="%d von %d fehlenden Capabilities" % (len(matched), n),
                reason=reason,
            )
        )
    matches.sort(key=lambda m: (-m.score, -len(m.reason.context_signals), m.journey_id))
    best = matches[0] if matches and matches[0].score > 0.0 else None
    headline = (
        "%d Journeys erklaeren das Delta; beste: %s (%d%% des Deltas)"
        % (sum(1 for m in matches if m.score > 0.0), best.label, round(best.score * 100))
        if best
        else "Keine bekannte Journey erklaert dieses Delta (neue Journey-Kandidatin)"
    )
    return JourneyMatchResult(delta_size=n, matches=matches, best=best, headline=headline)