breakpilot-compliance/backend-compliance/compliance/api/agent_check/_state.py

"""Shared state for the compliance-check pipeline.

The 7-step pipeline accumulates ~60 named values that flow across
phases (doc_entries, profile, results, banner_result, cmp_vendors,
scorecard, HTML blocks, …). Rather than threading 60 parameters
through each function, we pass one mutable `CheckState` dict.

Phases read what they need with `state[key]` and write their outputs
with `state[key] = value`. This is intentionally untyped: enforcing
strict typing would require freezing the schema before all phases
landed, and the report-building phase routinely adds new optional
keys (P1, P10, P50, P59b, P82, P103, P104, P106, …).

`CheckState.new(check_id, req)` initialises the dict with the few
keys that must exist from the start.
"""

from __future__ import annotations


def new_state(check_id: str, req) -> dict:
    """Create a fresh state dict for a check run.

    Pre-populates a few keys that downstream phases assume exist
    (e.g. `cmp_vendors` defaulting to `[]`).
    """
    return {
        "check_id": check_id,
        "req": req,
        # Phase-1 outputs
        "doc_texts": {},
        "doc_entries": [],
        "url_text_cache": {},
        "pasted_table_vendors": [],
        "placement_findings": [],
        # Phase-2/3/4 outputs
        "profile": None,
        "profile_dict": {},
        "results": [],
        "total_findings": 0,
        "business_scope": set(),
        "banner_result": None,
        "banner_url": "",
        "tcf_vendors": [],
        "vvt_entries": [],
        "extracted_profile": {},
        # Phase-5 outputs
        "cmp_vendors": [],
        "cookie_audit": {},
        "cookie_evidence_slices": None,
        "cookie_evidence_meta": None,
        "scorecard": {},
        "full_html": "",
        "audit_quality_findings": [],
        # Phase-6/7 outputs
        "email_result": {"status": "skipped"},
        "site_name": "",
    }