c2c8783fee
Phase-5 split of agent_compliance_check_routes.py — the 2700-line
monolith was decomposed into 19 modules in compliance/api/agent_check/:
- Phase A-F: resolve / profile+check / banner+TCF / vendors raw+finalize /
HTML blocks top+mid+bot / email / persist
- Helpers: _constants, _helpers, _fetch, _discovery, _single_check
- Schemas + State + thin _orchestrator
A1 ZIP-Anhang nativ in _phase_e_email: evidence_zip_builder.py bundles
slices + manifest.json + audit_metadata.json (SHA256 per slice +
build_sha + source_url). smtp_sender.py erweitert um attachments-Parameter.
B1 COOKIE-CONSENT-UX-001 (Mobile Reachability): consent_reachability_check.py
parses footer anchors, classifies intent (reopen_cmp / info_only /
browser_deflect) + target (same_page_cmp / new_tab / external).
_b1_wiring.py fetches homepage with iPhone-UA + renders Art-7-Abs-3
severity-coloured block.
B3 TH-RETENTION (Cross-Doc Speicherdauer): retention_comparator.py
compares DSI claim ↔ cookie-table duration ↔ actual Max-Age/expires
with 5% tolerance + severity hierarchy (dsi_under_actual HIGH,
table_under_actual HIGH, dsi_vs_table MEDIUM, actual_under_table LOW
Safari-ITP-Hint). _b3_wiring.py + Top-10 mismatches table in mail.
Side-effects:
- Fixed silent UnboundLocalError in original Step 5 (gf_one_pager used
audit_quality_findings before declaration, caught by surrounding
except → block never rendered). New _phase_d3_blocks_bot.py runs
audit-quality FIRST.
- agent_compliance_check_routes.py removed from loc-exceptions.txt
("Phase 5 split target" — done).
Tests: 55/55 grün (B1 22 + B3 27 + saving_scan 6).
E2E: smoke against Elli DSE+Cookie produced HIGH/missing B1 finding,
TH-RETENTION table (17 cookies / 3 ✓ / 3 ✗ / 11 ?), evidence-zip
with 2 slices + manifest + audit_metadata (12089B, SHA256-chained,
source verified), email sent (attachments=1).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
119 lines
5.1 KiB
Python
119 lines
5.1 KiB
Python
"""Per-document regex + MC + LLM checks for the compliance-check route.
|
|
|
|
Each document goes through:
|
|
1. regex completeness/correctness checklist
|
|
2. Master Control evaluation (all MCs for this doc_type)
|
|
3. LLM verification of failed regex checks (overturns where evidence
|
|
was missed by the regex)
|
|
4. Cookie-only: opt-out + privacy-policy URL health-check
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
async def _check_single(
|
|
text: str, doc_type: str, label: str, url: str,
|
|
word_count: int, use_agent: bool,
|
|
business_scope: set[str] | None = None,
|
|
business_profile: dict | None = None,
|
|
):
|
|
"""Run regex + MC checks on a single document."""
|
|
from compliance.services.doc_checks.runner import check_document_completeness
|
|
from compliance.services.rag_document_checker import check_document_with_controls
|
|
from ..agent_doc_check_routes import CheckItem, DocCheckResult
|
|
|
|
# Regex checklist
|
|
findings = check_document_completeness(text, doc_type, label, url,
|
|
business_profile=business_profile)
|
|
|
|
all_checks: list[CheckItem] = []
|
|
completeness = 0
|
|
correctness = 0
|
|
|
|
for f in findings:
|
|
if "SCORE" in f.get("code", ""):
|
|
for c in f.get("all_checks", []):
|
|
all_checks.append(CheckItem(
|
|
id=c["id"], label=c["label"], passed=c["passed"],
|
|
severity=c["severity"], matched_text=c.get("matched_text", ""),
|
|
level=c.get("level", 1), parent=c.get("parent"),
|
|
skipped=c.get("skipped", False), hint=c.get("hint", ""),
|
|
))
|
|
completeness = f.get("completeness_pct", 0)
|
|
correctness = f.get("correctness_pct", 0)
|
|
|
|
# Master Control checks (top 20 by severity to avoid noise)
|
|
try:
|
|
# max_controls=0 -> evaluate ALL MCs for this doc_type (DB has
|
|
# 1874 across 8 types; regex matching is cheap and dominates
|
|
# well under 1s per doc). Caps remain on the LLM-enrich step
|
|
# (top-10 FAILs) so cost stays bounded.
|
|
mc_results = await check_document_with_controls(
|
|
text, doc_type, label, max_controls=0, use_agent=use_agent,
|
|
business_scope=business_scope,
|
|
)
|
|
if mc_results:
|
|
for mc in mc_results:
|
|
all_checks.append(CheckItem(**mc))
|
|
l2 = [c for c in all_checks if c.level == 2 and not c.skipped]
|
|
l2_passed = sum(1 for c in l2 if c.passed)
|
|
correctness = round(l2_passed / len(l2) * 100) if l2 else 0
|
|
except Exception as e:
|
|
logger.warning("MC check skipped for %s: %s", label, e)
|
|
|
|
# LLM verification of regex fails
|
|
failed = [c for c in all_checks if not c.passed and not c.skipped and c.hint]
|
|
if failed:
|
|
try:
|
|
from compliance.services.doc_checks.llm_verify import verify_failed_checks
|
|
overturns = await verify_failed_checks(
|
|
text,
|
|
[{"id": c.id, "label": c.label, "hint": c.hint} for c in failed],
|
|
label,
|
|
)
|
|
for c in all_checks:
|
|
if c.id in overturns and overturns[c.id]["overturned"]:
|
|
c.passed = True
|
|
c.matched_text = f"[LLM] {overturns[c.id]['evidence']}"
|
|
l2_active = [c for c in all_checks if c.level == 2 and not c.skipped]
|
|
l2_passed = sum(1 for c in l2_active if c.passed)
|
|
if l2_active:
|
|
correctness = round(l2_passed / len(l2_active) * 100)
|
|
except Exception as e:
|
|
logger.warning("LLM verification skipped: %s", e)
|
|
|
|
# Cookie-policy only: actively HTTP-probe the Opt-Out + Privacy-Policy
|
|
# URLs the document advertises. Broken links make individual provider
|
|
# entries non-compliant under Art. 7(3) DSGVO.
|
|
if doc_type == "cookie":
|
|
try:
|
|
from compliance.services.cookie_link_validator import (
|
|
extract_links, validate_links, build_check_items,
|
|
)
|
|
links = extract_links(text)
|
|
if links:
|
|
logger.info("Cookie-link validator: %d urls extracted from %s",
|
|
len(links), label)
|
|
validated = await validate_links(links)
|
|
for item in build_check_items(validated):
|
|
all_checks.append(CheckItem(**item))
|
|
# Re-compute correctness with the new L2 items
|
|
l2_active = [c for c in all_checks if c.level == 2 and not c.skipped]
|
|
l2_passed = sum(1 for c in l2_active if c.passed)
|
|
if l2_active:
|
|
correctness = round(l2_passed / len(l2_active) * 100)
|
|
except Exception as e:
|
|
logger.warning("Cookie-link validation skipped for %s: %s", label, e)
|
|
|
|
non_score = [f for f in findings if "SCORE" not in f.get("code", "")]
|
|
return DocCheckResult(
|
|
label=label, url=url, doc_type=doc_type,
|
|
word_count=word_count or len(text.split()),
|
|
completeness_pct=completeness, correctness_pct=correctness,
|
|
checks=all_checks, findings_count=len(non_score),
|
|
)
|