breakpilot-compliance/backend-compliance/scripts/audit_diagnose.py

#!/usr/bin/env python3
"""Diagnose helper: for each failing template + missing check,
show the patterns and the closest substring in the rendered template.
Helps decide whether to fix the Template content or the regex pattern."""
from __future__ import annotations

import json
import os
import re
import sys
from typing import Optional

import psycopg2
from psycopg2.extras import RealDictCursor

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from compliance.services.doc_checks.runner import _CHECKLIST_MAP  # noqa: E402

# Re-use the same rendering as the audit script
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from audit_template_completeness import (  # noqa: E402
    TEMPLATE_TO_DOCTYPE, DEMO_PLACEHOLDERS,
    render_placeholders, strip_handlebars_blocks,
)


def keyword_hits(text: str, keywords: list[str], window: int = 80) -> list[str]:
    """Return short context snippets where any keyword appears (case-insensitive)."""
    hits = []
    text_lower = text.lower()
    for kw in keywords:
        for m in re.finditer(re.escape(kw.lower()), text_lower):
            start = max(0, m.start() - window // 2)
            end = min(len(text), m.end() + window // 2)
            snippet = text[start:end].replace("\n", " ").strip()
            hits.append(f"… {snippet} …")
            if len(hits) >= 3:
                return hits
    return hits


def diagnose_template(tpl_id: str, json_path: str = "/tmp/template_audit_report.json"):
    with open(json_path) as f:
        audit = json.load(f)
    entry = next((a for a in audit if a["template_id"] == tpl_id), None)
    if not entry or not entry.get("doc_type"):
        print("Not found or no doc_type"); return
    print(f"\n=== {entry['template_type']} ({entry['language']}) — {entry['title']} ===")
    print(f"doc_type: {entry['doc_type']} | L1: {entry['l1_passed']}/{entry['l1_total']}")
    print(f"Missing: {len(entry['l1_missing'])}")

    # Load template content
    dsn = os.environ["DATABASE_URL"]
    conn = psycopg2.connect(dsn)
    cur = conn.cursor(cursor_factory=RealDictCursor)
    cur.execute("SELECT content FROM compliance.compliance_legal_templates WHERE id=%s", (tpl_id,))
    row = cur.fetchone()
    if not row:
        print("Template not in DB"); return
    rendered = render_placeholders(strip_handlebars_blocks(row["content"]))

    # Look up checklist
    checklist, _label = _CHECKLIST_MAP.get(entry["doc_type"], ([], ""))
    by_id = {c["id"]: c for c in checklist}

    for miss in entry["l1_missing"]:
        chk = by_id.get(miss["id"])
        print(f"\n  ✗ {miss['label']} (id={miss['id']})")
        if not chk:
            print("    Pattern: (not found in checklist)"); continue
        patterns = chk.get("patterns", [])
        print(f"    Patterns ({len(patterns)}):")
        for p in patterns[:5]:
            print(f"      {p}")
        # Heuristic keywords from the label + pattern keywords
        keywords = []
        for p in patterns:
            # Extract literal words from pattern (rough)
            words = re.findall(r"[a-zÀ-ž]{4,}", p, re.IGNORECASE)
            keywords.extend(words[:3])
        keywords = list(dict.fromkeys(keywords))[:8]
        if keywords:
            print(f"    Searched keywords: {keywords}")
        hits = keyword_hits(rendered, keywords)
        if hits:
            print("    Closest template snippets:")
            for h in hits[:3]:
                print(f"      • {h[:160]}")
        else:
            print("    No keyword hits — likely genuinely missing content.")


if __name__ == "__main__":
    json_path = sys.argv[2] if len(sys.argv) > 2 else "/tmp/template_audit_report.json"
    if len(sys.argv) > 1 and sys.argv[1] != "all":
        diagnose_template(sys.argv[1], json_path)
    else:
        with open(json_path) as f:
            audit = json.load(f)
        for a in audit:
            if a.get("doc_type") and a.get("l1_missing"):
                diagnose_template(a["template_id"], json_path)