#!/usr/bin/env python3 """Diagnose helper: for each failing template + missing check, show the patterns and the closest substring in the rendered template. Helps decide whether to fix the Template content or the regex pattern.""" from __future__ import annotations import json import os import re import sys from typing import Optional import psycopg2 from psycopg2.extras import RealDictCursor sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from compliance.services.doc_checks.runner import _CHECKLIST_MAP # noqa: E402 # Re-use the same rendering as the audit script sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from audit_template_completeness import ( # noqa: E402 TEMPLATE_TO_DOCTYPE, DEMO_PLACEHOLDERS, render_placeholders, strip_handlebars_blocks, ) def keyword_hits(text: str, keywords: list[str], window: int = 80) -> list[str]: """Return short context snippets where any keyword appears (case-insensitive).""" hits = [] text_lower = text.lower() for kw in keywords: for m in re.finditer(re.escape(kw.lower()), text_lower): start = max(0, m.start() - window // 2) end = min(len(text), m.end() + window // 2) snippet = text[start:end].replace("\n", " ").strip() hits.append(f"… {snippet} …") if len(hits) >= 3: return hits return hits def diagnose_template(tpl_id: str, json_path: str = "/tmp/template_audit_report.json"): with open(json_path) as f: audit = json.load(f) entry = next((a for a in audit if a["template_id"] == tpl_id), None) if not entry or not entry.get("doc_type"): print("Not found or no doc_type"); return print(f"\n=== {entry['template_type']} ({entry['language']}) — {entry['title']} ===") print(f"doc_type: {entry['doc_type']} | L1: {entry['l1_passed']}/{entry['l1_total']}") print(f"Missing: {len(entry['l1_missing'])}") # Load template content dsn = os.environ["DATABASE_URL"] conn = psycopg2.connect(dsn) cur = conn.cursor(cursor_factory=RealDictCursor) cur.execute("SELECT content FROM compliance.compliance_legal_templates WHERE id=%s", (tpl_id,)) row = cur.fetchone() if not row: print("Template not in DB"); return rendered = render_placeholders(strip_handlebars_blocks(row["content"])) # Look up checklist checklist, _label = _CHECKLIST_MAP.get(entry["doc_type"], ([], "")) by_id = {c["id"]: c for c in checklist} for miss in entry["l1_missing"]: chk = by_id.get(miss["id"]) print(f"\n ✗ {miss['label']} (id={miss['id']})") if not chk: print(" Pattern: (not found in checklist)"); continue patterns = chk.get("patterns", []) print(f" Patterns ({len(patterns)}):") for p in patterns[:5]: print(f" {p}") # Heuristic keywords from the label + pattern keywords keywords = [] for p in patterns: # Extract literal words from pattern (rough) words = re.findall(r"[a-zÀ-ž]{4,}", p, re.IGNORECASE) keywords.extend(words[:3]) keywords = list(dict.fromkeys(keywords))[:8] if keywords: print(f" Searched keywords: {keywords}") hits = keyword_hits(rendered, keywords) if hits: print(" Closest template snippets:") for h in hits[:3]: print(f" • {h[:160]}") else: print(" No keyword hits — likely genuinely missing content.") if __name__ == "__main__": json_path = sys.argv[2] if len(sys.argv) > 2 else "/tmp/template_audit_report.json" if len(sys.argv) > 1 and sys.argv[1] != "all": diagnose_template(sys.argv[1], json_path) else: with open(json_path) as f: audit = json.load(f) for a in audit: if a.get("doc_type") and a.get("l1_missing"): diagnose_template(a["template_id"], json_path)