"""
Document Checker with Master Controls — deterministic keyword verification.

Checks ALL doc_check_controls for the given doc_type using keyword
extraction from pass_criteria/fail_criteria. No LLM needed for the
primary check — results are 100% deterministic and reproducible.

Flow:
  Document text + doc_type
    → Load ALL MCs from compliance.doc_check_controls WHERE doc_type = ?
    → For each MC: extract keywords from pass_criteria
    → Match keywords against document text (regex, case-insensitive)
    → PASS if enough pass_criteria met AND no fail_criteria triggered
    → Returns structured results compatible with CheckItem format
"""

import logging
import os
import re
from typing import Optional

import httpx

logger = logging.getLogger(__name__)

OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "qwen3.5:35b-a3b")

# Minimum keyword match ratio to consider a criterion "met"
PASS_THRESHOLD = 0.5  # At least 50% of extracted keywords must match


async def check_document_with_controls(
    text: str,
    doc_type: str,
    doc_title: str,
    db_url: str = "",
    max_controls: int = 0,  # 0 = no limit, check ALL
    use_agent: bool = False,  # Use LLM agent for intelligent evaluation
    business_scope: set[str] | None = None,
) -> list[dict]:
    """Check document against ALL doc_check_controls for this doc_type.

    Two modes:
    - use_agent=False (default): Deterministic keyword matching. Fast, reproducible.
    - use_agent=True: LLM agent with tool calling. Intelligent, contextual.
    """
    if use_agent:
        try:
            from compliance.services.compliance_agent import run_compliance_check
            return await run_compliance_check(text, doc_type, doc_title)
        except Exception as e:
            logger.warning("Agent mode failed, falling back to regex: %s", e)
    if not text or len(text) < 100:
        return []

    mapped_type = _map_doc_type(doc_type)

    # Load ALL controls for this doc_type
    controls = await _load_controls(mapped_type, db_url, max_controls, business_scope)
    if not controls:
        logger.info("No MCs for doc_type '%s' (%s)", mapped_type, doc_title)
        return []

    logger.info("Checking %d MCs for '%s' (%s)", len(controls), doc_title, mapped_type)

    text_lower = text.lower().replace("\xad", "")  # Strip soft hyphens
    results = []

    for mc in controls:
        result = _check_mc_deterministic(text_lower, mc)
        if result:
            results.append(result)

    # Semantic fallback (Phase 3): MCs that failed via regex get a second
    # chance via BGE-M3 cosine similarity. BMW writes "Speicherdauer 2
    # Jahre" — the regex misses, embedding catches it.
    failed_ids = {r.get("control_id") for r in results
                  if not r.get("passed") and r.get("control_id")}
    if failed_ids:
        try:
            from compliance.services.mc_embedding_matcher import (
                ensure_mc_embeddings, embedding_match,
            )
            await ensure_mc_embeddings()  # idempotent: only embeds new MCs
            failed_mcs = [c for c in controls if c.get("control_id") in failed_ids]
            semantic_passes = await embedding_match(
                text, failed_mcs, doc_type=mapped_type,
            )
            if semantic_passes:
                for r in results:
                    cid = r.get("control_id")
                    if cid and cid in semantic_passes and not r.get("passed"):
                        r["passed"] = True
                        r["matched_text"] = "[semantischer Treffer via Embedding]"
                        r["hint"] = (r.get("hint") or "") + " (passed via Embedding-Match, BGE-M3 cosine)"
        except Exception as e:
            logger.warning("Embedding fallback skipped: %s", e, exc_info=True)

    passed = sum(1 for r in results if r["passed"])
    failed_results = [r for r in results if not r["passed"]]
    logger.info("MC results: %d passed, %d failed out of %d for '%s'",
                passed, len(failed_results), len(results), doc_title)

    # LLM Interpretation: enrich FAILs with context-specific recommendations
    if failed_results:
        try:
            await _enrich_fails_with_llm(text, failed_results, doc_title)
        except Exception as e:
            logger.warning("LLM interpretation skipped: %s", e)

    return results


def _check_mc_deterministic(text_lower: str, mc: dict) -> Optional[dict]:
    """Check one MC against document text using keyword matching.

    Deterministic: extracts keywords from pass_criteria, searches text.
    """
    import json

    question = mc.get("check_question", "")
    if not question:
        return None

    pass_crit = mc.get("pass_criteria", [])
    fail_crit = mc.get("fail_criteria", [])

    # Parse JSON if needed
    if isinstance(pass_crit, str):
        try:
            pass_crit = json.loads(pass_crit)
        except Exception:
            pass_crit = [pass_crit] if pass_crit else []
    if isinstance(fail_crit, str):
        try:
            fail_crit = json.loads(fail_crit)
        except Exception:
            fail_crit = [fail_crit] if fail_crit else []

    if not pass_crit:
        return None

    # Check how many pass_criteria are met
    criteria_met = 0
    total_criteria = len(pass_crit)
    evidence = ""

    for criterion in pass_crit:
        keywords = _extract_keywords(criterion)
        if not keywords:
            criteria_met += 1  # Empty criterion = auto-pass
            continue

        # Count how many keywords match
        matched = sum(1 for kw in keywords if kw in text_lower)
        ratio = matched / len(keywords) if keywords else 0

        if ratio >= PASS_THRESHOLD:
            criteria_met += 1
            # Find evidence
            if not evidence:
                for kw in keywords:
                    idx = text_lower.find(kw)
                    if idx >= 0:
                        start = max(0, idx - 30)
                        end = min(len(text_lower), idx + len(kw) + 30)
                        evidence = text_lower[start:end].strip()
                        break

    # Check fail_criteria (any match = penalty)
    fail_triggered = False
    for criterion in fail_crit:
        keywords = _extract_keywords(criterion)
        if not keywords:
            continue
        matched = sum(1 for kw in keywords if kw in text_lower)
        if matched >= len(keywords) * 0.7:  # 70% of fail keywords match
            fail_triggered = True
            break

    # Decision: PASS if majority of criteria met and no fail triggered
    passed = (criteria_met >= total_criteria * 0.6) and not fail_triggered

    severity = (mc.get("severity") or "MEDIUM").upper()
    control_id = mc.get("control_id", str(mc.get("id", ""))[:8])

    return {
        "id": f"mc-{control_id}",
        "control_id": control_id,
        "label": mc.get("title", "")[:80],
        "passed": passed,
        "severity": severity,
        "matched_text": evidence[:100] if passed else "",
        "level": 2,
        "parent": None,
        "skipped": False,
        "hint": question if not passed else "",
        "source": "master_control",
        "criteria_met": f"{criteria_met}/{total_criteria}",
        "regulation": mc.get("regulation") or "",
        "article": mc.get("article") or "",
    }


# Keywords shorter than this are too generic to be useful
_MIN_KEYWORD_LEN = 4

# Common German stop words to skip
_STOP_WORDS = {
    "oder", "und", "der", "die", "das", "ein", "eine", "einer", "eines",
    "von", "vom", "zur", "zum", "mit", "auf", "aus", "fuer", "für",
    "bei", "nach", "ueber", "über", "unter", "nicht", "kein", "keine",
    "wird", "werden", "kann", "muss", "soll", "ist", "sind", "hat",
    "dass", "wenn", "ohne", "nur", "auch", "noch", "alle", "alle",
    "wie", "was", "wer", "den", "dem", "des", "als", "bis", "vor",
    "sein", "sich", "durch", "damit", "davon", "dazu", "dies", "diese",
    "dieser", "dieses", "jede", "jeder", "jedes", "andere", "anderen",
    "solche", "solcher", "welche", "welcher", "etwa", "bereits",
    "sowie", "soweit", "sofern", "falls", "hierzu", "hierbei",
    "insbesondere", "beispielsweise", "gegebenenfalls",
}


def _extract_keywords(criterion: str) -> list[str]:
    """Extract meaningful keywords from a pass/fail criterion text."""
    # Lowercase and clean
    text = criterion.lower()
    text = re.sub(r"[()'\"\[\],;:!?]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()

    words = text.split()
    keywords = []

    for word in words:
        # Skip short words and stop words
        if len(word) < _MIN_KEYWORD_LEN:
            continue
        if word in _STOP_WORDS:
            continue
        # Skip pure numbers
        if word.isdigit():
            continue
        keywords.append(word)

    # Also extract compound terms (2-word bigrams) for specificity
    for i in range(len(words) - 1):
        bigram = f"{words[i]} {words[i+1]}"
        if len(bigram) >= 8 and words[i] not in _STOP_WORDS and words[i+1] not in _STOP_WORDS:
            keywords.append(bigram)

    return keywords[:15]  # Cap at 15 keywords per criterion


# Map doc_type aliases
_DOC_TYPE_MAP = {
    "dse": "dse", "datenschutz": "dse", "privacy": "dse",
    "cookie": "cookie",
    "impressum": "impressum", "imprint": "impressum",
    "widerruf": "widerruf", "withdrawal": "widerruf",
    "agb": "agb", "terms": "agb",
    "dsfa": "dsfa",
    "social_media": "dse",
    "avv": "avv",
    "loeschkonzept": "loeschkonzept",
}


def _map_doc_type(doc_type: str) -> str:
    return _DOC_TYPE_MAP.get(doc_type, doc_type)


# Doc-types that have no own MCs but can borrow from a related set.
# (DB currently covers: dse, cookie, loeschkonzept, widerruf, dsfa,
#  avv, agb, impressum — total 1874 MCs across these.)
_MC_ALIAS_FALLBACK = {
    "nutzungsbedingungen": "agb",        # T&C overlap
    "terms": "agb",
    "terms_of_use": "agb",
    "social_media": "dse",               # Joint-controller / Art. 26 is in DSE area
    "joint_controller": "dse",
    "sub_processor": "avv",              # Subprocessor list = AVV annex
    "sub_processor_list": "avv",
    "scc": "avv",                        # SCC = AVV-Vertragsklauseln
    "standardvertragsklauseln": "avv",
    "tom_annex": "avv",                  # TOM-Annex meist als AVV-Anlage
    "tom": "avv",
    "dpa": "avv",
    "loeschung": "loeschkonzept",
    "loeschfristen": "loeschkonzept",
    "eu_institution": "dse",             # EU institution = DSE under VO 2018/1725
    "dsb": "dse",                        # DSB info ist Teil der DSE
}


# P72 — kompatible scope_doc_type-Werte pro operativem doc_type.
# 'other' / NULL / 'process' bleiben immer drin (Backfill ist Heuristik v1
# und nicht stark genug fuer hartes Filtern).
_SCOPE_COMPATIBLE: dict[str, set[str]] = {
    "dse":           {"dse", "jc", "process", "tom", "accounting"},
    "cookie":        {"cookie_richtlinie", "banner_implementation",
                       "cmp_audit", "dse"},
    "cookie_policy": {"cookie_richtlinie", "banner_implementation",
                       "cmp_audit", "dse"},
    "impressum":     {"impressum", "agb"},
    "agb":           {"agb", "widerruf", "impressum"},
    "nutzungsbedingungen": {"agb", "widerruf", "impressum"},
    "widerruf":      {"widerruf", "agb"},
    "avv":           {"avv", "tom", "jc", "process"},
    "tom":           {"tom", "avv", "process"},
    "loeschkonzept": {"process", "dse", "accounting"},
    "dsfa":          {"process", "tom", "dse"},
    "social_media":  {"jc", "dse"},
    "dsa":           {"dse", "impressum"},
    "legal_notice":  {"impressum", "agb"},
    "lizenzhinweise": {"agb", "impressum"},
}
_PERMISSIVE_SCOPES = {"other", "process", None, "", "null"}


def _filter_by_canonical_scope(
    controls: list[dict],
    doc_type: str,
) -> list[dict]:
    """P72 — wirft MCs raus, deren canonical scope_doc_type explizit auf
    einen INKOMPATIBLEN Doc-Type zeigt. 'other'/NULL/'process' bleiben
    drin (Backfill v1 noch zu unsicher).
    """
    compatible = _SCOPE_COMPATIBLE.get(doc_type)
    if not compatible:
        return controls
    kept: list[dict] = []
    dropped = 0
    for c in controls:
        scope = c.get("canonical_scope")
        scope_norm = (scope or "").strip().lower() or None
        if scope_norm in _PERMISSIVE_SCOPES or scope_norm in compatible:
            kept.append(c)
        else:
            dropped += 1
    if dropped:
        logger.info(
            "P72 scope-filter: %d/%d MCs out-of-scope fuer doc_type=%s",
            dropped, len(controls), doc_type,
        )
    return kept


def _load_text_only_ids(
    doc_type: str | None = None,
    business_scope: set[str] | None = None,
) -> set[str]:
    """Return control_ids that the Sonnet-classifier flagged as 'text'.

    Filters applied:
    1. check_type='text' (only doc-text-matchable MCs)
    2. doc_type matches (per-doc-type variant from v2-Sidecar)
    3. fits_doc_type=1 (LLM auditor approved this MC for this doc_type)
    4. scope_requires NULL or contained in business_scope
       (e.g. MCs with scope_requires='biometric_processing' are skipped
       on sites that don't do biometric processing — Art. 22 FRT-MC bei
       BMW falsch-positiv)

    `business_scope` comes from the business_profiler (set of detected
    site characteristics like 'b2c', 'shop', 'biometric_processing',
    'ai_decision_making', 'child_targeting').

    Returns empty set if the sidecar doesn't exist yet.
    """
    import sqlite3
    db_path = os.getenv("MC_CLASS_DB", "/data/mc_classification.db")
    try:
        with sqlite3.connect(db_path) as c:
            cols = [r[1] for r in c.execute("PRAGMA table_info(mc_classification)")]
            has_fit = "fits_doc_type" in cols
            has_scope = "scope_requires" in cols
            fit_clause = " AND (fits_doc_type IS NULL OR fits_doc_type = 1)" if has_fit else ""
            base = ("SELECT control_id, scope_requires FROM mc_classification "
                    "WHERE check_type = 'text'" + fit_clause) if has_scope else (
                   "SELECT control_id, NULL FROM mc_classification "
                   "WHERE check_type = 'text'" + fit_clause)
            params: list = []
            if doc_type:
                base += " AND doc_type = ?"
                params.append(doc_type)
            rows = c.execute(base, params).fetchall()
            scope = business_scope or set()
            keep: set[str] = set()
            for cid, req in rows:
                if not req:
                    keep.add(cid)
                else:
                    # Multiple requirements separated by '|' — ALL must
                    # be in scope to include. Empty req tokens are skipped.
                    needed = {r.strip().lower() for r in req.split("|") if r.strip()}
                    if needed.issubset({s.lower() for s in scope}):
                        keep.add(cid)
            return keep
    except sqlite3.OperationalError:
        return set()
    except Exception as e:
        logger.warning("MC classification lookup failed: %s", e)
        return set()


async def _load_controls(doc_type: str, db_url: str, limit: int,
                         business_scope: set[str] | None = None) -> list[dict]:
    """Load all doc_check_controls for a doc_type from PostgreSQL.

    Falls back via _MC_ALIAS_FALLBACK when no MCs exist for the requested
    type (e.g. 'nutzungsbedingungen' -> 'agb').

    Filters to only check_type='text' MCs when the classification sidecar
    is present — process/review MCs are routed to other modules.
    """
    try:
        import asyncpg
        db = db_url or os.getenv(
            "DATABASE_URL",
            "postgresql://breakpilot:breakpilot@bp-core-postgres:5432/breakpilot",
        )
        conn = await asyncpg.connect(db)
    except Exception as e:
        logger.warning("DB connection failed: %s", e)
        return []

    try:
        # P72: LEFT JOIN canonical_controls.scope_doc_type um scope-Info
        # mitzuziehen. Wenn ein MC explizit fuer einen anderen Doc-Type
        # klassifiziert ist (z.B. 'tom' statt 'dse'), wird er unten
        # gefiltert. 'other' / NULL bleiben drin (Backfill noch nicht stark).
        query = """SELECT dc.id, dc.control_id, dc.title, dc.regulation,
                          dc.article, dc.check_question, dc.pass_criteria,
                          dc.fail_criteria, dc.severity,
                          cc.scope_doc_type AS canonical_scope
                   FROM compliance.doc_check_controls dc
                   LEFT JOIN compliance.canonical_controls cc
                          ON cc.id = dc.control_uuid
                   WHERE dc.doc_type = $1
                   ORDER BY dc.severity DESC, dc.title"""
        if limit > 0:
            query += f" LIMIT {limit}"

        rows = await conn.fetch(query, doc_type)
        if not rows and doc_type in _MC_ALIAS_FALLBACK:
            fallback = _MC_ALIAS_FALLBACK[doc_type]
            logger.info("No MCs for %s -> falling back to %s", doc_type, fallback)
            rows = await conn.fetch(query, fallback)

        controls = [dict(r) for r in rows]

        # P72: Scope-Filter — werfe MCs raus, deren canonical scope_doc_type
        # explizit auf einen anderen Doc-Type zeigt. Konservativ:
        # other/NULL/process bleiben drin (zu unsichere Klassifikation).
        controls = _filter_by_canonical_scope(controls, doc_type)

        text_only = _load_text_only_ids(doc_type, business_scope)
        if text_only:
            before = len(controls)
            controls = [c for c in controls if c.get("control_id") in text_only]
            logger.info(
                "MC filter (text only) for %s: %d/%d MCs after Sonnet check_type filter",
                doc_type, len(controls), before,
            )
        return controls
    except Exception as e:
        logger.warning("MC query failed: %s", e)
        return []
    finally:
        await conn.close()


async def _enrich_fails_with_llm(
    doc_text: str,
    failed_results: list[dict],
    doc_title: str,
) -> None:
    """Enrich failed MC results with LLM-generated context-specific advice.

    Does NOT change pass/fail (deterministic result stays). Only adds
    a richer 'hint' with concrete recommendations based on the actual
    document content.

    Uses ONE batched LLM call for up to 10 top-severity FAILs.
    """
    # Select top failures by severity (max 10 to fit context window)
    sev_order = {"CRITICAL": 0, "HIGH": 1, "MEDIUM": 2, "LOW": 3}
    top_fails = sorted(
        failed_results,
        key=lambda r: sev_order.get(r.get("severity", "MEDIUM"), 2),
    )[:10]

    fail_list = "\n".join(
        f"{i+1}. [{r['severity']}] {r['label']} — {r.get('hint', '')[:100]}"
        for i, r in enumerate(top_fails)
    )

    # Truncate document for context
    excerpt = doc_text[:4000] if len(doc_text) > 5000 else doc_text

    prompt = (
        "/no_think\n"
        f"Du bist ein Datenschutz-Experte. Analysiere das Dokument '{doc_title}' "
        f"und gib fuer JEDEN der folgenden fehlgeschlagenen Pruefpunkte eine "
        f"konkrete, umsetzbare Empfehlung (1-2 Saetze).\n\n"
        f"Beruecksichtige dabei den Inhalt des Dokuments — welche Dienste werden "
        f"genutzt? Welche Rechtsgrundlagen sind genannt? Was fehlt konkret?\n\n"
        f"FEHLGESCHLAGENE PRUEFPUNKTE:\n{fail_list}\n\n"
        f"DOKUMENT (Auszug):\n{excerpt[:3000]}\n\n"
        f"Antworte als JSON-Array: [\n"
        f'  {{"nr": 1, "empfehlung": "Konkreter Hinweis..."}},\n'
        f'  {{"nr": 2, "empfehlung": "..."}}\n'
        f"]\n"
        f"Nur die Empfehlungen, kein anderer Text."
    )

    try:
        async with httpx.AsyncClient(timeout=60.0) as client:
            resp = await client.post(f"{OLLAMA_URL}/api/generate", json={
                "model": OLLAMA_MODEL,
                "prompt": prompt,
                "stream": False,
                "options": {"temperature": 0.3, "num_predict": 1500},
            })

        if resp.status_code != 200:
            return

        raw = resp.json().get("response", "")
        raw = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL).strip()

        # Parse JSON array
        import json
        arr_match = re.search(r"\[[\s\S]*\]", raw)
        if not arr_match:
            return

        try:
            recommendations = json.loads(arr_match.group())
        except json.JSONDecodeError:
            return

        # Enrich the failed results with LLM recommendations
        for rec in recommendations:
            nr = rec.get("nr", 0)
            advice = rec.get("empfehlung", "")
            if 1 <= nr <= len(top_fails) and advice:
                existing_hint = top_fails[nr - 1].get("hint", "")
                # Append LLM advice after the deterministic hint
                top_fails[nr - 1]["hint"] = (
                    f"{existing_hint}\n\n"
                    f"Empfehlung: {advice}"
                ).strip() if existing_hint else advice

        logger.info("LLM enriched %d/%d fails for '%s'",
                    len(recommendations), len(top_fails), doc_title)

    except Exception as e:
        logger.warning("LLM enrichment failed: %s", e)