"""Vocabulary extraction from images using Vision LLM and hybrid OCR+LLM.

Contains:
- VOCAB_EXTRACTION_PROMPT: Prompt template for Vision LLM extraction
- extract_vocabulary_from_image(): Core extraction (hybrid or Vision LLM)
- _get_demo_vocabulary(): Demo data for testing
- parse_vocabulary_json(): Robust JSON parsing with 4-strategy fallback
"""

import base64
import json
import logging
import os
import re
import uuid
from typing import List

import httpx

from vocab_worksheet_models import VocabularyEntry

logger = logging.getLogger(__name__)

# Ollama Configuration
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
VISION_MODEL = os.getenv("OLLAMA_VISION_MODEL", "qwen2.5vl:32b")


# =============================================================================
# Vision LLM Vocabulary Extraction
# =============================================================================

VOCAB_EXTRACTION_PROMPT = """Analysiere dieses Bild einer Vokabelliste aus einem Schulbuch.

AUFGABE: Extrahiere alle Vokabeleintraege in folgendem JSON-Format:

{
  "vocabulary": [
    {
      "english": "to improve",
      "german": "verbessern",
      "example": "I want to improve my English."
    }
  ]
}

REGELN:
1. Erkenne das typische 3-Spalten-Layout: Englisch | Deutsch | Beispielsatz
2. Behalte die exakte Schreibweise bei
3. Bei fehlenden Beispielsaetzen: "example": null
4. Ignoriere Seitenzahlen, Ueberschriften, Kapitelnummern
5. Gib NUR valides JSON zurueck, keine Erklaerungen
6. Wenn Wortarten angegeben sind (n, v, adj), extrahiere sie als "word_type"

Beispiel-Output:
{
  "vocabulary": [
    {"english": "achievement", "german": "Leistung, Errungenschaft", "example": "Her achievements were impressive.", "word_type": "n"},
    {"english": "to achieve", "german": "erreichen, erzielen", "example": "She achieved her goals.", "word_type": "v"}
  ]
}"""


async def extract_vocabulary_from_image(
    image_data: bytes,
    filename: str,
    page_number: int = 0,
    use_hybrid: bool = False  # DISABLED: PaddleOCR crashes on ARM64 Mac Mini
) -> tuple[List[VocabularyEntry], float, str]:
    """
    Extract vocabulary from an image using hybrid OCR+LLM or Vision LLM (default).

    Args:
        image_data: Image bytes
        filename: Original filename for logging
        page_number: 0-indexed page number for error messages
        use_hybrid: If True, use PaddleOCR + LLM (faster, more accurate for printed text)
                   If False, use Vision LLM (slower, better for complex layouts)

    Returns:
        Tuple of (vocabulary_entries, confidence, error_message)
        error_message is empty string on success
    """

    # ==========================================================================
    # HYBRID APPROACH (Default): PaddleOCR + LLM Gateway
    # ==========================================================================
    if use_hybrid:
        try:
            from hybrid_vocab_extractor import extract_vocabulary_hybrid
            logger.info(f"Using HYBRID extraction for {filename} (PaddleOCR + LLM)")

            vocab_dicts, confidence, error = await extract_vocabulary_hybrid(image_data, page_number)

            if error:
                logger.warning(f"Hybrid extraction had issues: {error}")
                # Fall through to Vision LLM fallback
            elif vocab_dicts:
                # Convert dicts to VocabularyEntry objects
                vocabulary = [
                    VocabularyEntry(
                        id=str(uuid.uuid4()),
                        english=v.get("english", ""),
                        german=v.get("german", ""),
                        example_sentence=v.get("example"),
                        source_page=page_number + 1
                    )
                    for v in vocab_dicts
                    if v.get("english") and v.get("german")
                ]
                logger.info(f"Hybrid extraction: {len(vocabulary)} entries from {filename}")
                return vocabulary, confidence, ""

        except ImportError as e:
            logger.warning(f"Hybrid extractor not available: {e}. Falling back to Vision LLM.")
        except Exception as e:
            logger.warning(f"Hybrid extraction failed: {e}. Falling back to Vision LLM.")
            import traceback
            logger.debug(traceback.format_exc())

    # ==========================================================================
    # FALLBACK: Vision LLM (Ollama llama3.2-vision)
    # ==========================================================================
    logger.info(f"Using VISION LLM extraction for {filename}")

    try:
        # First check if Ollama is available
        async with httpx.AsyncClient(timeout=10.0) as check_client:
            try:
                health_response = await check_client.get(f"{OLLAMA_URL}/api/tags")
                if health_response.status_code != 200:
                    logger.error(f"Ollama not available at {OLLAMA_URL}")
                    return [], 0.0, f"Seite {page_number + 1}: Ollama nicht verfuegbar"
            except Exception as e:
                logger.error(f"Ollama health check failed: {e}")
                return [], 0.0, f"Seite {page_number + 1}: Verbindung zu Ollama fehlgeschlagen"

        image_base64 = base64.b64encode(image_data).decode("utf-8")

        payload = {
            "model": VISION_MODEL,
            "messages": [
                {
                    "role": "user",
                    "content": VOCAB_EXTRACTION_PROMPT,
                    "images": [image_base64]
                }
            ],
            "stream": False,
            "options": {
                "temperature": 0.1,
                "num_predict": 4096,
            }
        }

        logger.info(f"Extracting vocabulary from {filename} ({len(image_data)} bytes) using {VISION_MODEL}")

        # Increased timeout for Vision models (they can be slow)
        async with httpx.AsyncClient(timeout=600.0) as client:
            response = await client.post(
                f"{OLLAMA_URL}/api/chat",
                json=payload,
                timeout=300.0  # 5 minutes per page
            )
            response.raise_for_status()

            data = response.json()
            extracted_text = data.get("message", {}).get("content", "")

        logger.info(f"Ollama response received: {len(extracted_text)} chars")

        # Parse JSON from response
        vocabulary = parse_vocabulary_json(extracted_text)

        # Set source_page for each entry
        for v in vocabulary:
            v.source_page = page_number + 1

        # Estimate confidence
        confidence = 0.85 if len(vocabulary) > 0 else 0.1

        logger.info(f"Vision LLM extracted {len(vocabulary)} vocabulary entries from {filename}")

        return vocabulary, confidence, ""

    except httpx.TimeoutException:
        logger.error(f"Ollama request timed out for {filename} (model: {VISION_MODEL})")
        return [], 0.0, f"Seite {page_number + 1}: Timeout - Verarbeitung dauerte zu lange"
    except Exception as e:
        logger.error(f"Vocabulary extraction failed for {filename}: {e}")
        import traceback
        logger.error(traceback.format_exc())
        return [], 0.0, f"Seite {page_number + 1}: Fehler - {str(e)[:50]}"


def _get_demo_vocabulary() -> List[VocabularyEntry]:
    """Return demo vocabulary for testing when Vision LLM is not available."""
    demo_entries = [
        {"english": "to achieve", "german": "erreichen, erzielen", "example": "She achieved her goals."},
        {"english": "achievement", "german": "Leistung, Errungenschaft", "example": "That was a great achievement."},
        {"english": "improve", "german": "verbessern", "example": "I want to improve my English."},
        {"english": "improvement", "german": "Verbesserung", "example": "There has been a lot of improvement."},
        {"english": "success", "german": "Erfolg", "example": "The project was a success."},
        {"english": "successful", "german": "erfolgreich", "example": "She is a successful businesswoman."},
        {"english": "fail", "german": "scheitern, durchfallen", "example": "Don't be afraid to fail."},
        {"english": "failure", "german": "Misserfolg, Versagen", "example": "Failure is part of learning."},
    ]
    return [
        VocabularyEntry(
            id=str(uuid.uuid4()),
            english=e["english"],
            german=e["german"],
            example_sentence=e.get("example"),
        )
        for e in demo_entries
    ]


def parse_vocabulary_json(text: str) -> List[VocabularyEntry]:
    """Parse vocabulary JSON from LLM response with robust error handling."""

    def clean_json_string(s: str) -> str:
        """Clean a JSON string by removing control characters and fixing common issues."""
        # Remove control characters except newlines and tabs
        s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s)
        # Replace unescaped newlines within strings with space
        # This is a simplistic approach - replace actual newlines with escaped ones
        s = s.replace('\n', '\\n').replace('\r', '\\r').replace('\t', '\\t')
        return s

    def try_parse_json(json_str: str) -> dict:
        """Try multiple strategies to parse JSON."""
        # Strategy 1: Direct parse
        try:
            return json.loads(json_str)
        except json.JSONDecodeError:
            pass

        # Strategy 2: Clean and parse
        try:
            cleaned = clean_json_string(json_str)
            return json.loads(cleaned)
        except json.JSONDecodeError:
            pass

        # Strategy 3: Try to fix common issues
        try:
            # Remove trailing commas before } or ]
            fixed = re.sub(r',(\s*[}\]])', r'\1', json_str)
            # Fix unquoted keys
            fixed = re.sub(r'(\{|\,)\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*:', r'\1"\2":', fixed)
            return json.loads(fixed)
        except json.JSONDecodeError:
            pass

        return None

    try:
        # Find JSON in response (may have extra text)
        start = text.find('{')
        end = text.rfind('}') + 1

        if start == -1 or end == 0:
            logger.warning("No JSON found in response")
            return []

        json_str = text[start:end]
        data = try_parse_json(json_str)

        if data is None:
            # Strategy 4: Extract vocabulary entries using regex as fallback
            logger.warning("JSON parsing failed, trying regex extraction")
            vocabulary = []
            # Match patterns like {"english": "...", "german": "...", ...}
            pattern = r'\{\s*"english"\s*:\s*"([^"]*?)"\s*,\s*"german"\s*:\s*"([^"]*?)"(?:\s*,\s*"example"\s*:\s*(?:"([^"]*?)"|null))?'
            matches = re.findall(pattern, text, re.IGNORECASE | re.DOTALL)

            for match in matches:
                english = match[0].strip() if match[0] else ""
                german = match[1].strip() if match[1] else ""
                example = match[2].strip() if len(match) > 2 and match[2] else None

                if english and german:
                    vocab_entry = VocabularyEntry(
                        id=str(uuid.uuid4()),
                        english=english,
                        german=german,
                        example_sentence=example,
                    )
                    vocabulary.append(vocab_entry)

            if vocabulary:
                logger.info(f"Regex extraction found {len(vocabulary)} entries")
            return vocabulary

        # Normal JSON parsing succeeded
        vocabulary = []
        for i, entry in enumerate(data.get("vocabulary", [])):
            english = entry.get("english", "").strip()
            german = entry.get("german", "").strip()

            # Skip entries that look like hallucinations (very long or containing unusual patterns)
            if len(english) > 100 or len(german) > 200:
                logger.warning(f"Skipping suspicious entry: {english[:50]}...")
                continue

            if not english or not german:
                continue

            vocab_entry = VocabularyEntry(
                id=str(uuid.uuid4()),
                english=english,
                german=german,
                example_sentence=entry.get("example"),
                word_type=entry.get("word_type"),
            )
            vocabulary.append(vocab_entry)

        return vocabulary

    except Exception as e:
        logger.error(f"Failed to parse vocabulary JSON: {e}")
        import traceback
        logger.error(traceback.format_exc())
        return []