breakpilot-pwa/backend/claude_vision.py

"""
Claude Vision API Integration for Worksheet Analysis

Uses Anthropic's Claude 3.5 Sonnet for superior OCR and layout understanding.
"""

import os
import base64
import json
from pathlib import Path
from typing import Dict, Optional
import logging

logger = logging.getLogger(__name__)

# Try to import Anthropic SDK
try:
    from anthropic import Anthropic
    ANTHROPIC_AVAILABLE = True
except ImportError:
    ANTHROPIC_AVAILABLE = False
    logger.warning("Anthropic SDK not installed. Run: pip install anthropic")


def _get_anthropic_api_key() -> str:
    """Get Anthropic API key from environment variable"""
    api_key = os.getenv("ANTHROPIC_API_KEY")
    if not api_key:
        raise RuntimeError(
            "ANTHROPIC_API_KEY ist nicht gesetzt. "
            "Bitte API-Schlüssel als Umgebungsvariable setzen:\n"
            "export ANTHROPIC_API_KEY='sk-ant-api03-...'"
        )
    return api_key


def _encode_image_to_base64(image_path: Path) -> tuple[str, str]:
    """
    Encode image to base64 for Claude API.

    Returns:
        (base64_string, media_type)
    """
    image_bytes = image_path.read_bytes()
    image_b64 = base64.standard_b64encode(image_bytes).decode("utf-8")

    # Determine media type from extension
    ext = image_path.suffix.lower()
    media_type_map = {
        '.jpg': 'image/jpeg',
        '.jpeg': 'image/jpeg',
        '.png': 'image/png',
        '.gif': 'image/gif',
        '.webp': 'image/webp'
    }
    media_type = media_type_map.get(ext, 'image/jpeg')

    return image_b64, media_type


def analyze_worksheet_with_claude(
    image_path: Path,
    max_tokens: int = 2500,
    model: str = "claude-3-5-sonnet-20241022"
) -> Dict:
    """
    Analyze worksheet using Claude Vision API.

    Args:
        image_path: Path to worksheet image
        max_tokens: Maximum tokens in response (default 2500)
        model: Claude model to use (default: Claude 3.5 Sonnet)

    Returns:
        Analysis dict with same structure as OpenAI version

    Raises:
        RuntimeError: If API key not set or SDK not installed
        Exception: If API call fails
    """
    if not ANTHROPIC_AVAILABLE:
        raise RuntimeError("Anthropic SDK nicht installiert. Run: pip install anthropic")

    if not image_path.exists():
        raise FileNotFoundError(f"Image not found: {image_path}")

    # Get API key
    api_key = _get_anthropic_api_key()

    # Initialize Anthropic client
    client = Anthropic(api_key=api_key)

    # Encode image
    image_b64, media_type = _encode_image_to_base64(image_path)

    # System prompt (instructions)
    system_prompt = """Du bist ein Experte für die Analyse von Schul-Arbeitsblättern.

Deine Aufgabe ist es, das Arbeitsblatt detailliert zu analysieren und strukturierte Informationen zu extrahieren:

1. **Gedruckter Text**: Erkenne den VOLLSTÄNDIGEN gedruckten Text inklusive durchgestrichener Wörter
2. **Handschrift**: Identifiziere alle handschriftlichen Eintragungen (Schülerantworten, Korrekturen, Notizen)
3. **Layout**: Bestimme räumliche Positionen aller Elemente (Bounding Boxes in Pixeln)
4. **Diagramme**: Erkenne gedruckte Illustrationen, Grafiken, Diagramme
5. **Farben**: Klassifiziere Handschrift nach Farbe (blau/schwarz/rot/Bleistift)

WICHTIG: Gib deine Antwort als gültiges JSON zurück, nicht als Markdown Code Block!"""

    # User prompt with JSON schema
    user_prompt = """Analysiere dieses Arbeitsblatt und gib ein JSON mit folgendem Aufbau zurück:

{
  "title": string | null,
  "subject": string | null,
  "grade_level": string | null,
  "instructions": string | null,
  "canonical_text": string | null,
  "printed_blocks": [
    {
      "id": string,
      "role": "title" | "instructions" | "body" | "other",
      "text": string
    }
  ],
  "layout": {
    "page_structure": {
      "has_diagram": boolean,
      "orientation": "portrait" | "landscape"
    },
    "text_regions": [
      {
        "id": string,
        "type": "title" | "paragraph" | "list" | "instruction",
        "text": string,
        "bounding_box": {"x": int, "y": int, "width": int, "height": int},
        "font_characteristics": {
          "is_bold": boolean,
          "approximate_size": "large" | "medium" | "small"
        }
      }
    ],
    "diagram_elements": [
      {
        "id": string,
        "type": "illustration" | "chart" | "graph" | "shape",
        "description": string,
        "bounding_box": {"x": int, "y": int, "width": int, "height": int},
        "preserve": boolean
      }
    ]
  },
  "handwriting_regions": [
    {
      "id": string,
      "text": string,
      "type": "student_answer" | "correction" | "note" | "drawing",
      "bounding_box": {"x": int, "y": int, "width": int, "height": int},
      "color_hint": "blue" | "black" | "red" | "pencil" | "unknown"
    }
  ],
  "handwritten_annotations": [
    {
      "text": string,
      "approx_location": string
    }
  ],
  "struck_through_words": [
    {
      "text": string,
      "context": string
    }
  ],
  "tasks": [
    {
      "id": string,
      "type": "cloze" | "mcq" | "short_answer" | "math" | "other",
      "description": string,
      "text_with_gaps": string | null,
      "gaps": [
        {
          "id": string,
          "solution": string,
          "position_hint": string
        }
      ]
    }
  ]
}

WICHTIGE HINWEISE:
- "canonical_text" enthält den KORRIGIERTEN gedruckten Text OHNE Handschrift und OHNE durchgestrichene Wörter
- "struck_through_words" enthält alle durchgestrichenen Wörter mit Kontext
- Bounding Boxes sind ungefähre Pixel-Positionen (x, y von oben links, width/height in Pixeln)
- "layout.text_regions" sollte alle gedruckten Textbereiche mit genauen Positionen enthalten
- "handwriting_regions" sollte alle handschriftlichen Bereiche mit Farb-Hinweisen enthalten
- Setze "preserve": true für Diagramm-Elemente die erhalten bleiben sollen
- Durchgestrichene Wörter NUR in "struck_through_words", NICHT in "canonical_text"

Gib NUR das JSON zurück, ohne Code-Block-Marker!"""

    try:
        logger.info(f"Calling Claude API for analysis of {image_path.name}")

        # Call Claude API
        response = client.messages.create(
            model=model,
            max_tokens=max_tokens,
            system=system_prompt,
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "image",
                            "source": {
                                "type": "base64",
                                "media_type": media_type,
                                "data": image_b64,
                            },
                        },
                        {
                            "type": "text",
                            "text": user_prompt
                        }
                    ],
                }
            ],
        )

        # Extract text from response
        if not response.content:
            raise RuntimeError("Empty response from Claude API")

        # Get first text block
        text_content = None
        for block in response.content:
            if block.type == "text":
                text_content = block.text
                break

        if not text_content:
            raise RuntimeError("No text content in Claude response")

        logger.info(f"Received response from Claude ({len(text_content)} chars)")

        # Parse JSON
        # Claude might wrap JSON in ```json ... ```, remove if present
        text_content = text_content.strip()
        if text_content.startswith("```json"):
            text_content = text_content[7:]
        if text_content.startswith("```"):
            text_content = text_content[3:]
        if text_content.endswith("```"):
            text_content = text_content[:-3]
        text_content = text_content.strip()

        try:
            analysis_data = json.loads(text_content)
        except json.JSONDecodeError as e:
            logger.error(f"Failed to parse Claude JSON response: {e}")
            logger.error(f"Response text: {text_content[:500]}...")
            raise RuntimeError(f"Invalid JSON from Claude: {e}\nContent: {text_content[:200]}...") from e

        logger.info("Successfully parsed Claude analysis")
        return analysis_data

    except Exception as e:
        logger.error(f"Claude API call failed: {e}")
        raise


def test_claude_connection() -> bool:
    """
    Test if Claude API is accessible with current credentials.

    Returns:
        True if connection successful, False otherwise
    """
    if not ANTHROPIC_AVAILABLE:
        logger.error("Anthropic SDK not installed")
        return False

    try:
        api_key = _get_anthropic_api_key()
        client = Anthropic(api_key=api_key)

        # Simple test call
        response = client.messages.create(
            model="claude-3-5-sonnet-20241022",
            max_tokens=10,
            messages=[{"role": "user", "content": "Test"}]
        )

        logger.info("✅ Claude API connection successful")
        return True

    except Exception as e:
        logger.error(f"❌ Claude API connection failed: {e}")
        return False