Fix: Remove broken getKlausurApiUrl and clean up empty lines

sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 16:02:04 +02:00
parent b07f802c24
commit 9ba420fa91
150 changed files with 30231 additions and 32053 deletions
--- a/klausur-service/backend/vocab_worksheet_extraction.py
+++ b/klausur-service/backend/vocab_worksheet_extraction.py
@@ -0,0 +1,325 @@
+"""Vocabulary extraction from images using Vision LLM and hybrid OCR+LLM.
+
+Contains:
+- VOCAB_EXTRACTION_PROMPT: Prompt template for Vision LLM extraction
+- extract_vocabulary_from_image(): Core extraction (hybrid or Vision LLM)
+- _get_demo_vocabulary(): Demo data for testing
+- parse_vocabulary_json(): Robust JSON parsing with 4-strategy fallback
+"""
+
+import base64
+import json
+import logging
+import os
+import re
+import uuid
+from typing import List
+
+import httpx
+
+from vocab_worksheet_models import VocabularyEntry
+
+logger = logging.getLogger(__name__)
+
+# Ollama Configuration
+OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
+VISION_MODEL = os.getenv("OLLAMA_VISION_MODEL", "qwen2.5vl:32b")
+
+
+# =============================================================================
+# Vision LLM Vocabulary Extraction
+# =============================================================================
+
+VOCAB_EXTRACTION_PROMPT = """Analysiere dieses Bild einer Vokabelliste aus einem Schulbuch.
+
+AUFGABE: Extrahiere alle Vokabeleintraege in folgendem JSON-Format:
+
+{
+  "vocabulary": [
+    {
+      "english": "to improve",
+      "german": "verbessern",
+      "example": "I want to improve my English."
+    }
+  ]
+}
+
+REGELN:
+1. Erkenne das typische 3-Spalten-Layout: Englisch | Deutsch | Beispielsatz
+2. Behalte die exakte Schreibweise bei
+3. Bei fehlenden Beispielsaetzen: "example": null
+4. Ignoriere Seitenzahlen, Ueberschriften, Kapitelnummern
+5. Gib NUR valides JSON zurueck, keine Erklaerungen
+6. Wenn Wortarten angegeben sind (n, v, adj), extrahiere sie als "word_type"
+
+Beispiel-Output:
+{
+  "vocabulary": [
+    {"english": "achievement", "german": "Leistung, Errungenschaft", "example": "Her achievements were impressive.", "word_type": "n"},
+    {"english": "to achieve", "german": "erreichen, erzielen", "example": "She achieved her goals.", "word_type": "v"}
+  ]
+}"""
+
+
+async def extract_vocabulary_from_image(
+    image_data: bytes,
+    filename: str,
+    page_number: int = 0,
+    use_hybrid: bool = False  # DISABLED: PaddleOCR crashes on ARM64 Mac Mini
+) -> tuple[List[VocabularyEntry], float, str]:
+    """
+    Extract vocabulary from an image using hybrid OCR+LLM or Vision LLM (default).
+
+    Args:
+        image_data: Image bytes
+        filename: Original filename for logging
+        page_number: 0-indexed page number for error messages
+        use_hybrid: If True, use PaddleOCR + LLM (faster, more accurate for printed text)
+                   If False, use Vision LLM (slower, better for complex layouts)
+
+    Returns:
+        Tuple of (vocabulary_entries, confidence, error_message)
+        error_message is empty string on success
+    """
+
+    # ==========================================================================
+    # HYBRID APPROACH (Default): PaddleOCR + LLM Gateway
+    # ==========================================================================
+    if use_hybrid:
+        try:
+            from hybrid_vocab_extractor import extract_vocabulary_hybrid
+            logger.info(f"Using HYBRID extraction for {filename} (PaddleOCR + LLM)")
+
+            vocab_dicts, confidence, error = await extract_vocabulary_hybrid(image_data, page_number)
+
+            if error:
+                logger.warning(f"Hybrid extraction had issues: {error}")
+                # Fall through to Vision LLM fallback
+            elif vocab_dicts:
+                # Convert dicts to VocabularyEntry objects
+                vocabulary = [
+                    VocabularyEntry(
+                        id=str(uuid.uuid4()),
+                        english=v.get("english", ""),
+                        german=v.get("german", ""),
+                        example_sentence=v.get("example"),
+                        source_page=page_number + 1
+                    )
+                    for v in vocab_dicts
+                    if v.get("english") and v.get("german")
+                ]
+                logger.info(f"Hybrid extraction: {len(vocabulary)} entries from {filename}")
+                return vocabulary, confidence, ""
+
+        except ImportError as e:
+            logger.warning(f"Hybrid extractor not available: {e}. Falling back to Vision LLM.")
+        except Exception as e:
+            logger.warning(f"Hybrid extraction failed: {e}. Falling back to Vision LLM.")
+            import traceback
+            logger.debug(traceback.format_exc())
+
+    # ==========================================================================
+    # FALLBACK: Vision LLM (Ollama llama3.2-vision)
+    # ==========================================================================
+    logger.info(f"Using VISION LLM extraction for {filename}")
+
+    try:
+        # First check if Ollama is available
+        async with httpx.AsyncClient(timeout=10.0) as check_client:
+            try:
+                health_response = await check_client.get(f"{OLLAMA_URL}/api/tags")
+                if health_response.status_code != 200:
+                    logger.error(f"Ollama not available at {OLLAMA_URL}")
+                    return [], 0.0, f"Seite {page_number + 1}: Ollama nicht verfuegbar"
+            except Exception as e:
+                logger.error(f"Ollama health check failed: {e}")
+                return [], 0.0, f"Seite {page_number + 1}: Verbindung zu Ollama fehlgeschlagen"
+
+        image_base64 = base64.b64encode(image_data).decode("utf-8")
+
+        payload = {
+            "model": VISION_MODEL,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": VOCAB_EXTRACTION_PROMPT,
+                    "images": [image_base64]
+                }
+            ],
+            "stream": False,
+            "options": {
+                "temperature": 0.1,
+                "num_predict": 4096,
+            }
+        }
+
+        logger.info(f"Extracting vocabulary from {filename} ({len(image_data)} bytes) using {VISION_MODEL}")
+
+        # Increased timeout for Vision models (they can be slow)
+        async with httpx.AsyncClient(timeout=600.0) as client:
+            response = await client.post(
+                f"{OLLAMA_URL}/api/chat",
+                json=payload,
+                timeout=300.0  # 5 minutes per page
+            )
+            response.raise_for_status()
+
+            data = response.json()
+            extracted_text = data.get("message", {}).get("content", "")
+
+        logger.info(f"Ollama response received: {len(extracted_text)} chars")
+
+        # Parse JSON from response
+        vocabulary = parse_vocabulary_json(extracted_text)
+
+        # Set source_page for each entry
+        for v in vocabulary:
+            v.source_page = page_number + 1
+
+        # Estimate confidence
+        confidence = 0.85 if len(vocabulary) > 0 else 0.1
+
+        logger.info(f"Vision LLM extracted {len(vocabulary)} vocabulary entries from {filename}")
+
+        return vocabulary, confidence, ""
+
+    except httpx.TimeoutException:
+        logger.error(f"Ollama request timed out for {filename} (model: {VISION_MODEL})")
+        return [], 0.0, f"Seite {page_number + 1}: Timeout - Verarbeitung dauerte zu lange"
+    except Exception as e:
+        logger.error(f"Vocabulary extraction failed for {filename}: {e}")
+        import traceback
+        logger.error(traceback.format_exc())
+        return [], 0.0, f"Seite {page_number + 1}: Fehler - {str(e)[:50]}"
+
+
+def _get_demo_vocabulary() -> List[VocabularyEntry]:
+    """Return demo vocabulary for testing when Vision LLM is not available."""
+    demo_entries = [
+        {"english": "to achieve", "german": "erreichen, erzielen", "example": "She achieved her goals."},
+        {"english": "achievement", "german": "Leistung, Errungenschaft", "example": "That was a great achievement."},
+        {"english": "improve", "german": "verbessern", "example": "I want to improve my English."},
+        {"english": "improvement", "german": "Verbesserung", "example": "There has been a lot of improvement."},
+        {"english": "success", "german": "Erfolg", "example": "The project was a success."},
+        {"english": "successful", "german": "erfolgreich", "example": "She is a successful businesswoman."},
+        {"english": "fail", "german": "scheitern, durchfallen", "example": "Don't be afraid to fail."},
+        {"english": "failure", "german": "Misserfolg, Versagen", "example": "Failure is part of learning."},
+    ]
+    return [
+        VocabularyEntry(
+            id=str(uuid.uuid4()),
+            english=e["english"],
+            german=e["german"],
+            example_sentence=e.get("example"),
+        )
+        for e in demo_entries
+    ]
+
+
+def parse_vocabulary_json(text: str) -> List[VocabularyEntry]:
+    """Parse vocabulary JSON from LLM response with robust error handling."""
+
+    def clean_json_string(s: str) -> str:
+        """Clean a JSON string by removing control characters and fixing common issues."""
+        # Remove control characters except newlines and tabs
+        s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s)
+        # Replace unescaped newlines within strings with space
+        # This is a simplistic approach - replace actual newlines with escaped ones
+        s = s.replace('\n', '\\n').replace('\r', '\\r').replace('\t', '\\t')
+        return s
+
+    def try_parse_json(json_str: str) -> dict:
+        """Try multiple strategies to parse JSON."""
+        # Strategy 1: Direct parse
+        try:
+            return json.loads(json_str)
+        except json.JSONDecodeError:
+            pass
+
+        # Strategy 2: Clean and parse
+        try:
+            cleaned = clean_json_string(json_str)
+            return json.loads(cleaned)
+        except json.JSONDecodeError:
+            pass
+
+        # Strategy 3: Try to fix common issues
+        try:
+            # Remove trailing commas before } or ]
+            fixed = re.sub(r',(\s*[}\]])', r'\1', json_str)
+            # Fix unquoted keys
+            fixed = re.sub(r'(\{|\,)\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*:', r'\1"\2":', fixed)
+            return json.loads(fixed)
+        except json.JSONDecodeError:
+            pass
+
+        return None
+
+    try:
+        # Find JSON in response (may have extra text)
+        start = text.find('{')
+        end = text.rfind('}') + 1
+
+        if start == -1 or end == 0:
+            logger.warning("No JSON found in response")
+            return []
+
+        json_str = text[start:end]
+        data = try_parse_json(json_str)
+
+        if data is None:
+            # Strategy 4: Extract vocabulary entries using regex as fallback
+            logger.warning("JSON parsing failed, trying regex extraction")
+            vocabulary = []
+            # Match patterns like {"english": "...", "german": "...", ...}
+            pattern = r'\{\s*"english"\s*:\s*"([^"]*?)"\s*,\s*"german"\s*:\s*"([^"]*?)"(?:\s*,\s*"example"\s*:\s*(?:"([^"]*?)"|null))?'
+            matches = re.findall(pattern, text, re.IGNORECASE | re.DOTALL)
+
+            for match in matches:
+                english = match[0].strip() if match[0] else ""
+                german = match[1].strip() if match[1] else ""
+                example = match[2].strip() if len(match) > 2 and match[2] else None
+
+                if english and german:
+                    vocab_entry = VocabularyEntry(
+                        id=str(uuid.uuid4()),
+                        english=english,
+                        german=german,
+                        example_sentence=example,
+                    )
+                    vocabulary.append(vocab_entry)
+
+            if vocabulary:
+                logger.info(f"Regex extraction found {len(vocabulary)} entries")
+            return vocabulary
+
+        # Normal JSON parsing succeeded
+        vocabulary = []
+        for i, entry in enumerate(data.get("vocabulary", [])):
+            english = entry.get("english", "").strip()
+            german = entry.get("german", "").strip()
+
+            # Skip entries that look like hallucinations (very long or containing unusual patterns)
+            if len(english) > 100 or len(german) > 200:
+                logger.warning(f"Skipping suspicious entry: {english[:50]}...")
+                continue
+
+            if not english or not german:
+                continue
+
+            vocab_entry = VocabularyEntry(
+                id=str(uuid.uuid4()),
+                english=english,
+                german=german,
+                example_sentence=entry.get("example"),
+                word_type=entry.get("word_type"),
+            )
+            vocabulary.append(vocab_entry)
+
+        return vocabulary
+
+    except Exception as e:
+        logger.error(f"Failed to parse vocabulary JSON: {e}")
+        import traceback
+        logger.error(traceback.format_exc())
+        return []