Fix: Remove broken getKlausurApiUrl and clean up empty lines
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s

sed replacement left orphaned hostname references in story page
and empty lines in getApiBase functions.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-24 16:02:04 +02:00
parent b07f802c24
commit 9ba420fa91
150 changed files with 30231 additions and 32053 deletions

View File

@@ -0,0 +1,325 @@
"""Vocabulary extraction from images using Vision LLM and hybrid OCR+LLM.
Contains:
- VOCAB_EXTRACTION_PROMPT: Prompt template for Vision LLM extraction
- extract_vocabulary_from_image(): Core extraction (hybrid or Vision LLM)
- _get_demo_vocabulary(): Demo data for testing
- parse_vocabulary_json(): Robust JSON parsing with 4-strategy fallback
"""
import base64
import json
import logging
import os
import re
import uuid
from typing import List
import httpx
from vocab_worksheet_models import VocabularyEntry
logger = logging.getLogger(__name__)
# Ollama Configuration
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
VISION_MODEL = os.getenv("OLLAMA_VISION_MODEL", "qwen2.5vl:32b")
# =============================================================================
# Vision LLM Vocabulary Extraction
# =============================================================================
VOCAB_EXTRACTION_PROMPT = """Analysiere dieses Bild einer Vokabelliste aus einem Schulbuch.
AUFGABE: Extrahiere alle Vokabeleintraege in folgendem JSON-Format:
{
"vocabulary": [
{
"english": "to improve",
"german": "verbessern",
"example": "I want to improve my English."
}
]
}
REGELN:
1. Erkenne das typische 3-Spalten-Layout: Englisch | Deutsch | Beispielsatz
2. Behalte die exakte Schreibweise bei
3. Bei fehlenden Beispielsaetzen: "example": null
4. Ignoriere Seitenzahlen, Ueberschriften, Kapitelnummern
5. Gib NUR valides JSON zurueck, keine Erklaerungen
6. Wenn Wortarten angegeben sind (n, v, adj), extrahiere sie als "word_type"
Beispiel-Output:
{
"vocabulary": [
{"english": "achievement", "german": "Leistung, Errungenschaft", "example": "Her achievements were impressive.", "word_type": "n"},
{"english": "to achieve", "german": "erreichen, erzielen", "example": "She achieved her goals.", "word_type": "v"}
]
}"""
async def extract_vocabulary_from_image(
image_data: bytes,
filename: str,
page_number: int = 0,
use_hybrid: bool = False # DISABLED: PaddleOCR crashes on ARM64 Mac Mini
) -> tuple[List[VocabularyEntry], float, str]:
"""
Extract vocabulary from an image using hybrid OCR+LLM or Vision LLM (default).
Args:
image_data: Image bytes
filename: Original filename for logging
page_number: 0-indexed page number for error messages
use_hybrid: If True, use PaddleOCR + LLM (faster, more accurate for printed text)
If False, use Vision LLM (slower, better for complex layouts)
Returns:
Tuple of (vocabulary_entries, confidence, error_message)
error_message is empty string on success
"""
# ==========================================================================
# HYBRID APPROACH (Default): PaddleOCR + LLM Gateway
# ==========================================================================
if use_hybrid:
try:
from hybrid_vocab_extractor import extract_vocabulary_hybrid
logger.info(f"Using HYBRID extraction for {filename} (PaddleOCR + LLM)")
vocab_dicts, confidence, error = await extract_vocabulary_hybrid(image_data, page_number)
if error:
logger.warning(f"Hybrid extraction had issues: {error}")
# Fall through to Vision LLM fallback
elif vocab_dicts:
# Convert dicts to VocabularyEntry objects
vocabulary = [
VocabularyEntry(
id=str(uuid.uuid4()),
english=v.get("english", ""),
german=v.get("german", ""),
example_sentence=v.get("example"),
source_page=page_number + 1
)
for v in vocab_dicts
if v.get("english") and v.get("german")
]
logger.info(f"Hybrid extraction: {len(vocabulary)} entries from {filename}")
return vocabulary, confidence, ""
except ImportError as e:
logger.warning(f"Hybrid extractor not available: {e}. Falling back to Vision LLM.")
except Exception as e:
logger.warning(f"Hybrid extraction failed: {e}. Falling back to Vision LLM.")
import traceback
logger.debug(traceback.format_exc())
# ==========================================================================
# FALLBACK: Vision LLM (Ollama llama3.2-vision)
# ==========================================================================
logger.info(f"Using VISION LLM extraction for {filename}")
try:
# First check if Ollama is available
async with httpx.AsyncClient(timeout=10.0) as check_client:
try:
health_response = await check_client.get(f"{OLLAMA_URL}/api/tags")
if health_response.status_code != 200:
logger.error(f"Ollama not available at {OLLAMA_URL}")
return [], 0.0, f"Seite {page_number + 1}: Ollama nicht verfuegbar"
except Exception as e:
logger.error(f"Ollama health check failed: {e}")
return [], 0.0, f"Seite {page_number + 1}: Verbindung zu Ollama fehlgeschlagen"
image_base64 = base64.b64encode(image_data).decode("utf-8")
payload = {
"model": VISION_MODEL,
"messages": [
{
"role": "user",
"content": VOCAB_EXTRACTION_PROMPT,
"images": [image_base64]
}
],
"stream": False,
"options": {
"temperature": 0.1,
"num_predict": 4096,
}
}
logger.info(f"Extracting vocabulary from {filename} ({len(image_data)} bytes) using {VISION_MODEL}")
# Increased timeout for Vision models (they can be slow)
async with httpx.AsyncClient(timeout=600.0) as client:
response = await client.post(
f"{OLLAMA_URL}/api/chat",
json=payload,
timeout=300.0 # 5 minutes per page
)
response.raise_for_status()
data = response.json()
extracted_text = data.get("message", {}).get("content", "")
logger.info(f"Ollama response received: {len(extracted_text)} chars")
# Parse JSON from response
vocabulary = parse_vocabulary_json(extracted_text)
# Set source_page for each entry
for v in vocabulary:
v.source_page = page_number + 1
# Estimate confidence
confidence = 0.85 if len(vocabulary) > 0 else 0.1
logger.info(f"Vision LLM extracted {len(vocabulary)} vocabulary entries from {filename}")
return vocabulary, confidence, ""
except httpx.TimeoutException:
logger.error(f"Ollama request timed out for {filename} (model: {VISION_MODEL})")
return [], 0.0, f"Seite {page_number + 1}: Timeout - Verarbeitung dauerte zu lange"
except Exception as e:
logger.error(f"Vocabulary extraction failed for {filename}: {e}")
import traceback
logger.error(traceback.format_exc())
return [], 0.0, f"Seite {page_number + 1}: Fehler - {str(e)[:50]}"
def _get_demo_vocabulary() -> List[VocabularyEntry]:
"""Return demo vocabulary for testing when Vision LLM is not available."""
demo_entries = [
{"english": "to achieve", "german": "erreichen, erzielen", "example": "She achieved her goals."},
{"english": "achievement", "german": "Leistung, Errungenschaft", "example": "That was a great achievement."},
{"english": "improve", "german": "verbessern", "example": "I want to improve my English."},
{"english": "improvement", "german": "Verbesserung", "example": "There has been a lot of improvement."},
{"english": "success", "german": "Erfolg", "example": "The project was a success."},
{"english": "successful", "german": "erfolgreich", "example": "She is a successful businesswoman."},
{"english": "fail", "german": "scheitern, durchfallen", "example": "Don't be afraid to fail."},
{"english": "failure", "german": "Misserfolg, Versagen", "example": "Failure is part of learning."},
]
return [
VocabularyEntry(
id=str(uuid.uuid4()),
english=e["english"],
german=e["german"],
example_sentence=e.get("example"),
)
for e in demo_entries
]
def parse_vocabulary_json(text: str) -> List[VocabularyEntry]:
"""Parse vocabulary JSON from LLM response with robust error handling."""
def clean_json_string(s: str) -> str:
"""Clean a JSON string by removing control characters and fixing common issues."""
# Remove control characters except newlines and tabs
s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s)
# Replace unescaped newlines within strings with space
# This is a simplistic approach - replace actual newlines with escaped ones
s = s.replace('\n', '\\n').replace('\r', '\\r').replace('\t', '\\t')
return s
def try_parse_json(json_str: str) -> dict:
"""Try multiple strategies to parse JSON."""
# Strategy 1: Direct parse
try:
return json.loads(json_str)
except json.JSONDecodeError:
pass
# Strategy 2: Clean and parse
try:
cleaned = clean_json_string(json_str)
return json.loads(cleaned)
except json.JSONDecodeError:
pass
# Strategy 3: Try to fix common issues
try:
# Remove trailing commas before } or ]
fixed = re.sub(r',(\s*[}\]])', r'\1', json_str)
# Fix unquoted keys
fixed = re.sub(r'(\{|\,)\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*:', r'\1"\2":', fixed)
return json.loads(fixed)
except json.JSONDecodeError:
pass
return None
try:
# Find JSON in response (may have extra text)
start = text.find('{')
end = text.rfind('}') + 1
if start == -1 or end == 0:
logger.warning("No JSON found in response")
return []
json_str = text[start:end]
data = try_parse_json(json_str)
if data is None:
# Strategy 4: Extract vocabulary entries using regex as fallback
logger.warning("JSON parsing failed, trying regex extraction")
vocabulary = []
# Match patterns like {"english": "...", "german": "...", ...}
pattern = r'\{\s*"english"\s*:\s*"([^"]*?)"\s*,\s*"german"\s*:\s*"([^"]*?)"(?:\s*,\s*"example"\s*:\s*(?:"([^"]*?)"|null))?'
matches = re.findall(pattern, text, re.IGNORECASE | re.DOTALL)
for match in matches:
english = match[0].strip() if match[0] else ""
german = match[1].strip() if match[1] else ""
example = match[2].strip() if len(match) > 2 and match[2] else None
if english and german:
vocab_entry = VocabularyEntry(
id=str(uuid.uuid4()),
english=english,
german=german,
example_sentence=example,
)
vocabulary.append(vocab_entry)
if vocabulary:
logger.info(f"Regex extraction found {len(vocabulary)} entries")
return vocabulary
# Normal JSON parsing succeeded
vocabulary = []
for i, entry in enumerate(data.get("vocabulary", [])):
english = entry.get("english", "").strip()
german = entry.get("german", "").strip()
# Skip entries that look like hallucinations (very long or containing unusual patterns)
if len(english) > 100 or len(german) > 200:
logger.warning(f"Skipping suspicious entry: {english[:50]}...")
continue
if not english or not german:
continue
vocab_entry = VocabularyEntry(
id=str(uuid.uuid4()),
english=english,
german=german,
example_sentence=entry.get("example"),
word_type=entry.get("word_type"),
)
vocabulary.append(vocab_entry)
return vocabulary
except Exception as e:
logger.error(f"Failed to parse vocabulary JSON: {e}")
import traceback
logger.error(traceback.format_exc())
return []