Fix: Remove broken getKlausurApiUrl and clean up empty lines
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
325
klausur-service/backend/vocab_worksheet_extraction.py
Normal file
325
klausur-service/backend/vocab_worksheet_extraction.py
Normal file
@@ -0,0 +1,325 @@
|
||||
"""Vocabulary extraction from images using Vision LLM and hybrid OCR+LLM.
|
||||
|
||||
Contains:
|
||||
- VOCAB_EXTRACTION_PROMPT: Prompt template for Vision LLM extraction
|
||||
- extract_vocabulary_from_image(): Core extraction (hybrid or Vision LLM)
|
||||
- _get_demo_vocabulary(): Demo data for testing
|
||||
- parse_vocabulary_json(): Robust JSON parsing with 4-strategy fallback
|
||||
"""
|
||||
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import uuid
|
||||
from typing import List
|
||||
|
||||
import httpx
|
||||
|
||||
from vocab_worksheet_models import VocabularyEntry
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Ollama Configuration
|
||||
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
|
||||
VISION_MODEL = os.getenv("OLLAMA_VISION_MODEL", "qwen2.5vl:32b")
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Vision LLM Vocabulary Extraction
|
||||
# =============================================================================
|
||||
|
||||
VOCAB_EXTRACTION_PROMPT = """Analysiere dieses Bild einer Vokabelliste aus einem Schulbuch.
|
||||
|
||||
AUFGABE: Extrahiere alle Vokabeleintraege in folgendem JSON-Format:
|
||||
|
||||
{
|
||||
"vocabulary": [
|
||||
{
|
||||
"english": "to improve",
|
||||
"german": "verbessern",
|
||||
"example": "I want to improve my English."
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
REGELN:
|
||||
1. Erkenne das typische 3-Spalten-Layout: Englisch | Deutsch | Beispielsatz
|
||||
2. Behalte die exakte Schreibweise bei
|
||||
3. Bei fehlenden Beispielsaetzen: "example": null
|
||||
4. Ignoriere Seitenzahlen, Ueberschriften, Kapitelnummern
|
||||
5. Gib NUR valides JSON zurueck, keine Erklaerungen
|
||||
6. Wenn Wortarten angegeben sind (n, v, adj), extrahiere sie als "word_type"
|
||||
|
||||
Beispiel-Output:
|
||||
{
|
||||
"vocabulary": [
|
||||
{"english": "achievement", "german": "Leistung, Errungenschaft", "example": "Her achievements were impressive.", "word_type": "n"},
|
||||
{"english": "to achieve", "german": "erreichen, erzielen", "example": "She achieved her goals.", "word_type": "v"}
|
||||
]
|
||||
}"""
|
||||
|
||||
|
||||
async def extract_vocabulary_from_image(
|
||||
image_data: bytes,
|
||||
filename: str,
|
||||
page_number: int = 0,
|
||||
use_hybrid: bool = False # DISABLED: PaddleOCR crashes on ARM64 Mac Mini
|
||||
) -> tuple[List[VocabularyEntry], float, str]:
|
||||
"""
|
||||
Extract vocabulary from an image using hybrid OCR+LLM or Vision LLM (default).
|
||||
|
||||
Args:
|
||||
image_data: Image bytes
|
||||
filename: Original filename for logging
|
||||
page_number: 0-indexed page number for error messages
|
||||
use_hybrid: If True, use PaddleOCR + LLM (faster, more accurate for printed text)
|
||||
If False, use Vision LLM (slower, better for complex layouts)
|
||||
|
||||
Returns:
|
||||
Tuple of (vocabulary_entries, confidence, error_message)
|
||||
error_message is empty string on success
|
||||
"""
|
||||
|
||||
# ==========================================================================
|
||||
# HYBRID APPROACH (Default): PaddleOCR + LLM Gateway
|
||||
# ==========================================================================
|
||||
if use_hybrid:
|
||||
try:
|
||||
from hybrid_vocab_extractor import extract_vocabulary_hybrid
|
||||
logger.info(f"Using HYBRID extraction for {filename} (PaddleOCR + LLM)")
|
||||
|
||||
vocab_dicts, confidence, error = await extract_vocabulary_hybrid(image_data, page_number)
|
||||
|
||||
if error:
|
||||
logger.warning(f"Hybrid extraction had issues: {error}")
|
||||
# Fall through to Vision LLM fallback
|
||||
elif vocab_dicts:
|
||||
# Convert dicts to VocabularyEntry objects
|
||||
vocabulary = [
|
||||
VocabularyEntry(
|
||||
id=str(uuid.uuid4()),
|
||||
english=v.get("english", ""),
|
||||
german=v.get("german", ""),
|
||||
example_sentence=v.get("example"),
|
||||
source_page=page_number + 1
|
||||
)
|
||||
for v in vocab_dicts
|
||||
if v.get("english") and v.get("german")
|
||||
]
|
||||
logger.info(f"Hybrid extraction: {len(vocabulary)} entries from {filename}")
|
||||
return vocabulary, confidence, ""
|
||||
|
||||
except ImportError as e:
|
||||
logger.warning(f"Hybrid extractor not available: {e}. Falling back to Vision LLM.")
|
||||
except Exception as e:
|
||||
logger.warning(f"Hybrid extraction failed: {e}. Falling back to Vision LLM.")
|
||||
import traceback
|
||||
logger.debug(traceback.format_exc())
|
||||
|
||||
# ==========================================================================
|
||||
# FALLBACK: Vision LLM (Ollama llama3.2-vision)
|
||||
# ==========================================================================
|
||||
logger.info(f"Using VISION LLM extraction for {filename}")
|
||||
|
||||
try:
|
||||
# First check if Ollama is available
|
||||
async with httpx.AsyncClient(timeout=10.0) as check_client:
|
||||
try:
|
||||
health_response = await check_client.get(f"{OLLAMA_URL}/api/tags")
|
||||
if health_response.status_code != 200:
|
||||
logger.error(f"Ollama not available at {OLLAMA_URL}")
|
||||
return [], 0.0, f"Seite {page_number + 1}: Ollama nicht verfuegbar"
|
||||
except Exception as e:
|
||||
logger.error(f"Ollama health check failed: {e}")
|
||||
return [], 0.0, f"Seite {page_number + 1}: Verbindung zu Ollama fehlgeschlagen"
|
||||
|
||||
image_base64 = base64.b64encode(image_data).decode("utf-8")
|
||||
|
||||
payload = {
|
||||
"model": VISION_MODEL,
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": VOCAB_EXTRACTION_PROMPT,
|
||||
"images": [image_base64]
|
||||
}
|
||||
],
|
||||
"stream": False,
|
||||
"options": {
|
||||
"temperature": 0.1,
|
||||
"num_predict": 4096,
|
||||
}
|
||||
}
|
||||
|
||||
logger.info(f"Extracting vocabulary from {filename} ({len(image_data)} bytes) using {VISION_MODEL}")
|
||||
|
||||
# Increased timeout for Vision models (they can be slow)
|
||||
async with httpx.AsyncClient(timeout=600.0) as client:
|
||||
response = await client.post(
|
||||
f"{OLLAMA_URL}/api/chat",
|
||||
json=payload,
|
||||
timeout=300.0 # 5 minutes per page
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
data = response.json()
|
||||
extracted_text = data.get("message", {}).get("content", "")
|
||||
|
||||
logger.info(f"Ollama response received: {len(extracted_text)} chars")
|
||||
|
||||
# Parse JSON from response
|
||||
vocabulary = parse_vocabulary_json(extracted_text)
|
||||
|
||||
# Set source_page for each entry
|
||||
for v in vocabulary:
|
||||
v.source_page = page_number + 1
|
||||
|
||||
# Estimate confidence
|
||||
confidence = 0.85 if len(vocabulary) > 0 else 0.1
|
||||
|
||||
logger.info(f"Vision LLM extracted {len(vocabulary)} vocabulary entries from {filename}")
|
||||
|
||||
return vocabulary, confidence, ""
|
||||
|
||||
except httpx.TimeoutException:
|
||||
logger.error(f"Ollama request timed out for {filename} (model: {VISION_MODEL})")
|
||||
return [], 0.0, f"Seite {page_number + 1}: Timeout - Verarbeitung dauerte zu lange"
|
||||
except Exception as e:
|
||||
logger.error(f"Vocabulary extraction failed for {filename}: {e}")
|
||||
import traceback
|
||||
logger.error(traceback.format_exc())
|
||||
return [], 0.0, f"Seite {page_number + 1}: Fehler - {str(e)[:50]}"
|
||||
|
||||
|
||||
def _get_demo_vocabulary() -> List[VocabularyEntry]:
|
||||
"""Return demo vocabulary for testing when Vision LLM is not available."""
|
||||
demo_entries = [
|
||||
{"english": "to achieve", "german": "erreichen, erzielen", "example": "She achieved her goals."},
|
||||
{"english": "achievement", "german": "Leistung, Errungenschaft", "example": "That was a great achievement."},
|
||||
{"english": "improve", "german": "verbessern", "example": "I want to improve my English."},
|
||||
{"english": "improvement", "german": "Verbesserung", "example": "There has been a lot of improvement."},
|
||||
{"english": "success", "german": "Erfolg", "example": "The project was a success."},
|
||||
{"english": "successful", "german": "erfolgreich", "example": "She is a successful businesswoman."},
|
||||
{"english": "fail", "german": "scheitern, durchfallen", "example": "Don't be afraid to fail."},
|
||||
{"english": "failure", "german": "Misserfolg, Versagen", "example": "Failure is part of learning."},
|
||||
]
|
||||
return [
|
||||
VocabularyEntry(
|
||||
id=str(uuid.uuid4()),
|
||||
english=e["english"],
|
||||
german=e["german"],
|
||||
example_sentence=e.get("example"),
|
||||
)
|
||||
for e in demo_entries
|
||||
]
|
||||
|
||||
|
||||
def parse_vocabulary_json(text: str) -> List[VocabularyEntry]:
|
||||
"""Parse vocabulary JSON from LLM response with robust error handling."""
|
||||
|
||||
def clean_json_string(s: str) -> str:
|
||||
"""Clean a JSON string by removing control characters and fixing common issues."""
|
||||
# Remove control characters except newlines and tabs
|
||||
s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s)
|
||||
# Replace unescaped newlines within strings with space
|
||||
# This is a simplistic approach - replace actual newlines with escaped ones
|
||||
s = s.replace('\n', '\\n').replace('\r', '\\r').replace('\t', '\\t')
|
||||
return s
|
||||
|
||||
def try_parse_json(json_str: str) -> dict:
|
||||
"""Try multiple strategies to parse JSON."""
|
||||
# Strategy 1: Direct parse
|
||||
try:
|
||||
return json.loads(json_str)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Strategy 2: Clean and parse
|
||||
try:
|
||||
cleaned = clean_json_string(json_str)
|
||||
return json.loads(cleaned)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Strategy 3: Try to fix common issues
|
||||
try:
|
||||
# Remove trailing commas before } or ]
|
||||
fixed = re.sub(r',(\s*[}\]])', r'\1', json_str)
|
||||
# Fix unquoted keys
|
||||
fixed = re.sub(r'(\{|\,)\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*:', r'\1"\2":', fixed)
|
||||
return json.loads(fixed)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
try:
|
||||
# Find JSON in response (may have extra text)
|
||||
start = text.find('{')
|
||||
end = text.rfind('}') + 1
|
||||
|
||||
if start == -1 or end == 0:
|
||||
logger.warning("No JSON found in response")
|
||||
return []
|
||||
|
||||
json_str = text[start:end]
|
||||
data = try_parse_json(json_str)
|
||||
|
||||
if data is None:
|
||||
# Strategy 4: Extract vocabulary entries using regex as fallback
|
||||
logger.warning("JSON parsing failed, trying regex extraction")
|
||||
vocabulary = []
|
||||
# Match patterns like {"english": "...", "german": "...", ...}
|
||||
pattern = r'\{\s*"english"\s*:\s*"([^"]*?)"\s*,\s*"german"\s*:\s*"([^"]*?)"(?:\s*,\s*"example"\s*:\s*(?:"([^"]*?)"|null))?'
|
||||
matches = re.findall(pattern, text, re.IGNORECASE | re.DOTALL)
|
||||
|
||||
for match in matches:
|
||||
english = match[0].strip() if match[0] else ""
|
||||
german = match[1].strip() if match[1] else ""
|
||||
example = match[2].strip() if len(match) > 2 and match[2] else None
|
||||
|
||||
if english and german:
|
||||
vocab_entry = VocabularyEntry(
|
||||
id=str(uuid.uuid4()),
|
||||
english=english,
|
||||
german=german,
|
||||
example_sentence=example,
|
||||
)
|
||||
vocabulary.append(vocab_entry)
|
||||
|
||||
if vocabulary:
|
||||
logger.info(f"Regex extraction found {len(vocabulary)} entries")
|
||||
return vocabulary
|
||||
|
||||
# Normal JSON parsing succeeded
|
||||
vocabulary = []
|
||||
for i, entry in enumerate(data.get("vocabulary", [])):
|
||||
english = entry.get("english", "").strip()
|
||||
german = entry.get("german", "").strip()
|
||||
|
||||
# Skip entries that look like hallucinations (very long or containing unusual patterns)
|
||||
if len(english) > 100 or len(german) > 200:
|
||||
logger.warning(f"Skipping suspicious entry: {english[:50]}...")
|
||||
continue
|
||||
|
||||
if not english or not german:
|
||||
continue
|
||||
|
||||
vocab_entry = VocabularyEntry(
|
||||
id=str(uuid.uuid4()),
|
||||
english=english,
|
||||
german=german,
|
||||
example_sentence=entry.get("example"),
|
||||
word_type=entry.get("word_type"),
|
||||
)
|
||||
vocabulary.append(vocab_entry)
|
||||
|
||||
return vocabulary
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to parse vocabulary JSON: {e}")
|
||||
import traceback
|
||||
logger.error(traceback.format_exc())
|
||||
return []
|
||||
Reference in New Issue
Block a user