Files
breakpilot-lehrer/klausur-service/backend/hybrid_vocab_extractor.py
Benjamin Admin 451365a312 [split-required] Split remaining 500-680 LOC files (final batch)
website (17 pages + 3 components):
- multiplayer/wizard, middleware/wizard+test-wizard, communication
- builds/wizard, staff-search, voice, sbom/wizard
- foerderantrag, mail/tasks, tools/communication, sbom
- compliance/evidence, uni-crawler, brandbook (already done)
- CollectionsTab, IngestionTab, RiskHeatmap

backend-lehrer (5 files):
- letters_api (641 → 2), certificates_api (636 → 2)
- alerts_agent/db/models (636 → 3)
- llm_gateway/communication_service (614 → 2)
- game/database already done in prior batch

klausur-service (2 files):
- hybrid_vocab_extractor (664 → 2)
- klausur-service/frontend: api.ts (620 → 3), EHUploadWizard (591 → 2)

voice-service (3 files):
- bqas/rag_judge (618 → 3), runner (529 → 2)
- enhanced_task_orchestrator (519 → 2)

studio-v2 (6 files):
- korrektur/[klausurId] (578 → 4), fairness (569 → 2)
- AlertsWizard (552 → 2), OnboardingWizard (513 → 2)
- korrektur/api.ts (506 → 3), geo-lernwelt (501 → 2)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 08:56:45 +02:00

229 lines
7.6 KiB
Python

"""
Hybrid OCR + LLM Vocabulary Extractor
Split into:
- hybrid_vocab_ocr.py: PaddleOCR integration, parsing, row/column detection
- hybrid_vocab_extractor.py (this file): LLM structuring, public API, barrel re-exports
All symbols re-exported for backward compatibility.
"""
import os
import json
import logging
import re
from typing import List, Dict, Any, Tuple
import httpx
# Re-export everything from ocr module for backward compatibility
from hybrid_vocab_ocr import (
OCRRegion,
get_paddle_ocr,
preprocess_image,
run_paddle_ocr,
group_regions_by_rows,
detect_columns,
format_ocr_for_llm,
)
logger = logging.getLogger(__name__)
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
LLM_MODEL = os.getenv("LLM_MODEL", "qwen2.5:14b")
# =============================================================================
# LLM Strukturierung
# =============================================================================
STRUCTURE_PROMPT = """Du erhaeltst OCR-Output einer Vokabelliste aus einem englischen Schulbuch.
Die Zeilen sind Tab-separiert und enthalten typischerweise:
- 2 Spalten: Englisch | Deutsch
- 3 Spalten: Englisch | Deutsch | Beispielsatz
OCR-Text:
{ocr_text}
AUFGABE: Strukturiere die Vokabeln als JSON-Array.
AUSGABE-FORMAT (nur JSON, keine Erklaerungen):
{{
"vocabulary": [
{{"english": "to improve", "german": "verbessern", "example": "I want to improve my English."}},
{{"english": "achievement", "german": "Leistung", "example": null}}
]
}}
REGELN:
1. Erkenne das Spalten-Layout aus den Tab-Trennungen
2. Korrigiere offensichtliche OCR-Fehler kontextuell (z.B. "vereessern" -> "verbessern", "0" -> "o")
3. Bei fehlenden Beispielsaetzen: "example": null
4. Ueberspringe Ueberschriften, Seitenzahlen, Kapitelnummern
5. Behalte Wortarten bei wenn vorhanden (n, v, adj am Ende des englischen Worts)
6. Gib NUR valides JSON zurueck"""
async def structure_vocabulary_with_llm(ocr_text: str) -> List[Dict[str, Any]]:
"""Verwendet Ollama LLM um OCR-Text zu strukturieren."""
prompt = STRUCTURE_PROMPT.format(ocr_text=ocr_text)
try:
async with httpx.AsyncClient(timeout=120.0) as client:
response = await client.post(
f"{OLLAMA_URL}/api/chat",
json={
"model": LLM_MODEL,
"messages": [{"role": "user", "content": prompt}],
"stream": False,
"options": {"temperature": 0.1, "num_predict": 4096}
}
)
response.raise_for_status()
data = response.json()
content = data.get("message", {}).get("content", "")
logger.info(f"Ollama LLM response received: {len(content)} chars")
return parse_llm_vocabulary_json(content)
except httpx.TimeoutException:
logger.error("Ollama LLM request timed out")
return []
except httpx.HTTPStatusError as e:
logger.error(f"Ollama LLM HTTP error: {e}")
return []
except Exception as e:
logger.error(f"LLM structuring failed: {e}")
return []
def parse_llm_vocabulary_json(text: str) -> List[Dict[str, Any]]:
"""Robustes JSON-Parsing des LLM-Outputs."""
try:
start = text.find('{')
end = text.rfind('}') + 1
if start == -1 or end == 0:
logger.warning("No JSON found in LLM response")
return []
json_str = text[start:end]
data = json.loads(json_str)
vocabulary = data.get("vocabulary", [])
valid_entries = []
for entry in vocabulary:
english = entry.get("english", "").strip()
german = entry.get("german", "").strip()
if english and german:
valid_entries.append({
"english": english, "german": german,
"example": entry.get("example")
})
return valid_entries
except json.JSONDecodeError as e:
logger.error(f"JSON parse error: {e}")
return extract_vocabulary_regex(text)
except Exception as e:
logger.error(f"Vocabulary parsing failed: {e}")
return []
def extract_vocabulary_regex(text: str) -> List[Dict[str, Any]]:
"""Fallback: Vokabeln via Regex extrahieren."""
pattern = r'"english"\s*:\s*"([^"]+)"\s*,\s*"german"\s*:\s*"([^"]+)"'
matches = re.findall(pattern, text)
vocabulary = []
for english, german in matches:
vocabulary.append({
"english": english.strip(), "german": german.strip(), "example": None
})
logger.info(f"Regex fallback extracted {len(vocabulary)} entries")
return vocabulary
# =============================================================================
# Public API
# =============================================================================
async def extract_vocabulary_hybrid(
image_bytes: bytes, page_number: int = 0
) -> Tuple[List[Dict[str, Any]], float, str]:
"""Hybrid-Extraktion: PaddleOCR + LLM Strukturierung."""
try:
logger.info(f"Starting hybrid extraction for page {page_number + 1}")
regions, raw_text = run_paddle_ocr(image_bytes)
if not regions:
return [], 0.0, f"Seite {page_number + 1}: Kein Text erkannt (OCR)"
formatted_text = format_ocr_for_llm(regions)
logger.info(f"Formatted OCR text: {len(formatted_text)} chars")
vocabulary = await structure_vocabulary_with_llm(formatted_text)
if not vocabulary:
vocabulary = extract_from_rows_directly(regions)
if not vocabulary:
return [], 0.0, f"Seite {page_number + 1}: Keine Vokabeln erkannt"
avg_confidence = sum(r.confidence for r in regions) / len(regions) if regions else 0.0
logger.info(f"Hybrid extraction completed: {len(vocabulary)} entries, {avg_confidence:.2f} confidence")
return vocabulary, avg_confidence, ""
except Exception as e:
logger.error(f"Hybrid extraction failed: {e}")
import traceback
logger.error(traceback.format_exc())
return [], 0.0, f"Seite {page_number + 1}: Fehler - {str(e)[:50]}"
def extract_from_rows_directly(regions: List[OCRRegion]) -> List[Dict[str, Any]]:
"""Direkter Fallback: Extrahiere Vokabeln ohne LLM."""
rows = group_regions_by_rows(regions)
vocabulary = []
for row in rows:
if len(row) >= 2:
english = row[0].text.strip()
german = row[1].text.strip()
example = row[2].text.strip() if len(row) >= 3 else None
if english and german and len(english) > 1 and len(german) > 1:
vocabulary.append({
"english": english, "german": german, "example": example
})
logger.info(f"Direct row extraction: {len(vocabulary)} entries")
return vocabulary
# =============================================================================
# Test/Debug
# =============================================================================
async def test_hybrid_extraction(image_path: str):
"""Test-Funktion fuer Entwicklung."""
with open(image_path, "rb") as f:
image_bytes = f.read()
vocab, confidence, error = await extract_vocabulary_hybrid(image_bytes)
print(f"\n=== Hybrid OCR Test ===")
print(f"Confidence: {confidence:.2f}")
print(f"Error: {error or 'None'}")
print(f"Vocabulary ({len(vocab)} entries):")
for v in vocab[:10]:
print(f" - {v['english']} = {v['german']}")
return vocab
if __name__ == "__main__":
import asyncio
import sys
if len(sys.argv) > 1:
asyncio.run(test_hybrid_extraction(sys.argv[1]))
else:
print("Usage: python hybrid_vocab_extractor.py <image_path>")