website (17 pages + 3 components): - multiplayer/wizard, middleware/wizard+test-wizard, communication - builds/wizard, staff-search, voice, sbom/wizard - foerderantrag, mail/tasks, tools/communication, sbom - compliance/evidence, uni-crawler, brandbook (already done) - CollectionsTab, IngestionTab, RiskHeatmap backend-lehrer (5 files): - letters_api (641 → 2), certificates_api (636 → 2) - alerts_agent/db/models (636 → 3) - llm_gateway/communication_service (614 → 2) - game/database already done in prior batch klausur-service (2 files): - hybrid_vocab_extractor (664 → 2) - klausur-service/frontend: api.ts (620 → 3), EHUploadWizard (591 → 2) voice-service (3 files): - bqas/rag_judge (618 → 3), runner (529 → 2) - enhanced_task_orchestrator (519 → 2) studio-v2 (6 files): - korrektur/[klausurId] (578 → 4), fairness (569 → 2) - AlertsWizard (552 → 2), OnboardingWizard (513 → 2) - korrektur/api.ts (506 → 3), geo-lernwelt (501 → 2) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
229 lines
7.6 KiB
Python
229 lines
7.6 KiB
Python
"""
|
|
Hybrid OCR + LLM Vocabulary Extractor
|
|
|
|
Split into:
|
|
- hybrid_vocab_ocr.py: PaddleOCR integration, parsing, row/column detection
|
|
- hybrid_vocab_extractor.py (this file): LLM structuring, public API, barrel re-exports
|
|
|
|
All symbols re-exported for backward compatibility.
|
|
"""
|
|
|
|
import os
|
|
import json
|
|
import logging
|
|
import re
|
|
from typing import List, Dict, Any, Tuple
|
|
|
|
import httpx
|
|
|
|
# Re-export everything from ocr module for backward compatibility
|
|
from hybrid_vocab_ocr import (
|
|
OCRRegion,
|
|
get_paddle_ocr,
|
|
preprocess_image,
|
|
run_paddle_ocr,
|
|
group_regions_by_rows,
|
|
detect_columns,
|
|
format_ocr_for_llm,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
|
|
LLM_MODEL = os.getenv("LLM_MODEL", "qwen2.5:14b")
|
|
|
|
# =============================================================================
|
|
# LLM Strukturierung
|
|
# =============================================================================
|
|
|
|
STRUCTURE_PROMPT = """Du erhaeltst OCR-Output einer Vokabelliste aus einem englischen Schulbuch.
|
|
Die Zeilen sind Tab-separiert und enthalten typischerweise:
|
|
- 2 Spalten: Englisch | Deutsch
|
|
- 3 Spalten: Englisch | Deutsch | Beispielsatz
|
|
|
|
OCR-Text:
|
|
{ocr_text}
|
|
|
|
AUFGABE: Strukturiere die Vokabeln als JSON-Array.
|
|
|
|
AUSGABE-FORMAT (nur JSON, keine Erklaerungen):
|
|
{{
|
|
"vocabulary": [
|
|
{{"english": "to improve", "german": "verbessern", "example": "I want to improve my English."}},
|
|
{{"english": "achievement", "german": "Leistung", "example": null}}
|
|
]
|
|
}}
|
|
|
|
REGELN:
|
|
1. Erkenne das Spalten-Layout aus den Tab-Trennungen
|
|
2. Korrigiere offensichtliche OCR-Fehler kontextuell (z.B. "vereessern" -> "verbessern", "0" -> "o")
|
|
3. Bei fehlenden Beispielsaetzen: "example": null
|
|
4. Ueberspringe Ueberschriften, Seitenzahlen, Kapitelnummern
|
|
5. Behalte Wortarten bei wenn vorhanden (n, v, adj am Ende des englischen Worts)
|
|
6. Gib NUR valides JSON zurueck"""
|
|
|
|
|
|
async def structure_vocabulary_with_llm(ocr_text: str) -> List[Dict[str, Any]]:
|
|
"""Verwendet Ollama LLM um OCR-Text zu strukturieren."""
|
|
prompt = STRUCTURE_PROMPT.format(ocr_text=ocr_text)
|
|
|
|
try:
|
|
async with httpx.AsyncClient(timeout=120.0) as client:
|
|
response = await client.post(
|
|
f"{OLLAMA_URL}/api/chat",
|
|
json={
|
|
"model": LLM_MODEL,
|
|
"messages": [{"role": "user", "content": prompt}],
|
|
"stream": False,
|
|
"options": {"temperature": 0.1, "num_predict": 4096}
|
|
}
|
|
)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
content = data.get("message", {}).get("content", "")
|
|
logger.info(f"Ollama LLM response received: {len(content)} chars")
|
|
return parse_llm_vocabulary_json(content)
|
|
|
|
except httpx.TimeoutException:
|
|
logger.error("Ollama LLM request timed out")
|
|
return []
|
|
except httpx.HTTPStatusError as e:
|
|
logger.error(f"Ollama LLM HTTP error: {e}")
|
|
return []
|
|
except Exception as e:
|
|
logger.error(f"LLM structuring failed: {e}")
|
|
return []
|
|
|
|
|
|
def parse_llm_vocabulary_json(text: str) -> List[Dict[str, Any]]:
|
|
"""Robustes JSON-Parsing des LLM-Outputs."""
|
|
try:
|
|
start = text.find('{')
|
|
end = text.rfind('}') + 1
|
|
if start == -1 or end == 0:
|
|
logger.warning("No JSON found in LLM response")
|
|
return []
|
|
|
|
json_str = text[start:end]
|
|
data = json.loads(json_str)
|
|
vocabulary = data.get("vocabulary", [])
|
|
|
|
valid_entries = []
|
|
for entry in vocabulary:
|
|
english = entry.get("english", "").strip()
|
|
german = entry.get("german", "").strip()
|
|
if english and german:
|
|
valid_entries.append({
|
|
"english": english, "german": german,
|
|
"example": entry.get("example")
|
|
})
|
|
return valid_entries
|
|
|
|
except json.JSONDecodeError as e:
|
|
logger.error(f"JSON parse error: {e}")
|
|
return extract_vocabulary_regex(text)
|
|
except Exception as e:
|
|
logger.error(f"Vocabulary parsing failed: {e}")
|
|
return []
|
|
|
|
|
|
def extract_vocabulary_regex(text: str) -> List[Dict[str, Any]]:
|
|
"""Fallback: Vokabeln via Regex extrahieren."""
|
|
pattern = r'"english"\s*:\s*"([^"]+)"\s*,\s*"german"\s*:\s*"([^"]+)"'
|
|
matches = re.findall(pattern, text)
|
|
|
|
vocabulary = []
|
|
for english, german in matches:
|
|
vocabulary.append({
|
|
"english": english.strip(), "german": german.strip(), "example": None
|
|
})
|
|
logger.info(f"Regex fallback extracted {len(vocabulary)} entries")
|
|
return vocabulary
|
|
|
|
|
|
# =============================================================================
|
|
# Public API
|
|
# =============================================================================
|
|
|
|
async def extract_vocabulary_hybrid(
|
|
image_bytes: bytes, page_number: int = 0
|
|
) -> Tuple[List[Dict[str, Any]], float, str]:
|
|
"""Hybrid-Extraktion: PaddleOCR + LLM Strukturierung."""
|
|
try:
|
|
logger.info(f"Starting hybrid extraction for page {page_number + 1}")
|
|
regions, raw_text = run_paddle_ocr(image_bytes)
|
|
|
|
if not regions:
|
|
return [], 0.0, f"Seite {page_number + 1}: Kein Text erkannt (OCR)"
|
|
|
|
formatted_text = format_ocr_for_llm(regions)
|
|
logger.info(f"Formatted OCR text: {len(formatted_text)} chars")
|
|
|
|
vocabulary = await structure_vocabulary_with_llm(formatted_text)
|
|
|
|
if not vocabulary:
|
|
vocabulary = extract_from_rows_directly(regions)
|
|
|
|
if not vocabulary:
|
|
return [], 0.0, f"Seite {page_number + 1}: Keine Vokabeln erkannt"
|
|
|
|
avg_confidence = sum(r.confidence for r in regions) / len(regions) if regions else 0.0
|
|
logger.info(f"Hybrid extraction completed: {len(vocabulary)} entries, {avg_confidence:.2f} confidence")
|
|
return vocabulary, avg_confidence, ""
|
|
|
|
except Exception as e:
|
|
logger.error(f"Hybrid extraction failed: {e}")
|
|
import traceback
|
|
logger.error(traceback.format_exc())
|
|
return [], 0.0, f"Seite {page_number + 1}: Fehler - {str(e)[:50]}"
|
|
|
|
|
|
def extract_from_rows_directly(regions: List[OCRRegion]) -> List[Dict[str, Any]]:
|
|
"""Direkter Fallback: Extrahiere Vokabeln ohne LLM."""
|
|
rows = group_regions_by_rows(regions)
|
|
vocabulary = []
|
|
|
|
for row in rows:
|
|
if len(row) >= 2:
|
|
english = row[0].text.strip()
|
|
german = row[1].text.strip()
|
|
example = row[2].text.strip() if len(row) >= 3 else None
|
|
if english and german and len(english) > 1 and len(german) > 1:
|
|
vocabulary.append({
|
|
"english": english, "german": german, "example": example
|
|
})
|
|
|
|
logger.info(f"Direct row extraction: {len(vocabulary)} entries")
|
|
return vocabulary
|
|
|
|
|
|
# =============================================================================
|
|
# Test/Debug
|
|
# =============================================================================
|
|
|
|
async def test_hybrid_extraction(image_path: str):
|
|
"""Test-Funktion fuer Entwicklung."""
|
|
with open(image_path, "rb") as f:
|
|
image_bytes = f.read()
|
|
|
|
vocab, confidence, error = await extract_vocabulary_hybrid(image_bytes)
|
|
|
|
print(f"\n=== Hybrid OCR Test ===")
|
|
print(f"Confidence: {confidence:.2f}")
|
|
print(f"Error: {error or 'None'}")
|
|
print(f"Vocabulary ({len(vocab)} entries):")
|
|
for v in vocab[:10]:
|
|
print(f" - {v['english']} = {v['german']}")
|
|
|
|
return vocab
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import asyncio
|
|
import sys
|
|
|
|
if len(sys.argv) > 1:
|
|
asyncio.run(test_hybrid_extraction(sys.argv[1]))
|
|
else:
|
|
print("Usage: python hybrid_vocab_extractor.py <image_path>")
|