Restructure: Move final 12 root files into packages (klausur-service)

ocr/spell/ (3): smart_spell, core, text upload/ (3): api, chunked, mobile crawler/ (3): github, github_core, github_parsers + unified_grid → grid/, tesseract_extractor → ocr/engines/, htr_api → ocr/pipeline/ 12 shims added. Only main.py, config.py, storage + RAG files remain at root. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 23:19:11 +02:00
parent cba877c65a
commit d093a4d388
27 changed files with 3116 additions and 3049 deletions
--- a/klausur-service/backend/ocr/engines/tesseract_extractor.py
+++ b/klausur-service/backend/ocr/engines/tesseract_extractor.py
@@ -0,0 +1,346 @@
+"""
+Tesseract-based OCR extraction with word-level bounding boxes.
+
+Uses Tesseract for spatial information (WHERE text is) while
+the Vision LLM handles semantic understanding (WHAT the text means).
+
+Tesseract runs natively on ARM64 via Debian's apt package.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+"""
+
+import io
+import logging
+from typing import List, Dict, Any, Optional
+from difflib import SequenceMatcher
+
+logger = logging.getLogger(__name__)
+
+try:
+    import pytesseract
+    from PIL import Image
+    TESSERACT_AVAILABLE = True
+except ImportError:
+    TESSERACT_AVAILABLE = False
+    logger.warning("pytesseract or Pillow not installed - Tesseract OCR unavailable")
+
+
+async def extract_bounding_boxes(image_bytes: bytes, lang: str = "eng+deu") -> dict:
+    """Run Tesseract OCR and return word-level bounding boxes.
+
+    Args:
+        image_bytes: PNG/JPEG image as bytes.
+        lang: Tesseract language string (e.g. "eng+deu").
+
+    Returns:
+        Dict with 'words' list and 'image_width'/'image_height'.
+    """
+    if not TESSERACT_AVAILABLE:
+        return {"words": [], "image_width": 0, "image_height": 0, "error": "Tesseract not available"}
+
+    image = Image.open(io.BytesIO(image_bytes))
+    data = pytesseract.image_to_data(image, lang=lang, output_type=pytesseract.Output.DICT)
+
+    words = []
+    for i in range(len(data['text'])):
+        text = data['text'][i].strip()
+        conf = int(data['conf'][i])
+        if not text or conf < 20:
+            continue
+        words.append({
+            "text": text,
+            "left": data['left'][i],
+            "top": data['top'][i],
+            "width": data['width'][i],
+            "height": data['height'][i],
+            "conf": conf,
+            "block_num": data['block_num'][i],
+            "par_num": data['par_num'][i],
+            "line_num": data['line_num'][i],
+            "word_num": data['word_num'][i],
+        })
+
+    return {
+        "words": words,
+        "image_width": image.width,
+        "image_height": image.height,
+    }
+
+
+def group_words_into_lines(words: List[dict], y_tolerance_px: int = 15) -> List[List[dict]]:
+    """Group words by their Y position into lines.
+
+    Args:
+        words: List of word dicts from extract_bounding_boxes.
+        y_tolerance_px: Max pixel distance to consider words on the same line.
+
+    Returns:
+        List of lines, each line is a list of words sorted by X position.
+    """
+    if not words:
+        return []
+
+    # Sort by Y then X
+    sorted_words = sorted(words, key=lambda w: (w['top'], w['left']))
+
+    lines: List[List[dict]] = []
+    current_line: List[dict] = [sorted_words[0]]
+    current_y = sorted_words[0]['top']
+
+    for word in sorted_words[1:]:
+        if abs(word['top'] - current_y) <= y_tolerance_px:
+            current_line.append(word)
+        else:
+            current_line.sort(key=lambda w: w['left'])
+            lines.append(current_line)
+            current_line = [word]
+            current_y = word['top']
+
+    if current_line:
+        current_line.sort(key=lambda w: w['left'])
+        lines.append(current_line)
+
+    return lines
+
+
+def detect_columns(lines: List[List[dict]], image_width: int) -> Dict[str, Any]:
+    """Detect column boundaries from word positions.
+
+    Typical vocab table: Left=English, Middle=German, Right=Example sentences.
+
+    Returns:
+        Dict with column boundaries and type assignments.
+    """
+    if not lines or image_width == 0:
+        return {"columns": [], "column_types": []}
+
+    # Collect all word X positions
+    all_x_positions = []
+    for line in lines:
+        for word in line:
+            all_x_positions.append(word['left'])
+
+    if not all_x_positions:
+        return {"columns": [], "column_types": []}
+
+    # Find X-position clusters (column starts)
+    all_x_positions.sort()
+
+    # Simple gap-based column detection
+    min_gap = image_width * 0.08  # 8% of page width = column gap
+    clusters = []
+    current_cluster = [all_x_positions[0]]
+
+    for x in all_x_positions[1:]:
+        if x - current_cluster[-1] > min_gap:
+            clusters.append(current_cluster)
+            current_cluster = [x]
+        else:
+            current_cluster.append(x)
+
+    if current_cluster:
+        clusters.append(current_cluster)
+
+    # Each cluster represents a column start
+    columns = []
+    for cluster in clusters:
+        col_start = min(cluster)
+        columns.append({
+            "x_start": col_start,
+            "x_start_pct": col_start / image_width * 100,
+            "word_count": len(cluster),
+        })
+
+    # Assign column types based on position (left→right: EN, DE, Example)
+    type_map = ["english", "german", "example"]
+    column_types = []
+    for i, col in enumerate(columns):
+        if i < len(type_map):
+            column_types.append(type_map[i])
+        else:
+            column_types.append("unknown")
+
+    return {
+        "columns": columns,
+        "column_types": column_types,
+    }
+
+
+def words_to_vocab_entries(lines: List[List[dict]], columns: List[dict],
+                           column_types: List[str], image_width: int,
+                           image_height: int) -> List[dict]:
+    """Convert grouped words into vocabulary entries using column positions.
+
+    Args:
+        lines: Grouped word lines from group_words_into_lines.
+        columns: Column boundaries from detect_columns.
+        column_types: Column type assignments.
+        image_width: Image width in pixels.
+        image_height: Image height in pixels.
+
+    Returns:
+        List of vocabulary entry dicts with english/german/example fields.
+    """
+    if not columns or not lines:
+        return []
+
+    # Build column boundaries for word assignment
+    col_boundaries = []
+    for i, col in enumerate(columns):
+        start = col['x_start']
+        if i + 1 < len(columns):
+            end = columns[i + 1]['x_start']
+        else:
+            end = image_width
+        col_boundaries.append((start, end, column_types[i] if i < len(column_types) else "unknown"))
+
+    entries = []
+    for line in lines:
+        entry = {"english": "", "german": "", "example": ""}
+        line_words_by_col: Dict[str, List[str]] = {"english": [], "german": [], "example": []}
+        line_bbox: Dict[str, Optional[dict]] = {}
+
+        for word in line:
+            word_center_x = word['left'] + word['width'] / 2
+            assigned_type = "unknown"
+            for start, end, col_type in col_boundaries:
+                if start <= word_center_x < end:
+                    assigned_type = col_type
+                    break
+
+            if assigned_type in line_words_by_col:
+                line_words_by_col[assigned_type].append(word['text'])
+                # Track bounding box for the column
+                if assigned_type not in line_bbox or line_bbox[assigned_type] is None:
+                    line_bbox[assigned_type] = {
+                        "left": word['left'],
+                        "top": word['top'],
+                        "right": word['left'] + word['width'],
+                        "bottom": word['top'] + word['height'],
+                    }
+                else:
+                    bb = line_bbox[assigned_type]
+                    bb['left'] = min(bb['left'], word['left'])
+                    bb['top'] = min(bb['top'], word['top'])
+                    bb['right'] = max(bb['right'], word['left'] + word['width'])
+                    bb['bottom'] = max(bb['bottom'], word['top'] + word['height'])
+
+        for col_type in ["english", "german", "example"]:
+            if line_words_by_col[col_type]:
+                entry[col_type] = " ".join(line_words_by_col[col_type])
+                if line_bbox.get(col_type):
+                    bb = line_bbox[col_type]
+                    entry[f"{col_type}_bbox"] = {
+                        "x_pct": bb['left'] / image_width * 100,
+                        "y_pct": bb['top'] / image_height * 100,
+                        "w_pct": (bb['right'] - bb['left']) / image_width * 100,
+                        "h_pct": (bb['bottom'] - bb['top']) / image_height * 100,
+                    }
+
+        # Only add if at least one column has content
+        if entry["english"] or entry["german"]:
+            entries.append(entry)
+
+    return entries
+
+
+def match_positions_to_vocab(tess_words: List[dict], llm_vocab: List[dict],
+                             image_w: int, image_h: int,
+                             threshold: float = 0.6) -> List[dict]:
+    """Match Tesseract bounding boxes to LLM vocabulary entries.
+
+    For each LLM vocab entry, find the best-matching Tesseract word
+    and attach its bounding box coordinates.
+
+    Args:
+        tess_words: Word list from Tesseract with pixel coordinates.
+        llm_vocab: Vocabulary list from Vision LLM.
+        image_w: Image width in pixels.
+        image_h: Image height in pixels.
+        threshold: Minimum similarity ratio for a match.
+
+    Returns:
+        llm_vocab list with bbox_x_pct/bbox_y_pct/bbox_w_pct/bbox_h_pct added.
+    """
+    if not tess_words or not llm_vocab or image_w == 0 or image_h == 0:
+        return llm_vocab
+
+    for entry in llm_vocab:
+        english = entry.get("english", "").lower().strip()
+        german = entry.get("german", "").lower().strip()
+
+        if not english and not german:
+            continue
+
+        # Try to match English word first, then German
+        for field in ["english", "german"]:
+            search_text = entry.get(field, "").lower().strip()
+            if not search_text:
+                continue
+
+            best_word = None
+            best_ratio = 0.0
+
+            for word in tess_words:
+                ratio = SequenceMatcher(None, search_text, word['text'].lower()).ratio()
+                if ratio > best_ratio:
+                    best_ratio = ratio
+                    best_word = word
+
+            if best_word and best_ratio >= threshold:
+                entry[f"bbox_x_pct"] = best_word['left'] / image_w * 100
+                entry[f"bbox_y_pct"] = best_word['top'] / image_h * 100
+                entry[f"bbox_w_pct"] = best_word['width'] / image_w * 100
+                entry[f"bbox_h_pct"] = best_word['height'] / image_h * 100
+                entry["bbox_match_field"] = field
+                entry["bbox_match_ratio"] = round(best_ratio, 3)
+                break  # Found a match, no need to try the other field
+
+    return llm_vocab
+
+
+async def run_tesseract_pipeline(image_bytes: bytes, lang: str = "eng+deu") -> dict:
+    """Full Tesseract pipeline: extract words, group lines, detect columns, build vocab.
+
+    Args:
+        image_bytes: PNG/JPEG image as bytes.
+        lang: Tesseract language string.
+
+    Returns:
+        Dict with 'vocabulary', 'words', 'lines', 'columns', 'image_width', 'image_height'.
+    """
+    # Step 1: Extract bounding boxes
+    bbox_data = await extract_bounding_boxes(image_bytes, lang=lang)
+
+    if bbox_data.get("error"):
+        return bbox_data
+
+    words = bbox_data["words"]
+    image_w = bbox_data["image_width"]
+    image_h = bbox_data["image_height"]
+
+    # Step 2: Group into lines
+    lines = group_words_into_lines(words)
+
+    # Step 3: Detect columns
+    col_info = detect_columns(lines, image_w)
+
+    # Step 4: Build vocabulary entries
+    vocab = words_to_vocab_entries(
+        lines,
+        col_info["columns"],
+        col_info["column_types"],
+        image_w,
+        image_h,
+    )
+
+    return {
+        "vocabulary": vocab,
+        "words": words,
+        "lines_count": len(lines),
+        "columns": col_info["columns"],
+        "column_types": col_info["column_types"],
+        "image_width": image_w,
+        "image_height": image_h,
+        "word_count": len(words),
+    }
--- a/klausur-service/backend/ocr/pipeline/htr_api.py
+++ b/klausur-service/backend/ocr/pipeline/htr_api.py
@@ -0,0 +1,276 @@
+"""
+Handwriting HTR API - Hochwertige Handschriftenerkennung (HTR) fuer Klausurkorrekturen.
+
+Endpoints:
+- POST /api/v1/htr/recognize          - Bild hochladen → handgeschriebener Text
+- POST /api/v1/htr/recognize-session  - OCR-Pipeline Session als Quelle nutzen
+
+Modell-Strategie:
+  1. qwen2.5vl:32b via Ollama (primaer, hoechste Qualitaet als VLM)
+  2. microsoft/trocr-large-handwritten (Fallback, offline, kein Ollama)
+
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal auf dem Mac Mini.
+"""
+
+import io
+import os
+import logging
+import time
+import base64
+from typing import Optional
+
+import cv2
+import numpy as np
+from fastapi import APIRouter, HTTPException, Query, UploadFile, File
+from pydantic import BaseModel
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter(prefix="/api/v1/htr", tags=["HTR"])
+
+OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
+OLLAMA_HTR_MODEL = os.getenv("OLLAMA_HTR_MODEL", "qwen2.5vl:32b")
+HTR_FALLBACK_MODEL = os.getenv("HTR_FALLBACK_MODEL", "trocr-large")
+
+
+# ---------------------------------------------------------------------------
+# Pydantic Models
+# ---------------------------------------------------------------------------
+
+class HTRSessionRequest(BaseModel):
+    session_id: str
+    model: str = "auto"       # "auto" | "qwen2.5vl" | "trocr-large"
+    use_clean: bool = True    # Prefer clean_png (after handwriting removal)
+
+
+# ---------------------------------------------------------------------------
+# Preprocessing
+# ---------------------------------------------------------------------------
+
+def _preprocess_for_htr(img_bgr: np.ndarray) -> np.ndarray:
+    """
+    CLAHE contrast enhancement + upscale to improve HTR accuracy.
+    Returns grayscale enhanced image.
+    """
+    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
+    enhanced = clahe.apply(gray)
+
+    # Upscale if image is too small
+    h, w = enhanced.shape
+    if min(h, w) < 800:
+        scale = 800 / min(h, w)
+        enhanced = cv2.resize(
+            enhanced, None, fx=scale, fy=scale,
+            interpolation=cv2.INTER_CUBIC
+        )
+
+    return enhanced
+
+
+def _bgr_to_png_bytes(img_bgr: np.ndarray) -> bytes:
+    """Convert BGR ndarray to PNG bytes."""
+    success, buf = cv2.imencode(".png", img_bgr)
+    if not success:
+        raise RuntimeError("Failed to encode image to PNG")
+    return buf.tobytes()
+
+
+def _preprocess_image_bytes(image_bytes: bytes) -> bytes:
+    """Load image, apply HTR preprocessing, return PNG bytes."""
+    arr = np.frombuffer(image_bytes, dtype=np.uint8)
+    img_bgr = cv2.imdecode(arr, cv2.IMREAD_COLOR)
+    if img_bgr is None:
+        raise ValueError("Could not decode image")
+
+    enhanced = _preprocess_for_htr(img_bgr)
+    # Convert grayscale back to BGR for encoding
+    enhanced_bgr = cv2.cvtColor(enhanced, cv2.COLOR_GRAY2BGR)
+    return _bgr_to_png_bytes(enhanced_bgr)
+
+
+# ---------------------------------------------------------------------------
+# Backend: Ollama qwen2.5vl
+# ---------------------------------------------------------------------------
+
+async def _recognize_with_qwen_vl(image_bytes: bytes, language: str) -> Optional[str]:
+    """
+    Send image to Ollama qwen2.5vl:32b for HTR.
+    Returns extracted text or None on error.
+    """
+    import httpx
+
+    lang_hint = {
+        "de": "Deutsch",
+        "en": "Englisch",
+        "de+en": "Deutsch und Englisch",
+    }.get(language, "Deutsch")
+
+    prompt = (
+        f"Du bist ein OCR-Experte fuer handgeschriebenen Text auf {lang_hint}. "
+        "Lies den Text im Bild exakt ab — korrigiere KEINE Rechtschreibfehler. "
+        "Antworte NUR mit dem erkannten Text, ohne Erklaerungen."
+    )
+
+    img_b64 = base64.b64encode(image_bytes).decode("utf-8")
+
+    payload = {
+        "model": OLLAMA_HTR_MODEL,
+        "prompt": prompt,
+        "images": [img_b64],
+        "stream": False,
+    }
+
+    try:
+        async with httpx.AsyncClient(timeout=120.0) as client:
+            resp = await client.post(f"{OLLAMA_BASE_URL}/api/generate", json=payload)
+            resp.raise_for_status()
+            data = resp.json()
+            return data.get("response", "").strip()
+    except Exception as e:
+        logger.warning(f"Ollama qwen2.5vl HTR failed: {e}")
+        return None
+
+
+# ---------------------------------------------------------------------------
+# Backend: TrOCR-large fallback
+# ---------------------------------------------------------------------------
+
+async def _recognize_with_trocr_large(image_bytes: bytes) -> Optional[str]:
+    """
+    Use microsoft/trocr-large-handwritten via trocr_service.py.
+    Returns extracted text or None on error.
+    """
+    try:
+        from services.trocr_service import run_trocr_ocr, _check_trocr_available
+        if not _check_trocr_available():
+            logger.warning("TrOCR not available for HTR fallback")
+            return None
+
+        text, confidence = await run_trocr_ocr(image_bytes, handwritten=True, size="large")
+        return text.strip() if text else None
+    except Exception as e:
+        logger.warning(f"TrOCR-large HTR failed: {e}")
+        return None
+
+
+# ---------------------------------------------------------------------------
+# Core recognition logic
+# ---------------------------------------------------------------------------
+
+async def _do_recognize(
+    image_bytes: bytes,
+    model: str = "auto",
+    preprocess: bool = True,
+    language: str = "de",
+) -> dict:
+    """
+    Core HTR logic: preprocess → try Ollama → fallback to TrOCR-large.
+    Returns dict with text, model_used, processing_time_ms.
+    """
+    t0 = time.monotonic()
+
+    if preprocess:
+        try:
+            image_bytes = _preprocess_image_bytes(image_bytes)
+        except Exception as e:
+            logger.warning(f"HTR preprocessing failed, using raw image: {e}")
+
+    text: Optional[str] = None
+    model_used: str = "none"
+
+    use_qwen = model in ("auto", "qwen2.5vl")
+    use_trocr = model in ("auto", "trocr-large") or (use_qwen and text is None)
+
+    if use_qwen:
+        text = await _recognize_with_qwen_vl(image_bytes, language)
+        if text is not None:
+            model_used = f"qwen2.5vl ({OLLAMA_HTR_MODEL})"
+
+    if text is None and (use_trocr or model == "trocr-large"):
+        text = await _recognize_with_trocr_large(image_bytes)
+        if text is not None:
+            model_used = "trocr-large-handwritten"
+
+    if text is None:
+        text = ""
+        model_used = "none (all backends failed)"
+
+    elapsed_ms = int((time.monotonic() - t0) * 1000)
+
+    return {
+        "text": text,
+        "model_used": model_used,
+        "processing_time_ms": elapsed_ms,
+        "language": language,
+        "preprocessed": preprocess,
+    }
+
+
+# ---------------------------------------------------------------------------
+# Endpoints
+# ---------------------------------------------------------------------------
+
+@router.post("/recognize")
+async def recognize_handwriting(
+    file: UploadFile = File(...),
+    model: str = Query("auto", description="auto | qwen2.5vl | trocr-large"),
+    preprocess: bool = Query(True, description="Apply CLAHE + upscale before recognition"),
+    language: str = Query("de", description="de | en | de+en"),
+):
+    """
+    Upload an image and get back the handwritten text as plain text.
+
+    Tries qwen2.5vl:32b via Ollama first, falls back to TrOCR-large-handwritten.
+    """
+    if model not in ("auto", "qwen2.5vl", "trocr-large"):
+        raise HTTPException(status_code=400, detail="model must be one of: auto, qwen2.5vl, trocr-large")
+    if language not in ("de", "en", "de+en"):
+        raise HTTPException(status_code=400, detail="language must be one of: de, en, de+en")
+
+    image_bytes = await file.read()
+    if not image_bytes:
+        raise HTTPException(status_code=400, detail="Empty file")
+
+    return await _do_recognize(image_bytes, model=model, preprocess=preprocess, language=language)
+
+
+@router.post("/recognize-session")
+async def recognize_from_session(req: HTRSessionRequest):
+    """
+    Use an OCR-Pipeline session as image source for HTR.
+
+    Set use_clean=true to prefer the clean image (after handwriting removal step).
+    This is useful when you want to do HTR on isolated handwriting regions.
+    """
+    from ocr_pipeline_session_store import get_session_db, get_session_image
+
+    session = await get_session_db(req.session_id)
+    if not session:
+        raise HTTPException(status_code=404, detail=f"Session {req.session_id} not found")
+
+    # Choose source image
+    image_bytes: Optional[bytes] = None
+    source_used: str = ""
+
+    if req.use_clean:
+        image_bytes = await get_session_image(req.session_id, "clean")
+        if image_bytes:
+            source_used = "clean"
+
+    if not image_bytes:
+        image_bytes = await get_session_image(req.session_id, "deskewed")
+        if image_bytes:
+            source_used = "deskewed"
+
+    if not image_bytes:
+        image_bytes = await get_session_image(req.session_id, "original")
+        source_used = "original"
+
+    if not image_bytes:
+        raise HTTPException(status_code=404, detail="No image available in session")
+
+    result = await _do_recognize(image_bytes, model=req.model)
+    result["session_id"] = req.session_id
+    result["source_image"] = source_used
+    return result
--- a/klausur-service/backend/ocr/spell/init.py
+++ b/klausur-service/backend/ocr/spell/init.py
@@ -0,0 +1,7 @@
+"""
+OCR spell-checking sub-package — language-aware OCR correction.
+
+Moved from backend/ flat modules (smart_spell*.py).
+Backward-compatible shim files remain at the old locations.
+"""
+from .smart_spell import *  # noqa: F401,F403
--- a/klausur-service/backend/ocr/spell/core.py
+++ b/klausur-service/backend/ocr/spell/core.py
@@ -0,0 +1,298 @@
+"""
+SmartSpellChecker Core — init, data types, language detection, word correction.
+
+Extracted from smart_spell.py for modularity.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+"""
+
+import logging
+import re
+from dataclasses import dataclass, field
+from typing import Dict, List, Literal, Optional, Set, Tuple
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Init
+# ---------------------------------------------------------------------------
+
+try:
+    from spellchecker import SpellChecker as _SpellChecker
+    _en_spell = _SpellChecker(language='en', distance=1)
+    _de_spell = _SpellChecker(language='de', distance=1)
+    _AVAILABLE = True
+except ImportError:
+    _AVAILABLE = False
+    logger.warning("pyspellchecker not installed — SmartSpellChecker disabled")
+
+Lang = Literal["en", "de", "both", "unknown"]
+
+# ---------------------------------------------------------------------------
+# Bigram context for a/I disambiguation
+# ---------------------------------------------------------------------------
+
+# Words that commonly follow "I" (subject pronoun -> verb/modal)
+_I_FOLLOWERS: frozenset = frozenset({
+    "am", "was", "have", "had", "do", "did", "will", "would", "can",
+    "could", "should", "shall", "may", "might", "must",
+    "think", "know", "see", "want", "need", "like", "love", "hate",
+    "go", "went", "come", "came", "say", "said", "get", "got",
+    "make", "made", "take", "took", "give", "gave", "tell", "told",
+    "feel", "felt", "find", "found", "believe", "hope", "wish",
+    "remember", "forget", "understand", "mean", "meant",
+    "don't", "didn't", "can't", "won't", "couldn't", "wouldn't",
+    "shouldn't", "haven't", "hadn't", "isn't", "wasn't",
+    "really", "just", "also", "always", "never", "often", "sometimes",
+})
+
+# Words that commonly follow "a" (article -> noun/adjective)
+_A_FOLLOWERS: frozenset = frozenset({
+    "lot", "few", "little", "bit", "good", "bad", "great", "new", "old",
+    "long", "short", "big", "small", "large", "huge", "tiny",
+    "nice", "beautiful", "wonderful", "terrible", "horrible",
+    "man", "woman", "boy", "girl", "child", "dog", "cat", "bird",
+    "book", "car", "house", "room", "school", "teacher", "student",
+    "day", "week", "month", "year", "time", "place", "way",
+    "friend", "family", "person", "problem", "question", "story",
+    "very", "really", "quite", "rather", "pretty", "single",
+})
+
+# Digit->letter substitutions (OCR confusion)
+_DIGIT_SUBS: Dict[str, List[str]] = {
+    '0': ['o', 'O'],
+    '1': ['l', 'I'],
+    '5': ['s', 'S'],
+    '6': ['g', 'G'],
+    '8': ['b', 'B'],
+    '|': ['I', 'l'],
+    '/': ['l'],  # italic 'l' misread as slash (e.g. "p/" -> "pl")
+}
+_SUSPICIOUS_CHARS = frozenset(_DIGIT_SUBS.keys())
+
+# Umlaut confusion: OCR drops dots (u->u, a->a, o->o)
+_UMLAUT_MAP = {
+    'a': '\u00e4', 'o': '\u00f6', 'u': '\u00fc', 'i': '\u00fc',
+    'A': '\u00c4', 'O': '\u00d6', 'U': '\u00dc', 'I': '\u00dc',
+}
+
+# Tokenizer -- includes | and / so OCR artifacts like "p/" are treated as words
+_TOKEN_RE = re.compile(r"([A-Za-z\u00c4\u00d6\u00dc\u00e4\u00f6\u00fc\u00df'|/]+)([^A-Za-z\u00c4\u00d6\u00dc\u00e4\u00f6\u00fc\u00df'|/]*)")
+
+
+# ---------------------------------------------------------------------------
+# Data types
+# ---------------------------------------------------------------------------
+
+@dataclass
+class CorrectionResult:
+    original: str
+    corrected: str
+    lang_detected: Lang
+    changed: bool
+    changes: List[str] = field(default_factory=list)
+
+
+# ---------------------------------------------------------------------------
+# Core class — language detection and word-level correction
+# ---------------------------------------------------------------------------
+
+class _SmartSpellCoreBase:
+    """Base class with language detection and single-word correction.
+
+    Not intended for direct use — SmartSpellChecker inherits from this.
+    """
+
+    def __init__(self):
+        if not _AVAILABLE:
+            raise RuntimeError("pyspellchecker not installed")
+        self.en = _en_spell
+        self.de = _de_spell
+
+    # --- Language detection ---
+
+    def detect_word_lang(self, word: str) -> Lang:
+        """Detect language of a single word using dual-dict heuristic."""
+        w = word.lower().strip(".,;:!?\"'()")
+        if not w:
+            return "unknown"
+        in_en = bool(self.en.known([w]))
+        in_de = bool(self.de.known([w]))
+        if in_en and in_de:
+            return "both"
+        if in_en:
+            return "en"
+        if in_de:
+            return "de"
+        return "unknown"
+
+    def detect_text_lang(self, text: str) -> Lang:
+        """Detect dominant language of a text string (sentence/phrase)."""
+        words = re.findall(r"[A-Za-z\u00c4\u00d6\u00dc\u00e4\u00f6\u00fc\u00df]+", text)
+        if not words:
+            return "unknown"
+
+        en_count = 0
+        de_count = 0
+        for w in words:
+            lang = self.detect_word_lang(w)
+            if lang == "en":
+                en_count += 1
+            elif lang == "de":
+                de_count += 1
+            # "both" doesn't count for either
+
+        if en_count > de_count:
+            return "en"
+        if de_count > en_count:
+            return "de"
+        if en_count == de_count and en_count > 0:
+            return "both"
+        return "unknown"
+
+    # --- Single-word correction ---
+
+    def _known(self, word: str) -> bool:
+        """True if word is known in EN or DE dictionary, or is a known abbreviation."""
+        w = word.lower()
+        if bool(self.en.known([w])) or bool(self.de.known([w])):
+            return True
+        # Also accept known abbreviations (sth, sb, adj, etc.)
+        try:
+            from cv_ocr_engines import _KNOWN_ABBREVIATIONS
+            if w in _KNOWN_ABBREVIATIONS:
+                return True
+        except ImportError:
+            pass
+        return False
+
+    def _word_freq(self, word: str) -> float:
+        """Get word frequency (max of EN and DE)."""
+        w = word.lower()
+        return max(self.en.word_usage_frequency(w), self.de.word_usage_frequency(w))
+
+    def _known_in(self, word: str, lang: str) -> bool:
+        """True if word is known in a specific language dictionary."""
+        w = word.lower()
+        spell = self.en if lang == "en" else self.de
+        return bool(spell.known([w]))
+
+    def correct_word(self, word: str, lang: str = "en",
+                     prev_word: str = "", next_word: str = "") -> Optional[str]:
+        """Correct a single word for the given language.
+
+        Returns None if no correction needed, or the corrected string.
+        """
+        if not word or not word.strip():
+            return None
+
+        # Skip numbers, abbreviations with dots, very short tokens
+        if word.isdigit() or '.' in word:
+            return None
+
+        # Skip IPA/phonetic content in brackets
+        if '[' in word or ']' in word:
+            return None
+
+        has_suspicious = any(ch in _SUSPICIOUS_CHARS for ch in word)
+
+        # 1. Already known -> no fix
+        if self._known(word):
+            # But check a/I disambiguation for single-char words
+            if word.lower() in ('l', '|') and next_word:
+                return self._disambiguate_a_I(word, next_word)
+            return None
+
+        # 2. Digit/pipe substitution
+        if has_suspicious:
+            if word == '|':
+                return 'I'
+            # Try single-char substitutions
+            for i, ch in enumerate(word):
+                if ch not in _DIGIT_SUBS:
+                    continue
+                for replacement in _DIGIT_SUBS[ch]:
+                    candidate = word[:i] + replacement + word[i + 1:]
+                    if self._known(candidate):
+                        return candidate
+            # Try multi-char substitution (e.g., "sch00l" -> "school")
+            multi = self._try_multi_digit_sub(word)
+            if multi:
+                return multi
+
+        # 3. Umlaut correction (German)
+        if lang == "de" and len(word) >= 3 and word.isalpha():
+            umlaut_fix = self._try_umlaut_fix(word)
+            if umlaut_fix:
+                return umlaut_fix
+
+        # 4. General spell correction
+        if not has_suspicious and len(word) >= 3 and word.isalpha():
+            # Safety: don't correct if the word is valid in the OTHER language
+            other_lang = "de" if lang == "en" else "en"
+            if self._known_in(word, other_lang):
+                return None
+            if other_lang == "de" and self._try_umlaut_fix(word):
+                return None  # has a valid DE umlaut variant -> don't touch
+
+            spell = self.en if lang == "en" else self.de
+            correction = spell.correction(word.lower())
+            if correction and correction != word.lower():
+                if word[0].isupper():
+                    correction = correction[0].upper() + correction[1:]
+                if self._known(correction):
+                    return correction
+
+        return None
+
+    # --- Multi-digit substitution ---
+
+    def _try_multi_digit_sub(self, word: str) -> Optional[str]:
+        """Try replacing multiple digits simultaneously using BFS."""
+        positions = [(i, ch) for i, ch in enumerate(word) if ch in _DIGIT_SUBS]
+        if not positions or len(positions) > 4:
+            return None
+
+        # BFS over substitution combinations
+        queue = [list(word)]
+        for pos, ch in positions:
+            next_queue = []
+            for current in queue:
+                # Keep original
+                next_queue.append(current[:])
+                # Try each substitution
+                for repl in _DIGIT_SUBS[ch]:
+                    variant = current[:]
+                    variant[pos] = repl
+                    next_queue.append(variant)
+            queue = next_queue
+
+        # Check which combinations produce known words
+        for combo in queue:
+            candidate = "".join(combo)
+            if candidate != word and self._known(candidate):
+                return candidate
+
+        return None
+
+    # --- Umlaut fix ---
+
+    def _try_umlaut_fix(self, word: str) -> Optional[str]:
+        """Try single-char umlaut substitutions for German words."""
+        for i, ch in enumerate(word):
+            if ch in _UMLAUT_MAP:
+                candidate = word[:i] + _UMLAUT_MAP[ch] + word[i + 1:]
+                if self._known(candidate):
+                    return candidate
+        return None
+
+    # --- a/I disambiguation ---
+
+    def _disambiguate_a_I(self, token: str, next_word: str) -> Optional[str]:
+        """Disambiguate 'a' vs 'I' (and OCR variants like 'l', '|')."""
+        nw = next_word.lower().strip(".,;:!?")
+        if nw in _I_FOLLOWERS:
+            return "I"
+        if nw in _A_FOLLOWERS:
+            return "a"
+        return None  # uncertain, don't change
--- a/klausur-service/backend/ocr/spell/smart_spell.py
+++ b/klausur-service/backend/ocr/spell/smart_spell.py
@@ -0,0 +1,25 @@
+"""
+SmartSpellChecker — barrel re-export.
+
+All implementation split into:
+  smart_spell_core — init, data types, language detection, word correction
+  smart_spell_text — full text correction, boundary repair, context split
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+"""
+
+# Core: data types, lang detection (re-exported for tests)
+from .core import (  # noqa: F401
+    _AVAILABLE,
+    _DIGIT_SUBS,
+    _SUSPICIOUS_CHARS,
+    _UMLAUT_MAP,
+    _TOKEN_RE,
+    _I_FOLLOWERS,
+    _A_FOLLOWERS,
+    CorrectionResult,
+    Lang,
+)
+
+# Text: SmartSpellChecker class (the main public API)
+from .text import SmartSpellChecker  # noqa: F401
--- a/klausur-service/backend/ocr/spell/text.py
+++ b/klausur-service/backend/ocr/spell/text.py
@@ -0,0 +1,289 @@
+"""
+SmartSpellChecker Text — full text correction, boundary repair, context split.
+
+Extracted from smart_spell.py for modularity.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+"""
+
+import re
+from typing import Dict, List, Optional, Tuple
+
+from .core import (
+    _SmartSpellCoreBase,
+    _TOKEN_RE,
+    CorrectionResult,
+    Lang,
+)
+
+
+class SmartSpellChecker(_SmartSpellCoreBase):
+    """Language-aware OCR spell checker using pyspellchecker (no LLM).
+
+    Inherits single-word correction from _SmartSpellCoreBase.
+    Adds text-level passes: boundary repair, context split, full correction.
+    """
+
+    # --- Boundary repair (shifted word boundaries) ---
+
+    def _try_boundary_repair(self, word1: str, word2: str) -> Optional[Tuple[str, str]]:
+        """Fix shifted word boundaries between adjacent tokens.
+
+        OCR sometimes shifts the boundary: "at sth." -> "ats th."
+        Try moving 1-2 chars from end of word1 to start of word2 and vice versa.
+        Returns (fixed_word1, fixed_word2) or None.
+        """
+        # Import known abbreviations for vocabulary context
+        try:
+            from cv_ocr_engines import _KNOWN_ABBREVIATIONS
+        except ImportError:
+            _KNOWN_ABBREVIATIONS = set()
+
+        # Strip trailing punctuation for checking, preserve for result
+        w2_stripped = word2.rstrip(".,;:!?")
+        w2_punct = word2[len(w2_stripped):]
+
+        # Try shifting 1-2 chars from word1 -> word2
+        for shift in (1, 2):
+            if len(word1) <= shift:
+                continue
+            new_w1 = word1[:-shift]
+            new_w2_base = word1[-shift:] + w2_stripped
+
+            w1_ok = self._known(new_w1) or new_w1.lower() in _KNOWN_ABBREVIATIONS
+            w2_ok = self._known(new_w2_base) or new_w2_base.lower() in _KNOWN_ABBREVIATIONS
+
+            if w1_ok and w2_ok:
+                return (new_w1, new_w2_base + w2_punct)
+
+        # Try shifting 1-2 chars from word2 -> word1
+        for shift in (1, 2):
+            if len(w2_stripped) <= shift:
+                continue
+            new_w1 = word1 + w2_stripped[:shift]
+            new_w2_base = w2_stripped[shift:]
+
+            w1_ok = self._known(new_w1) or new_w1.lower() in _KNOWN_ABBREVIATIONS
+            w2_ok = self._known(new_w2_base) or new_w2_base.lower() in _KNOWN_ABBREVIATIONS
+
+            if w1_ok and w2_ok:
+                return (new_w1, new_w2_base + w2_punct)
+
+        return None
+
+    # --- Context-based word split for ambiguous merges ---
+
+    # Patterns where a valid word is actually "a" + adjective/noun
+    _ARTICLE_SPLIT_CANDIDATES = {
+        # word -> (article, remainder) -- only when followed by a compatible word
+        "anew": ("a", "new"),
+        "areal": ("a", "real"),
+        "alive": None,    # genuinely one word, never split
+        "alone": None,
+        "aware": None,
+        "alike": None,
+        "apart": None,
+        "aside": None,
+        "above": None,
+        "about": None,
+        "among": None,
+        "along": None,
+    }
+
+    def _try_context_split(self, word: str, next_word: str,
+                           prev_word: str) -> Optional[str]:
+        """Split words like 'anew' -> 'a new' when context indicates a merge.
+
+        Only splits when:
+        - The word is in the split candidates list
+        - The following word makes sense as a noun (for "a + adj + noun" pattern)
+        - OR the word is unknown and can be split into article + known word
+        """
+        w_lower = word.lower()
+
+        # Check explicit candidates
+        if w_lower in self._ARTICLE_SPLIT_CANDIDATES:
+            split = self._ARTICLE_SPLIT_CANDIDATES[w_lower]
+            if split is None:
+                return None  # explicitly marked as "don't split"
+            article, remainder = split
+            # Only split if followed by a word (noun pattern)
+            if next_word and next_word[0].islower():
+                return f"{article} {remainder}"
+            # Also split if remainder + next_word makes a common phrase
+            if next_word and self._known(next_word):
+                return f"{article} {remainder}"
+
+        # Generic: if word starts with 'a' and rest is a known adjective/word
+        if (len(word) >= 4 and word[0].lower() == 'a'
+                and not self._known(word)  # only for UNKNOWN words
+                and self._known(word[1:])):
+            return f"a {word[1:]}"
+
+        return None
+
+    # --- Full text correction ---
+
+    def correct_text(self, text: str, lang: str = "en") -> CorrectionResult:
+        """Correct a full text string (field value).
+
+        Three passes:
+        1. Boundary repair -- fix shifted word boundaries between adjacent tokens
+        2. Context split -- split ambiguous merges (anew -> a new)
+        3. Per-word correction -- spell check individual words
+        """
+        if not text or not text.strip():
+            return CorrectionResult(text, text, "unknown", False)
+
+        detected = self.detect_text_lang(text) if lang == "auto" else lang
+        effective_lang = detected if detected in ("en", "de") else "en"
+
+        changes: List[str] = []
+        tokens = list(_TOKEN_RE.finditer(text))
+
+        # Extract token list: [(word, separator), ...]
+        token_list: List[List[str]] = []  # [[word, sep], ...]
+        for m in tokens:
+            token_list.append([m.group(1), m.group(2)])
+
+        # --- Pass 1: Boundary repair between adjacent unknown words ---
+        # Import abbreviations for the heuristic below
+        try:
+            from cv_ocr_engines import _KNOWN_ABBREVIATIONS as _ABBREVS
+        except ImportError:
+            _ABBREVS = set()
+
+        for i in range(len(token_list) - 1):
+            w1 = token_list[i][0]
+            w2_raw = token_list[i + 1][0]
+
+            # Skip boundary repair for IPA/bracket content
+            # Brackets may be in the token OR in the adjacent separators
+            sep_before_w1 = token_list[i - 1][1] if i > 0 else ""
+            sep_after_w1 = token_list[i][1]
+            sep_after_w2 = token_list[i + 1][1]
+            has_bracket = (
+                '[' in w1 or ']' in w1 or '[' in w2_raw or ']' in w2_raw
+                or ']' in sep_after_w1  # w1 text was inside [brackets]
+                or '[' in sep_after_w1  # w2 starts a bracket
+                or ']' in sep_after_w2  # w2 text was inside [brackets]
+                or '[' in sep_before_w1  # w1 starts a bracket
+            )
+            if has_bracket:
+                continue
+
+            # Include trailing punct from separator in w2 for abbreviation matching
+            w2_with_punct = w2_raw + token_list[i + 1][1].rstrip(" ")
+
+            # Try boundary repair -- always, even if both words are valid.
+            # Use word-frequency scoring to decide if repair is better.
+            repair = self._try_boundary_repair(w1, w2_with_punct)
+            if not repair and w2_with_punct != w2_raw:
+                repair = self._try_boundary_repair(w1, w2_raw)
+            if repair:
+                new_w1, new_w2_full = repair
+                new_w2_base = new_w2_full.rstrip(".,;:!?")
+
+                # Frequency-based scoring: product of word frequencies
+                # Higher product = more common word pair = better
+                old_freq = self._word_freq(w1) * self._word_freq(w2_raw)
+                new_freq = self._word_freq(new_w1) * self._word_freq(new_w2_base)
+
+                # Abbreviation bonus: if repair produces a known abbreviation
+                has_abbrev = new_w1.lower() in _ABBREVS or new_w2_base.lower() in _ABBREVS
+                if has_abbrev:
+                    # Accept abbreviation repair ONLY if at least one of the
+                    # original words is rare/unknown (prevents "Can I" -> "Ca nI"
+                    # where both original words are common and correct).
+                    RARE_THRESHOLD = 1e-6
+                    orig_both_common = (
+                        self._word_freq(w1) > RARE_THRESHOLD
+                        and self._word_freq(w2_raw) > RARE_THRESHOLD
+                    )
+                    if not orig_both_common:
+                        new_freq = max(new_freq, old_freq * 10)
+                    else:
+                        has_abbrev = False  # both originals common -> don't trust
+
+                # Accept if repair produces a more frequent word pair
+                # (threshold: at least 5x more frequent to avoid false positives)
+                if new_freq > old_freq * 5:
+                    new_w2_punct = new_w2_full[len(new_w2_base):]
+                    changes.append(f"{w1} {w2_raw}\u2192{new_w1} {new_w2_base}")
+                    token_list[i][0] = new_w1
+                    token_list[i + 1][0] = new_w2_base
+                    if new_w2_punct:
+                        token_list[i + 1][1] = new_w2_punct + token_list[i + 1][1].lstrip(".,;:!?")
+
+        # --- Pass 2: Context split (anew -> a new) ---
+        expanded: List[List[str]] = []
+        for i, (word, sep) in enumerate(token_list):
+            next_word = token_list[i + 1][0] if i + 1 < len(token_list) else ""
+            prev_word = token_list[i - 1][0] if i > 0 else ""
+            split = self._try_context_split(word, next_word, prev_word)
+            if split and split != word:
+                changes.append(f"{word}\u2192{split}")
+                expanded.append([split, sep])
+            else:
+                expanded.append([word, sep])
+        token_list = expanded
+
+        # --- Pass 3: Per-word correction ---
+        parts: List[str] = []
+
+        # Preserve any leading text before the first token match
+        first_start = tokens[0].start() if tokens else 0
+        if first_start > 0:
+            parts.append(text[:first_start])
+
+        for i, (word, sep) in enumerate(token_list):
+            # Skip words inside IPA brackets (brackets land in separators)
+            prev_sep = token_list[i - 1][1] if i > 0 else ""
+            if '[' in prev_sep or ']' in sep:
+                parts.append(word)
+                parts.append(sep)
+                continue
+
+            next_word = token_list[i + 1][0] if i + 1 < len(token_list) else ""
+            prev_word = token_list[i - 1][0] if i > 0 else ""
+
+            correction = self.correct_word(
+                word, lang=effective_lang,
+                prev_word=prev_word, next_word=next_word,
+            )
+            if correction and correction != word:
+                changes.append(f"{word}\u2192{correction}")
+                parts.append(correction)
+            else:
+                parts.append(word)
+            parts.append(sep)
+
+        # Append any trailing text
+        last_end = tokens[-1].end() if tokens else 0
+        if last_end < len(text):
+            parts.append(text[last_end:])
+
+        corrected = "".join(parts)
+        return CorrectionResult(
+            original=text,
+            corrected=corrected,
+            lang_detected=detected,
+            changed=corrected != text,
+            changes=changes,
+        )
+
+    # --- Vocabulary entry correction ---
+
+    def correct_vocab_entry(self, english: str, german: str,
+                            example: str = "") -> Dict[str, CorrectionResult]:
+        """Correct a full vocabulary entry (EN + DE + example).
+
+        Uses column position to determine language -- the most reliable signal.
+        """
+        results = {}
+        results["english"] = self.correct_text(english, lang="en")
+        results["german"] = self.correct_text(german, lang="de")
+        if example:
+            # For examples, auto-detect language
+            results["example"] = self.correct_text(example, lang="auto")
+        return results