Fix: Sidebar scrollable + add Eltern-Portal nav link

overflow-hidden → overflow-y-auto so all nav items are reachable. Added /parent (Eltern-Portal) link with people icon. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 20:49:44 +02:00
parent d87645ffce
commit 45287b3541
48 changed files with 6 additions and 1 deletions
--- a/klausur-service/backend/ocr/engines/engines.py
+++ b/klausur-service/backend/ocr/engines/engines.py
@@ -0,0 +1,381 @@
+"""
+OCR engines (RapidOCR, TrOCR, LightOn) and re-exports.
+
+This module contains the OCR engine wrappers and re-exports all functions
+from the split sub-modules for backward compatibility.
+
+Sub-modules:
+- cv_ocr_word_assembly: Word grouping and text assembly
+- cv_ocr_vocab_postprocess: Vocabulary postprocessing (char confusion, comma split)
+- cv_ocr_ipa_lookup: Core IPA lookup and bracket handling
+- cv_ocr_ipa_repair: Advanced IPA repair (continuation cells, post-bracket cleanup)
+- cv_ocr_cell_phonetics: Cell-level phonetics for overlay
+- cv_ocr_cell_filter: Cell text filtering, column assignment, bold detection
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import io
+import logging
+import os
+import re
+from typing import Any, Dict, List, Optional, Tuple
+
+import numpy as np
+
+from cv_vocab_types import (
+    IPA_AVAILABLE,
+    PageRegion,
+    RowGeometry,
+    _britfone_dict,
+    _ipa_convert_american,
+)
+
+logger = logging.getLogger(__name__)
+
+try:
+    import cv2
+except ImportError:
+    cv2 = None  # type: ignore[assignment]
+
+try:
+    from PIL import Image
+except ImportError:
+    Image = None  # type: ignore[assignment,misc]
+
+
+# ── Re-exports from sub-modules (backward compatibility) ──────────────────
+
+from cv_ocr_word_assembly import (  # noqa: F401
+    _group_words_into_lines,
+    _words_to_reading_order_lines,
+    _rejoin_hyphenated,
+    _words_to_reading_order_text,
+    _words_to_spaced_text,
+)
+
+from cv_ocr_vocab_postprocess import (  # noqa: F401
+    _CHAR_CONFUSION_RULES,
+    _DE_INDICATORS_FOR_EN_I,
+    _fix_character_confusion,
+    _is_singular_plural_pair,
+    _split_comma_entries,
+    _split_by_comma,
+    _find_best_vocab_match,
+    _attach_example_sentences,
+)
+
+from cv_ocr_ipa_lookup import (  # noqa: F401
+    _PHONETIC_BRACKET_RE,
+    _IPA_CHARS,
+    _MIN_WORD_CONF,
+    _GRAMMAR_BRACKET_WORDS,
+    _lookup_ipa,
+    _fix_phonetic_brackets,
+    _is_grammar_bracket_content,
+    _replace_phonetics_in_text,
+    _text_has_garbled_ipa,
+    _decompose_compound,
+    _insert_missing_ipa,
+)
+
+from cv_ocr_ipa_repair import (  # noqa: F401
+    _has_non_dict_trailing,
+    _strip_post_bracket_garbled,
+    fix_ipa_continuation_cell,
+    _insert_headword_ipa,
+)
+
+from cv_ocr_cell_phonetics import (  # noqa: F401
+    fix_cell_phonetics,
+    _has_ipa_gap,
+    _sync_word_boxes_after_ipa_insert,
+)
+
+from cv_ocr_cell_filter import (  # noqa: F401
+    _RE_REAL_WORD,
+    _RE_ALPHA,
+    _COMMON_SHORT_WORDS,
+    _KNOWN_ABBREVIATIONS,
+    _assign_row_words_to_columns,
+    _is_noise_tail_token,
+    _is_garbage_text,
+    _clean_cell_text,
+    _clean_cell_text_lite,
+    _measure_stroke_width,
+    _classify_bold_cells,
+)
+
+
+# ── OCR Engine Wrappers ───────────────────────────────────────────────────
+
+_rapid_engine = None
+RAPIDOCR_AVAILABLE = False
+
+try:
+    from rapidocr import RapidOCR as _RapidOCRClass
+    from rapidocr import LangRec as _LangRec, OCRVersion as _OCRVersion, ModelType as _ModelType
+    RAPIDOCR_AVAILABLE = True
+    logger.info("RapidOCR available — can be used as alternative to Tesseract")
+except ImportError:
+    logger.info("RapidOCR not installed — using Tesseract only")
+
+
+def _get_rapid_engine():
+    """Lazy-init RapidOCR engine with PP-OCRv5 Latin model for German support."""
+    global _rapid_engine
+    if _rapid_engine is None:
+        _rapid_engine = _RapidOCRClass(params={
+            "Rec.lang_type": _LangRec.LATIN,
+            "Rec.model_type": _ModelType.SERVER,
+            "Rec.ocr_version": _OCRVersion.PPOCRV5,
+            "Det.unclip_ratio": 1.3,
+            "Det.box_thresh": 0.4,
+            "Global.log_level": "critical",
+        })
+        logger.info("RapidOCR engine initialized (PP-OCRv5 Latin, unclip_ratio=1.3)")
+    return _rapid_engine
+
+
+def ocr_region_rapid(
+    img_bgr: np.ndarray,
+    region: PageRegion,
+) -> List[Dict[str, Any]]:
+    """Run RapidOCR on a specific region, returning word dicts compatible with Tesseract format."""
+    engine = _get_rapid_engine()
+
+    crop = img_bgr[region.y:region.y + region.height,
+                   region.x:region.x + region.width]
+
+    if crop.size == 0:
+        return []
+
+    result = engine(crop)
+
+    if result is None or result.boxes is None or result.txts is None:
+        return []
+
+    words = []
+    boxes = result.boxes
+    txts = result.txts
+    scores = result.scores
+
+    for i, (box, txt, score) in enumerate(zip(boxes, txts, scores)):
+        if not txt or not txt.strip():
+            continue
+
+        xs = [p[0] for p in box]
+        ys = [p[1] for p in box]
+        left = int(min(xs))
+        top = int(min(ys))
+        w = int(max(xs) - left)
+        h = int(max(ys) - top)
+
+        words.append({
+            'text': txt.strip(),
+            'left': left + region.x,
+            'top': top + region.y,
+            'width': w,
+            'height': h,
+            'conf': int(score * 100),
+            'region_type': region.type,
+        })
+
+    return words
+
+
+def ocr_region_trocr(img_bgr: np.ndarray, region: PageRegion, handwritten: bool = False) -> List[Dict[str, Any]]:
+    """Run TrOCR on a region. Returns line-level word dicts."""
+    from services.trocr_service import get_trocr_model, _split_into_lines, _check_trocr_available
+
+    if not _check_trocr_available():
+        logger.warning("TrOCR not available, falling back to Tesseract")
+        if region.height > 0 and region.width > 0:
+            ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) if img_bgr is not None else None
+            if ocr_img_crop is not None:
+                return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
+        return []
+
+    crop = img_bgr[region.y:region.y + region.height, region.x:region.x + region.width]
+    if crop.size == 0:
+        return []
+
+    try:
+        import torch
+        from PIL import Image as _PILImage
+
+        processor, model = get_trocr_model(handwritten=handwritten)
+        if processor is None or model is None:
+            logger.warning("TrOCR model not loaded, falling back to Tesseract")
+            ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+            return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
+
+        pil_crop = _PILImage.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
+        lines = _split_into_lines(pil_crop)
+        if not lines:
+            lines = [pil_crop]
+
+        device = next(model.parameters()).device
+        all_text = []
+        confidences = []
+        for line_img in lines:
+            pixel_values = processor(images=line_img, return_tensors="pt").pixel_values.to(device)
+            with torch.no_grad():
+                generated_ids = model.generate(pixel_values, max_length=128)
+            text_line = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+            if text_line:
+                all_text.append(text_line)
+                confidences.append(0.85 if len(text_line) > 3 else 0.5)
+
+        if not all_text:
+            return []
+
+        avg_conf = int(sum(confidences) / len(confidences) * 100)
+        line_h = region.height // max(len(all_text), 1)
+        words = []
+        for i, line in enumerate(all_text):
+            words.append({
+                "text": line,
+                "left": region.x,
+                "top": region.y + i * line_h,
+                "width": region.width,
+                "height": line_h,
+                "conf": avg_conf,
+                "region_type": region.type,
+            })
+        return words
+
+    except Exception as e:
+        logger.error(f"ocr_region_trocr failed: {e}")
+        return []
+
+
+def ocr_region_lighton(img_bgr: np.ndarray, region: PageRegion) -> List[Dict[str, Any]]:
+    """Run LightOnOCR-2-1B on a region. Returns line-level word dicts."""
+    from services.lighton_ocr_service import get_lighton_model, _check_lighton_available
+
+    if not _check_lighton_available():
+        logger.warning("LightOnOCR not available, falling back to RapidOCR/Tesseract")
+        if RAPIDOCR_AVAILABLE and img_bgr is not None:
+            return ocr_region_rapid(img_bgr, region)
+        ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) if img_bgr is not None else None
+        return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6) if ocr_img_crop is not None else []
+
+    crop = img_bgr[region.y:region.y + region.height, region.x:region.x + region.width]
+    if crop.size == 0:
+        return []
+
+    try:
+        import io
+        import torch
+        from PIL import Image as _PILImage
+
+        processor, model = get_lighton_model()
+        if processor is None or model is None:
+            logger.warning("LightOnOCR model not loaded, falling back to RapidOCR/Tesseract")
+            if RAPIDOCR_AVAILABLE and img_bgr is not None:
+                return ocr_region_rapid(img_bgr, region)
+            ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+            return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
+
+        pil_crop = _PILImage.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
+        conversation = [{"role": "user", "content": [{"type": "image"}]}]
+        inputs = processor.apply_chat_template(
+            conversation, images=[pil_crop],
+            add_generation_prompt=True, return_tensors="pt"
+        ).to(model.device)
+
+        with torch.no_grad():
+            output_ids = model.generate(**inputs, max_new_tokens=1024)
+
+        text = processor.decode(output_ids[0], skip_special_tokens=True).strip()
+        if not text:
+            return []
+
+        lines = [l.strip() for l in text.split("\n") if l.strip()]
+        line_h = region.height // max(len(lines), 1)
+        words = []
+        for i, line in enumerate(lines):
+            words.append({
+                "text": line,
+                "left": region.x,
+                "top": region.y + i * line_h,
+                "width": region.width,
+                "height": line_h,
+                "conf": 85,
+                "region_type": region.type,
+            })
+        return words
+
+    except Exception as e:
+        logger.error(f"ocr_region_lighton failed: {e}")
+        return []
+
+
+async def ocr_region_paddle(
+    img_bgr: np.ndarray,
+    region: Optional["PageRegion"] = None,
+) -> List[Dict[str, Any]]:
+    """Run OCR via local RapidOCR (default) or remote PaddleOCR (fallback)."""
+    force_remote = os.environ.get("FORCE_REMOTE_PADDLE", "").strip() == "1"
+
+    if not force_remote:
+        try:
+            if region is None:
+                h, w = img_bgr.shape[:2]
+                _region = PageRegion(type="full_page", x=0, y=0, width=w, height=h)
+            else:
+                _region = region
+
+            words = ocr_region_rapid(img_bgr, _region)
+            if words:
+                logger.info("ocr_region_paddle: used local RapidOCR (%d words)", len(words))
+                return words
+            logger.warning("ocr_region_paddle: RapidOCR returned 0 words, trying remote")
+        except Exception as e:
+            logger.warning("ocr_region_paddle: RapidOCR failed (%s), trying remote", e)
+
+    from services.paddleocr_remote import ocr_remote_paddle
+
+    if region is not None:
+        crop = img_bgr[
+            region.y : region.y + region.height,
+            region.x : region.x + region.width,
+        ]
+        offset_x, offset_y = region.x, region.y
+    else:
+        crop = img_bgr
+        offset_x, offset_y = 0, 0
+
+    if crop.size == 0:
+        return []
+
+    h, w = crop.shape[:2]
+    scale = 1.0
+    _MAX_DIM = 1500
+    if max(h, w) > _MAX_DIM:
+        scale = _MAX_DIM / max(h, w)
+        new_w, new_h = int(w * scale), int(h * scale)
+        crop = cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_AREA)
+        logger.info("ocr_region_paddle: downscaled %dx%d → %dx%d (scale=%.2f)",
+                     w, h, new_w, new_h, scale)
+
+    success, jpg_buf = cv2.imencode(".jpg", crop, [cv2.IMWRITE_JPEG_QUALITY, 90])
+    if not success:
+        logger.error("ocr_region_paddle: cv2.imencode failed")
+        return []
+
+    words, _w, _h = await ocr_remote_paddle(jpg_buf.tobytes(), filename="scan.jpg")
+    logger.info("ocr_region_paddle: used remote PaddleOCR (%d words)", len(words))
+
+    inv_scale = 1.0 / scale if scale != 1.0 else 1.0
+    for wd in words:
+        wd["left"] = int(wd["left"] * inv_scale) + offset_x
+        wd["top"] = int(wd["top"] * inv_scale) + offset_y
+        wd["width"] = int(wd["width"] * inv_scale)
+        wd["height"] = int(wd["height"] * inv_scale)
+        if region is not None:
+            wd["region_type"] = region.type
+
+    return words