Fix: Sidebar scrollable + add Eltern-Portal nav link
overflow-hidden → overflow-y-auto so all nav items are reachable. Added /parent (Eltern-Portal) link with people icon. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
381
klausur-service/backend/ocr/engines/engines.py
Normal file
381
klausur-service/backend/ocr/engines/engines.py
Normal file
@@ -0,0 +1,381 @@
|
||||
"""
|
||||
OCR engines (RapidOCR, TrOCR, LightOn) and re-exports.
|
||||
|
||||
This module contains the OCR engine wrappers and re-exports all functions
|
||||
from the split sub-modules for backward compatibility.
|
||||
|
||||
Sub-modules:
|
||||
- cv_ocr_word_assembly: Word grouping and text assembly
|
||||
- cv_ocr_vocab_postprocess: Vocabulary postprocessing (char confusion, comma split)
|
||||
- cv_ocr_ipa_lookup: Core IPA lookup and bracket handling
|
||||
- cv_ocr_ipa_repair: Advanced IPA repair (continuation cells, post-bracket cleanup)
|
||||
- cv_ocr_cell_phonetics: Cell-level phonetics for overlay
|
||||
- cv_ocr_cell_filter: Cell text filtering, column assignment, bold detection
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
|
||||
from cv_vocab_types import (
|
||||
IPA_AVAILABLE,
|
||||
PageRegion,
|
||||
RowGeometry,
|
||||
_britfone_dict,
|
||||
_ipa_convert_american,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
import cv2
|
||||
except ImportError:
|
||||
cv2 = None # type: ignore[assignment]
|
||||
|
||||
try:
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
Image = None # type: ignore[assignment,misc]
|
||||
|
||||
|
||||
# ── Re-exports from sub-modules (backward compatibility) ──────────────────
|
||||
|
||||
from cv_ocr_word_assembly import ( # noqa: F401
|
||||
_group_words_into_lines,
|
||||
_words_to_reading_order_lines,
|
||||
_rejoin_hyphenated,
|
||||
_words_to_reading_order_text,
|
||||
_words_to_spaced_text,
|
||||
)
|
||||
|
||||
from cv_ocr_vocab_postprocess import ( # noqa: F401
|
||||
_CHAR_CONFUSION_RULES,
|
||||
_DE_INDICATORS_FOR_EN_I,
|
||||
_fix_character_confusion,
|
||||
_is_singular_plural_pair,
|
||||
_split_comma_entries,
|
||||
_split_by_comma,
|
||||
_find_best_vocab_match,
|
||||
_attach_example_sentences,
|
||||
)
|
||||
|
||||
from cv_ocr_ipa_lookup import ( # noqa: F401
|
||||
_PHONETIC_BRACKET_RE,
|
||||
_IPA_CHARS,
|
||||
_MIN_WORD_CONF,
|
||||
_GRAMMAR_BRACKET_WORDS,
|
||||
_lookup_ipa,
|
||||
_fix_phonetic_brackets,
|
||||
_is_grammar_bracket_content,
|
||||
_replace_phonetics_in_text,
|
||||
_text_has_garbled_ipa,
|
||||
_decompose_compound,
|
||||
_insert_missing_ipa,
|
||||
)
|
||||
|
||||
from cv_ocr_ipa_repair import ( # noqa: F401
|
||||
_has_non_dict_trailing,
|
||||
_strip_post_bracket_garbled,
|
||||
fix_ipa_continuation_cell,
|
||||
_insert_headword_ipa,
|
||||
)
|
||||
|
||||
from cv_ocr_cell_phonetics import ( # noqa: F401
|
||||
fix_cell_phonetics,
|
||||
_has_ipa_gap,
|
||||
_sync_word_boxes_after_ipa_insert,
|
||||
)
|
||||
|
||||
from cv_ocr_cell_filter import ( # noqa: F401
|
||||
_RE_REAL_WORD,
|
||||
_RE_ALPHA,
|
||||
_COMMON_SHORT_WORDS,
|
||||
_KNOWN_ABBREVIATIONS,
|
||||
_assign_row_words_to_columns,
|
||||
_is_noise_tail_token,
|
||||
_is_garbage_text,
|
||||
_clean_cell_text,
|
||||
_clean_cell_text_lite,
|
||||
_measure_stroke_width,
|
||||
_classify_bold_cells,
|
||||
)
|
||||
|
||||
|
||||
# ── OCR Engine Wrappers ───────────────────────────────────────────────────
|
||||
|
||||
_rapid_engine = None
|
||||
RAPIDOCR_AVAILABLE = False
|
||||
|
||||
try:
|
||||
from rapidocr import RapidOCR as _RapidOCRClass
|
||||
from rapidocr import LangRec as _LangRec, OCRVersion as _OCRVersion, ModelType as _ModelType
|
||||
RAPIDOCR_AVAILABLE = True
|
||||
logger.info("RapidOCR available — can be used as alternative to Tesseract")
|
||||
except ImportError:
|
||||
logger.info("RapidOCR not installed — using Tesseract only")
|
||||
|
||||
|
||||
def _get_rapid_engine():
|
||||
"""Lazy-init RapidOCR engine with PP-OCRv5 Latin model for German support."""
|
||||
global _rapid_engine
|
||||
if _rapid_engine is None:
|
||||
_rapid_engine = _RapidOCRClass(params={
|
||||
"Rec.lang_type": _LangRec.LATIN,
|
||||
"Rec.model_type": _ModelType.SERVER,
|
||||
"Rec.ocr_version": _OCRVersion.PPOCRV5,
|
||||
"Det.unclip_ratio": 1.3,
|
||||
"Det.box_thresh": 0.4,
|
||||
"Global.log_level": "critical",
|
||||
})
|
||||
logger.info("RapidOCR engine initialized (PP-OCRv5 Latin, unclip_ratio=1.3)")
|
||||
return _rapid_engine
|
||||
|
||||
|
||||
def ocr_region_rapid(
|
||||
img_bgr: np.ndarray,
|
||||
region: PageRegion,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Run RapidOCR on a specific region, returning word dicts compatible with Tesseract format."""
|
||||
engine = _get_rapid_engine()
|
||||
|
||||
crop = img_bgr[region.y:region.y + region.height,
|
||||
region.x:region.x + region.width]
|
||||
|
||||
if crop.size == 0:
|
||||
return []
|
||||
|
||||
result = engine(crop)
|
||||
|
||||
if result is None or result.boxes is None or result.txts is None:
|
||||
return []
|
||||
|
||||
words = []
|
||||
boxes = result.boxes
|
||||
txts = result.txts
|
||||
scores = result.scores
|
||||
|
||||
for i, (box, txt, score) in enumerate(zip(boxes, txts, scores)):
|
||||
if not txt or not txt.strip():
|
||||
continue
|
||||
|
||||
xs = [p[0] for p in box]
|
||||
ys = [p[1] for p in box]
|
||||
left = int(min(xs))
|
||||
top = int(min(ys))
|
||||
w = int(max(xs) - left)
|
||||
h = int(max(ys) - top)
|
||||
|
||||
words.append({
|
||||
'text': txt.strip(),
|
||||
'left': left + region.x,
|
||||
'top': top + region.y,
|
||||
'width': w,
|
||||
'height': h,
|
||||
'conf': int(score * 100),
|
||||
'region_type': region.type,
|
||||
})
|
||||
|
||||
return words
|
||||
|
||||
|
||||
def ocr_region_trocr(img_bgr: np.ndarray, region: PageRegion, handwritten: bool = False) -> List[Dict[str, Any]]:
|
||||
"""Run TrOCR on a region. Returns line-level word dicts."""
|
||||
from services.trocr_service import get_trocr_model, _split_into_lines, _check_trocr_available
|
||||
|
||||
if not _check_trocr_available():
|
||||
logger.warning("TrOCR not available, falling back to Tesseract")
|
||||
if region.height > 0 and region.width > 0:
|
||||
ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) if img_bgr is not None else None
|
||||
if ocr_img_crop is not None:
|
||||
return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
|
||||
return []
|
||||
|
||||
crop = img_bgr[region.y:region.y + region.height, region.x:region.x + region.width]
|
||||
if crop.size == 0:
|
||||
return []
|
||||
|
||||
try:
|
||||
import torch
|
||||
from PIL import Image as _PILImage
|
||||
|
||||
processor, model = get_trocr_model(handwritten=handwritten)
|
||||
if processor is None or model is None:
|
||||
logger.warning("TrOCR model not loaded, falling back to Tesseract")
|
||||
ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
|
||||
return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
|
||||
|
||||
pil_crop = _PILImage.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
|
||||
lines = _split_into_lines(pil_crop)
|
||||
if not lines:
|
||||
lines = [pil_crop]
|
||||
|
||||
device = next(model.parameters()).device
|
||||
all_text = []
|
||||
confidences = []
|
||||
for line_img in lines:
|
||||
pixel_values = processor(images=line_img, return_tensors="pt").pixel_values.to(device)
|
||||
with torch.no_grad():
|
||||
generated_ids = model.generate(pixel_values, max_length=128)
|
||||
text_line = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
|
||||
if text_line:
|
||||
all_text.append(text_line)
|
||||
confidences.append(0.85 if len(text_line) > 3 else 0.5)
|
||||
|
||||
if not all_text:
|
||||
return []
|
||||
|
||||
avg_conf = int(sum(confidences) / len(confidences) * 100)
|
||||
line_h = region.height // max(len(all_text), 1)
|
||||
words = []
|
||||
for i, line in enumerate(all_text):
|
||||
words.append({
|
||||
"text": line,
|
||||
"left": region.x,
|
||||
"top": region.y + i * line_h,
|
||||
"width": region.width,
|
||||
"height": line_h,
|
||||
"conf": avg_conf,
|
||||
"region_type": region.type,
|
||||
})
|
||||
return words
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"ocr_region_trocr failed: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def ocr_region_lighton(img_bgr: np.ndarray, region: PageRegion) -> List[Dict[str, Any]]:
|
||||
"""Run LightOnOCR-2-1B on a region. Returns line-level word dicts."""
|
||||
from services.lighton_ocr_service import get_lighton_model, _check_lighton_available
|
||||
|
||||
if not _check_lighton_available():
|
||||
logger.warning("LightOnOCR not available, falling back to RapidOCR/Tesseract")
|
||||
if RAPIDOCR_AVAILABLE and img_bgr is not None:
|
||||
return ocr_region_rapid(img_bgr, region)
|
||||
ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) if img_bgr is not None else None
|
||||
return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6) if ocr_img_crop is not None else []
|
||||
|
||||
crop = img_bgr[region.y:region.y + region.height, region.x:region.x + region.width]
|
||||
if crop.size == 0:
|
||||
return []
|
||||
|
||||
try:
|
||||
import io
|
||||
import torch
|
||||
from PIL import Image as _PILImage
|
||||
|
||||
processor, model = get_lighton_model()
|
||||
if processor is None or model is None:
|
||||
logger.warning("LightOnOCR model not loaded, falling back to RapidOCR/Tesseract")
|
||||
if RAPIDOCR_AVAILABLE and img_bgr is not None:
|
||||
return ocr_region_rapid(img_bgr, region)
|
||||
ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
|
||||
return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
|
||||
|
||||
pil_crop = _PILImage.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
|
||||
conversation = [{"role": "user", "content": [{"type": "image"}]}]
|
||||
inputs = processor.apply_chat_template(
|
||||
conversation, images=[pil_crop],
|
||||
add_generation_prompt=True, return_tensors="pt"
|
||||
).to(model.device)
|
||||
|
||||
with torch.no_grad():
|
||||
output_ids = model.generate(**inputs, max_new_tokens=1024)
|
||||
|
||||
text = processor.decode(output_ids[0], skip_special_tokens=True).strip()
|
||||
if not text:
|
||||
return []
|
||||
|
||||
lines = [l.strip() for l in text.split("\n") if l.strip()]
|
||||
line_h = region.height // max(len(lines), 1)
|
||||
words = []
|
||||
for i, line in enumerate(lines):
|
||||
words.append({
|
||||
"text": line,
|
||||
"left": region.x,
|
||||
"top": region.y + i * line_h,
|
||||
"width": region.width,
|
||||
"height": line_h,
|
||||
"conf": 85,
|
||||
"region_type": region.type,
|
||||
})
|
||||
return words
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"ocr_region_lighton failed: {e}")
|
||||
return []
|
||||
|
||||
|
||||
async def ocr_region_paddle(
|
||||
img_bgr: np.ndarray,
|
||||
region: Optional["PageRegion"] = None,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Run OCR via local RapidOCR (default) or remote PaddleOCR (fallback)."""
|
||||
force_remote = os.environ.get("FORCE_REMOTE_PADDLE", "").strip() == "1"
|
||||
|
||||
if not force_remote:
|
||||
try:
|
||||
if region is None:
|
||||
h, w = img_bgr.shape[:2]
|
||||
_region = PageRegion(type="full_page", x=0, y=0, width=w, height=h)
|
||||
else:
|
||||
_region = region
|
||||
|
||||
words = ocr_region_rapid(img_bgr, _region)
|
||||
if words:
|
||||
logger.info("ocr_region_paddle: used local RapidOCR (%d words)", len(words))
|
||||
return words
|
||||
logger.warning("ocr_region_paddle: RapidOCR returned 0 words, trying remote")
|
||||
except Exception as e:
|
||||
logger.warning("ocr_region_paddle: RapidOCR failed (%s), trying remote", e)
|
||||
|
||||
from services.paddleocr_remote import ocr_remote_paddle
|
||||
|
||||
if region is not None:
|
||||
crop = img_bgr[
|
||||
region.y : region.y + region.height,
|
||||
region.x : region.x + region.width,
|
||||
]
|
||||
offset_x, offset_y = region.x, region.y
|
||||
else:
|
||||
crop = img_bgr
|
||||
offset_x, offset_y = 0, 0
|
||||
|
||||
if crop.size == 0:
|
||||
return []
|
||||
|
||||
h, w = crop.shape[:2]
|
||||
scale = 1.0
|
||||
_MAX_DIM = 1500
|
||||
if max(h, w) > _MAX_DIM:
|
||||
scale = _MAX_DIM / max(h, w)
|
||||
new_w, new_h = int(w * scale), int(h * scale)
|
||||
crop = cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_AREA)
|
||||
logger.info("ocr_region_paddle: downscaled %dx%d → %dx%d (scale=%.2f)",
|
||||
w, h, new_w, new_h, scale)
|
||||
|
||||
success, jpg_buf = cv2.imencode(".jpg", crop, [cv2.IMWRITE_JPEG_QUALITY, 90])
|
||||
if not success:
|
||||
logger.error("ocr_region_paddle: cv2.imencode failed")
|
||||
return []
|
||||
|
||||
words, _w, _h = await ocr_remote_paddle(jpg_buf.tobytes(), filename="scan.jpg")
|
||||
logger.info("ocr_region_paddle: used remote PaddleOCR (%d words)", len(words))
|
||||
|
||||
inv_scale = 1.0 / scale if scale != 1.0 else 1.0
|
||||
for wd in words:
|
||||
wd["left"] = int(wd["left"] * inv_scale) + offset_x
|
||||
wd["top"] = int(wd["top"] * inv_scale) + offset_y
|
||||
wd["width"] = int(wd["width"] * inv_scale)
|
||||
wd["height"] = int(wd["height"] * inv_scale)
|
||||
if region is not None:
|
||||
wd["region_type"] = region.type
|
||||
|
||||
return words
|
||||
Reference in New Issue
Block a user