Files
breakpilot-lehrer/klausur-service/backend/cv_ocr_engines.py
Benjamin Admin 9ba420fa91
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
Fix: Remove broken getKlausurApiUrl and clean up empty lines
sed replacement left orphaned hostname references in story page
and empty lines in getApiBase functions.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 16:02:04 +02:00

382 lines
12 KiB
Python

"""
OCR engines (RapidOCR, TrOCR, LightOn) and re-exports.
This module contains the OCR engine wrappers and re-exports all functions
from the split sub-modules for backward compatibility.
Sub-modules:
- cv_ocr_word_assembly: Word grouping and text assembly
- cv_ocr_vocab_postprocess: Vocabulary postprocessing (char confusion, comma split)
- cv_ocr_ipa_lookup: Core IPA lookup and bracket handling
- cv_ocr_ipa_repair: Advanced IPA repair (continuation cells, post-bracket cleanup)
- cv_ocr_cell_phonetics: Cell-level phonetics for overlay
- cv_ocr_cell_filter: Cell text filtering, column assignment, bold detection
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import io
import logging
import os
import re
from typing import Any, Dict, List, Optional, Tuple
import numpy as np
from cv_vocab_types import (
IPA_AVAILABLE,
PageRegion,
RowGeometry,
_britfone_dict,
_ipa_convert_american,
)
logger = logging.getLogger(__name__)
try:
import cv2
except ImportError:
cv2 = None # type: ignore[assignment]
try:
from PIL import Image
except ImportError:
Image = None # type: ignore[assignment,misc]
# ── Re-exports from sub-modules (backward compatibility) ──────────────────
from cv_ocr_word_assembly import ( # noqa: F401
_group_words_into_lines,
_words_to_reading_order_lines,
_rejoin_hyphenated,
_words_to_reading_order_text,
_words_to_spaced_text,
)
from cv_ocr_vocab_postprocess import ( # noqa: F401
_CHAR_CONFUSION_RULES,
_DE_INDICATORS_FOR_EN_I,
_fix_character_confusion,
_is_singular_plural_pair,
_split_comma_entries,
_split_by_comma,
_find_best_vocab_match,
_attach_example_sentences,
)
from cv_ocr_ipa_lookup import ( # noqa: F401
_PHONETIC_BRACKET_RE,
_IPA_CHARS,
_MIN_WORD_CONF,
_GRAMMAR_BRACKET_WORDS,
_lookup_ipa,
_fix_phonetic_brackets,
_is_grammar_bracket_content,
_replace_phonetics_in_text,
_text_has_garbled_ipa,
_decompose_compound,
_insert_missing_ipa,
)
from cv_ocr_ipa_repair import ( # noqa: F401
_has_non_dict_trailing,
_strip_post_bracket_garbled,
fix_ipa_continuation_cell,
_insert_headword_ipa,
)
from cv_ocr_cell_phonetics import ( # noqa: F401
fix_cell_phonetics,
_has_ipa_gap,
_sync_word_boxes_after_ipa_insert,
)
from cv_ocr_cell_filter import ( # noqa: F401
_RE_REAL_WORD,
_RE_ALPHA,
_COMMON_SHORT_WORDS,
_KNOWN_ABBREVIATIONS,
_assign_row_words_to_columns,
_is_noise_tail_token,
_is_garbage_text,
_clean_cell_text,
_clean_cell_text_lite,
_measure_stroke_width,
_classify_bold_cells,
)
# ── OCR Engine Wrappers ───────────────────────────────────────────────────
_rapid_engine = None
RAPIDOCR_AVAILABLE = False
try:
from rapidocr import RapidOCR as _RapidOCRClass
from rapidocr import LangRec as _LangRec, OCRVersion as _OCRVersion, ModelType as _ModelType
RAPIDOCR_AVAILABLE = True
logger.info("RapidOCR available — can be used as alternative to Tesseract")
except ImportError:
logger.info("RapidOCR not installed — using Tesseract only")
def _get_rapid_engine():
"""Lazy-init RapidOCR engine with PP-OCRv5 Latin model for German support."""
global _rapid_engine
if _rapid_engine is None:
_rapid_engine = _RapidOCRClass(params={
"Rec.lang_type": _LangRec.LATIN,
"Rec.model_type": _ModelType.SERVER,
"Rec.ocr_version": _OCRVersion.PPOCRV5,
"Det.unclip_ratio": 1.3,
"Det.box_thresh": 0.4,
"Global.log_level": "critical",
})
logger.info("RapidOCR engine initialized (PP-OCRv5 Latin, unclip_ratio=1.3)")
return _rapid_engine
def ocr_region_rapid(
img_bgr: np.ndarray,
region: PageRegion,
) -> List[Dict[str, Any]]:
"""Run RapidOCR on a specific region, returning word dicts compatible with Tesseract format."""
engine = _get_rapid_engine()
crop = img_bgr[region.y:region.y + region.height,
region.x:region.x + region.width]
if crop.size == 0:
return []
result = engine(crop)
if result is None or result.boxes is None or result.txts is None:
return []
words = []
boxes = result.boxes
txts = result.txts
scores = result.scores
for i, (box, txt, score) in enumerate(zip(boxes, txts, scores)):
if not txt or not txt.strip():
continue
xs = [p[0] for p in box]
ys = [p[1] for p in box]
left = int(min(xs))
top = int(min(ys))
w = int(max(xs) - left)
h = int(max(ys) - top)
words.append({
'text': txt.strip(),
'left': left + region.x,
'top': top + region.y,
'width': w,
'height': h,
'conf': int(score * 100),
'region_type': region.type,
})
return words
def ocr_region_trocr(img_bgr: np.ndarray, region: PageRegion, handwritten: bool = False) -> List[Dict[str, Any]]:
"""Run TrOCR on a region. Returns line-level word dicts."""
from services.trocr_service import get_trocr_model, _split_into_lines, _check_trocr_available
if not _check_trocr_available():
logger.warning("TrOCR not available, falling back to Tesseract")
if region.height > 0 and region.width > 0:
ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) if img_bgr is not None else None
if ocr_img_crop is not None:
return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
return []
crop = img_bgr[region.y:region.y + region.height, region.x:region.x + region.width]
if crop.size == 0:
return []
try:
import torch
from PIL import Image as _PILImage
processor, model = get_trocr_model(handwritten=handwritten)
if processor is None or model is None:
logger.warning("TrOCR model not loaded, falling back to Tesseract")
ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
pil_crop = _PILImage.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
lines = _split_into_lines(pil_crop)
if not lines:
lines = [pil_crop]
device = next(model.parameters()).device
all_text = []
confidences = []
for line_img in lines:
pixel_values = processor(images=line_img, return_tensors="pt").pixel_values.to(device)
with torch.no_grad():
generated_ids = model.generate(pixel_values, max_length=128)
text_line = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
if text_line:
all_text.append(text_line)
confidences.append(0.85 if len(text_line) > 3 else 0.5)
if not all_text:
return []
avg_conf = int(sum(confidences) / len(confidences) * 100)
line_h = region.height // max(len(all_text), 1)
words = []
for i, line in enumerate(all_text):
words.append({
"text": line,
"left": region.x,
"top": region.y + i * line_h,
"width": region.width,
"height": line_h,
"conf": avg_conf,
"region_type": region.type,
})
return words
except Exception as e:
logger.error(f"ocr_region_trocr failed: {e}")
return []
def ocr_region_lighton(img_bgr: np.ndarray, region: PageRegion) -> List[Dict[str, Any]]:
"""Run LightOnOCR-2-1B on a region. Returns line-level word dicts."""
from services.lighton_ocr_service import get_lighton_model, _check_lighton_available
if not _check_lighton_available():
logger.warning("LightOnOCR not available, falling back to RapidOCR/Tesseract")
if RAPIDOCR_AVAILABLE and img_bgr is not None:
return ocr_region_rapid(img_bgr, region)
ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) if img_bgr is not None else None
return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6) if ocr_img_crop is not None else []
crop = img_bgr[region.y:region.y + region.height, region.x:region.x + region.width]
if crop.size == 0:
return []
try:
import io
import torch
from PIL import Image as _PILImage
processor, model = get_lighton_model()
if processor is None or model is None:
logger.warning("LightOnOCR model not loaded, falling back to RapidOCR/Tesseract")
if RAPIDOCR_AVAILABLE and img_bgr is not None:
return ocr_region_rapid(img_bgr, region)
ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
pil_crop = _PILImage.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
conversation = [{"role": "user", "content": [{"type": "image"}]}]
inputs = processor.apply_chat_template(
conversation, images=[pil_crop],
add_generation_prompt=True, return_tensors="pt"
).to(model.device)
with torch.no_grad():
output_ids = model.generate(**inputs, max_new_tokens=1024)
text = processor.decode(output_ids[0], skip_special_tokens=True).strip()
if not text:
return []
lines = [l.strip() for l in text.split("\n") if l.strip()]
line_h = region.height // max(len(lines), 1)
words = []
for i, line in enumerate(lines):
words.append({
"text": line,
"left": region.x,
"top": region.y + i * line_h,
"width": region.width,
"height": line_h,
"conf": 85,
"region_type": region.type,
})
return words
except Exception as e:
logger.error(f"ocr_region_lighton failed: {e}")
return []
async def ocr_region_paddle(
img_bgr: np.ndarray,
region: Optional["PageRegion"] = None,
) -> List[Dict[str, Any]]:
"""Run OCR via local RapidOCR (default) or remote PaddleOCR (fallback)."""
force_remote = os.environ.get("FORCE_REMOTE_PADDLE", "").strip() == "1"
if not force_remote:
try:
if region is None:
h, w = img_bgr.shape[:2]
_region = PageRegion(type="full_page", x=0, y=0, width=w, height=h)
else:
_region = region
words = ocr_region_rapid(img_bgr, _region)
if words:
logger.info("ocr_region_paddle: used local RapidOCR (%d words)", len(words))
return words
logger.warning("ocr_region_paddle: RapidOCR returned 0 words, trying remote")
except Exception as e:
logger.warning("ocr_region_paddle: RapidOCR failed (%s), trying remote", e)
from services.paddleocr_remote import ocr_remote_paddle
if region is not None:
crop = img_bgr[
region.y : region.y + region.height,
region.x : region.x + region.width,
]
offset_x, offset_y = region.x, region.y
else:
crop = img_bgr
offset_x, offset_y = 0, 0
if crop.size == 0:
return []
h, w = crop.shape[:2]
scale = 1.0
_MAX_DIM = 1500
if max(h, w) > _MAX_DIM:
scale = _MAX_DIM / max(h, w)
new_w, new_h = int(w * scale), int(h * scale)
crop = cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_AREA)
logger.info("ocr_region_paddle: downscaled %dx%d%dx%d (scale=%.2f)",
w, h, new_w, new_h, scale)
success, jpg_buf = cv2.imencode(".jpg", crop, [cv2.IMWRITE_JPEG_QUALITY, 90])
if not success:
logger.error("ocr_region_paddle: cv2.imencode failed")
return []
words, _w, _h = await ocr_remote_paddle(jpg_buf.tobytes(), filename="scan.jpg")
logger.info("ocr_region_paddle: used remote PaddleOCR (%d words)", len(words))
inv_scale = 1.0 / scale if scale != 1.0 else 1.0
for wd in words:
wd["left"] = int(wd["left"] * inv_scale) + offset_x
wd["top"] = int(wd["top"] * inv_scale) + offset_y
wd["width"] = int(wd["width"] * inv_scale)
wd["height"] = int(wd["height"] * inv_scale)
if region is not None:
wd["region_type"] = region.type
return words