Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
382 lines
12 KiB
Python
382 lines
12 KiB
Python
"""
|
|
OCR engines (RapidOCR, TrOCR, LightOn) and re-exports.
|
|
|
|
This module contains the OCR engine wrappers and re-exports all functions
|
|
from the split sub-modules for backward compatibility.
|
|
|
|
Sub-modules:
|
|
- cv_ocr_word_assembly: Word grouping and text assembly
|
|
- cv_ocr_vocab_postprocess: Vocabulary postprocessing (char confusion, comma split)
|
|
- cv_ocr_ipa_lookup: Core IPA lookup and bracket handling
|
|
- cv_ocr_ipa_repair: Advanced IPA repair (continuation cells, post-bracket cleanup)
|
|
- cv_ocr_cell_phonetics: Cell-level phonetics for overlay
|
|
- cv_ocr_cell_filter: Cell text filtering, column assignment, bold detection
|
|
|
|
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
|
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
|
"""
|
|
|
|
import io
|
|
import logging
|
|
import os
|
|
import re
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
|
|
import numpy as np
|
|
|
|
from cv_vocab_types import (
|
|
IPA_AVAILABLE,
|
|
PageRegion,
|
|
RowGeometry,
|
|
_britfone_dict,
|
|
_ipa_convert_american,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
try:
|
|
import cv2
|
|
except ImportError:
|
|
cv2 = None # type: ignore[assignment]
|
|
|
|
try:
|
|
from PIL import Image
|
|
except ImportError:
|
|
Image = None # type: ignore[assignment,misc]
|
|
|
|
|
|
# ── Re-exports from sub-modules (backward compatibility) ──────────────────
|
|
|
|
from cv_ocr_word_assembly import ( # noqa: F401
|
|
_group_words_into_lines,
|
|
_words_to_reading_order_lines,
|
|
_rejoin_hyphenated,
|
|
_words_to_reading_order_text,
|
|
_words_to_spaced_text,
|
|
)
|
|
|
|
from cv_ocr_vocab_postprocess import ( # noqa: F401
|
|
_CHAR_CONFUSION_RULES,
|
|
_DE_INDICATORS_FOR_EN_I,
|
|
_fix_character_confusion,
|
|
_is_singular_plural_pair,
|
|
_split_comma_entries,
|
|
_split_by_comma,
|
|
_find_best_vocab_match,
|
|
_attach_example_sentences,
|
|
)
|
|
|
|
from cv_ocr_ipa_lookup import ( # noqa: F401
|
|
_PHONETIC_BRACKET_RE,
|
|
_IPA_CHARS,
|
|
_MIN_WORD_CONF,
|
|
_GRAMMAR_BRACKET_WORDS,
|
|
_lookup_ipa,
|
|
_fix_phonetic_brackets,
|
|
_is_grammar_bracket_content,
|
|
_replace_phonetics_in_text,
|
|
_text_has_garbled_ipa,
|
|
_decompose_compound,
|
|
_insert_missing_ipa,
|
|
)
|
|
|
|
from cv_ocr_ipa_repair import ( # noqa: F401
|
|
_has_non_dict_trailing,
|
|
_strip_post_bracket_garbled,
|
|
fix_ipa_continuation_cell,
|
|
_insert_headword_ipa,
|
|
)
|
|
|
|
from cv_ocr_cell_phonetics import ( # noqa: F401
|
|
fix_cell_phonetics,
|
|
_has_ipa_gap,
|
|
_sync_word_boxes_after_ipa_insert,
|
|
)
|
|
|
|
from cv_ocr_cell_filter import ( # noqa: F401
|
|
_RE_REAL_WORD,
|
|
_RE_ALPHA,
|
|
_COMMON_SHORT_WORDS,
|
|
_KNOWN_ABBREVIATIONS,
|
|
_assign_row_words_to_columns,
|
|
_is_noise_tail_token,
|
|
_is_garbage_text,
|
|
_clean_cell_text,
|
|
_clean_cell_text_lite,
|
|
_measure_stroke_width,
|
|
_classify_bold_cells,
|
|
)
|
|
|
|
|
|
# ── OCR Engine Wrappers ───────────────────────────────────────────────────
|
|
|
|
_rapid_engine = None
|
|
RAPIDOCR_AVAILABLE = False
|
|
|
|
try:
|
|
from rapidocr import RapidOCR as _RapidOCRClass
|
|
from rapidocr import LangRec as _LangRec, OCRVersion as _OCRVersion, ModelType as _ModelType
|
|
RAPIDOCR_AVAILABLE = True
|
|
logger.info("RapidOCR available — can be used as alternative to Tesseract")
|
|
except ImportError:
|
|
logger.info("RapidOCR not installed — using Tesseract only")
|
|
|
|
|
|
def _get_rapid_engine():
|
|
"""Lazy-init RapidOCR engine with PP-OCRv5 Latin model for German support."""
|
|
global _rapid_engine
|
|
if _rapid_engine is None:
|
|
_rapid_engine = _RapidOCRClass(params={
|
|
"Rec.lang_type": _LangRec.LATIN,
|
|
"Rec.model_type": _ModelType.SERVER,
|
|
"Rec.ocr_version": _OCRVersion.PPOCRV5,
|
|
"Det.unclip_ratio": 1.3,
|
|
"Det.box_thresh": 0.4,
|
|
"Global.log_level": "critical",
|
|
})
|
|
logger.info("RapidOCR engine initialized (PP-OCRv5 Latin, unclip_ratio=1.3)")
|
|
return _rapid_engine
|
|
|
|
|
|
def ocr_region_rapid(
|
|
img_bgr: np.ndarray,
|
|
region: PageRegion,
|
|
) -> List[Dict[str, Any]]:
|
|
"""Run RapidOCR on a specific region, returning word dicts compatible with Tesseract format."""
|
|
engine = _get_rapid_engine()
|
|
|
|
crop = img_bgr[region.y:region.y + region.height,
|
|
region.x:region.x + region.width]
|
|
|
|
if crop.size == 0:
|
|
return []
|
|
|
|
result = engine(crop)
|
|
|
|
if result is None or result.boxes is None or result.txts is None:
|
|
return []
|
|
|
|
words = []
|
|
boxes = result.boxes
|
|
txts = result.txts
|
|
scores = result.scores
|
|
|
|
for i, (box, txt, score) in enumerate(zip(boxes, txts, scores)):
|
|
if not txt or not txt.strip():
|
|
continue
|
|
|
|
xs = [p[0] for p in box]
|
|
ys = [p[1] for p in box]
|
|
left = int(min(xs))
|
|
top = int(min(ys))
|
|
w = int(max(xs) - left)
|
|
h = int(max(ys) - top)
|
|
|
|
words.append({
|
|
'text': txt.strip(),
|
|
'left': left + region.x,
|
|
'top': top + region.y,
|
|
'width': w,
|
|
'height': h,
|
|
'conf': int(score * 100),
|
|
'region_type': region.type,
|
|
})
|
|
|
|
return words
|
|
|
|
|
|
def ocr_region_trocr(img_bgr: np.ndarray, region: PageRegion, handwritten: bool = False) -> List[Dict[str, Any]]:
|
|
"""Run TrOCR on a region. Returns line-level word dicts."""
|
|
from services.trocr_service import get_trocr_model, _split_into_lines, _check_trocr_available
|
|
|
|
if not _check_trocr_available():
|
|
logger.warning("TrOCR not available, falling back to Tesseract")
|
|
if region.height > 0 and region.width > 0:
|
|
ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) if img_bgr is not None else None
|
|
if ocr_img_crop is not None:
|
|
return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
|
|
return []
|
|
|
|
crop = img_bgr[region.y:region.y + region.height, region.x:region.x + region.width]
|
|
if crop.size == 0:
|
|
return []
|
|
|
|
try:
|
|
import torch
|
|
from PIL import Image as _PILImage
|
|
|
|
processor, model = get_trocr_model(handwritten=handwritten)
|
|
if processor is None or model is None:
|
|
logger.warning("TrOCR model not loaded, falling back to Tesseract")
|
|
ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
|
|
return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
|
|
|
|
pil_crop = _PILImage.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
|
|
lines = _split_into_lines(pil_crop)
|
|
if not lines:
|
|
lines = [pil_crop]
|
|
|
|
device = next(model.parameters()).device
|
|
all_text = []
|
|
confidences = []
|
|
for line_img in lines:
|
|
pixel_values = processor(images=line_img, return_tensors="pt").pixel_values.to(device)
|
|
with torch.no_grad():
|
|
generated_ids = model.generate(pixel_values, max_length=128)
|
|
text_line = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
|
|
if text_line:
|
|
all_text.append(text_line)
|
|
confidences.append(0.85 if len(text_line) > 3 else 0.5)
|
|
|
|
if not all_text:
|
|
return []
|
|
|
|
avg_conf = int(sum(confidences) / len(confidences) * 100)
|
|
line_h = region.height // max(len(all_text), 1)
|
|
words = []
|
|
for i, line in enumerate(all_text):
|
|
words.append({
|
|
"text": line,
|
|
"left": region.x,
|
|
"top": region.y + i * line_h,
|
|
"width": region.width,
|
|
"height": line_h,
|
|
"conf": avg_conf,
|
|
"region_type": region.type,
|
|
})
|
|
return words
|
|
|
|
except Exception as e:
|
|
logger.error(f"ocr_region_trocr failed: {e}")
|
|
return []
|
|
|
|
|
|
def ocr_region_lighton(img_bgr: np.ndarray, region: PageRegion) -> List[Dict[str, Any]]:
|
|
"""Run LightOnOCR-2-1B on a region. Returns line-level word dicts."""
|
|
from services.lighton_ocr_service import get_lighton_model, _check_lighton_available
|
|
|
|
if not _check_lighton_available():
|
|
logger.warning("LightOnOCR not available, falling back to RapidOCR/Tesseract")
|
|
if RAPIDOCR_AVAILABLE and img_bgr is not None:
|
|
return ocr_region_rapid(img_bgr, region)
|
|
ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) if img_bgr is not None else None
|
|
return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6) if ocr_img_crop is not None else []
|
|
|
|
crop = img_bgr[region.y:region.y + region.height, region.x:region.x + region.width]
|
|
if crop.size == 0:
|
|
return []
|
|
|
|
try:
|
|
import io
|
|
import torch
|
|
from PIL import Image as _PILImage
|
|
|
|
processor, model = get_lighton_model()
|
|
if processor is None or model is None:
|
|
logger.warning("LightOnOCR model not loaded, falling back to RapidOCR/Tesseract")
|
|
if RAPIDOCR_AVAILABLE and img_bgr is not None:
|
|
return ocr_region_rapid(img_bgr, region)
|
|
ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
|
|
return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
|
|
|
|
pil_crop = _PILImage.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
|
|
conversation = [{"role": "user", "content": [{"type": "image"}]}]
|
|
inputs = processor.apply_chat_template(
|
|
conversation, images=[pil_crop],
|
|
add_generation_prompt=True, return_tensors="pt"
|
|
).to(model.device)
|
|
|
|
with torch.no_grad():
|
|
output_ids = model.generate(**inputs, max_new_tokens=1024)
|
|
|
|
text = processor.decode(output_ids[0], skip_special_tokens=True).strip()
|
|
if not text:
|
|
return []
|
|
|
|
lines = [l.strip() for l in text.split("\n") if l.strip()]
|
|
line_h = region.height // max(len(lines), 1)
|
|
words = []
|
|
for i, line in enumerate(lines):
|
|
words.append({
|
|
"text": line,
|
|
"left": region.x,
|
|
"top": region.y + i * line_h,
|
|
"width": region.width,
|
|
"height": line_h,
|
|
"conf": 85,
|
|
"region_type": region.type,
|
|
})
|
|
return words
|
|
|
|
except Exception as e:
|
|
logger.error(f"ocr_region_lighton failed: {e}")
|
|
return []
|
|
|
|
|
|
async def ocr_region_paddle(
|
|
img_bgr: np.ndarray,
|
|
region: Optional["PageRegion"] = None,
|
|
) -> List[Dict[str, Any]]:
|
|
"""Run OCR via local RapidOCR (default) or remote PaddleOCR (fallback)."""
|
|
force_remote = os.environ.get("FORCE_REMOTE_PADDLE", "").strip() == "1"
|
|
|
|
if not force_remote:
|
|
try:
|
|
if region is None:
|
|
h, w = img_bgr.shape[:2]
|
|
_region = PageRegion(type="full_page", x=0, y=0, width=w, height=h)
|
|
else:
|
|
_region = region
|
|
|
|
words = ocr_region_rapid(img_bgr, _region)
|
|
if words:
|
|
logger.info("ocr_region_paddle: used local RapidOCR (%d words)", len(words))
|
|
return words
|
|
logger.warning("ocr_region_paddle: RapidOCR returned 0 words, trying remote")
|
|
except Exception as e:
|
|
logger.warning("ocr_region_paddle: RapidOCR failed (%s), trying remote", e)
|
|
|
|
from services.paddleocr_remote import ocr_remote_paddle
|
|
|
|
if region is not None:
|
|
crop = img_bgr[
|
|
region.y : region.y + region.height,
|
|
region.x : region.x + region.width,
|
|
]
|
|
offset_x, offset_y = region.x, region.y
|
|
else:
|
|
crop = img_bgr
|
|
offset_x, offset_y = 0, 0
|
|
|
|
if crop.size == 0:
|
|
return []
|
|
|
|
h, w = crop.shape[:2]
|
|
scale = 1.0
|
|
_MAX_DIM = 1500
|
|
if max(h, w) > _MAX_DIM:
|
|
scale = _MAX_DIM / max(h, w)
|
|
new_w, new_h = int(w * scale), int(h * scale)
|
|
crop = cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_AREA)
|
|
logger.info("ocr_region_paddle: downscaled %dx%d → %dx%d (scale=%.2f)",
|
|
w, h, new_w, new_h, scale)
|
|
|
|
success, jpg_buf = cv2.imencode(".jpg", crop, [cv2.IMWRITE_JPEG_QUALITY, 90])
|
|
if not success:
|
|
logger.error("ocr_region_paddle: cv2.imencode failed")
|
|
return []
|
|
|
|
words, _w, _h = await ocr_remote_paddle(jpg_buf.tobytes(), filename="scan.jpg")
|
|
logger.info("ocr_region_paddle: used remote PaddleOCR (%d words)", len(words))
|
|
|
|
inv_scale = 1.0 / scale if scale != 1.0 else 1.0
|
|
for wd in words:
|
|
wd["left"] = int(wd["left"] * inv_scale) + offset_x
|
|
wd["top"] = int(wd["top"] * inv_scale) + offset_y
|
|
wd["width"] = int(wd["width"] * inv_scale)
|
|
wd["height"] = int(wd["height"] * inv_scale)
|
|
if region is not None:
|
|
wd["region_type"] = region.type
|
|
|
|
return words
|