feat(ocr-pipeline): line grouping fix + RapidOCR integration
Fix A: Use _group_words_into_lines() with adaptive Y-tolerance to correctly order words in multi-line cells (fixes word reordering bug). RapidOCR: Add as alternative OCR engine (PaddleOCR models on ONNX Runtime, native ARM64). Engine selectable via dropdown in UI or ?engine= query param. Auto mode prefers RapidOCR when available. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2173,6 +2173,101 @@ def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Li
|
||||
# Pipeline Step 5: Word Grid from Columns × Rows
|
||||
# =============================================================================
|
||||
|
||||
def _words_to_reading_order_text(words: List[Dict], y_tolerance_px: int = 15) -> str:
|
||||
"""Join OCR words into text in correct reading order.
|
||||
|
||||
Groups words into visual lines by Y-tolerance, sorts each line by X,
|
||||
then joins lines with spaces. This fixes multi-line cell reading order.
|
||||
"""
|
||||
if not words:
|
||||
return ''
|
||||
|
||||
lines = _group_words_into_lines(words, y_tolerance_px=y_tolerance_px)
|
||||
line_texts = []
|
||||
for line in lines:
|
||||
line_texts.append(' '.join(w['text'] for w in line))
|
||||
return ' '.join(line_texts)
|
||||
|
||||
|
||||
# --- RapidOCR integration (PaddleOCR models on ONNX Runtime) ---
|
||||
|
||||
_rapid_engine = None
|
||||
RAPIDOCR_AVAILABLE = False
|
||||
|
||||
try:
|
||||
from rapidocr import RapidOCR as _RapidOCRClass
|
||||
RAPIDOCR_AVAILABLE = True
|
||||
logger.info("RapidOCR available — can be used as alternative to Tesseract")
|
||||
except ImportError:
|
||||
logger.info("RapidOCR not installed — using Tesseract only")
|
||||
|
||||
|
||||
def _get_rapid_engine():
|
||||
"""Lazy-init RapidOCR engine (downloads models on first use)."""
|
||||
global _rapid_engine
|
||||
if _rapid_engine is None:
|
||||
_rapid_engine = _RapidOCRClass()
|
||||
logger.info("RapidOCR engine initialized")
|
||||
return _rapid_engine
|
||||
|
||||
|
||||
def ocr_region_rapid(
|
||||
img_bgr: np.ndarray,
|
||||
region: PageRegion,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Run RapidOCR on a specific region, returning word dicts compatible with Tesseract format.
|
||||
|
||||
Args:
|
||||
img_bgr: Full-page BGR image (NOT binarized — RapidOCR works on color/gray).
|
||||
region: Region to crop and OCR.
|
||||
|
||||
Returns:
|
||||
List of word dicts with text, left, top, width, height, conf, region_type.
|
||||
"""
|
||||
engine = _get_rapid_engine()
|
||||
|
||||
# Crop region from BGR image
|
||||
crop = img_bgr[region.y:region.y + region.height,
|
||||
region.x:region.x + region.width]
|
||||
|
||||
if crop.size == 0:
|
||||
return []
|
||||
|
||||
result = engine(crop)
|
||||
|
||||
if result is None or result.boxes is None or result.txts is None:
|
||||
return []
|
||||
|
||||
words = []
|
||||
boxes = result.boxes # shape (N, 4, 2) — 4 corner points per text line
|
||||
txts = result.txts # tuple of strings
|
||||
scores = result.scores # tuple of floats
|
||||
|
||||
for i, (box, txt, score) in enumerate(zip(boxes, txts, scores)):
|
||||
if not txt or not txt.strip():
|
||||
continue
|
||||
|
||||
# box is [[x1,y1],[x2,y2],[x3,y3],[x4,y4]] (clockwise from top-left)
|
||||
xs = [p[0] for p in box]
|
||||
ys = [p[1] for p in box]
|
||||
left = int(min(xs))
|
||||
top = int(min(ys))
|
||||
w = int(max(xs) - left)
|
||||
h = int(max(ys) - top)
|
||||
|
||||
words.append({
|
||||
'text': txt.strip(),
|
||||
'left': left + region.x, # Absolute coords
|
||||
'top': top + region.y,
|
||||
'width': w,
|
||||
'height': h,
|
||||
'conf': int(score * 100), # 0-100 like Tesseract
|
||||
'region_type': region.type,
|
||||
})
|
||||
|
||||
return words
|
||||
|
||||
|
||||
def build_word_grid(
|
||||
ocr_img: np.ndarray,
|
||||
column_regions: List[PageRegion],
|
||||
@@ -2180,20 +2275,37 @@ def build_word_grid(
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
lang: str = "eng+deu",
|
||||
ocr_engine: str = "auto",
|
||||
img_bgr: Optional[np.ndarray] = None,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Build a word grid by intersecting columns and rows, then OCR each cell.
|
||||
|
||||
Args:
|
||||
ocr_img: Binarized full-page image.
|
||||
ocr_img: Binarized full-page image (for Tesseract).
|
||||
column_regions: Classified columns from Step 3 (PageRegion list).
|
||||
row_geometries: Rows from Step 4 (RowGeometry list).
|
||||
img_w: Image width in pixels.
|
||||
img_h: Image height in pixels.
|
||||
lang: Default Tesseract language.
|
||||
ocr_engine: 'tesseract', 'rapid', or 'auto' (rapid if available, else tesseract).
|
||||
img_bgr: BGR color image (required for RapidOCR).
|
||||
|
||||
Returns:
|
||||
List of entry dicts with english/german/example text and bbox info (percent).
|
||||
"""
|
||||
# Resolve engine choice
|
||||
use_rapid = False
|
||||
if ocr_engine == "auto":
|
||||
use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
|
||||
elif ocr_engine == "rapid":
|
||||
if not RAPIDOCR_AVAILABLE:
|
||||
logger.warning("RapidOCR requested but not available, falling back to Tesseract")
|
||||
else:
|
||||
use_rapid = True
|
||||
|
||||
engine_name = "rapid" if use_rapid else "tesseract"
|
||||
logger.info(f"build_word_grid: using OCR engine '{engine_name}'")
|
||||
|
||||
# Filter to content rows only (skip header/footer)
|
||||
content_rows = [r for r in row_geometries if r.row_type == 'content']
|
||||
if not content_rows:
|
||||
@@ -2210,7 +2322,7 @@ def build_word_grid(
|
||||
# Sort columns left-to-right
|
||||
relevant_cols.sort(key=lambda c: c.x)
|
||||
|
||||
# Choose OCR language per column type
|
||||
# Choose OCR language per column type (Tesseract only)
|
||||
lang_map = {
|
||||
'column_en': 'eng',
|
||||
'column_de': 'deu',
|
||||
@@ -2235,6 +2347,7 @@ def build_word_grid(
|
||||
'bbox_en': None,
|
||||
'bbox_de': None,
|
||||
'bbox_ex': None,
|
||||
'ocr_engine': engine_name,
|
||||
}
|
||||
|
||||
confidences: List[float] = []
|
||||
@@ -2263,12 +2376,22 @@ def build_word_grid(
|
||||
width=cell_w, height=cell_h,
|
||||
)
|
||||
|
||||
cell_lang = lang_map.get(col.type, lang)
|
||||
words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6)
|
||||
# OCR the cell
|
||||
if use_rapid:
|
||||
words = ocr_region_rapid(img_bgr, cell_region)
|
||||
else:
|
||||
cell_lang = lang_map.get(col.type, lang)
|
||||
words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=6)
|
||||
|
||||
# Group into lines, then join in reading order (Fix A)
|
||||
# Use half of average word height as Y-tolerance
|
||||
if words:
|
||||
avg_h = sum(w['height'] for w in words) / len(words)
|
||||
y_tol = max(10, int(avg_h * 0.5))
|
||||
else:
|
||||
y_tol = 15
|
||||
text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
|
||||
|
||||
# Sort words by Y then X (reading order for multi-line cells)
|
||||
words.sort(key=lambda w: (w['top'], w['left']))
|
||||
text = ' '.join(w['text'] for w in words)
|
||||
if words:
|
||||
avg_conf = sum(w['conf'] for w in words) / len(words)
|
||||
confidences.append(avg_conf)
|
||||
@@ -2300,7 +2423,8 @@ def build_word_grid(
|
||||
entries.append(entry)
|
||||
|
||||
logger.info(f"build_word_grid: {len(entries)} entries from "
|
||||
f"{len(content_rows)} content rows × {len(relevant_cols)} columns")
|
||||
f"{len(content_rows)} content rows × {len(relevant_cols)} columns "
|
||||
f"(engine={engine_name})")
|
||||
|
||||
return entries
|
||||
|
||||
|
||||
Reference in New Issue
Block a user