fix(ocr-pipeline): exclusive word-to-column assignment prevents duplicates

Replace per-cell word filtering (which allowed the same word to appear in
multiple columns due to padded overlap) with exclusive nearest-center
assignment. Each word is assigned to exactly one column per row.

Also use row height as Y-tolerance for text assembly so words within
the same row (e.g. "Maus, Mäuse") are always grouped on one line.

Fixes: words leaking into wrong columns, missing words, duplicate words.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-02 07:54:45 +01:00
parent 9bbde1c03e
commit 2c4160e4c4

View File

@@ -3009,46 +3009,52 @@ def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str
return _PHONETIC_BRACKET_RE.sub(replacer, text) return _PHONETIC_BRACKET_RE.sub(replacer, text)
def _lookup_cell_words( def _assign_row_words_to_columns(
row: RowGeometry, row: RowGeometry,
col: PageRegion, columns: List[PageRegion],
pad: int = 8, ) -> Dict[int, List[Dict]]:
) -> Tuple[List[Dict], float]: """Assign each word in a row to exactly one column (nearest center).
"""Look up pre-existing Tesseract words that fall within a cell region.
Instead of re-running OCR on a cell crop, this filters the full-page This prevents the same word from appearing in multiple cells when column
Tesseract words (stored in row.words) by X-overlap with the column. boundaries are close together. Each word is assigned to the column whose
horizontal center is closest to the word's horizontal center.
Words use coordinates relative to the content ROI; columns use absolute Args:
coordinates. row.x equals the content-ROI left_x, so we convert with: row: Row with words (relative coordinates).
col_left_rel = col.x - row.x columns: Sorted list of columns (absolute coordinates).
Returns: Returns:
(words_in_cell, avg_confidence) where words_in_cell keep their Dict mapping col_index → list of words assigned to that column.
original relative coordinates (compatible with
_words_to_reading_order_text).
""" """
if not row.words: result: Dict[int, List[Dict]] = {i: [] for i in range(len(columns))}
return [], 0.0
left_x = row.x # content ROI offset (absolute) if not row.words or not columns:
col_left_rel = col.x - left_x - pad return result
col_right_rel = col.x - left_x + col.width + pad
left_x = row.x # content ROI left (absolute)
# Pre-compute column centers in relative coordinates
col_centers_rel = []
for col in columns:
col_left_rel = col.x - left_x
col_center_rel = col_left_rel + col.width / 2
col_centers_rel.append(col_center_rel)
words_in_cell = []
for w in row.words: for w in row.words:
w_left = w['left'] w_center_x = w['left'] + w['width'] / 2
w_right = w_left + w['width']
# Word center must be within column bounds
w_center_x = (w_left + w_right) / 2
if col_left_rel <= w_center_x <= col_right_rel:
words_in_cell.append(w)
avg_conf = 0.0 # Find nearest column by center distance
if words_in_cell: best_col = 0
avg_conf = round(sum(w['conf'] for w in words_in_cell) / len(words_in_cell), 1) best_dist = abs(w_center_x - col_centers_rel[0])
for ci in range(1, len(columns)):
dist = abs(w_center_x - col_centers_rel[ci])
if dist < best_dist:
best_dist = dist
best_col = ci
return words_in_cell, avg_conf result[best_col].append(w)
return result
def _ocr_single_cell( def _ocr_single_cell(
@@ -3064,6 +3070,7 @@ def _ocr_single_cell(
engine_name: str, engine_name: str,
lang: str, lang: str,
lang_map: Dict[str, str], lang_map: Dict[str, str],
preassigned_words: Optional[List[Dict]] = None,
) -> Dict[str, Any]: ) -> Dict[str, Any]:
"""Populate a single cell (column x row intersection) via word lookup.""" """Populate a single cell (column x row intersection) via word lookup."""
pad = 8 # pixels pad = 8 # pixels
@@ -3096,19 +3103,21 @@ def _ocr_single_cell(
'ocr_engine': 'word_lookup', 'ocr_engine': 'word_lookup',
} }
# --- PRIMARY: Word-lookup from full-page Tesseract --- # Use pre-assigned words (exclusive per column) if provided
# Use pre-existing words from row.words (Step 4) instead of words = preassigned_words if preassigned_words is not None else []
# re-running OCR on a small crop. This is more reliable because
# full-page Tesseract has better context for recognition.
words, avg_conf = _lookup_cell_words(row, col, pad=pad)
if words: if words:
avg_h = sum(w['height'] for w in words) / len(words) # Use row height as Y-tolerance so all words within a single row
y_tol = max(10, int(avg_h * 0.5)) # are grouped onto one line (avoids splitting e.g. "Maus, Mäuse"
# across two lines due to slight vertical offset).
y_tol = max(15, row.height)
text = _words_to_reading_order_text(words, y_tolerance_px=y_tol) text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
else: else:
text = '' text = ''
avg_conf = 0.0
avg_conf = 0.0
if words:
avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
return { return {
'cell_id': f"R{row_idx:02d}_C{col_idx}", 'cell_id': f"R{row_idx:02d}_C{col_idx}",
@@ -3218,11 +3227,14 @@ def build_cell_grid(
cells: List[Dict[str, Any]] = [] cells: List[Dict[str, Any]] = []
for row_idx, row in enumerate(content_rows): for row_idx, row in enumerate(content_rows):
# Pre-assign each word to exactly one column (nearest center)
col_words = _assign_row_words_to_columns(row, relevant_cols)
for col_idx, col in enumerate(relevant_cols): for col_idx, col in enumerate(relevant_cols):
cell = _ocr_single_cell( cell = _ocr_single_cell(
row_idx, col_idx, row, col, row_idx, col_idx, row, col,
ocr_img, img_bgr, img_w, img_h, ocr_img, img_bgr, img_w, img_h,
use_rapid, engine_name, lang, lang_map, use_rapid, engine_name, lang, lang_map,
preassigned_words=col_words[col_idx],
) )
cells.append(cell) cells.append(cell)
@@ -3300,11 +3312,14 @@ def build_cell_grid_streaming(
total_cells = len(content_rows) * len(relevant_cols) total_cells = len(content_rows) * len(relevant_cols)
for row_idx, row in enumerate(content_rows): for row_idx, row in enumerate(content_rows):
# Pre-assign each word to exactly one column (nearest center)
col_words = _assign_row_words_to_columns(row, relevant_cols)
for col_idx, col in enumerate(relevant_cols): for col_idx, col in enumerate(relevant_cols):
cell = _ocr_single_cell( cell = _ocr_single_cell(
row_idx, col_idx, row, col, row_idx, col_idx, row, col,
ocr_img, img_bgr, img_w, img_h, ocr_img, img_bgr, img_w, img_h,
use_rapid, engine_name, lang, lang_map, use_rapid, engine_name, lang, lang_map,
preassigned_words=col_words[col_idx],
) )
yield cell, columns_meta, total_cells yield cell, columns_meta, total_cells