fix(ocr-pipeline): overlap-based word assignment and empty row filtering
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 1m14s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m55s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 17s

1. Word-to-column assignment now uses overlap-based matching instead of
   center-point matching. This fixes narrow page_ref columns losing
   their last digit (e.g. "p.59" → "p.5") when the digit's center
   falls slightly past the midpoint boundary into the next column.

2. Post-OCR empty row filter: rows where ALL cells have empty text
   are removed after OCR. This catches inter-row gaps that had stray
   Tesseract artifacts giving word_count > 0 but no actual content.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-03 11:00:29 +01:00
parent ccba2bb887
commit 606bef0591
2 changed files with 61 additions and 19 deletions

View File

@@ -3519,28 +3519,46 @@ def _assign_row_words_to_columns(
col_ranges_rel.append((assign_left, assign_right))
for w in row.words:
w_center_x = w['left'] + w['width'] / 2
w_left = w['left']
w_right = w_left + w['width']
w_center_x = w_left + w['width'] / 2
# Find which column range contains this word
assigned = False
for ci, (al, ar) in enumerate(col_ranges_rel):
if al <= w_center_x < ar:
result[ci].append(w)
assigned = True
break
# Primary: overlap-based matching — assign to column with most overlap.
# This is more robust than center-based for narrow columns (page_ref)
# where the last character's center may fall into the next column.
best_col = -1
best_overlap = 0
for ci, col in enumerate(columns):
col_left_rel = col.x - left_x
col_right_rel = col_left_rel + col.width
overlap = max(0, min(w_right, col_right_rel) - max(w_left, col_left_rel))
if overlap > best_overlap:
best_overlap = overlap
best_col = ci
if not assigned:
# Fallback: nearest column center
best_col = 0
col_left_0 = columns[0].x - left_x
best_dist = abs(w_center_x - (col_left_0 + columns[0].width / 2))
for ci in range(1, n):
col_left = columns[ci].x - left_x
dist = abs(w_center_x - (col_left + columns[ci].width / 2))
if dist < best_dist:
best_dist = dist
best_col = ci
if best_col >= 0 and best_overlap > 0:
result[best_col].append(w)
else:
# Fallback: center-based range matching
assigned = False
for ci, (al, ar) in enumerate(col_ranges_rel):
if al <= w_center_x < ar:
result[ci].append(w)
assigned = True
break
if not assigned:
# Last resort: nearest column center
best_col = 0
col_left_0 = columns[0].x - left_x
best_dist = abs(w_center_x - (col_left_0 + columns[0].width / 2))
for ci in range(1, n):
col_left = columns[ci].x - left_x
dist = abs(w_center_x - (col_left + columns[ci].width / 2))
if dist < best_dist:
best_dist = dist
best_col = ci
result[best_col].append(w)
return result
@@ -4115,6 +4133,18 @@ def build_cell_grid(
f"empty cells in column {col_idx}"
)
# Post-OCR: remove rows where ALL cells are empty (inter-row gaps
# that had stray Tesseract artifacts giving word_count > 0).
rows_with_text: set = set()
for cell in cells:
if cell['text'].strip():
rows_with_text.add(cell['row_index'])
before_filter = len(cells)
cells = [c for c in cells if c['row_index'] in rows_with_text]
empty_rows_removed = (before_filter - len(cells)) // max(len(relevant_cols), 1)
if empty_rows_removed > 0:
logger.info(f"build_cell_grid: removed {empty_rows_removed} all-empty rows after OCR")
logger.info(f"build_cell_grid: {len(cells)} cells from "
f"{len(content_rows)} rows × {len(relevant_cols)} columns, "
f"engine={engine_name}")

View File

@@ -1291,6 +1291,18 @@ async def _word_stream_generator(
if columns_meta is None:
columns_meta = []
# Post-OCR: remove rows where ALL cells are empty (inter-row gaps
# that had stray Tesseract artifacts giving word_count > 0).
rows_with_text: set = set()
for c in all_cells:
if c.get("text", "").strip():
rows_with_text.add(c["row_index"])
before_filter = len(all_cells)
all_cells = [c for c in all_cells if c["row_index"] in rows_with_text]
empty_rows_removed = (before_filter - len(all_cells)) // max(n_cols, 1)
if empty_rows_removed > 0:
logger.info(f"SSE: removed {empty_rows_removed} all-empty rows after OCR")
used_engine = all_cells[0].get("ocr_engine", "tesseract") if all_cells else engine
word_result = {