fix: border strip pre-filter + 3-column detection for vocabulary tables

The border strip filter (Step 4e) used the LARGEST x-gap which incorrectly
removed base words along with edge artifacts. Now uses a two-stage approach:
1. _filter_border_strip_words() pre-filters raw words BEFORE column detection,
   scanning from the page edge inward to find the FIRST significant gap (>30px)
2. Step 4e runs as fallback only when pre-filter didn't apply

Session 4233 now correctly detects 3 columns (base word | oder | synonyms)
instead of 2. Threshold raised from 15% to 20% to handle pages with many
edge artifacts. All 4 ground-truth sessions pass regression.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-21 21:01:43 +01:00
parent 4000110501
commit 46c8c28d34
2 changed files with 212 additions and 117 deletions

View File

@@ -14,7 +14,7 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
import logging import logging
import re import re
import time import time
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional, Tuple
import cv2 import cv2
import numpy as np import numpy as np
@@ -40,6 +40,60 @@ router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["grid-editor"])
# Helpers # Helpers
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
def _filter_border_strip_words(words: List[Dict]) -> Tuple[List[Dict], int]:
"""Remove page-border decoration strip words BEFORE column detection.
Scans from each page edge inward to find the first significant x-gap
(>30 px). If the edge cluster contains <15 % of total words, those
words are removed as border-strip artifacts (alphabet letters,
illustration fragments).
Must run BEFORE ``_build_zone_grid`` so that column detection only
sees real content words and doesn't produce inflated row counts.
"""
if len(words) < 10:
return words, 0
sorted_words = sorted(words, key=lambda w: w.get("left", 0))
total = len(sorted_words)
# -- Left-edge scan (running max right-edge) --
left_count = 0
running_right = 0
for gi in range(total - 1):
running_right = max(
running_right,
sorted_words[gi].get("left", 0) + sorted_words[gi].get("width", 0),
)
if sorted_words[gi + 1].get("left", 0) - running_right > 30:
left_count = gi + 1
break
# -- Right-edge scan (running min left) --
right_count = 0
running_left = sorted_words[-1].get("left", 0)
for gi in range(total - 1, 0, -1):
running_left = min(running_left, sorted_words[gi].get("left", 0))
prev_right = (
sorted_words[gi - 1].get("left", 0)
+ sorted_words[gi - 1].get("width", 0)
)
if running_left - prev_right > 30:
right_count = total - gi
break
strip_ids: set = set()
if left_count > 0 and left_count / total < 0.20:
strip_ids = {id(w) for w in sorted_words[:left_count]}
elif right_count > 0 and right_count / total < 0.20:
strip_ids = {id(w) for w in sorted_words[total - right_count :]}
if not strip_ids:
return words, 0
return [w for w in words if id(w) not in strip_ids], len(strip_ids)
def _cluster_columns_by_alignment( def _cluster_columns_by_alignment(
words: List[Dict], words: List[Dict],
zone_w: int, zone_w: int,
@@ -1447,6 +1501,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
zones_data: List[Dict[str, Any]] = [] zones_data: List[Dict[str, Any]] = []
boxes_detected = 0 boxes_detected = 0
recovered_count = 0 recovered_count = 0
border_prefiltered = False
img_bgr = None img_bgr = None
content_x, content_y, content_w, content_h = _get_content_bounds(all_words) content_x, content_y, content_w, content_h = _get_content_bounds(all_words)
@@ -1591,6 +1646,13 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
"build-grid: filtered %d words inside image overlays from zone %d", "build-grid: filtered %d words inside image overlays from zone %d",
ov_removed, pz.index, ov_removed, pz.index,
) )
zone_words, bs_removed = _filter_border_strip_words(zone_words)
if bs_removed:
border_prefiltered = True
logger.info(
"build-grid: pre-filtered %d border-strip words from zone %d",
bs_removed, pz.index,
)
grid = _build_zone_grid( grid = _build_zone_grid(
zone_words, pz.x, pz.y, pz.width, pz.height, zone_words, pz.x, pz.y, pz.width, pz.height,
pz.index, img_w, img_h, pz.index, img_w, img_h,
@@ -1728,6 +1790,16 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
"build-grid session %s: filtered %d recovered artifacts (fallback zone)", "build-grid session %s: filtered %d recovered artifacts (fallback zone)",
session_id, removed, session_id, removed,
) )
# Pre-filter border-strip words so column detection is not
# confused by edge artifacts. When this removes words, Step 4e
# is skipped (it would otherwise re-detect content as a "strip").
filtered_words, bs_removed = _filter_border_strip_words(filtered_words)
if bs_removed:
border_prefiltered = True
logger.info(
"build-grid session %s: pre-filtered %d border-strip words",
session_id, bs_removed,
)
grid = _build_zone_grid( grid = _build_zone_grid(
filtered_words, content_x, content_y, content_w, content_h, filtered_words, content_x, content_y, content_w, content_h,
0, img_w, img_h, 0, img_w, img_h,
@@ -1895,64 +1967,93 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
cell["text"] = cleaned cell["text"] = cleaned
# 4e. Detect and remove page-border decoration strips. # 4e. Detect and remove page-border decoration strips.
# Some textbooks have decorative alphabet strips along the page edge # Skipped when the pre-filter already removed border words BEFORE
# (coloured letters, illustrations). OCR picks up scattered letters # column detection — re-running would incorrectly detect the
# from these as artifacts. Detection: find a significant x-gap # leftmost content column as a "strip".
# (>30 px) between a small cluster of word_boxes near the page edge
# and the main content block.
border_strip_removed = 0 border_strip_removed = 0
for z in zones_data: if border_prefiltered:
cells = z.get("cells", []) logger.info("Step 4e: skipped (border pre-filter already applied)")
if not cells: else:
continue # Some textbooks have decorative alphabet strips along the page
# Collect all word_boxes with their cell reference # edge. OCR picks up scattered letters from these as artifacts.
all_wbs_with_cell: List[tuple] = [] # (left, wb, cell) # Detection: find the first significant x-gap (>30 px) from each
for cell in cells: # page edge between a small cluster (<20 %) and the main content.
for wb in cell.get("word_boxes") or []: for z in zones_data:
all_wbs_with_cell.append((wb.get("left", 0), wb, cell)) cells = z.get("cells", [])
if len(all_wbs_with_cell) < 10: if not cells:
continue continue
# Sort by x and find the largest gap all_wbs_with_cell: List[tuple] = [] # (left, wb, cell)
all_wbs_with_cell.sort(key=lambda t: t[0]) for cell in cells:
best_gap = 0 for wb in cell.get("word_boxes") or []:
best_gap_idx = -1 all_wbs_with_cell.append((wb.get("left", 0), wb, cell))
for gi in range(len(all_wbs_with_cell) - 1): if len(all_wbs_with_cell) < 10:
right_edge = all_wbs_with_cell[gi][0] + all_wbs_with_cell[gi][1].get("width", 0) continue
gap = all_wbs_with_cell[gi + 1][0] - right_edge all_wbs_with_cell.sort(key=lambda t: t[0])
if gap > best_gap: total = len(all_wbs_with_cell)
best_gap = gap
best_gap_idx = gi # -- Left-edge scan --
if best_gap < 30 or best_gap_idx < 0: left_strip_count = 0
continue left_gap = 0
left_count = best_gap_idx + 1 running_right = 0
right_count = len(all_wbs_with_cell) - left_count for gi in range(total - 1):
total = len(all_wbs_with_cell) running_right = max(
# The border strip is the SMALLER side with < 15% of total running_right,
if left_count < right_count and left_count / total < 0.15: all_wbs_with_cell[gi][0] + all_wbs_with_cell[gi][1].get("width", 0),
strip_side = "left" )
strip_wbs = {id(t[1]) for t in all_wbs_with_cell[:left_count]} gap = all_wbs_with_cell[gi + 1][0] - running_right
elif right_count < left_count and right_count / total < 0.15: if gap > 30:
strip_side = "right" left_strip_count = gi + 1
strip_wbs = {id(t[1]) for t in all_wbs_with_cell[left_count:]} left_gap = gap
else: break
continue
# Remove strip word_boxes from cells # -- Right-edge scan --
for cell in cells: right_strip_count = 0
wbs = cell.get("word_boxes") or [] right_gap = 0
filtered = [wb for wb in wbs if id(wb) not in strip_wbs] running_left = all_wbs_with_cell[-1][0]
if len(filtered) < len(wbs): for gi in range(total - 1, 0, -1):
border_strip_removed += len(wbs) - len(filtered) running_left = min(running_left, all_wbs_with_cell[gi][0])
cell["word_boxes"] = filtered prev_right = (
cell["text"] = _words_to_reading_order_text(filtered) all_wbs_with_cell[gi - 1][0]
# Remove cells that became empty + all_wbs_with_cell[gi - 1][1].get("width", 0)
z["cells"] = [c for c in cells )
if (c.get("word_boxes") or c.get("text", "").strip())] gap = running_left - prev_right
logger.info( if gap > 30:
"Step 4e: removed %d border-strip word_boxes (%s) from zone %d " right_strip_count = total - gi
"(gap=%dpx, strip=%d/%d wbs)", right_gap = gap
border_strip_removed, strip_side, z.get("zone_index", 0), break
best_gap, left_count if strip_side == "left" else right_count, total,
) strip_wbs: set = set()
strip_side = ""
strip_gap = 0
strip_count = 0
if left_strip_count > 0 and left_strip_count / total < 0.20:
strip_side = "left"
strip_count = left_strip_count
strip_gap = left_gap
strip_wbs = {id(t[1]) for t in all_wbs_with_cell[:left_strip_count]}
elif right_strip_count > 0 and right_strip_count / total < 0.20:
strip_side = "right"
strip_count = right_strip_count
strip_gap = right_gap
strip_wbs = {id(t[1]) for t in all_wbs_with_cell[total - right_strip_count:]}
if not strip_wbs:
continue
for cell in cells:
wbs = cell.get("word_boxes") or []
filtered = [wb for wb in wbs if id(wb) not in strip_wbs]
if len(filtered) < len(wbs):
border_strip_removed += len(wbs) - len(filtered)
cell["word_boxes"] = filtered
cell["text"] = _words_to_reading_order_text(filtered)
z["cells"] = [c for c in cells
if (c.get("word_boxes") or c.get("text", "").strip())]
logger.info(
"Step 4e: removed %d border-strip word_boxes (%s) from zone %d "
"(gap=%dpx, strip=%d/%d wbs)",
border_strip_removed, strip_side, z.get("zone_index", 0),
strip_gap, strip_count, total,
)
# 5. Color annotation on final word_boxes in cells # 5. Color annotation on final word_boxes in cells
if img_bgr is not None: if img_bgr is not None:

View File

@@ -1109,64 +1109,56 @@ class TestBorderStripFilter:
def test_left_border_strip_removed(self): def test_left_border_strip_removed(self):
"""Word_boxes at x<120 with 45px gap to content at x>=179 are removed.""" """Word_boxes at x<120 with 45px gap to content at x>=179 are removed."""
# Simulate border strip (11 wbs) + real content (20 wbs) # Simulate border strip (3 wbs) + base words (7 wbs) + oder (7 wbs)
# + synonyms (20 wbs). The old "largest gap" algorithm would pick
# the 67px gap between base words and "oder", removing base words.
# The new "first gap from edge" algorithm picks the 45px gap between
# border artifacts and base words.
border_wbs = [ border_wbs = [
self._make_wb("M", 49, 436, 46, 44), self._make_wb("M", 49, 436, 46, 44), # right=95
self._make_wb("x", 113, 610, 21, 38), self._make_wb("x", 113, 610, 21, 38), # right=134
self._make_wb("Er", 45, 998, 62, 37), self._make_wb("Er", 45, 998, 62, 37), # right=107
] ]
content_wbs = [] base_wbs = [self._make_wb(f"base{i}", 179, 100 + i * 60, 100, 20) for i in range(7)]
for i in range(20): oder_wbs = [self._make_wb("oder", 379, 100 + i * 60, 68, 20) for i in range(7)]
# Place content words at x=179 and x=280 (gap=1px between them, synonym_wbs = [self._make_wb(f"syn{i}", 452 + (i % 5) * 30, 100 + (i // 5) * 60, 80, 20) for i in range(20)]
# much smaller than the 45px border-to-content gap)
content_wbs.append(self._make_wb(f"word{i}", 179 + (i % 2) * 51, 100 + i * 40)) all_wbs = border_wbs + base_wbs + oder_wbs + synonym_wbs
# Build zone with cells all_left = sorted([(wb["left"], wb) for wb in all_wbs], key=lambda t: t[0])
cells = []
# Border-only cells
for i, wb in enumerate(border_wbs):
cells.append({"cell_id": f"R{i}_C0", "col_index": 0, "row_index": i,
"word_boxes": [wb], "text": wb["text"]})
# Content cells
for i, wb in enumerate(content_wbs):
ri = len(border_wbs) + i
cells.append({"cell_id": f"R{ri}_C1", "col_index": 1, "row_index": ri,
"word_boxes": [wb], "text": wb["text"]})
zone = {"zone_index": 0, "zone_type": "content", "cells": cells,
"columns": [], "rows": []}
# The filter runs inside _build_grid_core, but we can test the
# pattern detection logic: 3 border wbs + 20 content wbs,
# border right edge = 113+21=134, content left = 179, gap = 45px
# 3/23 = 13% < 15% threshold
from cv_ocr_engines import _group_words_into_lines
all_left = sorted(
[(wb["left"], wb) for cell in cells for wb in cell.get("word_boxes", [])],
key=lambda t: t[0]
)
# Find largest gap
best_gap = 0
best_idx = -1
for gi in range(len(all_left) - 1):
right_edge = all_left[gi][0] + all_left[gi][1]["width"]
gap = all_left[gi + 1][0] - right_edge
if gap > best_gap:
best_gap = gap
best_idx = gi
assert best_gap >= 30, f"Gap should be >=30, got {best_gap}"
left_count = best_idx + 1
total = len(all_left) total = len(all_left)
assert left_count / total < 0.15, f"Border ratio {left_count}/{total} should be <15%"
# New algorithm: scan from left edge, find FIRST gap >30px
running_right = 0
left_strip_count = 0
for gi in range(total - 1):
running_right = max(running_right, all_left[gi][0] + all_left[gi][1]["width"])
gap = all_left[gi + 1][0] - running_right
if gap > 30:
left_strip_count = gi + 1
break
# Should find the 45px gap between border (right=134) and base (left=179)
assert left_strip_count == len(border_wbs), (
f"Expected {len(border_wbs)} border wbs, got {left_strip_count}"
)
assert left_strip_count / total < 0.20, (
f"Border ratio {left_strip_count}/{total} should be <20%"
)
def test_no_removal_when_no_gap(self): def test_no_removal_when_no_gap(self):
"""No gap > 30px between word_boxes → nothing removed.""" """No gap > 30px between word_boxes → nothing removed."""
# Words spaced 20px apart with width 50 → overlap, no gap >30px
wbs = [self._make_wb(f"w{i}", 10 + i * 20, 100) for i in range(15)] wbs = [self._make_wb(f"w{i}", 10 + i * 20, 100) for i in range(15)]
all_left = sorted([(wb["left"], wb) for wb in wbs], key=lambda t: t[0]) all_left = sorted([(wb["left"], wb) for wb in wbs], key=lambda t: t[0])
best_gap = 0 running_right = 0
found_gap = False
for gi in range(len(all_left) - 1): for gi in range(len(all_left) - 1):
right_edge = all_left[gi][0] + all_left[gi][1]["width"] running_right = max(running_right, all_left[gi][0] + all_left[gi][1]["width"])
gap = all_left[gi + 1][0] - right_edge gap = all_left[gi + 1][0] - running_right
if gap > best_gap: if gap > 30:
best_gap = gap found_gap = True
assert best_gap < 30, f"No significant gap expected, got {best_gap}" break
assert not found_gap, "No significant gap expected"
def test_equal_sides_not_removed(self): def test_equal_sides_not_removed(self):
"""Two roughly equal groups (50/50) are NOT treated as border strip.""" """Two roughly equal groups (50/50) are NOT treated as border strip."""
@@ -1176,15 +1168,17 @@ class TestBorderStripFilter:
[(wb["left"], wb) for wb in left_wbs + right_wbs], [(wb["left"], wb) for wb in left_wbs + right_wbs],
key=lambda t: t[0] key=lambda t: t[0]
) )
best_gap = 0
best_idx = -1
for gi in range(len(all_left) - 1):
right_edge = all_left[gi][0] + all_left[gi][1]["width"]
gap = all_left[gi + 1][0] - right_edge
if gap > best_gap:
best_gap = gap
best_idx = gi
left_count = best_idx + 1
total = len(all_left) total = len(all_left)
# Left scan: first gap >30px from left
running_right = 0
left_strip_count = 0
for gi in range(total - 1):
running_right = max(running_right, all_left[gi][0] + all_left[gi][1]["width"])
gap = all_left[gi + 1][0] - running_right
if gap > 30:
left_strip_count = gi + 1
break
# 10/20 = 50% — NOT below 15% threshold, so no removal # 10/20 = 50% — NOT below 15% threshold, so no removal
assert left_count / total >= 0.15, "Equal groups should NOT trigger border removal" assert left_strip_count == 0 or left_strip_count / total >= 0.20, (
"Equal groups should NOT trigger border removal"
)