fix: border strip pre-filter + 3-column detection for vocabulary tables

The border strip filter (Step 4e) used the LARGEST x-gap which incorrectly removed base words along with edge artifacts. Now uses a two-stage approach: 1. _filter_border_strip_words() pre-filters raw words BEFORE column detection, scanning from the page edge inward to find the FIRST significant gap (>30px) 2. Step 4e runs as fallback only when pre-filter didn't apply Session 4233 now correctly detects 3 columns (base word | oder | synonyms) instead of 2. Threshold raised from 15% to 20% to handle pages with many edge artifacts. All 4 ground-truth sessions pass regression. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-21 21:01:43 +01:00
parent 4000110501
commit 46c8c28d34
2 changed files with 212 additions and 117 deletions
@@ -14,7 +14,7 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
 import logging
 import re
 import time
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Tuple
 import cv2
 import numpy as np
@@ -40,6 +40,60 @@ router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["grid-editor"])
 # Helpers
 # ---------------------------------------------------------------------------
 def _filter_border_strip_words(words: List[Dict]) -> Tuple[List[Dict], int]:
    """Remove page-border decoration strip words BEFORE column detection.
    Scans from each page edge inward to find the first significant x-gap
    (>30 px).  If the edge cluster contains <15 % of total words, those
    words are removed as border-strip artifacts (alphabet letters,
    illustration fragments).
    Must run BEFORE ``_build_zone_grid`` so that column detection only
    sees real content words and doesn't produce inflated row counts.
    """
    if len(words) < 10:
        return words, 0
    sorted_words = sorted(words, key=lambda w: w.get("left", 0))
    total = len(sorted_words)
    # -- Left-edge scan (running max right-edge) --
    left_count = 0
    running_right = 0
    for gi in range(total - 1):
        running_right = max(
            running_right,
            sorted_words[gi].get("left", 0) + sorted_words[gi].get("width", 0),
        )
        if sorted_words[gi + 1].get("left", 0) - running_right > 30:
            left_count = gi + 1
            break
    # -- Right-edge scan (running min left) --
    right_count = 0
    running_left = sorted_words[-1].get("left", 0)
    for gi in range(total - 1, 0, -1):
        running_left = min(running_left, sorted_words[gi].get("left", 0))
        prev_right = (
            sorted_words[gi - 1].get("left", 0)
            + sorted_words[gi - 1].get("width", 0)
        )
        if running_left - prev_right > 30:
            right_count = total - gi
            break
    strip_ids: set = set()
    if left_count > 0 and left_count / total < 0.20:
        strip_ids = {id(w) for w in sorted_words[:left_count]}
    elif right_count > 0 and right_count / total < 0.20:
        strip_ids = {id(w) for w in sorted_words[total - right_count :]}
    if not strip_ids:
        return words, 0
    return [w for w in words if id(w) not in strip_ids], len(strip_ids)
 def _cluster_columns_by_alignment(
    words: List[Dict],
    zone_w: int,
@@ -1447,6 +1501,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
    zones_data: List[Dict[str, Any]] = []
    boxes_detected = 0
    recovered_count = 0
    border_prefiltered = False
    img_bgr = None
    content_x, content_y, content_w, content_h = _get_content_bounds(all_words)
@@ -1591,6 +1646,13 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                                "build-grid: filtered %d words inside image overlays from zone %d",
                                ov_removed, pz.index,
                            )
                    zone_words, bs_removed = _filter_border_strip_words(zone_words)
                    if bs_removed:
                        border_prefiltered = True
                        logger.info(
                            "build-grid: pre-filtered %d border-strip words from zone %d",
                            bs_removed, pz.index,
                        )
                    grid = _build_zone_grid(
                        zone_words, pz.x, pz.y, pz.width, pz.height,
                        pz.index, img_w, img_h,
@@ -1728,6 +1790,16 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                "build-grid session %s: filtered %d recovered artifacts (fallback zone)",
                session_id, removed,
            )
        # Pre-filter border-strip words so column detection is not
        # confused by edge artifacts.  When this removes words, Step 4e
        # is skipped (it would otherwise re-detect content as a "strip").
        filtered_words, bs_removed = _filter_border_strip_words(filtered_words)
        if bs_removed:
            border_prefiltered = True
            logger.info(
                "build-grid session %s: pre-filtered %d border-strip words",
                session_id, bs_removed,
            )
        grid = _build_zone_grid(
            filtered_words, content_x, content_y, content_w, content_h,
            0, img_w, img_h,
@@ -1895,64 +1967,93 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                    cell["text"] = cleaned
    # 4e. Detect and remove page-border decoration strips.
-    # Some textbooks have decorative alphabet strips along the page edge
+    # Skipped when the pre-filter already removed border words BEFORE
-    # (coloured letters, illustrations).  OCR picks up scattered letters
+    # column detection — re-running would incorrectly detect the
-    # from these as artifacts.  Detection: find a significant x-gap
+    # leftmost content column as a "strip".
    # (>30 px) between a small cluster of word_boxes near the page edge
    # and the main content block.
    border_strip_removed = 0
-    for z in zones_data:
+    if border_prefiltered:
-        cells = z.get("cells", [])
+        logger.info("Step 4e: skipped (border pre-filter already applied)")
-        if not cells:
+    else:
-            continue
+        # Some textbooks have decorative alphabet strips along the page
-        # Collect all word_boxes with their cell reference
+        # edge.  OCR picks up scattered letters from these as artifacts.
-        all_wbs_with_cell: List[tuple] = []  # (left, wb, cell)
+        # Detection: find the first significant x-gap (>30 px) from each
-        for cell in cells:
+        # page edge between a small cluster (<20 %) and the main content.
-            for wb in cell.get("word_boxes") or []:
+        for z in zones_data:
-                all_wbs_with_cell.append((wb.get("left", 0), wb, cell))
+            cells = z.get("cells", [])
-        if len(all_wbs_with_cell) < 10:
+            if not cells:
-            continue
+                continue
-        # Sort by x and find the largest gap
+            all_wbs_with_cell: List[tuple] = []  # (left, wb, cell)
-        all_wbs_with_cell.sort(key=lambda t: t[0])
+            for cell in cells:
-        best_gap = 0
+                for wb in cell.get("word_boxes") or []:
-        best_gap_idx = -1
+                    all_wbs_with_cell.append((wb.get("left", 0), wb, cell))
-        for gi in range(len(all_wbs_with_cell) - 1):
+            if len(all_wbs_with_cell) < 10:
-            right_edge = all_wbs_with_cell[gi][0] + all_wbs_with_cell[gi][1].get("width", 0)
+                continue
-            gap = all_wbs_with_cell[gi + 1][0] - right_edge
+            all_wbs_with_cell.sort(key=lambda t: t[0])
-            if gap > best_gap:
+            total = len(all_wbs_with_cell)
-                best_gap = gap
+
-                best_gap_idx = gi
+            # -- Left-edge scan --
-        if best_gap < 30 or best_gap_idx < 0:
+            left_strip_count = 0
-            continue
+            left_gap = 0
-        left_count = best_gap_idx + 1
+            running_right = 0
-        right_count = len(all_wbs_with_cell) - left_count
+            for gi in range(total - 1):
-        total = len(all_wbs_with_cell)
+                running_right = max(
-        # The border strip is the SMALLER side with < 15% of total
+                    running_right,
-        if left_count < right_count and left_count / total < 0.15:
+                    all_wbs_with_cell[gi][0] + all_wbs_with_cell[gi][1].get("width", 0),
-            strip_side = "left"
+                )
-            strip_wbs = {id(t[1]) for t in all_wbs_with_cell[:left_count]}
+                gap = all_wbs_with_cell[gi + 1][0] - running_right
-        elif right_count < left_count and right_count / total < 0.15:
+                if gap > 30:
-            strip_side = "right"
+                    left_strip_count = gi + 1
-            strip_wbs = {id(t[1]) for t in all_wbs_with_cell[left_count:]}
+                    left_gap = gap
-        else:
+                    break
-            continue
+
-        # Remove strip word_boxes from cells
+            # -- Right-edge scan --
-        for cell in cells:
+            right_strip_count = 0
-            wbs = cell.get("word_boxes") or []
+            right_gap = 0
-            filtered = [wb for wb in wbs if id(wb) not in strip_wbs]
+            running_left = all_wbs_with_cell[-1][0]
-            if len(filtered) < len(wbs):
+            for gi in range(total - 1, 0, -1):
-                border_strip_removed += len(wbs) - len(filtered)
+                running_left = min(running_left, all_wbs_with_cell[gi][0])
-                cell["word_boxes"] = filtered
+                prev_right = (
-                cell["text"] = _words_to_reading_order_text(filtered)
+                    all_wbs_with_cell[gi - 1][0]
-        # Remove cells that became empty
+                    + all_wbs_with_cell[gi - 1][1].get("width", 0)
-        z["cells"] = [c for c in cells
+                )
-                      if (c.get("word_boxes") or c.get("text", "").strip())]
+                gap = running_left - prev_right
-        logger.info(
+                if gap > 30:
-            "Step 4e: removed %d border-strip word_boxes (%s) from zone %d "
+                    right_strip_count = total - gi
-            "(gap=%dpx, strip=%d/%d wbs)",
+                    right_gap = gap
-            border_strip_removed, strip_side, z.get("zone_index", 0),
+                    break
-            best_gap, left_count if strip_side == "left" else right_count, total,
+
-        )
+            strip_wbs: set = set()
            strip_side = ""
            strip_gap = 0
            strip_count = 0
            if left_strip_count > 0 and left_strip_count / total < 0.20:
                strip_side = "left"
                strip_count = left_strip_count
                strip_gap = left_gap
                strip_wbs = {id(t[1]) for t in all_wbs_with_cell[:left_strip_count]}
            elif right_strip_count > 0 and right_strip_count / total < 0.20:
                strip_side = "right"
                strip_count = right_strip_count
                strip_gap = right_gap
                strip_wbs = {id(t[1]) for t in all_wbs_with_cell[total - right_strip_count:]}
            if not strip_wbs:
                continue
            for cell in cells:
                wbs = cell.get("word_boxes") or []
                filtered = [wb for wb in wbs if id(wb) not in strip_wbs]
                if len(filtered) < len(wbs):
                    border_strip_removed += len(wbs) - len(filtered)
                    cell["word_boxes"] = filtered
                    cell["text"] = _words_to_reading_order_text(filtered)
            z["cells"] = [c for c in cells
                          if (c.get("word_boxes") or c.get("text", "").strip())]
            logger.info(
                "Step 4e: removed %d border-strip word_boxes (%s) from zone %d "
                "(gap=%dpx, strip=%d/%d wbs)",
                border_strip_removed, strip_side, z.get("zone_index", 0),
                strip_gap, strip_count, total,
            )
    # 5. Color annotation on final word_boxes in cells
    if img_bgr is not None:
@@ -1109,64 +1109,56 @@ class TestBorderStripFilter:
    def test_left_border_strip_removed(self):
        """Word_boxes at x<120 with 45px gap to content at x>=179 are removed."""
-        # Simulate border strip (11 wbs) + real content (20 wbs)
+        # Simulate border strip (3 wbs) + base words (7 wbs) + oder (7 wbs)
        # + synonyms (20 wbs). The old "largest gap" algorithm would pick
        # the 67px gap between base words and "oder", removing base words.
        # The new "first gap from edge" algorithm picks the 45px gap between
        # border artifacts and base words.
        border_wbs = [
-            self._make_wb("M", 49, 436, 46, 44),
+            self._make_wb("M", 49, 436, 46, 44),    # right=95
-            self._make_wb("x", 113, 610, 21, 38),
+            self._make_wb("x", 113, 610, 21, 38),    # right=134
-            self._make_wb("Er", 45, 998, 62, 37),
+            self._make_wb("Er", 45, 998, 62, 37),     # right=107
        ]
-        content_wbs = []
+        base_wbs = [self._make_wb(f"base{i}", 179, 100 + i * 60, 100, 20) for i in range(7)]
-        for i in range(20):
+        oder_wbs = [self._make_wb("oder", 379, 100 + i * 60, 68, 20) for i in range(7)]
-            # Place content words at x=179 and x=280 (gap=1px between them,
+        synonym_wbs = [self._make_wb(f"syn{i}", 452 + (i % 5) * 30, 100 + (i // 5) * 60, 80, 20) for i in range(20)]
-            # much smaller than the 45px border-to-content gap)
+
-            content_wbs.append(self._make_wb(f"word{i}", 179 + (i % 2) * 51, 100 + i * 40))
+        all_wbs = border_wbs + base_wbs + oder_wbs + synonym_wbs
-        # Build zone with cells
+        all_left = sorted([(wb["left"], wb) for wb in all_wbs], key=lambda t: t[0])
        cells = []
        # Border-only cells
        for i, wb in enumerate(border_wbs):
            cells.append({"cell_id": f"R{i}_C0", "col_index": 0, "row_index": i,
                          "word_boxes": [wb], "text": wb["text"]})
        # Content cells
        for i, wb in enumerate(content_wbs):
            ri = len(border_wbs) + i
            cells.append({"cell_id": f"R{ri}_C1", "col_index": 1, "row_index": ri,
                          "word_boxes": [wb], "text": wb["text"]})
        zone = {"zone_index": 0, "zone_type": "content", "cells": cells,
                "columns": [], "rows": []}
        # The filter runs inside _build_grid_core, but we can test the
        # pattern detection logic: 3 border wbs + 20 content wbs,
        # border right edge = 113+21=134, content left = 179, gap = 45px
        # 3/23 = 13% < 15% threshold
        from cv_ocr_engines import _group_words_into_lines
        all_left = sorted(
            [(wb["left"], wb) for cell in cells for wb in cell.get("word_boxes", [])],
            key=lambda t: t[0]
        )
        # Find largest gap
        best_gap = 0
        best_idx = -1
        for gi in range(len(all_left) - 1):
            right_edge = all_left[gi][0] + all_left[gi][1]["width"]
            gap = all_left[gi + 1][0] - right_edge
            if gap > best_gap:
                best_gap = gap
                best_idx = gi
        assert best_gap >= 30, f"Gap should be >=30, got {best_gap}"
        left_count = best_idx + 1
        total = len(all_left)
-        assert left_count / total < 0.15, f"Border ratio {left_count}/{total} should be <15%"
+
        # New algorithm: scan from left edge, find FIRST gap >30px
        running_right = 0
        left_strip_count = 0
        for gi in range(total - 1):
            running_right = max(running_right, all_left[gi][0] + all_left[gi][1]["width"])
            gap = all_left[gi + 1][0] - running_right
            if gap > 30:
                left_strip_count = gi + 1
                break
        # Should find the 45px gap between border (right=134) and base (left=179)
        assert left_strip_count == len(border_wbs), (
            f"Expected {len(border_wbs)} border wbs, got {left_strip_count}"
        )
        assert left_strip_count / total < 0.20, (
            f"Border ratio {left_strip_count}/{total} should be <20%"
        )
    def test_no_removal_when_no_gap(self):
        """No gap > 30px between word_boxes → nothing removed."""
        # Words spaced 20px apart with width 50 → overlap, no gap >30px
        wbs = [self._make_wb(f"w{i}", 10 + i * 20, 100) for i in range(15)]
        all_left = sorted([(wb["left"], wb) for wb in wbs], key=lambda t: t[0])
-        best_gap = 0
+        running_right = 0
        found_gap = False
        for gi in range(len(all_left) - 1):
-            right_edge = all_left[gi][0] + all_left[gi][1]["width"]
+            running_right = max(running_right, all_left[gi][0] + all_left[gi][1]["width"])
-            gap = all_left[gi + 1][0] - right_edge
+            gap = all_left[gi + 1][0] - running_right
-            if gap > best_gap:
+            if gap > 30:
-                best_gap = gap
+                found_gap = True
-        assert best_gap < 30, f"No significant gap expected, got {best_gap}"
+                break
        assert not found_gap, "No significant gap expected"
    def test_equal_sides_not_removed(self):
        """Two roughly equal groups (50/50) are NOT treated as border strip."""
@@ -1176,15 +1168,17 @@ class TestBorderStripFilter:
            [(wb["left"], wb) for wb in left_wbs + right_wbs],
            key=lambda t: t[0]
        )
        best_gap = 0
        best_idx = -1
        for gi in range(len(all_left) - 1):
            right_edge = all_left[gi][0] + all_left[gi][1]["width"]
            gap = all_left[gi + 1][0] - right_edge
            if gap > best_gap:
                best_gap = gap
                best_idx = gi
        left_count = best_idx + 1
        total = len(all_left)
        # Left scan: first gap >30px from left
        running_right = 0
        left_strip_count = 0
        for gi in range(total - 1):
            running_right = max(running_right, all_left[gi][0] + all_left[gi][1]["width"])
            gap = all_left[gi + 1][0] - running_right
            if gap > 30:
                left_strip_count = gi + 1
                break
        # 10/20 = 50% — NOT below 15% threshold, so no removal
-        assert left_count / total >= 0.15, "Equal groups should NOT trigger border removal"
+        assert left_strip_count == 0 or left_strip_count / total >= 0.20, (
            "Equal groups should NOT trigger border removal"
        )