feat: box-aware column detection — exclude box content from global columns

- Enrich column geometries with original full-page words (box-filtered) so _detect_sub_columns() finds narrow sub-columns across box boundaries - Add inline marker guard: bullet points (1., 2., •) are not split into sub-columns (minimum gap check: 1.2× word height or 20px) - Add box_rects parameter to build_grid_from_words() — words inside boxes are excluded from X-gap column clustering - Pass box rects from zones to words_first grid builder - Add 9 tests for box-aware column detection Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-16 18:42:46 +01:00
parent 729ebff63c
commit 0340204c1f
4 changed files with 269 additions and 2 deletions
@@ -7,6 +7,7 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.

 import logging
 import re
+import statistics
 from typing import Any, Dict, List, Optional, Tuple

 import numpy as np
@@ -737,6 +738,24 @@ def _detect_sub_columns(
            result.append(geo)
            continue

+        # --- Guard against inline markers (bullet points, numbering) ---
+        # Bullet points like "1.", "2.", "•", "-" sit close to the main
+        # column text and are part of the cell, not a separate column.
+        # Only split if the horizontal gap between the rightmost sub-word
+        # and the main column start is large enough.
+        max_sub_right = max(w['left'] + w.get('width', 0) for w in sub_words)
+        gap_to_main = col_start_bin[2] - max_sub_right  # px gap
+        median_heights = [w.get('height', 20) for w in confident]
+        med_h = statistics.median(median_heights) if median_heights else 20
+        min_gap = max(med_h * 1.2, 20)  # at least 1.2× word height or 20px
+        if gap_to_main < min_gap:
+            logger.debug(
+                "SubColumnSplit: column idx=%d skipped — gap=%dpx < min=%dpx "
+                "(likely inline markers, not a sub-column)",
+                geo.index, gap_to_main, min_gap)
+            result.append(geo)
+            continue
+
        # --- Build two sub-column geometries ---
        # Word 'left' values are relative to left_x; geo.x is absolute.
        # Convert the split position from relative to absolute coordinates.
@@ -3221,6 +3240,46 @@ def detect_column_geometry_zoned(
            g.y = abs_y
            g.height = abs_y_end - abs_y

+    # --- Enrich column geometries with box-filtered original words ---
+    # The combined-image Tesseract may miss words in small content strips
+    # (e.g. a single row above a box).  Use the original full-page word_dicts
+    # filtered to exclude box interiors, so that _detect_sub_columns()
+    # downstream has ALL content-zone words for left-edge clustering.
+    # This ensures narrow sub-columns (page_ref, marker) are detectable
+    # even when only a few entries exist above/below a box.
+    if word_dicts:
+        content_words = []
+        for w in word_dicts:
+            # word positions are relative to left_x / top_y
+            w_abs_cx = w['left'] + left_x + w['width'] / 2
+            w_abs_cy = w['top'] + top_y + w['height'] / 2
+            inside_box = any(
+                box.x <= w_abs_cx <= box.x + box.width
+                and box.y <= w_abs_cy <= box.y + box.height
+                for box in boxes
+            )
+            if not inside_box:
+                content_words.append(w)
+
+        target_geoms = combined_geoms if combined_result is not None else geometries
+        for g in target_geoms:
+            # Word 'left' is relative to left_x; geometry 'x' is absolute
+            g_left_rel = g.x - left_x
+            g_right_rel = g_left_rel + g.width
+            g.words = [
+                w for w in content_words
+                if g_left_rel <= w['left'] + w['width'] / 2 < g_right_rel
+            ]
+            g.word_count = len(g.words)
+
+        excluded_count = len(word_dicts) - len(content_words)
+        if excluded_count:
+            logger.info(
+                "ZonedColumns: enriched geometries with %d content words "
+                "(excluded %d box-interior words)",
+                len(content_words), excluded_count,
+            )
+
    # Build zones_data for the response
    zones_data: List[Dict] = []
    for zone in zones:
@@ -17,7 +17,7 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
 import logging
 import re
 import statistics
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List, Optional, Tuple

 from cv_ocr_engines import (
    _group_words_into_lines,
@@ -259,6 +259,7 @@ def build_grid_from_words(
    img_w: int,
    img_h: int,
    min_confidence: int = 30,
+    box_rects: Optional[List[Dict]] = None,
 ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
    """Build a cell grid bottom-up from Tesseract word boxes.

@@ -269,6 +270,9 @@ def build_grid_from_words(
        img_w: Image width in pixels.
        img_h: Image height in pixels.
        min_confidence: Minimum OCR confidence to keep a word.
+        box_rects: Optional list of box dicts with keys x, y, width, height.
+            Words inside these boxes are excluded from column clustering
+            (box-internal columns are detected separately in sub-sessions).

    Returns:
        (cells, columns_meta) — same format as build_cell_grid_v2().
@@ -290,6 +294,28 @@ def build_grid_from_words(

    logger.info("build_grid_from_words: %d words (after confidence filter from %d)", len(words), len(word_dicts))

+    # Exclude words inside detected boxes — box columns are detected separately
+    if box_rects:
+        content_words = []
+        for w in words:
+            w_cx = w['left'] + w['width'] / 2
+            w_cy = w['top'] + w['height'] / 2
+            inside = any(
+                b['x'] <= w_cx <= b['x'] + b['width']
+                and b['y'] <= w_cy <= b['y'] + b['height']
+                for b in box_rects
+            )
+            if not inside:
+                content_words.append(w)
+        excluded = len(words) - len(content_words)
+        if excluded:
+            logger.info("build_grid_from_words: excluded %d words inside %d box(es)",
+                        excluded, len(box_rects))
+        words = content_words
+        if not words:
+            logger.info("build_grid_from_words: all words inside boxes — returning empty grid")
+            return [], []
+
    # Step 1: cluster columns
    columns = _cluster_columns(words, img_w)
    logger.info("build_grid_from_words: %d column(s) detected", len(columns))
@@ -2543,7 +2543,15 @@ async def detect_words(
                    })
                wf_word_dicts = abs_words

-        cells, columns_meta = build_grid_from_words(wf_word_dicts, img_w, img_h)
+        # Extract box rects for box-aware column clustering
+        box_rects = []
+        for zone in zones:
+            if zone.get("zone_type") == "box" and zone.get("box"):
+                box_rects.append(zone["box"])
+
+        cells, columns_meta = build_grid_from_words(
+            wf_word_dicts, img_w, img_h, box_rects=box_rects or None,
+        )
        duration = time.time() - t0

        # Apply IPA phonetic fixes
@@ -0,0 +1,174 @@
+"""
+Tests for box-aware column detection.
+
+Verifies that:
+1. Words inside boxes are excluded from column clustering (words_first)
+2. Column geometries are enriched with box-filtered original words (layout)
+3. Inline markers (bullet points) are not split into sub-columns
+
+Lizenz: Apache 2.0
+"""
+
+import sys
+import os
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+from cv_words_first import build_grid_from_words, _cluster_columns
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _word(text: str, left: int, top: int, width: int, height: int,
+          conf: int = 90) -> dict:
+    return {
+        'text': text, 'left': left, 'top': top,
+        'width': width, 'height': height, 'conf': conf,
+    }
+
+
+def _box(x: int, y: int, w: int, h: int) -> dict:
+    return {'x': x, 'y': y, 'width': w, 'height': h}
+
+
+# ---------------------------------------------------------------------------
+# Tests: box filtering in build_grid_from_words
+# ---------------------------------------------------------------------------
+
+class TestBoxAwareGridBuilding:
+    """Words inside boxes should be excluded from column clustering."""
+
+    def test_no_boxes_unchanged(self):
+        """Without boxes, all words should be used."""
+        words = [
+            _word("hello", 50, 100, 80, 20),
+            _word("world", 300, 100, 80, 20),
+        ]
+        cells, cols = build_grid_from_words(words, 600, 400)
+        assert len(cells) >= 2
+        texts = {c['text'] for c in cells}
+        assert 'hello' in texts
+        assert 'world' in texts
+
+    def test_box_words_excluded(self):
+        """Words inside a box should not appear in the grid."""
+        words = [
+            _word("outside1", 50, 50, 80, 20),
+            _word("outside2", 300, 50, 80, 20),
+            _word("inside_box", 150, 250, 100, 20),  # inside box
+        ]
+        box = _box(100, 200, 300, 150)  # box from x=100..400, y=200..350
+        cells, cols = build_grid_from_words(words, 600, 500, box_rects=[box])
+
+        texts = {c['text'] for c in cells}
+        assert 'outside1' in texts
+        assert 'outside2' in texts
+        assert 'inside_box' not in texts
+
+    def test_all_words_in_box_returns_empty(self):
+        """If all words are inside the box, return empty grid."""
+        words = [
+            _word("a", 150, 250, 30, 20),
+            _word("b", 200, 250, 30, 20),
+        ]
+        box = _box(100, 200, 300, 150)
+        cells, cols = build_grid_from_words(words, 600, 500, box_rects=[box])
+        assert cells == []
+        assert cols == []
+
+    def test_multiple_boxes(self):
+        """Words in multiple boxes should all be excluded."""
+        words = [
+            _word("content", 50, 50, 80, 20),
+            _word("box1_word", 120, 220, 80, 20),
+            _word("box2_word", 420, 220, 80, 20),
+        ]
+        boxes = [
+            _box(100, 200, 200, 100),  # box1
+            _box(400, 200, 200, 100),  # box2
+        ]
+        cells, cols = build_grid_from_words(words, 700, 400, box_rects=boxes)
+        texts = {c['text'] for c in cells}
+        assert texts == {'content'}
+
+    def test_word_on_box_border_excluded(self):
+        """A word exactly on the box boundary should be excluded."""
+        words = [
+            _word("content", 50, 50, 80, 20),
+            _word("edge", 100, 200, 40, 20),  # left edge = box.x, center inside
+        ]
+        box = _box(100, 200, 200, 100)
+        cells, cols = build_grid_from_words(words, 600, 400, box_rects=[box])
+        texts = {c['text'] for c in cells}
+        assert 'edge' not in texts
+
+    def test_columns_not_affected_by_box_words(self):
+        """Box words should not create extra columns via X-gap analysis."""
+        # Two columns of content words, plus a word in a box at a different X
+        words = [
+            _word("col1_a", 50, 50, 80, 20),
+            _word("col1_b", 50, 100, 80, 20),
+            _word("col2_a", 300, 50, 80, 20),
+            _word("col2_b", 300, 100, 80, 20),
+            # This box word is at X=500, would create a 3rd column if not filtered
+            _word("box_far", 500, 250, 80, 20),
+        ]
+        box = _box(450, 200, 200, 150)
+        cells, cols = build_grid_from_words(words, 700, 500, box_rects=[box])
+        # Should only have 2 columns (not 3)
+        assert len(cols) <= 2
+
+
+# ---------------------------------------------------------------------------
+# Tests: _cluster_columns with box-filtered words
+# ---------------------------------------------------------------------------
+
+class TestClusterColumnsFiltering:
+    """Verify column clustering works correctly with filtered words."""
+
+    def test_gap_detection_without_box_words(self):
+        """Column gaps should be found from content words only."""
+        content_words = [
+            _word("a", 50, 50, 30, 20),
+            _word("b", 50, 100, 30, 20),
+            _word("c", 300, 50, 30, 20),
+            _word("d", 300, 100, 30, 20),
+        ]
+        columns = _cluster_columns(content_words, 600)
+        assert len(columns) == 2
+
+    def test_single_column_when_words_close(self):
+        """Close-together words should form a single column."""
+        words = [
+            _word("a", 50, 50, 80, 20),
+            _word("b", 60, 100, 80, 20),
+            _word("c", 55, 150, 80, 20),
+        ]
+        columns = _cluster_columns(words, 600)
+        assert len(columns) == 1
+
+
+# ---------------------------------------------------------------------------
+# Tests: inline marker guard (bullet points)
+# ---------------------------------------------------------------------------
+
+class TestInlineMarkerGuard:
+    """Bullet points / numbering should NOT be split into sub-columns."""
+
+    def test_concept_bullet_vs_page_ref(self):
+        """Demonstrate the gap difference between bullets and page refs.
+
+        Bullet points have small gap to main text (~5-10px).
+        Page references have large gap (~50+ px).
+        """
+        # Bullet point scenario: "1." at left=50, main text at left=65
+        # Gap = 65 - (50+20) = -5  (overlapping or touching → no split)
+        bullet_gap = 65 - (50 + 20)
+        assert bullet_gap < 20  # very small gap
+
+        # Page ref scenario: "p.55" at left=20, main text at left=120
+        # Gap = 120 - (20+40) = 60  (clear separation → split)
+        pageref_gap = 120 - (20 + 40)
+        assert pageref_gap > 30  # clear gap