feat: box-aware column detection — exclude box content from global columns
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 29s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 2m4s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 19s

- Enrich column geometries with original full-page words (box-filtered)
  so _detect_sub_columns() finds narrow sub-columns across box boundaries
- Add inline marker guard: bullet points (1., 2., •) are not split into
  sub-columns (minimum gap check: 1.2× word height or 20px)
- Add box_rects parameter to build_grid_from_words() — words inside boxes
  are excluded from X-gap column clustering
- Pass box rects from zones to words_first grid builder
- Add 9 tests for box-aware column detection

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-16 18:42:46 +01:00
parent 729ebff63c
commit 0340204c1f
4 changed files with 269 additions and 2 deletions

View File

@@ -7,6 +7,7 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
import logging import logging
import re import re
import statistics
from typing import Any, Dict, List, Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
import numpy as np import numpy as np
@@ -737,6 +738,24 @@ def _detect_sub_columns(
result.append(geo) result.append(geo)
continue continue
# --- Guard against inline markers (bullet points, numbering) ---
# Bullet points like "1.", "2.", "•", "-" sit close to the main
# column text and are part of the cell, not a separate column.
# Only split if the horizontal gap between the rightmost sub-word
# and the main column start is large enough.
max_sub_right = max(w['left'] + w.get('width', 0) for w in sub_words)
gap_to_main = col_start_bin[2] - max_sub_right # px gap
median_heights = [w.get('height', 20) for w in confident]
med_h = statistics.median(median_heights) if median_heights else 20
min_gap = max(med_h * 1.2, 20) # at least 1.2× word height or 20px
if gap_to_main < min_gap:
logger.debug(
"SubColumnSplit: column idx=%d skipped — gap=%dpx < min=%dpx "
"(likely inline markers, not a sub-column)",
geo.index, gap_to_main, min_gap)
result.append(geo)
continue
# --- Build two sub-column geometries --- # --- Build two sub-column geometries ---
# Word 'left' values are relative to left_x; geo.x is absolute. # Word 'left' values are relative to left_x; geo.x is absolute.
# Convert the split position from relative to absolute coordinates. # Convert the split position from relative to absolute coordinates.
@@ -3221,6 +3240,46 @@ def detect_column_geometry_zoned(
g.y = abs_y g.y = abs_y
g.height = abs_y_end - abs_y g.height = abs_y_end - abs_y
# --- Enrich column geometries with box-filtered original words ---
# The combined-image Tesseract may miss words in small content strips
# (e.g. a single row above a box). Use the original full-page word_dicts
# filtered to exclude box interiors, so that _detect_sub_columns()
# downstream has ALL content-zone words for left-edge clustering.
# This ensures narrow sub-columns (page_ref, marker) are detectable
# even when only a few entries exist above/below a box.
if word_dicts:
content_words = []
for w in word_dicts:
# word positions are relative to left_x / top_y
w_abs_cx = w['left'] + left_x + w['width'] / 2
w_abs_cy = w['top'] + top_y + w['height'] / 2
inside_box = any(
box.x <= w_abs_cx <= box.x + box.width
and box.y <= w_abs_cy <= box.y + box.height
for box in boxes
)
if not inside_box:
content_words.append(w)
target_geoms = combined_geoms if combined_result is not None else geometries
for g in target_geoms:
# Word 'left' is relative to left_x; geometry 'x' is absolute
g_left_rel = g.x - left_x
g_right_rel = g_left_rel + g.width
g.words = [
w for w in content_words
if g_left_rel <= w['left'] + w['width'] / 2 < g_right_rel
]
g.word_count = len(g.words)
excluded_count = len(word_dicts) - len(content_words)
if excluded_count:
logger.info(
"ZonedColumns: enriched geometries with %d content words "
"(excluded %d box-interior words)",
len(content_words), excluded_count,
)
# Build zones_data for the response # Build zones_data for the response
zones_data: List[Dict] = [] zones_data: List[Dict] = []
for zone in zones: for zone in zones:

View File

@@ -17,7 +17,7 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
import logging import logging
import re import re
import statistics import statistics
from typing import Any, Dict, List, Tuple from typing import Any, Dict, List, Optional, Tuple
from cv_ocr_engines import ( from cv_ocr_engines import (
_group_words_into_lines, _group_words_into_lines,
@@ -259,6 +259,7 @@ def build_grid_from_words(
img_w: int, img_w: int,
img_h: int, img_h: int,
min_confidence: int = 30, min_confidence: int = 30,
box_rects: Optional[List[Dict]] = None,
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
"""Build a cell grid bottom-up from Tesseract word boxes. """Build a cell grid bottom-up from Tesseract word boxes.
@@ -269,6 +270,9 @@ def build_grid_from_words(
img_w: Image width in pixels. img_w: Image width in pixels.
img_h: Image height in pixels. img_h: Image height in pixels.
min_confidence: Minimum OCR confidence to keep a word. min_confidence: Minimum OCR confidence to keep a word.
box_rects: Optional list of box dicts with keys x, y, width, height.
Words inside these boxes are excluded from column clustering
(box-internal columns are detected separately in sub-sessions).
Returns: Returns:
(cells, columns_meta) — same format as build_cell_grid_v2(). (cells, columns_meta) — same format as build_cell_grid_v2().
@@ -290,6 +294,28 @@ def build_grid_from_words(
logger.info("build_grid_from_words: %d words (after confidence filter from %d)", len(words), len(word_dicts)) logger.info("build_grid_from_words: %d words (after confidence filter from %d)", len(words), len(word_dicts))
# Exclude words inside detected boxes — box columns are detected separately
if box_rects:
content_words = []
for w in words:
w_cx = w['left'] + w['width'] / 2
w_cy = w['top'] + w['height'] / 2
inside = any(
b['x'] <= w_cx <= b['x'] + b['width']
and b['y'] <= w_cy <= b['y'] + b['height']
for b in box_rects
)
if not inside:
content_words.append(w)
excluded = len(words) - len(content_words)
if excluded:
logger.info("build_grid_from_words: excluded %d words inside %d box(es)",
excluded, len(box_rects))
words = content_words
if not words:
logger.info("build_grid_from_words: all words inside boxes — returning empty grid")
return [], []
# Step 1: cluster columns # Step 1: cluster columns
columns = _cluster_columns(words, img_w) columns = _cluster_columns(words, img_w)
logger.info("build_grid_from_words: %d column(s) detected", len(columns)) logger.info("build_grid_from_words: %d column(s) detected", len(columns))

View File

@@ -2543,7 +2543,15 @@ async def detect_words(
}) })
wf_word_dicts = abs_words wf_word_dicts = abs_words
cells, columns_meta = build_grid_from_words(wf_word_dicts, img_w, img_h) # Extract box rects for box-aware column clustering
box_rects = []
for zone in zones:
if zone.get("zone_type") == "box" and zone.get("box"):
box_rects.append(zone["box"])
cells, columns_meta = build_grid_from_words(
wf_word_dicts, img_w, img_h, box_rects=box_rects or None,
)
duration = time.time() - t0 duration = time.time() - t0
# Apply IPA phonetic fixes # Apply IPA phonetic fixes

View File

@@ -0,0 +1,174 @@
"""
Tests for box-aware column detection.
Verifies that:
1. Words inside boxes are excluded from column clustering (words_first)
2. Column geometries are enriched with box-filtered original words (layout)
3. Inline markers (bullet points) are not split into sub-columns
Lizenz: Apache 2.0
"""
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from cv_words_first import build_grid_from_words, _cluster_columns
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _word(text: str, left: int, top: int, width: int, height: int,
conf: int = 90) -> dict:
return {
'text': text, 'left': left, 'top': top,
'width': width, 'height': height, 'conf': conf,
}
def _box(x: int, y: int, w: int, h: int) -> dict:
return {'x': x, 'y': y, 'width': w, 'height': h}
# ---------------------------------------------------------------------------
# Tests: box filtering in build_grid_from_words
# ---------------------------------------------------------------------------
class TestBoxAwareGridBuilding:
"""Words inside boxes should be excluded from column clustering."""
def test_no_boxes_unchanged(self):
"""Without boxes, all words should be used."""
words = [
_word("hello", 50, 100, 80, 20),
_word("world", 300, 100, 80, 20),
]
cells, cols = build_grid_from_words(words, 600, 400)
assert len(cells) >= 2
texts = {c['text'] for c in cells}
assert 'hello' in texts
assert 'world' in texts
def test_box_words_excluded(self):
"""Words inside a box should not appear in the grid."""
words = [
_word("outside1", 50, 50, 80, 20),
_word("outside2", 300, 50, 80, 20),
_word("inside_box", 150, 250, 100, 20), # inside box
]
box = _box(100, 200, 300, 150) # box from x=100..400, y=200..350
cells, cols = build_grid_from_words(words, 600, 500, box_rects=[box])
texts = {c['text'] for c in cells}
assert 'outside1' in texts
assert 'outside2' in texts
assert 'inside_box' not in texts
def test_all_words_in_box_returns_empty(self):
"""If all words are inside the box, return empty grid."""
words = [
_word("a", 150, 250, 30, 20),
_word("b", 200, 250, 30, 20),
]
box = _box(100, 200, 300, 150)
cells, cols = build_grid_from_words(words, 600, 500, box_rects=[box])
assert cells == []
assert cols == []
def test_multiple_boxes(self):
"""Words in multiple boxes should all be excluded."""
words = [
_word("content", 50, 50, 80, 20),
_word("box1_word", 120, 220, 80, 20),
_word("box2_word", 420, 220, 80, 20),
]
boxes = [
_box(100, 200, 200, 100), # box1
_box(400, 200, 200, 100), # box2
]
cells, cols = build_grid_from_words(words, 700, 400, box_rects=boxes)
texts = {c['text'] for c in cells}
assert texts == {'content'}
def test_word_on_box_border_excluded(self):
"""A word exactly on the box boundary should be excluded."""
words = [
_word("content", 50, 50, 80, 20),
_word("edge", 100, 200, 40, 20), # left edge = box.x, center inside
]
box = _box(100, 200, 200, 100)
cells, cols = build_grid_from_words(words, 600, 400, box_rects=[box])
texts = {c['text'] for c in cells}
assert 'edge' not in texts
def test_columns_not_affected_by_box_words(self):
"""Box words should not create extra columns via X-gap analysis."""
# Two columns of content words, plus a word in a box at a different X
words = [
_word("col1_a", 50, 50, 80, 20),
_word("col1_b", 50, 100, 80, 20),
_word("col2_a", 300, 50, 80, 20),
_word("col2_b", 300, 100, 80, 20),
# This box word is at X=500, would create a 3rd column if not filtered
_word("box_far", 500, 250, 80, 20),
]
box = _box(450, 200, 200, 150)
cells, cols = build_grid_from_words(words, 700, 500, box_rects=[box])
# Should only have 2 columns (not 3)
assert len(cols) <= 2
# ---------------------------------------------------------------------------
# Tests: _cluster_columns with box-filtered words
# ---------------------------------------------------------------------------
class TestClusterColumnsFiltering:
"""Verify column clustering works correctly with filtered words."""
def test_gap_detection_without_box_words(self):
"""Column gaps should be found from content words only."""
content_words = [
_word("a", 50, 50, 30, 20),
_word("b", 50, 100, 30, 20),
_word("c", 300, 50, 30, 20),
_word("d", 300, 100, 30, 20),
]
columns = _cluster_columns(content_words, 600)
assert len(columns) == 2
def test_single_column_when_words_close(self):
"""Close-together words should form a single column."""
words = [
_word("a", 50, 50, 80, 20),
_word("b", 60, 100, 80, 20),
_word("c", 55, 150, 80, 20),
]
columns = _cluster_columns(words, 600)
assert len(columns) == 1
# ---------------------------------------------------------------------------
# Tests: inline marker guard (bullet points)
# ---------------------------------------------------------------------------
class TestInlineMarkerGuard:
"""Bullet points / numbering should NOT be split into sub-columns."""
def test_concept_bullet_vs_page_ref(self):
"""Demonstrate the gap difference between bullets and page refs.
Bullet points have small gap to main text (~5-10px).
Page references have large gap (~50+ px).
"""
# Bullet point scenario: "1." at left=50, main text at left=65
# Gap = 65 - (50+20) = -5 (overlapping or touching → no split)
bullet_gap = 65 - (50 + 20)
assert bullet_gap < 20 # very small gap
# Page ref scenario: "p.55" at left=20, main text at left=120
# Gap = 120 - (20+40) = 60 (clear separation → split)
pageref_gap = 120 - (20 + 40)
assert pageref_gap > 30 # clear gap