feat: box-aware column detection — exclude box content from global columns
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 29s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 2m4s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 19s

- Enrich column geometries with original full-page words (box-filtered)
  so _detect_sub_columns() finds narrow sub-columns across box boundaries
- Add inline marker guard: bullet points (1., 2., •) are not split into
  sub-columns (minimum gap check: 1.2× word height or 20px)
- Add box_rects parameter to build_grid_from_words() — words inside boxes
  are excluded from X-gap column clustering
- Pass box rects from zones to words_first grid builder
- Add 9 tests for box-aware column detection

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-16 18:42:46 +01:00
parent 729ebff63c
commit 0340204c1f
4 changed files with 269 additions and 2 deletions

View File

@@ -17,7 +17,7 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
import logging
import re
import statistics
from typing import Any, Dict, List, Tuple
from typing import Any, Dict, List, Optional, Tuple
from cv_ocr_engines import (
_group_words_into_lines,
@@ -259,6 +259,7 @@ def build_grid_from_words(
img_w: int,
img_h: int,
min_confidence: int = 30,
box_rects: Optional[List[Dict]] = None,
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
"""Build a cell grid bottom-up from Tesseract word boxes.
@@ -269,6 +270,9 @@ def build_grid_from_words(
img_w: Image width in pixels.
img_h: Image height in pixels.
min_confidence: Minimum OCR confidence to keep a word.
box_rects: Optional list of box dicts with keys x, y, width, height.
Words inside these boxes are excluded from column clustering
(box-internal columns are detected separately in sub-sessions).
Returns:
(cells, columns_meta) — same format as build_cell_grid_v2().
@@ -290,6 +294,28 @@ def build_grid_from_words(
logger.info("build_grid_from_words: %d words (after confidence filter from %d)", len(words), len(word_dicts))
# Exclude words inside detected boxes — box columns are detected separately
if box_rects:
content_words = []
for w in words:
w_cx = w['left'] + w['width'] / 2
w_cy = w['top'] + w['height'] / 2
inside = any(
b['x'] <= w_cx <= b['x'] + b['width']
and b['y'] <= w_cy <= b['y'] + b['height']
for b in box_rects
)
if not inside:
content_words.append(w)
excluded = len(words) - len(content_words)
if excluded:
logger.info("build_grid_from_words: excluded %d words inside %d box(es)",
excluded, len(box_rects))
words = content_words
if not words:
logger.info("build_grid_from_words: all words inside boxes — returning empty grid")
return [], []
# Step 1: cluster columns
columns = _cluster_columns(words, img_w)
logger.info("build_grid_from_words: %d column(s) detected", len(columns))