feat: box-aware column detection — exclude box content from global columns

- Enrich column geometries with original full-page words (box-filtered) so _detect_sub_columns() finds narrow sub-columns across box boundaries - Add inline marker guard: bullet points (1., 2., •) are not split into sub-columns (minimum gap check: 1.2× word height or 20px) - Add box_rects parameter to build_grid_from_words() — words inside boxes are excluded from X-gap column clustering - Pass box rects from zones to words_first grid builder - Add 9 tests for box-aware column detection Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-16 18:42:46 +01:00
parent 729ebff63c
commit 0340204c1f
4 changed files with 269 additions and 2 deletions
--- a/klausur-service/backend/cv_words_first.py
+++ b/klausur-service/backend/cv_words_first.py
@@ -17,7 +17,7 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
 import logging
 import re
 import statistics
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List, Optional, Tuple

 from cv_ocr_engines import (
    _group_words_into_lines,
@@ -259,6 +259,7 @@ def build_grid_from_words(
    img_w: int,
    img_h: int,
    min_confidence: int = 30,
+    box_rects: Optional[List[Dict]] = None,
 ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
    """Build a cell grid bottom-up from Tesseract word boxes.

@@ -269,6 +270,9 @@ def build_grid_from_words(
        img_w: Image width in pixels.
        img_h: Image height in pixels.
        min_confidence: Minimum OCR confidence to keep a word.
+        box_rects: Optional list of box dicts with keys x, y, width, height.
+            Words inside these boxes are excluded from column clustering
+            (box-internal columns are detected separately in sub-sessions).

    Returns:
        (cells, columns_meta) — same format as build_cell_grid_v2().
@@ -290,6 +294,28 @@ def build_grid_from_words(

    logger.info("build_grid_from_words: %d words (after confidence filter from %d)", len(words), len(word_dicts))

+    # Exclude words inside detected boxes — box columns are detected separately
+    if box_rects:
+        content_words = []
+        for w in words:
+            w_cx = w['left'] + w['width'] / 2
+            w_cy = w['top'] + w['height'] / 2
+            inside = any(
+                b['x'] <= w_cx <= b['x'] + b['width']
+                and b['y'] <= w_cy <= b['y'] + b['height']
+                for b in box_rects
+            )
+            if not inside:
+                content_words.append(w)
+        excluded = len(words) - len(content_words)
+        if excluded:
+            logger.info("build_grid_from_words: excluded %d words inside %d box(es)",
+                        excluded, len(box_rects))
+        words = content_words
+        if not words:
+            logger.info("build_grid_from_words: all words inside boxes — returning empty grid")
+            return [], []
+
    # Step 1: cluster columns
    columns = _cluster_columns(words, img_w)
    logger.info("build_grid_from_words: %d column(s) detected", len(columns))