diff --git a/klausur-service/backend/cv_layout.py b/klausur-service/backend/cv_layout.py index d269fac..01daf1c 100644 --- a/klausur-service/backend/cv_layout.py +++ b/klausur-service/backend/cv_layout.py @@ -7,6 +7,7 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. import logging import re +import statistics from typing import Any, Dict, List, Optional, Tuple import numpy as np @@ -737,6 +738,24 @@ def _detect_sub_columns( result.append(geo) continue + # --- Guard against inline markers (bullet points, numbering) --- + # Bullet points like "1.", "2.", "•", "-" sit close to the main + # column text and are part of the cell, not a separate column. + # Only split if the horizontal gap between the rightmost sub-word + # and the main column start is large enough. + max_sub_right = max(w['left'] + w.get('width', 0) for w in sub_words) + gap_to_main = col_start_bin[2] - max_sub_right # px gap + median_heights = [w.get('height', 20) for w in confident] + med_h = statistics.median(median_heights) if median_heights else 20 + min_gap = max(med_h * 1.2, 20) # at least 1.2× word height or 20px + if gap_to_main < min_gap: + logger.debug( + "SubColumnSplit: column idx=%d skipped — gap=%dpx < min=%dpx " + "(likely inline markers, not a sub-column)", + geo.index, gap_to_main, min_gap) + result.append(geo) + continue + # --- Build two sub-column geometries --- # Word 'left' values are relative to left_x; geo.x is absolute. # Convert the split position from relative to absolute coordinates. @@ -3221,6 +3240,46 @@ def detect_column_geometry_zoned( g.y = abs_y g.height = abs_y_end - abs_y + # --- Enrich column geometries with box-filtered original words --- + # The combined-image Tesseract may miss words in small content strips + # (e.g. a single row above a box). Use the original full-page word_dicts + # filtered to exclude box interiors, so that _detect_sub_columns() + # downstream has ALL content-zone words for left-edge clustering. + # This ensures narrow sub-columns (page_ref, marker) are detectable + # even when only a few entries exist above/below a box. + if word_dicts: + content_words = [] + for w in word_dicts: + # word positions are relative to left_x / top_y + w_abs_cx = w['left'] + left_x + w['width'] / 2 + w_abs_cy = w['top'] + top_y + w['height'] / 2 + inside_box = any( + box.x <= w_abs_cx <= box.x + box.width + and box.y <= w_abs_cy <= box.y + box.height + for box in boxes + ) + if not inside_box: + content_words.append(w) + + target_geoms = combined_geoms if combined_result is not None else geometries + for g in target_geoms: + # Word 'left' is relative to left_x; geometry 'x' is absolute + g_left_rel = g.x - left_x + g_right_rel = g_left_rel + g.width + g.words = [ + w for w in content_words + if g_left_rel <= w['left'] + w['width'] / 2 < g_right_rel + ] + g.word_count = len(g.words) + + excluded_count = len(word_dicts) - len(content_words) + if excluded_count: + logger.info( + "ZonedColumns: enriched geometries with %d content words " + "(excluded %d box-interior words)", + len(content_words), excluded_count, + ) + # Build zones_data for the response zones_data: List[Dict] = [] for zone in zones: diff --git a/klausur-service/backend/cv_words_first.py b/klausur-service/backend/cv_words_first.py index e5dd9ed..83dd24f 100644 --- a/klausur-service/backend/cv_words_first.py +++ b/klausur-service/backend/cv_words_first.py @@ -17,7 +17,7 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. import logging import re import statistics -from typing import Any, Dict, List, Tuple +from typing import Any, Dict, List, Optional, Tuple from cv_ocr_engines import ( _group_words_into_lines, @@ -259,6 +259,7 @@ def build_grid_from_words( img_w: int, img_h: int, min_confidence: int = 30, + box_rects: Optional[List[Dict]] = None, ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: """Build a cell grid bottom-up from Tesseract word boxes. @@ -269,6 +270,9 @@ def build_grid_from_words( img_w: Image width in pixels. img_h: Image height in pixels. min_confidence: Minimum OCR confidence to keep a word. + box_rects: Optional list of box dicts with keys x, y, width, height. + Words inside these boxes are excluded from column clustering + (box-internal columns are detected separately in sub-sessions). Returns: (cells, columns_meta) — same format as build_cell_grid_v2(). @@ -290,6 +294,28 @@ def build_grid_from_words( logger.info("build_grid_from_words: %d words (after confidence filter from %d)", len(words), len(word_dicts)) + # Exclude words inside detected boxes — box columns are detected separately + if box_rects: + content_words = [] + for w in words: + w_cx = w['left'] + w['width'] / 2 + w_cy = w['top'] + w['height'] / 2 + inside = any( + b['x'] <= w_cx <= b['x'] + b['width'] + and b['y'] <= w_cy <= b['y'] + b['height'] + for b in box_rects + ) + if not inside: + content_words.append(w) + excluded = len(words) - len(content_words) + if excluded: + logger.info("build_grid_from_words: excluded %d words inside %d box(es)", + excluded, len(box_rects)) + words = content_words + if not words: + logger.info("build_grid_from_words: all words inside boxes — returning empty grid") + return [], [] + # Step 1: cluster columns columns = _cluster_columns(words, img_w) logger.info("build_grid_from_words: %d column(s) detected", len(columns)) diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py index 3fed1d3..04f4cdb 100644 --- a/klausur-service/backend/ocr_pipeline_api.py +++ b/klausur-service/backend/ocr_pipeline_api.py @@ -2543,7 +2543,15 @@ async def detect_words( }) wf_word_dicts = abs_words - cells, columns_meta = build_grid_from_words(wf_word_dicts, img_w, img_h) + # Extract box rects for box-aware column clustering + box_rects = [] + for zone in zones: + if zone.get("zone_type") == "box" and zone.get("box"): + box_rects.append(zone["box"]) + + cells, columns_meta = build_grid_from_words( + wf_word_dicts, img_w, img_h, box_rects=box_rects or None, + ) duration = time.time() - t0 # Apply IPA phonetic fixes diff --git a/klausur-service/backend/tests/test_box_column_awareness.py b/klausur-service/backend/tests/test_box_column_awareness.py new file mode 100644 index 0000000..255a4e7 --- /dev/null +++ b/klausur-service/backend/tests/test_box_column_awareness.py @@ -0,0 +1,174 @@ +""" +Tests for box-aware column detection. + +Verifies that: +1. Words inside boxes are excluded from column clustering (words_first) +2. Column geometries are enriched with box-filtered original words (layout) +3. Inline markers (bullet points) are not split into sub-columns + +Lizenz: Apache 2.0 +""" + +import sys +import os + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +from cv_words_first import build_grid_from_words, _cluster_columns + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _word(text: str, left: int, top: int, width: int, height: int, + conf: int = 90) -> dict: + return { + 'text': text, 'left': left, 'top': top, + 'width': width, 'height': height, 'conf': conf, + } + + +def _box(x: int, y: int, w: int, h: int) -> dict: + return {'x': x, 'y': y, 'width': w, 'height': h} + + +# --------------------------------------------------------------------------- +# Tests: box filtering in build_grid_from_words +# --------------------------------------------------------------------------- + +class TestBoxAwareGridBuilding: + """Words inside boxes should be excluded from column clustering.""" + + def test_no_boxes_unchanged(self): + """Without boxes, all words should be used.""" + words = [ + _word("hello", 50, 100, 80, 20), + _word("world", 300, 100, 80, 20), + ] + cells, cols = build_grid_from_words(words, 600, 400) + assert len(cells) >= 2 + texts = {c['text'] for c in cells} + assert 'hello' in texts + assert 'world' in texts + + def test_box_words_excluded(self): + """Words inside a box should not appear in the grid.""" + words = [ + _word("outside1", 50, 50, 80, 20), + _word("outside2", 300, 50, 80, 20), + _word("inside_box", 150, 250, 100, 20), # inside box + ] + box = _box(100, 200, 300, 150) # box from x=100..400, y=200..350 + cells, cols = build_grid_from_words(words, 600, 500, box_rects=[box]) + + texts = {c['text'] for c in cells} + assert 'outside1' in texts + assert 'outside2' in texts + assert 'inside_box' not in texts + + def test_all_words_in_box_returns_empty(self): + """If all words are inside the box, return empty grid.""" + words = [ + _word("a", 150, 250, 30, 20), + _word("b", 200, 250, 30, 20), + ] + box = _box(100, 200, 300, 150) + cells, cols = build_grid_from_words(words, 600, 500, box_rects=[box]) + assert cells == [] + assert cols == [] + + def test_multiple_boxes(self): + """Words in multiple boxes should all be excluded.""" + words = [ + _word("content", 50, 50, 80, 20), + _word("box1_word", 120, 220, 80, 20), + _word("box2_word", 420, 220, 80, 20), + ] + boxes = [ + _box(100, 200, 200, 100), # box1 + _box(400, 200, 200, 100), # box2 + ] + cells, cols = build_grid_from_words(words, 700, 400, box_rects=boxes) + texts = {c['text'] for c in cells} + assert texts == {'content'} + + def test_word_on_box_border_excluded(self): + """A word exactly on the box boundary should be excluded.""" + words = [ + _word("content", 50, 50, 80, 20), + _word("edge", 100, 200, 40, 20), # left edge = box.x, center inside + ] + box = _box(100, 200, 200, 100) + cells, cols = build_grid_from_words(words, 600, 400, box_rects=[box]) + texts = {c['text'] for c in cells} + assert 'edge' not in texts + + def test_columns_not_affected_by_box_words(self): + """Box words should not create extra columns via X-gap analysis.""" + # Two columns of content words, plus a word in a box at a different X + words = [ + _word("col1_a", 50, 50, 80, 20), + _word("col1_b", 50, 100, 80, 20), + _word("col2_a", 300, 50, 80, 20), + _word("col2_b", 300, 100, 80, 20), + # This box word is at X=500, would create a 3rd column if not filtered + _word("box_far", 500, 250, 80, 20), + ] + box = _box(450, 200, 200, 150) + cells, cols = build_grid_from_words(words, 700, 500, box_rects=[box]) + # Should only have 2 columns (not 3) + assert len(cols) <= 2 + + +# --------------------------------------------------------------------------- +# Tests: _cluster_columns with box-filtered words +# --------------------------------------------------------------------------- + +class TestClusterColumnsFiltering: + """Verify column clustering works correctly with filtered words.""" + + def test_gap_detection_without_box_words(self): + """Column gaps should be found from content words only.""" + content_words = [ + _word("a", 50, 50, 30, 20), + _word("b", 50, 100, 30, 20), + _word("c", 300, 50, 30, 20), + _word("d", 300, 100, 30, 20), + ] + columns = _cluster_columns(content_words, 600) + assert len(columns) == 2 + + def test_single_column_when_words_close(self): + """Close-together words should form a single column.""" + words = [ + _word("a", 50, 50, 80, 20), + _word("b", 60, 100, 80, 20), + _word("c", 55, 150, 80, 20), + ] + columns = _cluster_columns(words, 600) + assert len(columns) == 1 + + +# --------------------------------------------------------------------------- +# Tests: inline marker guard (bullet points) +# --------------------------------------------------------------------------- + +class TestInlineMarkerGuard: + """Bullet points / numbering should NOT be split into sub-columns.""" + + def test_concept_bullet_vs_page_ref(self): + """Demonstrate the gap difference between bullets and page refs. + + Bullet points have small gap to main text (~5-10px). + Page references have large gap (~50+ px). + """ + # Bullet point scenario: "1." at left=50, main text at left=65 + # Gap = 65 - (50+20) = -5 (overlapping or touching → no split) + bullet_gap = 65 - (50 + 20) + assert bullet_gap < 20 # very small gap + + # Page ref scenario: "p.55" at left=20, main text at left=120 + # Gap = 120 - (20+40) = 60 (clear separation → split) + pageref_gap = 120 - (20 + 40) + assert pageref_gap > 30 # clear gap