diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 9b79c80..c5dbf4b 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -320,7 +320,7 @@ def _filter_border_ghosts( ) if not on_border: return False - if all(c in _GRID_GHOST_CHARS for c in text): + if len(text) == 1 and text in _GRID_GHOST_CHARS: return True return False @@ -656,6 +656,7 @@ def _detect_header_rows( zone_words: List[Dict], zone_y: int, columns: Optional[List[Dict]] = None, + skip_first_row_header: bool = False, ) -> List[int]: """Detect header rows: first-row heuristic + spanning header detection. @@ -666,27 +667,29 @@ def _detect_header_rows( return [] headers = [] - first_row = rows[0] - second_row = rows[1] - # Gap between first and second row > 0.5x average row height - avg_h = sum(r["y_max"] - r["y_min"] for r in rows) / len(rows) - gap = second_row["y_min"] - first_row["y_max"] - if gap > avg_h * 0.5: - headers.append(0) + if not skip_first_row_header: + first_row = rows[0] + second_row = rows[1] - # Also check if first row words are taller than average (bold/header text) - all_heights = [w["height"] for w in zone_words] - median_h = sorted(all_heights)[len(all_heights) // 2] if all_heights else 20 - first_row_words = [ - w for w in zone_words - if first_row["y_min"] <= w["top"] + w["height"] / 2 <= first_row["y_max"] - ] - if first_row_words: - first_h = max(w["height"] for w in first_row_words) - if first_h > median_h * 1.3: - if 0 not in headers: - headers.append(0) + # Gap between first and second row > 0.5x average row height + avg_h = sum(r["y_max"] - r["y_min"] for r in rows) / len(rows) + gap = second_row["y_min"] - first_row["y_max"] + if gap > avg_h * 0.5: + headers.append(0) + + # Also check if first row words are taller than average (bold/header text) + all_heights = [w["height"] for w in zone_words] + median_h = sorted(all_heights)[len(all_heights) // 2] if all_heights else 20 + first_row_words = [ + w for w in zone_words + if first_row["y_min"] <= w["top"] + w["height"] / 2 <= first_row["y_max"] + ] + if first_row_words: + first_h = max(w["height"] for w in first_row_words) + if first_h > median_h * 1.3: + if 0 not in headers: + headers.append(0) # Note: Spanning-header detection (rows spanning all columns) has been # disabled because it produces too many false positives on vocabulary @@ -707,6 +710,7 @@ def _build_zone_grid( img_w: int, img_h: int, global_columns: Optional[List[Dict]] = None, + skip_first_row_header: bool = False, ) -> Dict[str, Any]: """Build columns, rows, cells for a single zone from its words. @@ -773,7 +777,8 @@ def _build_zone_grid( cell["zone_index"] = zone_index # Detect header rows (pass columns for spanning header detection) - header_rows = _detect_header_rows(rows, zone_words, zone_y, columns) + header_rows = _detect_header_rows(rows, zone_words, zone_y, columns, + skip_first_row_header=skip_first_row_header) # Merge cells in spanning header rows into a single col-0 cell if header_rows and len(columns) >= 2: @@ -1270,9 +1275,27 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: "build-grid: filtered %d recovered artifacts from %s zone %d", removed, pz.zone_type, pz.index, ) + # Filter words inside image overlay regions (merged box zones) + if pz.image_overlays: + before_ov = len(zone_words) + zone_words = [ + w for w in zone_words + if not any( + ov["y"] <= w["top"] + w["height"] / 2 <= ov["y"] + ov["height"] + and ov["x"] <= w["left"] + w["width"] / 2 <= ov["x"] + ov["width"] + for ov in pz.image_overlays + ) + ] + ov_removed = before_ov - len(zone_words) + if ov_removed: + logger.info( + "build-grid: filtered %d words inside image overlays from zone %d", + ov_removed, pz.index, + ) grid = _build_zone_grid( zone_words, pz.x, pz.y, pz.width, pz.height, pz.index, img_w, img_h, + skip_first_row_header=bool(pz.image_overlays), ) zone_grids.append({"pz": pz, "words": zone_words, "grid": grid}) @@ -1339,6 +1362,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: pz.width, pz.height, pz.index, img_w, img_h, global_columns=merged_columns, + skip_first_row_header=bool(pz.image_overlays), ) zg["grid"] = grid logger.info( diff --git a/klausur-service/backend/tests/test_grid_editor_api.py b/klausur-service/backend/tests/test_grid_editor_api.py index 7d9ee84..6e6e342 100644 --- a/klausur-service/backend/tests/test_grid_editor_api.py +++ b/klausur-service/backend/tests/test_grid_editor_api.py @@ -1,9 +1,11 @@ """ -Tests for grid_editor_api zone merging and heading detection. +Tests for grid_editor_api zone merging, heading detection, and ghost filtering. Covers: - _merge_content_zones_across_boxes: zone merging logic - _detect_heading_rows_by_color: heading detection by color + height +- _filter_border_ghosts: single-char ghost detection +- _detect_header_rows: skip_first_row_header flag """ import sys @@ -13,6 +15,8 @@ import pytest from cv_vocab_types import PageZone, DetectedBox from grid_editor_api import ( _merge_content_zones_across_boxes, + _filter_border_ghosts, + _detect_header_rows, _detect_heading_rows_by_color, ) @@ -358,3 +362,82 @@ class TestDetectHeadingRowsByColor: zones_data = [self._make_zone(cells, rows, columns)] count = _detect_heading_rows_by_color(zones_data, 800, 1000) assert count == 0 + + +# --------------------------------------------------------------------------- +# _filter_border_ghosts (Fix 2: single-char only) +# --------------------------------------------------------------------------- + +class TestFilterBorderGhosts: + """Test that ghost filtering only removes single-char words.""" + + def test_single_char_ghost_removed(self): + """Single '|' on a box border → filtered as ghost.""" + box = DetectedBox(x=100, y=100, width=200, height=150, confidence=0.9, border_thickness=3) + words = [ + {"text": "|", "left": 98, "top": 200, "width": 5, "height": 20}, + {"text": "hello", "left": 150, "top": 150, "width": 80, "height": 20}, + ] + filtered, count = _filter_border_ghosts(words, [box]) + assert count == 1 + assert len(filtered) == 1 + assert filtered[0]["text"] == "hello" + + def test_multi_char_ghost_kept(self): + """Multi-char '(=' on a box border → NOT filtered (real content).""" + box = DetectedBox(x=648, y=129, width=338, height=125, confidence=0.7, border_thickness=0) + words = [ + {"text": "(=", "left": 644, "top": 294, "width": 16, "height": 17}, + {"text": "I", "left": 665, "top": 294, "width": 9, "height": 18}, + ] + filtered, count = _filter_border_ghosts(words, [box]) + assert count == 0 + assert len(filtered) == 2 + + def test_single_paren_on_border_removed(self): + """Single ')' on border → filtered.""" + box = DetectedBox(x=100, y=100, width=200, height=150, confidence=0.9, border_thickness=2) + words = [ + {"text": ")", "left": 299, "top": 200, "width": 4, "height": 7}, + ] + filtered, count = _filter_border_ghosts(words, [box]) + assert count == 1 + assert len(filtered) == 0 + + +# --------------------------------------------------------------------------- +# _detect_header_rows (Fix 3: skip_first_row_header) +# --------------------------------------------------------------------------- + +class TestDetectHeaderRowsSkipFlag: + """Test skip_first_row_header flag.""" + + def test_first_row_detected_without_flag(self): + """Without flag, first row with big gap → header.""" + rows = [ + {"y_min": 100, "y_max": 120, "index": 0}, + {"y_min": 160, "y_max": 180, "index": 1}, + {"y_min": 185, "y_max": 205, "index": 2}, + ] + words = [ + {"height": 20, "top": 105, "left": 10, "width": 80}, + {"height": 20, "top": 165, "left": 10, "width": 80}, + {"height": 20, "top": 190, "left": 10, "width": 80}, + ] + headers = _detect_header_rows(rows, words, 0) + assert 0 in headers + + def test_first_row_skipped_with_flag(self): + """With skip flag, first row NOT detected even with big gap.""" + rows = [ + {"y_min": 100, "y_max": 120, "index": 0}, + {"y_min": 160, "y_max": 180, "index": 1}, + {"y_min": 185, "y_max": 205, "index": 2}, + ] + words = [ + {"height": 20, "top": 105, "left": 10, "width": 80}, + {"height": 20, "top": 165, "left": 10, "width": 80}, + {"height": 20, "top": 190, "left": 10, "width": 80}, + ] + headers = _detect_header_rows(rows, words, 0, skip_first_row_header=True) + assert 0 not in headers