Fix overlay word leak, ghost filter false positive, merged zone header
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 1m57s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 41s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 1m57s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 41s
1. Filter words inside image_overlays (removes OCR from images) 2. Ghost filter: only remove single-char border artifacts, not multi-char like (= which is real content 3. Skip first-row header detection for zones with image_overlays (merged geometry creates artificial gaps) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,9 +1,11 @@
|
||||
"""
|
||||
Tests for grid_editor_api zone merging and heading detection.
|
||||
Tests for grid_editor_api zone merging, heading detection, and ghost filtering.
|
||||
|
||||
Covers:
|
||||
- _merge_content_zones_across_boxes: zone merging logic
|
||||
- _detect_heading_rows_by_color: heading detection by color + height
|
||||
- _filter_border_ghosts: single-char ghost detection
|
||||
- _detect_header_rows: skip_first_row_header flag
|
||||
"""
|
||||
|
||||
import sys
|
||||
@@ -13,6 +15,8 @@ import pytest
|
||||
from cv_vocab_types import PageZone, DetectedBox
|
||||
from grid_editor_api import (
|
||||
_merge_content_zones_across_boxes,
|
||||
_filter_border_ghosts,
|
||||
_detect_header_rows,
|
||||
_detect_heading_rows_by_color,
|
||||
)
|
||||
|
||||
@@ -358,3 +362,82 @@ class TestDetectHeadingRowsByColor:
|
||||
zones_data = [self._make_zone(cells, rows, columns)]
|
||||
count = _detect_heading_rows_by_color(zones_data, 800, 1000)
|
||||
assert count == 0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _filter_border_ghosts (Fix 2: single-char only)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestFilterBorderGhosts:
|
||||
"""Test that ghost filtering only removes single-char words."""
|
||||
|
||||
def test_single_char_ghost_removed(self):
|
||||
"""Single '|' on a box border → filtered as ghost."""
|
||||
box = DetectedBox(x=100, y=100, width=200, height=150, confidence=0.9, border_thickness=3)
|
||||
words = [
|
||||
{"text": "|", "left": 98, "top": 200, "width": 5, "height": 20},
|
||||
{"text": "hello", "left": 150, "top": 150, "width": 80, "height": 20},
|
||||
]
|
||||
filtered, count = _filter_border_ghosts(words, [box])
|
||||
assert count == 1
|
||||
assert len(filtered) == 1
|
||||
assert filtered[0]["text"] == "hello"
|
||||
|
||||
def test_multi_char_ghost_kept(self):
|
||||
"""Multi-char '(=' on a box border → NOT filtered (real content)."""
|
||||
box = DetectedBox(x=648, y=129, width=338, height=125, confidence=0.7, border_thickness=0)
|
||||
words = [
|
||||
{"text": "(=", "left": 644, "top": 294, "width": 16, "height": 17},
|
||||
{"text": "I", "left": 665, "top": 294, "width": 9, "height": 18},
|
||||
]
|
||||
filtered, count = _filter_border_ghosts(words, [box])
|
||||
assert count == 0
|
||||
assert len(filtered) == 2
|
||||
|
||||
def test_single_paren_on_border_removed(self):
|
||||
"""Single ')' on border → filtered."""
|
||||
box = DetectedBox(x=100, y=100, width=200, height=150, confidence=0.9, border_thickness=2)
|
||||
words = [
|
||||
{"text": ")", "left": 299, "top": 200, "width": 4, "height": 7},
|
||||
]
|
||||
filtered, count = _filter_border_ghosts(words, [box])
|
||||
assert count == 1
|
||||
assert len(filtered) == 0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _detect_header_rows (Fix 3: skip_first_row_header)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestDetectHeaderRowsSkipFlag:
|
||||
"""Test skip_first_row_header flag."""
|
||||
|
||||
def test_first_row_detected_without_flag(self):
|
||||
"""Without flag, first row with big gap → header."""
|
||||
rows = [
|
||||
{"y_min": 100, "y_max": 120, "index": 0},
|
||||
{"y_min": 160, "y_max": 180, "index": 1},
|
||||
{"y_min": 185, "y_max": 205, "index": 2},
|
||||
]
|
||||
words = [
|
||||
{"height": 20, "top": 105, "left": 10, "width": 80},
|
||||
{"height": 20, "top": 165, "left": 10, "width": 80},
|
||||
{"height": 20, "top": 190, "left": 10, "width": 80},
|
||||
]
|
||||
headers = _detect_header_rows(rows, words, 0)
|
||||
assert 0 in headers
|
||||
|
||||
def test_first_row_skipped_with_flag(self):
|
||||
"""With skip flag, first row NOT detected even with big gap."""
|
||||
rows = [
|
||||
{"y_min": 100, "y_max": 120, "index": 0},
|
||||
{"y_min": 160, "y_max": 180, "index": 1},
|
||||
{"y_min": 185, "y_max": 205, "index": 2},
|
||||
]
|
||||
words = [
|
||||
{"height": 20, "top": 105, "left": 10, "width": 80},
|
||||
{"height": 20, "top": 165, "left": 10, "width": 80},
|
||||
{"height": 20, "top": 190, "left": 10, "width": 80},
|
||||
]
|
||||
headers = _detect_header_rows(rows, words, 0, skip_first_row_header=True)
|
||||
assert 0 not in headers
|
||||
|
||||
Reference in New Issue
Block a user