Fix ghost filter for borderless boxes + remove oversized graphic artifacts
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m0s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 16s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m0s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 16s
1. Skip ghost filtering for boxes with border_thickness=0 (images/graphics have no border lines to produce OCR artifacts like |, I) 2. Remove individual word_boxes with height > 3x zone median (OCR from graphics like a huge "N" from a map image below text) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -384,8 +384,8 @@ class TestFilterBorderGhosts:
|
||||
assert filtered[0]["text"] == "hello"
|
||||
|
||||
def test_multi_char_ghost_kept(self):
|
||||
"""Multi-char '(=' on a box border → NOT filtered (real content)."""
|
||||
box = DetectedBox(x=648, y=129, width=338, height=125, confidence=0.7, border_thickness=0)
|
||||
"""Multi-char '(=' on a bordered box → NOT filtered (real content)."""
|
||||
box = DetectedBox(x=648, y=129, width=338, height=125, confidence=0.7, border_thickness=3)
|
||||
words = [
|
||||
{"text": "(=", "left": 644, "top": 294, "width": 16, "height": 17},
|
||||
{"text": "I", "left": 665, "top": 294, "width": 9, "height": 18},
|
||||
@@ -394,6 +394,17 @@ class TestFilterBorderGhosts:
|
||||
assert count == 0
|
||||
assert len(filtered) == 2
|
||||
|
||||
def test_borderless_box_no_ghost_filter(self):
|
||||
"""Borderless box (border_thickness=0) → no ghost filtering at all."""
|
||||
box = DetectedBox(x=648, y=129, width=338, height=125, confidence=0.7, border_thickness=0)
|
||||
words = [
|
||||
{"text": "I", "left": 643, "top": 272, "width": 6, "height": 19}, # near box edge
|
||||
{"text": "|", "left": 647, "top": 200, "width": 3, "height": 10}, # even pipe on edge
|
||||
]
|
||||
filtered, count = _filter_border_ghosts(words, [box])
|
||||
assert count == 0 # nothing filtered — borderless box
|
||||
assert len(filtered) == 2
|
||||
|
||||
def test_single_paren_on_border_removed(self):
|
||||
"""Single ')' on border → filtered."""
|
||||
box = DetectedBox(x=100, y=100, width=200, height=150, confidence=0.9, border_thickness=2)
|
||||
|
||||
Reference in New Issue
Block a user