Fix ghost filter for borderless boxes + remove oversized graphic artifacts
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m0s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 16s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m0s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 16s
1. Skip ghost filtering for boxes with border_thickness=0 (images/graphics have no border lines to produce OCR artifacts like |, I) 2. Remove individual word_boxes with height > 3x zone median (OCR from graphics like a huge "N" from a map image below text) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -290,15 +290,18 @@ def _filter_border_ghosts(
|
|||||||
x_bands: List[tuple] = []
|
x_bands: List[tuple] = []
|
||||||
y_bands: List[tuple] = []
|
y_bands: List[tuple] = []
|
||||||
for b in boxes:
|
for b in boxes:
|
||||||
bx = b.x if hasattr(b, "x") else b.get("x", 0)
|
|
||||||
by = b.y if hasattr(b, "y") else b.get("y", 0)
|
|
||||||
bw = b.width if hasattr(b, "width") else b.get("w", b.get("width", 0))
|
|
||||||
bh = b.height if hasattr(b, "height") else b.get("h", b.get("height", 0))
|
|
||||||
bt = (
|
bt = (
|
||||||
b.border_thickness
|
b.border_thickness
|
||||||
if hasattr(b, "border_thickness")
|
if hasattr(b, "border_thickness")
|
||||||
else b.get("border_thickness", 3)
|
else b.get("border_thickness", 3)
|
||||||
)
|
)
|
||||||
|
# Skip borderless boxes (images/graphics) — no border line to produce ghosts
|
||||||
|
if bt == 0:
|
||||||
|
continue
|
||||||
|
bx = b.x if hasattr(b, "x") else b.get("x", 0)
|
||||||
|
by = b.y if hasattr(b, "y") else b.get("y", 0)
|
||||||
|
bw = b.width if hasattr(b, "width") else b.get("w", b.get("width", 0))
|
||||||
|
bh = b.height if hasattr(b, "height") else b.get("h", b.get("height", 0))
|
||||||
margin = max(bt * 2, 10) + 6
|
margin = max(bt * 2, 10) + 6
|
||||||
x_bands.append((bx - margin, bx + margin))
|
x_bands.append((bx - margin, bx + margin))
|
||||||
x_bands.append((bx + bw - margin, bx + bw + margin))
|
x_bands.append((bx + bw - margin, bx + bw + margin))
|
||||||
@@ -1518,6 +1521,44 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
|||||||
sorted(junk_row_indices),
|
sorted(junk_row_indices),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# 4c. Remove oversized word_boxes from individual cells.
|
||||||
|
# OCR artifacts from graphics/images (e.g. a huge "N" from a map image)
|
||||||
|
# have word heights 3-5x the median. Remove them per-word so they don't
|
||||||
|
# pollute cells that also contain valid text in other columns.
|
||||||
|
for z in zones_data:
|
||||||
|
cells = z.get("cells", [])
|
||||||
|
if not cells:
|
||||||
|
continue
|
||||||
|
all_wh = [
|
||||||
|
wb["height"]
|
||||||
|
for cell in cells
|
||||||
|
for wb in cell.get("word_boxes") or []
|
||||||
|
if wb.get("height", 0) > 0
|
||||||
|
]
|
||||||
|
if not all_wh:
|
||||||
|
continue
|
||||||
|
med_h = sorted(all_wh)[len(all_wh) // 2]
|
||||||
|
oversized_threshold = med_h * 3
|
||||||
|
removed_oversized = 0
|
||||||
|
for cell in cells:
|
||||||
|
wbs = cell.get("word_boxes") or []
|
||||||
|
filtered = [wb for wb in wbs if wb.get("height", 0) <= oversized_threshold]
|
||||||
|
if len(filtered) < len(wbs):
|
||||||
|
removed_oversized += len(wbs) - len(filtered)
|
||||||
|
cell["word_boxes"] = filtered
|
||||||
|
cell["text"] = " ".join(
|
||||||
|
wb.get("text", "").strip()
|
||||||
|
for wb in sorted(filtered, key=lambda w: (w.get("top", 0), w.get("left", 0)))
|
||||||
|
if wb.get("text", "").strip()
|
||||||
|
)
|
||||||
|
if removed_oversized:
|
||||||
|
# Remove cells that became empty after oversized removal
|
||||||
|
z["cells"] = [c for c in cells if c.get("word_boxes")]
|
||||||
|
logger.info(
|
||||||
|
"build-grid: removed %d oversized word_boxes (>%dpx) from zone %d",
|
||||||
|
removed_oversized, oversized_threshold, z.get("zone_index", 0),
|
||||||
|
)
|
||||||
|
|
||||||
# 5. Color annotation on final word_boxes in cells
|
# 5. Color annotation on final word_boxes in cells
|
||||||
if img_bgr is not None:
|
if img_bgr is not None:
|
||||||
all_wb: List[Dict] = []
|
all_wb: List[Dict] = []
|
||||||
|
|||||||
@@ -384,8 +384,8 @@ class TestFilterBorderGhosts:
|
|||||||
assert filtered[0]["text"] == "hello"
|
assert filtered[0]["text"] == "hello"
|
||||||
|
|
||||||
def test_multi_char_ghost_kept(self):
|
def test_multi_char_ghost_kept(self):
|
||||||
"""Multi-char '(=' on a box border → NOT filtered (real content)."""
|
"""Multi-char '(=' on a bordered box → NOT filtered (real content)."""
|
||||||
box = DetectedBox(x=648, y=129, width=338, height=125, confidence=0.7, border_thickness=0)
|
box = DetectedBox(x=648, y=129, width=338, height=125, confidence=0.7, border_thickness=3)
|
||||||
words = [
|
words = [
|
||||||
{"text": "(=", "left": 644, "top": 294, "width": 16, "height": 17},
|
{"text": "(=", "left": 644, "top": 294, "width": 16, "height": 17},
|
||||||
{"text": "I", "left": 665, "top": 294, "width": 9, "height": 18},
|
{"text": "I", "left": 665, "top": 294, "width": 9, "height": 18},
|
||||||
@@ -394,6 +394,17 @@ class TestFilterBorderGhosts:
|
|||||||
assert count == 0
|
assert count == 0
|
||||||
assert len(filtered) == 2
|
assert len(filtered) == 2
|
||||||
|
|
||||||
|
def test_borderless_box_no_ghost_filter(self):
|
||||||
|
"""Borderless box (border_thickness=0) → no ghost filtering at all."""
|
||||||
|
box = DetectedBox(x=648, y=129, width=338, height=125, confidence=0.7, border_thickness=0)
|
||||||
|
words = [
|
||||||
|
{"text": "I", "left": 643, "top": 272, "width": 6, "height": 19}, # near box edge
|
||||||
|
{"text": "|", "left": 647, "top": 200, "width": 3, "height": 10}, # even pipe on edge
|
||||||
|
]
|
||||||
|
filtered, count = _filter_border_ghosts(words, [box])
|
||||||
|
assert count == 0 # nothing filtered — borderless box
|
||||||
|
assert len(filtered) == 2
|
||||||
|
|
||||||
def test_single_paren_on_border_removed(self):
|
def test_single_paren_on_border_removed(self):
|
||||||
"""Single ')' on border → filtered."""
|
"""Single ')' on border → filtered."""
|
||||||
box = DetectedBox(x=100, y=100, width=200, height=150, confidence=0.9, border_thickness=2)
|
box = DetectedBox(x=100, y=100, width=200, height=150, confidence=0.9, border_thickness=2)
|
||||||
|
|||||||
Reference in New Issue
Block a user