fix: filter words and color recoveries inside graphic/image regions
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 30s
CI / test-go-edu-search (push) Successful in 31s
CI / test-python-klausur (push) Failing after 2m8s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 21s

- Load structure_result from session to get detected graphic bounds
- Exclude OCR words whose center falls inside a graphic region
- Exclude recovered colored text inside graphic regions
- Reject color recovery regions wider than 4x median word height

Fixes garbage characters (!, ?, •) in box zones and false OCR
detections (N, ?) in image areas.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-17 11:20:07 +01:00
parent bbf0a5720e
commit 872b47f691
2 changed files with 43 additions and 0 deletions

View File

@@ -256,6 +256,9 @@ def recover_colored_text(
bx, by, bw, bh = cv2.boundingRect(cnt)
if bh < 6:
continue
# Reject regions too wide to be single characters
if bw > median_h * 4:
continue
candidates.append((area, bx, by, bw, bh))
# Keep largest first, limited count

View File

@@ -613,6 +613,36 @@ async def build_grid(session_id: str):
logger.info("build-grid session %s: %d words from %d cells",
session_id, len(all_words), len(word_result["cells"]))
# 2b. Filter words inside detected graphic/image regions
structure_result = session.get("structure_result")
graphic_rects = []
if structure_result:
for g in structure_result.get("graphics", []):
graphic_rects.append({
"x": g["x"], "y": g["y"],
"w": g["w"], "h": g["h"],
})
if graphic_rects:
before = len(all_words)
filtered = []
for w in all_words:
w_cx = w["left"] + w.get("width", 0) / 2
w_cy = w["top"] + w.get("height", 0) / 2
inside = any(
gr["x"] <= w_cx <= gr["x"] + gr["w"]
and gr["y"] <= w_cy <= gr["y"] + gr["h"]
for gr in graphic_rects
)
if not inside:
filtered.append(w)
removed = before - len(filtered)
if removed:
all_words = filtered
logger.info(
"build-grid session %s: removed %d words inside %d graphic region(s)",
session_id, removed, len(graphic_rects),
)
# 3. Load image for box detection
img_png = await get_session_image(session_id, "cropped")
if not img_png:
@@ -635,6 +665,16 @@ async def build_grid(session_id: str):
if img_bgr is not None:
# --- Recover colored text that OCR missed (before grid building) ---
recovered = recover_colored_text(img_bgr, all_words)
if recovered and graphic_rects:
# Filter recovered chars inside graphic regions
recovered = [
r for r in recovered
if not any(
gr["x"] <= r["left"] + r.get("width", 0) / 2 <= gr["x"] + gr["w"]
and gr["y"] <= r["top"] + r.get("height", 0) / 2 <= gr["y"] + gr["h"]
for gr in graphic_rects
)
]
if recovered:
recovered_count = len(recovered)
all_words.extend(recovered)