fix: filter words and color recoveries inside graphic/image regions
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 30s
CI / test-go-edu-search (push) Successful in 31s
CI / test-python-klausur (push) Failing after 2m8s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 21s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 30s
CI / test-go-edu-search (push) Successful in 31s
CI / test-python-klausur (push) Failing after 2m8s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 21s
- Load structure_result from session to get detected graphic bounds - Exclude OCR words whose center falls inside a graphic region - Exclude recovered colored text inside graphic regions - Reject color recovery regions wider than 4x median word height Fixes garbage characters (!, ?, •) in box zones and false OCR detections (N, ?) in image areas. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -256,6 +256,9 @@ def recover_colored_text(
|
||||
bx, by, bw, bh = cv2.boundingRect(cnt)
|
||||
if bh < 6:
|
||||
continue
|
||||
# Reject regions too wide to be single characters
|
||||
if bw > median_h * 4:
|
||||
continue
|
||||
candidates.append((area, bx, by, bw, bh))
|
||||
|
||||
# Keep largest first, limited count
|
||||
|
||||
@@ -613,6 +613,36 @@ async def build_grid(session_id: str):
|
||||
logger.info("build-grid session %s: %d words from %d cells",
|
||||
session_id, len(all_words), len(word_result["cells"]))
|
||||
|
||||
# 2b. Filter words inside detected graphic/image regions
|
||||
structure_result = session.get("structure_result")
|
||||
graphic_rects = []
|
||||
if structure_result:
|
||||
for g in structure_result.get("graphics", []):
|
||||
graphic_rects.append({
|
||||
"x": g["x"], "y": g["y"],
|
||||
"w": g["w"], "h": g["h"],
|
||||
})
|
||||
if graphic_rects:
|
||||
before = len(all_words)
|
||||
filtered = []
|
||||
for w in all_words:
|
||||
w_cx = w["left"] + w.get("width", 0) / 2
|
||||
w_cy = w["top"] + w.get("height", 0) / 2
|
||||
inside = any(
|
||||
gr["x"] <= w_cx <= gr["x"] + gr["w"]
|
||||
and gr["y"] <= w_cy <= gr["y"] + gr["h"]
|
||||
for gr in graphic_rects
|
||||
)
|
||||
if not inside:
|
||||
filtered.append(w)
|
||||
removed = before - len(filtered)
|
||||
if removed:
|
||||
all_words = filtered
|
||||
logger.info(
|
||||
"build-grid session %s: removed %d words inside %d graphic region(s)",
|
||||
session_id, removed, len(graphic_rects),
|
||||
)
|
||||
|
||||
# 3. Load image for box detection
|
||||
img_png = await get_session_image(session_id, "cropped")
|
||||
if not img_png:
|
||||
@@ -635,6 +665,16 @@ async def build_grid(session_id: str):
|
||||
if img_bgr is not None:
|
||||
# --- Recover colored text that OCR missed (before grid building) ---
|
||||
recovered = recover_colored_text(img_bgr, all_words)
|
||||
if recovered and graphic_rects:
|
||||
# Filter recovered chars inside graphic regions
|
||||
recovered = [
|
||||
r for r in recovered
|
||||
if not any(
|
||||
gr["x"] <= r["left"] + r.get("width", 0) / 2 <= gr["x"] + gr["w"]
|
||||
and gr["y"] <= r["top"] + r.get("height", 0) / 2 <= gr["y"] + gr["h"]
|
||||
for gr in graphic_rects
|
||||
)
|
||||
]
|
||||
if recovered:
|
||||
recovered_count = len(recovered)
|
||||
all_words.extend(recovered)
|
||||
|
||||
Reference in New Issue
Block a user