fix: filter words and color recoveries inside graphic/image regions

- Load structure_result from session to get detected graphic bounds - Exclude OCR words whose center falls inside a graphic region - Exclude recovered colored text inside graphic regions - Reject color recovery regions wider than 4x median word height Fixes garbage characters (!, ?, •) in box zones and false OCR detections (N, ?) in image areas. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-17 11:20:07 +01:00
parent bbf0a5720e
commit 872b47f691
2 changed files with 43 additions and 0 deletions
@@ -256,6 +256,9 @@ def recover_colored_text(
            bx, by, bw, bh = cv2.boundingRect(cnt)
            if bh < 6:
                continue
+            # Reject regions too wide to be single characters
+            if bw > median_h * 4:
+                continue
            candidates.append((area, bx, by, bw, bh))

        # Keep largest first, limited count
@@ -613,6 +613,36 @@ async def build_grid(session_id: str):
    logger.info("build-grid session %s: %d words from %d cells",
                session_id, len(all_words), len(word_result["cells"]))

+    # 2b. Filter words inside detected graphic/image regions
+    structure_result = session.get("structure_result")
+    graphic_rects = []
+    if structure_result:
+        for g in structure_result.get("graphics", []):
+            graphic_rects.append({
+                "x": g["x"], "y": g["y"],
+                "w": g["w"], "h": g["h"],
+            })
+    if graphic_rects:
+        before = len(all_words)
+        filtered = []
+        for w in all_words:
+            w_cx = w["left"] + w.get("width", 0) / 2
+            w_cy = w["top"] + w.get("height", 0) / 2
+            inside = any(
+                gr["x"] <= w_cx <= gr["x"] + gr["w"]
+                and gr["y"] <= w_cy <= gr["y"] + gr["h"]
+                for gr in graphic_rects
+            )
+            if not inside:
+                filtered.append(w)
+        removed = before - len(filtered)
+        if removed:
+            all_words = filtered
+            logger.info(
+                "build-grid session %s: removed %d words inside %d graphic region(s)",
+                session_id, removed, len(graphic_rects),
+            )
+
    # 3. Load image for box detection
    img_png = await get_session_image(session_id, "cropped")
    if not img_png:
@@ -635,6 +665,16 @@ async def build_grid(session_id: str):
        if img_bgr is not None:
            # --- Recover colored text that OCR missed (before grid building) ---
            recovered = recover_colored_text(img_bgr, all_words)
+            if recovered and graphic_rects:
+                # Filter recovered chars inside graphic regions
+                recovered = [
+                    r for r in recovered
+                    if not any(
+                        gr["x"] <= r["left"] + r.get("width", 0) / 2 <= gr["x"] + gr["w"]
+                        and gr["y"] <= r["top"] + r.get("height", 0) / 2 <= gr["y"] + gr["h"]
+                        for gr in graphic_rects
+                    )
+                ]
            if recovered:
                recovered_count = len(recovered)
                all_words.extend(recovered)