fix: filter words and color recoveries inside graphic/image regions

- Load structure_result from session to get detected graphic bounds - Exclude OCR words whose center falls inside a graphic region - Exclude recovered colored text inside graphic regions - Reject color recovery regions wider than 4x median word height Fixes garbage characters (!, ?, •) in box zones and false OCR detections (N, ?) in image areas. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-17 11:20:07 +01:00
parent bbf0a5720e
commit 872b47f691
2 changed files with 43 additions and 0 deletions
@@ -256,6 +256,9 @@ def recover_colored_text(
            bx, by, bw, bh = cv2.boundingRect(cnt)
            if bh < 6:
                continue
            # Reject regions too wide to be single characters
            if bw > median_h * 4:
                continue
            candidates.append((area, bx, by, bw, bh))
        # Keep largest first, limited count
@@ -613,6 +613,36 @@ async def build_grid(session_id: str):
    logger.info("build-grid session %s: %d words from %d cells",
                session_id, len(all_words), len(word_result["cells"]))
    # 2b. Filter words inside detected graphic/image regions
    structure_result = session.get("structure_result")
    graphic_rects = []
    if structure_result:
        for g in structure_result.get("graphics", []):
            graphic_rects.append({
                "x": g["x"], "y": g["y"],
                "w": g["w"], "h": g["h"],
            })
    if graphic_rects:
        before = len(all_words)
        filtered = []
        for w in all_words:
            w_cx = w["left"] + w.get("width", 0) / 2
            w_cy = w["top"] + w.get("height", 0) / 2
            inside = any(
                gr["x"] <= w_cx <= gr["x"] + gr["w"]
                and gr["y"] <= w_cy <= gr["y"] + gr["h"]
                for gr in graphic_rects
            )
            if not inside:
                filtered.append(w)
        removed = before - len(filtered)
        if removed:
            all_words = filtered
            logger.info(
                "build-grid session %s: removed %d words inside %d graphic region(s)",
                session_id, removed, len(graphic_rects),
            )
    # 3. Load image for box detection
    img_png = await get_session_image(session_id, "cropped")
    if not img_png:
@@ -635,6 +665,16 @@ async def build_grid(session_id: str):
        if img_bgr is not None:
            # --- Recover colored text that OCR missed (before grid building) ---
            recovered = recover_colored_text(img_bgr, all_words)
            if recovered and graphic_rects:
                # Filter recovered chars inside graphic regions
                recovered = [
                    r for r in recovered
                    if not any(
                        gr["x"] <= r["left"] + r.get("width", 0) / 2 <= gr["x"] + gr["w"]
                        and gr["y"] <= r["top"] + r.get("height", 0) / 2 <= gr["y"] + gr["h"]
                        for gr in graphic_rects
                    )
                ]
            if recovered:
                recovered_count = len(recovered)
                all_words.extend(recovered)