From 872b47f69116a917844e0c376db347dda05e002f Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Tue, 17 Mar 2026 11:20:07 +0100 Subject: [PATCH] fix: filter words and color recoveries inside graphic/image regions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Load structure_result from session to get detected graphic bounds - Exclude OCR words whose center falls inside a graphic region - Exclude recovered colored text inside graphic regions - Reject color recovery regions wider than 4x median word height Fixes garbage characters (!, ?, •) in box zones and false OCR detections (N, ?) in image areas. Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_color_detect.py | 3 ++ klausur-service/backend/grid_editor_api.py | 40 ++++++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/klausur-service/backend/cv_color_detect.py b/klausur-service/backend/cv_color_detect.py index beaa55a..bfcc5f0 100644 --- a/klausur-service/backend/cv_color_detect.py +++ b/klausur-service/backend/cv_color_detect.py @@ -256,6 +256,9 @@ def recover_colored_text( bx, by, bw, bh = cv2.boundingRect(cnt) if bh < 6: continue + # Reject regions too wide to be single characters + if bw > median_h * 4: + continue candidates.append((area, bx, by, bw, bh)) # Keep largest first, limited count diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 03926c6..181a72f 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -613,6 +613,36 @@ async def build_grid(session_id: str): logger.info("build-grid session %s: %d words from %d cells", session_id, len(all_words), len(word_result["cells"])) + # 2b. Filter words inside detected graphic/image regions + structure_result = session.get("structure_result") + graphic_rects = [] + if structure_result: + for g in structure_result.get("graphics", []): + graphic_rects.append({ + "x": g["x"], "y": g["y"], + "w": g["w"], "h": g["h"], + }) + if graphic_rects: + before = len(all_words) + filtered = [] + for w in all_words: + w_cx = w["left"] + w.get("width", 0) / 2 + w_cy = w["top"] + w.get("height", 0) / 2 + inside = any( + gr["x"] <= w_cx <= gr["x"] + gr["w"] + and gr["y"] <= w_cy <= gr["y"] + gr["h"] + for gr in graphic_rects + ) + if not inside: + filtered.append(w) + removed = before - len(filtered) + if removed: + all_words = filtered + logger.info( + "build-grid session %s: removed %d words inside %d graphic region(s)", + session_id, removed, len(graphic_rects), + ) + # 3. Load image for box detection img_png = await get_session_image(session_id, "cropped") if not img_png: @@ -635,6 +665,16 @@ async def build_grid(session_id: str): if img_bgr is not None: # --- Recover colored text that OCR missed (before grid building) --- recovered = recover_colored_text(img_bgr, all_words) + if recovered and graphic_rects: + # Filter recovered chars inside graphic regions + recovered = [ + r for r in recovered + if not any( + gr["x"] <= r["left"] + r.get("width", 0) / 2 <= gr["x"] + gr["w"] + and gr["y"] <= r["top"] + r.get("height", 0) / 2 <= gr["y"] + gr["h"] + for gr in graphic_rects + ) + ] if recovered: recovered_count = len(recovered) all_words.extend(recovered)