diff --git a/klausur-service/backend/cv_graphic_detect.py b/klausur-service/backend/cv_graphic_detect.py index 2f66efd..fb9f5c3 100644 --- a/klausur-service/backend/cv_graphic_detect.py +++ b/klausur-service/backend/cv_graphic_detect.py @@ -170,7 +170,7 @@ def detect_graphic_elements( continue # Skip page-spanning regions - if bw > w * 0.5 or bh > h * 0.5: + if bw > w * 0.6 or bh > h * 0.6: logger.debug("GraphicDetect PASS1 skip page-spanning (%d,%d) %dx%d", bx, by, bw, bh) continue @@ -232,12 +232,16 @@ def detect_graphic_elements( if color_pixel_count < 200: continue - # (d) Very low density → thin strokes, almost certainly text - if density < 0.20: + # (d) Very low density → thin strokes, almost certainly text. + # Large regions (photos/illustrations) can have low color density + # because most pixels are grayscale ink. Use a lower threshold + # for regions bigger than 100×80 px. + _min_density = 0.05 if (bw > 100 and bh > 80) else 0.20 + if density < _min_density: logger.info( "GraphicDetect PASS1 skip low-density (%d,%d) %dx%d " - "density=%.0f%% (likely colored text)", - bx, by, bw, bh, density * 100, + "density=%.0f%% (min=%.0f%%, likely colored text)", + bx, by, bw, bh, density * 100, _min_density * 100, ) continue diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index ae1d063..bc34694 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -21,6 +21,7 @@ import numpy as np from fastapi import APIRouter, HTTPException, Request from cv_box_detect import detect_boxes, split_page_into_zones +from cv_graphic_detect import detect_graphic_elements from cv_vocab_types import PageZone from cv_color_detect import detect_word_colors, recover_colored_text from cv_ocr_engines import fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa, _lookup_ipa, _words_to_reading_order_text, _group_words_into_lines @@ -1469,13 +1470,12 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: session_id, removed, len(exclude_rects), ) - # 2e. Filter words inside detected graphic/image regions - # Only remove LOW-CONFIDENCE words (likely OCR artifacts from images). - # High-confidence words are real text even if they overlap a detected - # graphic region (e.g. colored text that graphic detection couldn't - # fully distinguish from an image). - _GRAPHIC_CONF_THRESHOLD = 50 # keep words with conf >= 50 - graphic_rects = [] + # 2e. Hard-filter words inside graphic/image regions from structure step. + # ALL words inside graphic regions are removed regardless of confidence — + # images cannot contain real text; any OCR words inside are artifacts. + # After image loading (Step 3a) we augment these with freshly detected + # graphic regions from cv_graphic_detect. + graphic_rects: List[Dict[str, int]] = [] if structure_result: for g in structure_result.get("graphics", []): graphic_rects.append({ @@ -1484,23 +1484,18 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: }) if graphic_rects: before = len(all_words) - filtered = [] - for w in all_words: - w_cx = w["left"] + w.get("width", 0) / 2 - w_cy = w["top"] + w.get("height", 0) / 2 - inside = any( - gr["x"] <= w_cx <= gr["x"] + gr["w"] - and gr["y"] <= w_cy <= gr["y"] + gr["h"] + all_words = [ + w for w in all_words + if not any( + gr["x"] <= w["left"] + w.get("width", 0) / 2 <= gr["x"] + gr["w"] + and gr["y"] <= w["top"] + w.get("height", 0) / 2 <= gr["y"] + gr["h"] for gr in graphic_rects ) - if inside and w.get("conf", 0) < _GRAPHIC_CONF_THRESHOLD: - continue # remove low-confidence artifact - filtered.append(w) - removed = before - len(filtered) + ] + removed = before - len(all_words) if removed: - all_words = filtered logger.info( - "build-grid session %s: removed %d low-conf words inside %d graphic region(s)", + "build-grid session %s: hard-removed %d words inside %d structure graphic region(s)", session_id, removed, len(graphic_rects), ) @@ -1525,6 +1520,39 @@ async def _build_grid_core(session_id: str, session: dict) -> dict: img_bgr = cv2.imdecode(arr, cv2.IMREAD_COLOR) if img_bgr is not None: + # --- 3a. Detect graphic/image regions via CV and hard-filter --- + # Pass only significant words (len >= 3) to the detector so that + # short OCR artifacts inside images don't fool the text-vs-graphic + # heuristic (it counts word centroids to distinguish text from images). + sig_words = [w for w in all_words if len((w.get("text") or "").strip()) >= 3] + fresh_graphics = detect_graphic_elements(img_bgr, sig_words) + if fresh_graphics: + fresh_rects = [ + {"x": g.x, "y": g.y, "w": g.width, "h": g.height} + for g in fresh_graphics + ] + graphic_rects.extend(fresh_rects) + logger.info( + "build-grid session %s: detected %d graphic region(s) via CV", + session_id, len(fresh_graphics), + ) + # Hard-filter words inside newly detected graphic regions + before = len(all_words) + all_words = [ + w for w in all_words + if not any( + gr["x"] <= w["left"] + w.get("width", 0) / 2 <= gr["x"] + gr["w"] + and gr["y"] <= w["top"] + w.get("height", 0) / 2 <= gr["y"] + gr["h"] + for gr in fresh_rects + ) + ] + removed = before - len(all_words) + if removed: + logger.info( + "build-grid session %s: hard-removed %d words inside %d fresh graphic region(s)", + session_id, removed, len(fresh_rects), + ) + # --- Recover colored text that OCR missed (before grid building) --- recovered = recover_colored_text(img_bgr, all_words) if recovered and graphic_rects: