diff --git a/klausur-service/backend/cv_graphic_detect.py b/klausur-service/backend/cv_graphic_detect.py index 911ee5e..773966a 100644 --- a/klausur-service/backend/cv_graphic_detect.py +++ b/klausur-service/backend/cv_graphic_detect.py @@ -1,14 +1,13 @@ """ Graphical element detection for OCR pages. -Two-pass approach: - Pass 1 — COLOR PASS: Detect colored graphical elements (balloons, colored - arrows, icons) on the saturation channel alone. Black text has - zero saturation and is invisible on this channel, so no word - exclusion is needed. - Pass 2 — INK PASS: Detect large black-ink illustrations by subtracting - OCR word boxes from the full ink mask and keeping only very large - remaining contours. +Region-based approach: + 1. Build a color mask (saturation channel — black text is invisible). + 2. Dilate heavily to merge nearby colored pixels into regions. + 3. For each region, check overlap with OCR word boxes: + - High word overlap → colored text (skip) + - Low word overlap → colored graphic / image (keep) + 4. Separately detect large black-ink illustrations via ink mask. Boxes and text colors are handled by cv_box_detect / cv_color_detect. @@ -36,7 +35,7 @@ class GraphicElement: width: int height: int area: int - shape: str # circle, illustration + shape: str # image, illustration color_name: str # dominant color or 'black' color_hex: str confidence: float @@ -59,7 +58,7 @@ _COLOR_HEX = { } -def _dominant_color(hsv_roi: np.ndarray, sat_threshold: int = 50) -> tuple: +def _dominant_color(hsv_roi: np.ndarray, sat_threshold: int = 40) -> tuple: """Return (color_name, color_hex) for an HSV region.""" if hsv_roi.size == 0: return "black", _COLOR_HEX["black"] @@ -104,13 +103,10 @@ def detect_graphic_elements( detected_boxes: Optional[List[Dict]] = None, max_elements: int = 50, ) -> List[GraphicElement]: - """Find non-text graphical elements on the page. + """Find non-text graphical regions on the page. - Two-pass approach: - Pass 1 (color): Find colored elements via saturation channel. - No word exclusion needed — black text is invisible. - Pass 2 (ink): Find large black illustrations via ink mask minus - word exclusion. + Region-based: dilate color mask to form regions, then check word + overlap to distinguish colored text from colored graphics. Args: img_bgr: BGR color image. @@ -133,89 +129,104 @@ def detect_graphic_elements( hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV) candidates: List[GraphicElement] = [] + # --- Build word mask (for overlap checking) --- + word_mask = np.zeros((h, w), dtype=np.uint8) + for wb in word_boxes: + x1 = max(0, int(wb.get("left", 0))) + y1 = max(0, int(wb.get("top", 0))) + x2 = min(w, int(wb.get("left", 0) + wb.get("width", 0))) + y2 = min(h, int(wb.get("top", 0) + wb.get("height", 0))) + word_mask[y1:y2, x1:x2] = 255 + # ===================================================================== - # PASS 1 — COLOR CHANNEL (no word exclusion needed) + # PASS 1 — COLORED IMAGE REGIONS # ===================================================================== - # Saturated pixels = colored ink. Black text has sat ≈ 0 → invisible. + # Color mask: saturated pixels (black text has sat ≈ 0 → invisible) sat_mask = (hsv[:, :, 1] > 40).astype(np.uint8) * 255 - # Exclude very bright backgrounds (white/near-white with color cast) val_mask = (hsv[:, :, 2] < 240).astype(np.uint8) * 255 - color_mask = cv2.bitwise_and(sat_mask, val_mask) + color_pixels = cv2.bitwise_and(sat_mask, val_mask) - # Only remove tiny speckle — NO closing, which would merge nearby - # colored elements into one giant blob spanning half the page. + # Remove tiny speckle kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2)) - color_mask = cv2.morphologyEx(color_mask, cv2.MORPH_OPEN, kernel_open) + color_pixels = cv2.morphologyEx(color_pixels, cv2.MORPH_OPEN, kernel_open) - contours_color, _ = cv2.findContours( - color_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE, + # Count raw colored pixels before dilation (for density check later) + color_pixel_raw = color_pixels.copy() + + # Heavy dilation to merge nearby colored elements into regions. + # A 25x25 kernel merges elements within ~12px of each other. + kernel_dilate = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (25, 25)) + region_mask = cv2.dilate(color_pixels, kernel_dilate, iterations=1) + + contours_regions, _ = cv2.findContours( + region_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE, ) - logger.info("GraphicDetect PASS1 (color): %d contours", len(contours_color)) - - for cnt in contours_color: - area = cv2.contourArea(cnt) - if area < 80: - continue + logger.info("GraphicDetect PASS1: %d color regions after dilation", len(contours_regions)) + for cnt in contours_regions: bx, by, bw, bh = cv2.boundingRect(cnt) - if bw < 8 or bh < 8: + + # Skip tiny regions + if bw < 15 or bh < 15: continue - # Skip page-spanning contours (background color cast / merged blobs) - if bw > w * 0.5 or bh > h * 0.5 or area > img_area * 0.10: + # Skip page-spanning regions + if bw > w * 0.5 or bh > h * 0.5: + logger.info("GraphicDetect PASS1 SKIP page-spanning (%d,%d) %dx%d", bx, by, bw, bh) continue - perimeter = cv2.arcLength(cnt, True) - circularity = (4 * np.pi * area) / (perimeter * perimeter) if perimeter > 0 else 0 - aspect = bw / bh if bh > 0 else 1.0 - min_dim = min(bw, bh) + bbox_area = bw * bh - # Colored circle / balloon - if circularity > 0.45 and 0.4 < aspect < 2.5 and min_dim > 12: - # Determine color - roi_hsv = hsv[by:by + bh, bx:bx + bw] - cnt_mask_roi = np.zeros((bh, bw), dtype=np.uint8) - cv2.drawContours(cnt_mask_roi, [cnt - np.array([bx, by])], -1, 255, -1) - masked_hsv = roi_hsv[cnt_mask_roi > 0] - color_name, color_hex = _dominant_color(masked_hsv, sat_threshold=30) + # Check: how much of this region's bounding box overlaps with words? + roi_words = word_mask[by:by + bh, bx:bx + bw] + word_pixel_count = int(np.sum(roi_words > 0)) + word_overlap = word_pixel_count / bbox_area if bbox_area > 0 else 0 - conf = min(0.95, circularity) - logger.info("GraphicDetect PASS1 ACCEPT circle at (%d,%d) %dx%d area=%d circ=%.2f color=%s", - bx, by, bw, bh, int(area), circularity, color_name) - candidates.append(GraphicElement( - x=bx, y=by, width=bw, height=bh, - area=int(area), shape="circle", - color_name=color_name, color_hex=color_hex, - confidence=conf, contour=cnt, - )) + # Check: how many actual colored pixels are in this region? + roi_color = color_pixel_raw[by:by + bh, bx:bx + bw] + color_pixel_count = int(np.sum(roi_color > 0)) + + # If most of the region is covered by word boxes → colored text, skip + if word_overlap > 0.5: + logger.info("GraphicDetect PASS1 SKIP text region (%d,%d) %dx%d word_overlap=%.0f%%", + bx, by, bw, bh, word_overlap * 100) continue - # Colored illustration (large colored region) - if area > 2000 and min_dim > 20: - roi_hsv = hsv[by:by + bh, bx:bx + bw] - cnt_mask_roi = np.zeros((bh, bw), dtype=np.uint8) - cv2.drawContours(cnt_mask_roi, [cnt - np.array([bx, by])], -1, 255, -1) - masked_hsv = roi_hsv[cnt_mask_roi > 0] - color_name, color_hex = _dominant_color(masked_hsv, sat_threshold=30) - - logger.info("GraphicDetect PASS1 ACCEPT illustration at (%d,%d) %dx%d area=%d color=%s", - bx, by, bw, bh, int(area), color_name) - candidates.append(GraphicElement( - x=bx, y=by, width=bw, height=bh, - area=int(area), shape="illustration", - color_name=color_name, color_hex=color_hex, - confidence=0.6, contour=cnt, - )) + # Need a minimum number of colored pixels (not just dilated area) + if color_pixel_count < 200: continue + # Determine dominant color from the actual colored pixels + roi_hsv = hsv[by:by + bh, bx:bx + bw] + color_px_mask = roi_color > 0 + if np.sum(color_px_mask) > 0: + masked_hsv = roi_hsv[color_px_mask] + color_name, color_hex = _dominant_color(masked_hsv) + else: + color_name, color_hex = "black", _COLOR_HEX["black"] + + # Confidence based on color density and low word overlap + density = color_pixel_count / bbox_area if bbox_area > 0 else 0 + conf = min(0.95, 0.5 + density * 0.5) + + logger.info("GraphicDetect PASS1 ACCEPT image at (%d,%d) %dx%d " + "color_px=%d word_overlap=%.0f%% color=%s", + bx, by, bw, bh, color_pixel_count, word_overlap * 100, color_name) + candidates.append(GraphicElement( + x=bx, y=by, width=bw, height=bh, + area=color_pixel_count, + shape="image", + color_name=color_name, color_hex=color_hex, + confidence=round(conf, 2), contour=cnt, + )) + # ===================================================================== - # PASS 2 — INK (dark pixels) with word exclusion - # Only for large black illustrations (drawings in black ink). + # PASS 2 — LARGE BLACK-INK ILLUSTRATIONS # ===================================================================== gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) _, dark_mask = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) - # Build exclusion mask from words + # Exclude words and colored regions already found exclusion = np.zeros((h, w), dtype=np.uint8) word_pad = 5 for wb in word_boxes: @@ -225,7 +236,6 @@ def detect_graphic_elements( y2 = min(h, int(wb.get("top", 0) + wb.get("height", 0)) + word_pad) exclusion[y1:y2, x1:x2] = 255 - # Also exclude detected box regions if detected_boxes: for box in detected_boxes: bbx = int(box.get("x", 0)) @@ -241,11 +251,8 @@ def detect_graphic_elements( exclusion[y1:y2, x1:x2] = 255 ink_only = cv2.bitwise_and(dark_mask, cv2.bitwise_not(exclusion)) + ink_only = cv2.bitwise_and(ink_only, cv2.bitwise_not(color_pixels)) - # Remove colored regions already found in pass 1 - ink_only = cv2.bitwise_and(ink_only, cv2.bitwise_not(color_mask)) - - # Only look for LARGE remaining regions (black illustrations) contours_ink, _ = cv2.findContours( ink_only, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE, ) @@ -254,14 +261,10 @@ def detect_graphic_elements( for cnt in contours_ink: area = cv2.contourArea(cnt) bx, by, bw, bh = cv2.boundingRect(cnt) - min_dim = min(bw, bh) - # Only large illustrations survive (area > 5000, min_dim > 40) - if area < 5000 or min_dim < 40: + if area < 5000 or min(bw, bh) < 40: continue - - # Skip page-spanning contours - if bw > w * 0.8 and bh > h * 0.8: + if bw > w * 0.8 or bh > h * 0.8: continue logger.info("GraphicDetect PASS2 ACCEPT illustration at (%d,%d) %dx%d area=%d", @@ -274,16 +277,14 @@ def detect_graphic_elements( )) # ===================================================================== - # Deduplicate overlapping results and return + # Deduplicate and return # ===================================================================== candidates.sort(key=lambda g: g.area, reverse=True) - # Remove duplicates where bounding boxes overlap > 50% final: List[GraphicElement] = [] for c in candidates: overlap = False for f in final: - # Intersection ix1 = max(c.x, f.x) iy1 = max(c.y, f.y) ix2 = min(c.x + c.width, f.x + f.width)