diff --git a/klausur-service/backend/cv_graphic_detect.py b/klausur-service/backend/cv_graphic_detect.py index 69c1d96..0997a01 100644 --- a/klausur-service/backend/cv_graphic_detect.py +++ b/klausur-service/backend/cv_graphic_detect.py @@ -1,11 +1,16 @@ """ Graphical element detection for OCR pages. -Finds non-text visual elements (arrows, balloons, icons, illustrations) -by subtracting known OCR word regions from the page ink and analysing -remaining connected components via contour shape metrics. +Two-pass approach: + Pass 1 — COLOR PASS: Detect colored graphical elements (balloons, colored + arrows, icons) on the saturation channel alone. Black text has + zero saturation and is invisible on this channel, so no word + exclusion is needed. + Pass 2 — INK PASS: Detect large black-ink illustrations by subtracting + OCR word boxes from the full ink mask and keeping only very large + remaining contours. -Works on both color and grayscale scans. +Boxes and text colors are handled by cv_box_detect / cv_color_detect. Lizenz: Apache 2.0 (kommerziell nutzbar) DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. @@ -31,11 +36,11 @@ class GraphicElement: width: int height: int area: int - shape: str # arrow, circle, line, icon, illustration + shape: str # circle, illustration color_name: str # dominant color or 'black' color_hex: str confidence: float - contour: Any = field(default=None, repr=False) # numpy contour, excluded from repr + contour: Any = field(default=None, repr=False) # --------------------------------------------------------------------------- @@ -89,46 +94,6 @@ def _dominant_color(hsv_roi: np.ndarray, sat_threshold: int = 50) -> tuple: return name, _COLOR_HEX.get(name, _COLOR_HEX["black"]) -# --------------------------------------------------------------------------- -# Shape classification via contour analysis -# --------------------------------------------------------------------------- - -def _classify_shape( - contour: np.ndarray, - bw: int, - bh: int, - area: float, -) -> tuple: - """Classify contour shape → (shape_name, confidence). - - Only detects high-confidence shapes that are clearly non-text: - - circle/balloon: high circularity (very reliable) - - illustration: large area (clearly a drawing/image) - - Text fragments are classified as 'noise' and filtered out. - Boxes and colors are detected by separate modules. - """ - perimeter = cv2.arcLength(contour, True) - circularity = (4 * np.pi * area) / (perimeter * perimeter) if perimeter > 0 else 0 - aspect = bw / bh if bh > 0 else 1.0 - min_dim = min(bw, bh) - - # --- Circle / balloon --- - # High circularity is the most reliable non-text indicator. - # Text characters rarely have circularity > 0.55. - if circularity > 0.55 and 0.5 < aspect < 2.0 and min_dim > 15: - conf = min(0.95, circularity) - return "circle", conf - - # --- Illustration (drawing, image, large graphic) --- - # Large connected regions that survived word exclusion = genuine graphics. - if area > 3000 and min_dim > 30: - return "illustration", 0.6 - - # Everything else is likely a text fragment — skip - return "noise", 0.0 - - # --------------------------------------------------------------------------- # Main detection # --------------------------------------------------------------------------- @@ -137,24 +102,20 @@ def detect_graphic_elements( img_bgr: np.ndarray, word_boxes: List[Dict], detected_boxes: Optional[List[Dict]] = None, - min_area: int = 80, - max_area_ratio: float = 0.25, - word_pad: int = 5, max_elements: int = 50, ) -> List[GraphicElement]: """Find non-text graphical elements on the page. - 1. Build ink mask (dark + colored pixels). - 2. Subtract OCR word regions and detected boxes. - 3. Find connected components and classify shapes. + Two-pass approach: + Pass 1 (color): Find colored elements via saturation channel. + No word exclusion needed — black text is invisible. + Pass 2 (ink): Find large black illustrations via ink mask minus + word exclusion. Args: img_bgr: BGR color image. word_boxes: List of OCR word dicts with left/top/width/height. detected_boxes: Optional list of detected box dicts (x/y/w/h). - min_area: Minimum contour area to keep (80 filters tiny noise). - max_area_ratio: Maximum area as fraction of image area. - word_pad: Padding around word boxes for exclusion (5px). max_elements: Maximum number of elements to return. Returns: @@ -164,27 +125,100 @@ def detect_graphic_elements( return [] h, w = img_bgr.shape[:2] - max_area = int(h * w * max_area_ratio) + img_area = h * w logger.info("GraphicDetect: image %dx%d, %d word_boxes, %d detected_boxes", w, h, len(word_boxes), len(detected_boxes or [])) - gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV) + candidates: List[GraphicElement] = [] - # --- 1. Build ink mask: dark pixels + saturated colored pixels --- + # ===================================================================== + # PASS 1 — COLOR CHANNEL (no word exclusion needed) + # ===================================================================== + # Saturated pixels = colored ink. Black text has sat ≈ 0 → invisible. + sat_mask = (hsv[:, :, 1] > 50).astype(np.uint8) * 255 + # Exclude very bright backgrounds (white/near-white with color cast) + val_mask = (hsv[:, :, 2] < 235).astype(np.uint8) * 255 + color_mask = cv2.bitwise_and(sat_mask, val_mask) + + # Morphological cleanup: close small gaps, remove speckle + kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (5, 5)) + color_mask = cv2.morphologyEx(color_mask, cv2.MORPH_CLOSE, kernel) + kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3)) + color_mask = cv2.morphologyEx(color_mask, cv2.MORPH_OPEN, kernel_open) + + contours_color, _ = cv2.findContours( + color_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE, + ) + logger.info("GraphicDetect PASS1 (color): %d contours", len(contours_color)) + + for cnt in contours_color: + area = cv2.contourArea(cnt) + if area < 80: + continue + + bx, by, bw, bh = cv2.boundingRect(cnt) + if bw < 8 or bh < 8: + continue + + # Skip page-spanning contours (background color cast) + if bw > w * 0.8 and bh > h * 0.8: + continue + + perimeter = cv2.arcLength(cnt, True) + circularity = (4 * np.pi * area) / (perimeter * perimeter) if perimeter > 0 else 0 + aspect = bw / bh if bh > 0 else 1.0 + min_dim = min(bw, bh) + + # Colored circle / balloon + if circularity > 0.45 and 0.4 < aspect < 2.5 and min_dim > 12: + # Determine color + roi_hsv = hsv[by:by + bh, bx:bx + bw] + cnt_mask_roi = np.zeros((bh, bw), dtype=np.uint8) + cv2.drawContours(cnt_mask_roi, [cnt - np.array([bx, by])], -1, 255, -1) + masked_hsv = roi_hsv[cnt_mask_roi > 0] + color_name, color_hex = _dominant_color(masked_hsv, sat_threshold=30) + + conf = min(0.95, circularity) + logger.info("GraphicDetect PASS1 ACCEPT circle at (%d,%d) %dx%d area=%d circ=%.2f color=%s", + bx, by, bw, bh, int(area), circularity, color_name) + candidates.append(GraphicElement( + x=bx, y=by, width=bw, height=bh, + area=int(area), shape="circle", + color_name=color_name, color_hex=color_hex, + confidence=conf, contour=cnt, + )) + continue + + # Colored illustration (large colored region) + if area > 2000 and min_dim > 20: + roi_hsv = hsv[by:by + bh, bx:bx + bw] + cnt_mask_roi = np.zeros((bh, bw), dtype=np.uint8) + cv2.drawContours(cnt_mask_roi, [cnt - np.array([bx, by])], -1, 255, -1) + masked_hsv = roi_hsv[cnt_mask_roi > 0] + color_name, color_hex = _dominant_color(masked_hsv, sat_threshold=30) + + logger.info("GraphicDetect PASS1 ACCEPT illustration at (%d,%d) %dx%d area=%d color=%s", + bx, by, bw, bh, int(area), color_name) + candidates.append(GraphicElement( + x=bx, y=by, width=bw, height=bh, + area=int(area), shape="illustration", + color_name=color_name, color_hex=color_hex, + confidence=0.6, contour=cnt, + )) + continue + + # ===================================================================== + # PASS 2 — INK (dark pixels) with word exclusion + # Only for large black illustrations (drawings in black ink). + # ===================================================================== + gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) _, dark_mask = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) - # Saturated colored pixels (catches colored arrows, markers) - sat_mask = (hsv[:, :, 1] > 40).astype(np.uint8) * 255 - val_mask = (hsv[:, :, 2] < 230).astype(np.uint8) * 255 - color_ink = cv2.bitwise_and(sat_mask, val_mask) - - ink_mask = cv2.bitwise_or(dark_mask, color_ink) - - # --- 2. Build exclusion mask from OCR words --- + # Build exclusion mask from words exclusion = np.zeros((h, w), dtype=np.uint8) - + word_pad = 5 for wb in word_boxes: x1 = max(0, int(wb.get("left", 0)) - word_pad) y1 = max(0, int(wb.get("top", 0)) - word_pad) @@ -192,110 +226,82 @@ def detect_graphic_elements( y2 = min(h, int(wb.get("top", 0) + wb.get("height", 0)) + word_pad) exclusion[y1:y2, x1:x2] = 255 - # Also exclude detected box interiors (they contain text, not graphics) - # But keep a border strip so arrows/icons at box edges are found + # Also exclude detected box regions if detected_boxes: - box_inset = 8 for box in detected_boxes: - bx = int(box.get("x", 0)) - by = int(box.get("y", 0)) + bbx = int(box.get("x", 0)) + bby = int(box.get("y", 0)) bbw = int(box.get("w", box.get("width", 0))) bbh = int(box.get("h", box.get("height", 0))) - x1 = max(0, bx + box_inset) - y1 = max(0, by + box_inset) - x2 = min(w, bx + bbw - box_inset) - y2 = min(h, by + bbh - box_inset) + inset = 8 + x1 = max(0, bbx + inset) + y1 = max(0, bby + inset) + x2 = min(w, bbx + bbw - inset) + y2 = min(h, bby + bbh - inset) if x2 > x1 and y2 > y1: exclusion[y1:y2, x1:x2] = 255 - excl_pct = int(np.sum(exclusion > 0) * 100 / (h * w)) if h * w else 0 - logger.info("GraphicDetect: exclusion mask covers %d%% of image", excl_pct) + ink_only = cv2.bitwise_and(dark_mask, cv2.bitwise_not(exclusion)) - # Subtract exclusion from ink - graphic_mask = cv2.bitwise_and(ink_mask, cv2.bitwise_not(exclusion)) + # Remove colored regions already found in pass 1 + ink_only = cv2.bitwise_and(ink_only, cv2.bitwise_not(color_mask)) - # --- 3. Morphological cleanup --- - # Close small gaps (connects arrow stroke + head) — but not too large - # to avoid reconnecting text fragments - kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3)) - graphic_mask = cv2.morphologyEx(graphic_mask, cv2.MORPH_CLOSE, kernel_close) - # Remove small noise - kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3)) - graphic_mask = cv2.morphologyEx(graphic_mask, cv2.MORPH_OPEN, kernel_open) - - # --- 4. Find contours --- - contours, _ = cv2.findContours( - graphic_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE, + # Only look for LARGE remaining regions (black illustrations) + contours_ink, _ = cv2.findContours( + ink_only, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE, ) + logger.info("GraphicDetect PASS2 (ink): %d contours", len(contours_ink)) - logger.info("GraphicDetect: %d raw contours after exclusion", len(contours)) - - # --- 5. Analyse and classify --- - candidates: List[GraphicElement] = [] - skip_reasons: Dict[str, int] = {} - for cnt in contours: + for cnt in contours_ink: area = cv2.contourArea(cnt) - if area < min_area or area > max_area: - bx, by, bw, bh = cv2.boundingRect(cnt) - reason = f"area={int(area)}<{min_area}" if area < min_area else f"area={int(area)}>{max_area}" - logger.info("GraphicDetect SKIP: %s at (%d,%d) %dx%d", reason, bx, by, bw, bh) - skip_reasons[f"area_filter"] = skip_reasons.get("area_filter", 0) + 1 - continue - bx, by, bw, bh = cv2.boundingRect(cnt) - if bw < 8 or bh < 8: - skip_reasons["too_small_dim"] = skip_reasons.get("too_small_dim", 0) + 1 + min_dim = min(bw, bh) + + # Only large illustrations survive (area > 5000, min_dim > 40) + if area < 5000 or min_dim < 40: continue - # Skip elements that overlap significantly with the exclusion zone - roi_excl = exclusion[by:by + bh, bx:bx + bw] - excl_ratio = np.sum(roi_excl > 0) / (bw * bh) if bw * bh > 0 else 0 - if excl_ratio > 0.4: - logger.info("GraphicDetect SKIP excl_ratio=%.2f at (%d,%d) %dx%d area=%d", - excl_ratio, bx, by, bw, bh, int(area)) - skip_reasons["excl_overlap"] = skip_reasons.get("excl_overlap", 0) + 1 + # Skip page-spanning contours + if bw > w * 0.8 and bh > h * 0.8: continue - # Classify shape - shape, conf = _classify_shape(cnt, bw, bh, area) - - # Skip noise (too small or text-like) - if shape == "noise": - logger.info("GraphicDetect SKIP noise at (%d,%d) %dx%d area=%d", - bx, by, bw, bh, int(area)) - skip_reasons["noise"] = skip_reasons.get("noise", 0) + 1 - continue - - # Determine dominant color - roi_hsv = hsv[by:by + bh, bx:bx + bw] - cnt_mask = np.zeros((bh, bw), dtype=np.uint8) - shifted_cnt = cnt - np.array([bx, by]) - cv2.drawContours(cnt_mask, [shifted_cnt], -1, 255, -1) - masked_hsv = roi_hsv[cnt_mask > 0] - color_name, color_hex = _dominant_color(masked_hsv) - - logger.info("GraphicDetect ACCEPT: %s at (%d,%d) %dx%d area=%d color=%s conf=%.2f", - shape, bx, by, bw, bh, int(area), color_name, conf) + logger.info("GraphicDetect PASS2 ACCEPT illustration at (%d,%d) %dx%d area=%d", + bx, by, bw, bh, int(area)) candidates.append(GraphicElement( x=bx, y=by, width=bw, height=bh, - area=int(area), - shape=shape, - color_name=color_name, - color_hex=color_hex, - confidence=conf, - contour=cnt, + area=int(area), shape="illustration", + color_name="black", color_hex="#000000", + confidence=0.5, contour=cnt, )) - if skip_reasons: - logger.info("GraphicDetect: skipped contours: %s", - ", ".join(f"{k}={v}" for k, v in sorted(skip_reasons.items()))) - - # Sort by area descending, limit count + # ===================================================================== + # Deduplicate overlapping results and return + # ===================================================================== candidates.sort(key=lambda g: g.area, reverse=True) - result = candidates[:max_elements] + + # Remove duplicates where bounding boxes overlap > 50% + final: List[GraphicElement] = [] + for c in candidates: + overlap = False + for f in final: + # Intersection + ix1 = max(c.x, f.x) + iy1 = max(c.y, f.y) + ix2 = min(c.x + c.width, f.x + f.width) + iy2 = min(c.y + c.height, f.y + f.height) + if ix2 > ix1 and iy2 > iy1: + inter = (ix2 - ix1) * (iy2 - iy1) + smaller = min(c.width * c.height, f.width * f.height) + if smaller > 0 and inter / smaller > 0.5: + overlap = True + break + if not overlap: + final.append(c) + + result = final[:max_elements] if result: - shape_counts = {} + shape_counts: Dict[str, int] = {} for g in result: shape_counts[g.shape] = shape_counts.get(g.shape, 0) + 1 logger.info(