""" Graphical element detection for OCR pages. Region-based approach: 1. Build a color mask (saturation channel — black text is invisible). 2. Dilate heavily to merge nearby colored pixels into regions. 3. For each region, check overlap with OCR word boxes: - High word overlap → colored text (skip) - Low word overlap → colored graphic / image (keep) 4. Separately detect large black-ink illustrations via ink mask. Boxes and text colors are handled by cv_box_detect / cv_color_detect. Lizenz: Apache 2.0 (kommerziell nutzbar) DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import logging from dataclasses import dataclass, field from typing import Any, Dict, List, Optional import cv2 import numpy as np logger = logging.getLogger(__name__) __all__ = ["detect_graphic_elements", "GraphicElement"] @dataclass class GraphicElement: """A detected non-text graphical element.""" x: int y: int width: int height: int area: int shape: str # image, illustration color_name: str # dominant color or 'black' color_hex: str confidence: float contour: Any = field(default=None, repr=False) # --------------------------------------------------------------------------- # Color helpers # --------------------------------------------------------------------------- _COLOR_HEX = { "black": "#000000", "gray": "#6b7280", "red": "#dc2626", "orange": "#ea580c", "yellow": "#ca8a04", "green": "#16a34a", "blue": "#2563eb", "purple": "#9333ea", } def _dominant_color(hsv_roi: np.ndarray, sat_threshold: int = 40) -> tuple: """Return (color_name, color_hex) for an HSV region.""" if hsv_roi.size == 0: return "black", _COLOR_HEX["black"] pixels = hsv_roi.reshape(-1, 3) sat = pixels[:, 1] sat_mask = sat > sat_threshold sat_ratio = np.sum(sat_mask) / len(pixels) if len(pixels) > 0 else 0 if sat_ratio < 0.15: return "black", _COLOR_HEX["black"] sat_pixels = pixels[sat_mask] if len(sat_pixels) < 3: return "black", _COLOR_HEX["black"] med_hue = float(np.median(sat_pixels[:, 0])) if med_hue < 10 or med_hue > 170: name = "red" elif med_hue < 25: name = "orange" elif med_hue < 35: name = "yellow" elif med_hue < 85: name = "green" elif med_hue < 130: name = "blue" else: name = "purple" return name, _COLOR_HEX.get(name, _COLOR_HEX["black"]) # --------------------------------------------------------------------------- # Main detection # --------------------------------------------------------------------------- def detect_graphic_elements( img_bgr: np.ndarray, word_boxes: List[Dict], detected_boxes: Optional[List[Dict]] = None, max_elements: int = 50, ) -> List[GraphicElement]: """Find non-text graphical regions on the page. Region-based: dilate color mask to form regions, then check word overlap to distinguish colored text from colored graphics. Args: img_bgr: BGR color image. word_boxes: List of OCR word dicts with left/top/width/height. detected_boxes: Optional list of detected box dicts (x/y/w/h). max_elements: Maximum number of elements to return. Returns: List of GraphicElement, sorted by area descending. """ if img_bgr is None: return [] h, w = img_bgr.shape[:2] logger.debug("GraphicDetect: image %dx%d, %d word_boxes, %d detected_boxes", w, h, len(word_boxes), len(detected_boxes or [])) hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV) candidates: List[GraphicElement] = [] # --- Build word mask (for overlap checking) --- word_mask = np.zeros((h, w), dtype=np.uint8) for wb in word_boxes: x1 = max(0, int(wb.get("left", 0))) y1 = max(0, int(wb.get("top", 0))) x2 = min(w, int(wb.get("left", 0) + wb.get("width", 0))) y2 = min(h, int(wb.get("top", 0) + wb.get("height", 0))) word_mask[y1:y2, x1:x2] = 255 # ===================================================================== # PASS 1 — COLORED IMAGE REGIONS # ===================================================================== # Color mask: saturated pixels (black text has sat ≈ 0 → invisible) sat_mask = (hsv[:, :, 1] > 40).astype(np.uint8) * 255 val_mask = (hsv[:, :, 2] < 240).astype(np.uint8) * 255 color_pixels = cv2.bitwise_and(sat_mask, val_mask) # Remove tiny speckle kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2)) color_pixels = cv2.morphologyEx(color_pixels, cv2.MORPH_OPEN, kernel_open) # Count raw colored pixels before dilation (for density check later) color_pixel_raw = color_pixels.copy() # Heavy dilation to merge nearby colored elements into regions. # A 25x25 kernel merges elements within ~12px of each other. kernel_dilate = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (25, 25)) region_mask = cv2.dilate(color_pixels, kernel_dilate, iterations=1) contours_regions, _ = cv2.findContours( region_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE, ) logger.debug("GraphicDetect PASS1: %d color regions after dilation", len(contours_regions)) for cnt in contours_regions: bx, by, bw, bh = cv2.boundingRect(cnt) # Skip tiny regions if bw < 15 or bh < 15: continue # Skip page-spanning regions if bw > w * 0.5 or bh > h * 0.5: logger.debug("GraphicDetect PASS1 skip page-spanning (%d,%d) %dx%d", bx, by, bw, bh) continue bbox_area = bw * bh # Check: how much of this region's bounding box overlaps with words? roi_words = word_mask[by:by + bh, bx:bx + bw] word_pixel_count = int(np.sum(roi_words > 0)) word_overlap = word_pixel_count / bbox_area if bbox_area > 0 else 0 # Check: how many OCR word centroids fall inside this region? # Colored text that OCR detected will have multiple centroids inside. # Actual images may have 0-1 spurious OCR artifacts. word_centroid_count = sum( 1 for wb in word_boxes if (bx <= int(wb.get("left", 0) + wb.get("width", 0) / 2) <= bx + bw and by <= int(wb.get("top", 0) + wb.get("height", 0) / 2) <= by + bh) ) # Check: how many actual colored pixels are in this region? roi_color = color_pixel_raw[by:by + bh, bx:bx + bw] color_pixel_count = int(np.sum(roi_color > 0)) # Color pixel density (before any skip checks so we can log it) density = color_pixel_count / bbox_area if bbox_area > 0 else 0 # --- Skip heuristics for colored TEXT (not images) --- # (a) High word-box pixel overlap → clearly text if word_overlap > 0.40: logger.info( "GraphicDetect PASS1 skip text-overlap (%d,%d) %dx%d " "overlap=%.0f%% centroids=%d", bx, by, bw, bh, word_overlap * 100, word_centroid_count, ) continue # (b) Multiple OCR words detected inside → colored text # (images rarely produce 2+ confident word detections) if word_centroid_count >= 2: logger.info( "GraphicDetect PASS1 skip multi-word (%d,%d) %dx%d " "centroids=%d overlap=%.0f%% density=%.0f%%", bx, by, bw, bh, word_centroid_count, word_overlap * 100, density * 100, ) continue # (c) Even 1 word + some pixel overlap → likely text if word_centroid_count >= 1 and word_overlap > 0.10: logger.info( "GraphicDetect PASS1 skip word+overlap (%d,%d) %dx%d " "centroids=%d overlap=%.0f%%", bx, by, bw, bh, word_centroid_count, word_overlap * 100, ) continue # Need a minimum number of colored pixels (not just dilated area) if color_pixel_count < 200: continue # (d) Very low density → thin strokes, almost certainly text if density < 0.20: logger.info( "GraphicDetect PASS1 skip low-density (%d,%d) %dx%d " "density=%.0f%% (likely colored text)", bx, by, bw, bh, density * 100, ) continue # (e) Moderate density + small height → colored text line if density < 0.35 and bh < h * 0.05: logger.info( "GraphicDetect PASS1 skip text-height (%d,%d) %dx%d " "density=%.0f%% height=%.1f%%", bx, by, bw, bh, density * 100, 100.0 * bh / h, ) continue # Determine dominant color from the actual colored pixels roi_hsv = hsv[by:by + bh, bx:bx + bw] color_px_mask = roi_color > 0 if np.sum(color_px_mask) > 0: masked_hsv = roi_hsv[color_px_mask] color_name, color_hex = _dominant_color(masked_hsv) else: color_name, color_hex = "black", _COLOR_HEX["black"] # Confidence based on color density and low word overlap conf = min(0.95, 0.5 + density * 0.5) logger.debug("GraphicDetect PASS1 accept (%d,%d) %dx%d px=%d density=%.0f%% overlap=%.0f%% %s", bx, by, bw, bh, color_pixel_count, density * 100, word_overlap * 100, color_name) candidates.append(GraphicElement( x=bx, y=by, width=bw, height=bh, area=color_pixel_count, shape="image", color_name=color_name, color_hex=color_hex, confidence=round(conf, 2), contour=cnt, )) # ===================================================================== # PASS 2 — LARGE BLACK-INK ILLUSTRATIONS # ===================================================================== gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) _, dark_mask = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) # Exclude words and colored regions already found exclusion = np.zeros((h, w), dtype=np.uint8) word_pad = 5 for wb in word_boxes: x1 = max(0, int(wb.get("left", 0)) - word_pad) y1 = max(0, int(wb.get("top", 0)) - word_pad) x2 = min(w, int(wb.get("left", 0) + wb.get("width", 0)) + word_pad) y2 = min(h, int(wb.get("top", 0) + wb.get("height", 0)) + word_pad) exclusion[y1:y2, x1:x2] = 255 if detected_boxes: for box in detected_boxes: bbx = int(box.get("x", 0)) bby = int(box.get("y", 0)) bbw = int(box.get("w", box.get("width", 0))) bbh = int(box.get("h", box.get("height", 0))) inset = 8 x1 = max(0, bbx + inset) y1 = max(0, bby + inset) x2 = min(w, bbx + bbw - inset) y2 = min(h, bby + bbh - inset) if x2 > x1 and y2 > y1: exclusion[y1:y2, x1:x2] = 255 ink_only = cv2.bitwise_and(dark_mask, cv2.bitwise_not(exclusion)) ink_only = cv2.bitwise_and(ink_only, cv2.bitwise_not(color_pixels)) contours_ink, _ = cv2.findContours( ink_only, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE, ) logger.debug("GraphicDetect PASS2 ink: %d contours", len(contours_ink)) for cnt in contours_ink: area = cv2.contourArea(cnt) bx, by, bw, bh = cv2.boundingRect(cnt) if area < 5000 or min(bw, bh) < 40: continue if bw > w * 0.8 or bh > h * 0.8: continue logger.debug("GraphicDetect PASS2 accept (%d,%d) %dx%d area=%d", bx, by, bw, bh, int(area)) candidates.append(GraphicElement( x=bx, y=by, width=bw, height=bh, area=int(area), shape="illustration", color_name="black", color_hex="#000000", confidence=0.5, contour=cnt, )) # ===================================================================== # Deduplicate and return # ===================================================================== candidates.sort(key=lambda g: g.area, reverse=True) final: List[GraphicElement] = [] for c in candidates: overlap = False for f in final: ix1 = max(c.x, f.x) iy1 = max(c.y, f.y) ix2 = min(c.x + c.width, f.x + f.width) iy2 = min(c.y + c.height, f.y + f.height) if ix2 > ix1 and iy2 > iy1: inter = (ix2 - ix1) * (iy2 - iy1) smaller = min(c.width * c.height, f.width * f.height) if smaller > 0 and inter / smaller > 0.5: overlap = True break if not overlap: final.append(c) result = final[:max_elements] if result: shape_counts: Dict[str, int] = {} for g in result: shape_counts[g.shape] = shape_counts.get(g.shape, 0) + 1 logger.info( "GraphicDetect: %d elements found (%s)", len(result), ", ".join(f"{s}: {c}" for s, c in sorted(shape_counts.items())), ) else: logger.info("GraphicDetect: no graphic elements found") return result