""" Graphical element detection for OCR pages. Finds non-text visual elements (arrows, balloons, icons, illustrations) by subtracting known OCR word regions from the page ink and analysing remaining connected components via contour shape metrics. Works on both color and grayscale scans. Lizenz: Apache 2.0 (kommerziell nutzbar) DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import logging from dataclasses import dataclass, field from typing import Any, Dict, List, Optional import cv2 import numpy as np logger = logging.getLogger(__name__) __all__ = ["detect_graphic_elements", "GraphicElement"] @dataclass class GraphicElement: """A detected non-text graphical element.""" x: int y: int width: int height: int area: int shape: str # arrow, circle, line, icon, illustration color_name: str # dominant color or 'black' color_hex: str confidence: float contour: Any = field(default=None, repr=False) # numpy contour, excluded from repr # --------------------------------------------------------------------------- # Color helpers # --------------------------------------------------------------------------- _COLOR_HEX = { "black": "#000000", "gray": "#6b7280", "red": "#dc2626", "orange": "#ea580c", "yellow": "#ca8a04", "green": "#16a34a", "blue": "#2563eb", "purple": "#9333ea", } def _dominant_color(hsv_roi: np.ndarray, sat_threshold: int = 50) -> tuple: """Return (color_name, color_hex) for an HSV region.""" if hsv_roi.size == 0: return "black", _COLOR_HEX["black"] pixels = hsv_roi.reshape(-1, 3) sat = pixels[:, 1] sat_mask = sat > sat_threshold sat_ratio = np.sum(sat_mask) / len(pixels) if len(pixels) > 0 else 0 if sat_ratio < 0.15: return "black", _COLOR_HEX["black"] sat_pixels = pixels[sat_mask] if len(sat_pixels) < 3: return "black", _COLOR_HEX["black"] med_hue = float(np.median(sat_pixels[:, 0])) if med_hue < 10 or med_hue > 170: name = "red" elif med_hue < 25: name = "orange" elif med_hue < 35: name = "yellow" elif med_hue < 85: name = "green" elif med_hue < 130: name = "blue" else: name = "purple" return name, _COLOR_HEX.get(name, _COLOR_HEX["black"]) # --------------------------------------------------------------------------- # Shape classification via contour analysis # --------------------------------------------------------------------------- def _classify_shape( contour: np.ndarray, bw: int, bh: int, area: float, ) -> tuple: """Classify contour shape → (shape_name, confidence). Uses circularity, aspect ratio, solidity, and vertex count. """ aspect = bw / bh if bh > 0 else 1.0 perimeter = cv2.arcLength(contour, True) circularity = (4 * np.pi * area) / (perimeter * perimeter) if perimeter > 0 else 0 hull = cv2.convexHull(contour) hull_area = cv2.contourArea(hull) solidity = area / hull_area if hull_area > 0 else 0 # Approximate polygon epsilon = 0.03 * perimeter approx = cv2.approxPolyDP(contour, epsilon, True) vertices = len(approx) # --- Arrow detection --- # Arrows typically have: vertices 5-8, moderate solidity (0.4-0.8), # moderate aspect ratio, low circularity if 4 <= vertices <= 9 and 0.3 < solidity < 0.85 and circularity < 0.5: # Check for a pointed tip via convexity defects hull_idx = cv2.convexHull(contour, returnPoints=False) if len(hull_idx) >= 4: try: defects = cv2.convexityDefects(contour, hull_idx) if defects is not None and len(defects) >= 1: # Significant defect = pointed shape max_depth = max(d[0][3] for d in defects) / 256.0 if max_depth > min(bw, bh) * 0.15: return "arrow", min(0.75, 0.5 + max_depth / max(bw, bh)) except cv2.error: pass # --- Circle / balloon --- if circularity > 0.65 and 0.5 < aspect < 2.0: conf = min(0.95, circularity) return "circle", conf # --- Line --- if aspect > 4.0 or aspect < 0.25: return "line", 0.7 # --- Exclamation mark (tall narrow + high solidity) --- if aspect < 0.45 and bh > 12 and solidity > 0.5: return "exclamation", 0.7 # --- Dot / bullet (small, roughly square, high solidity) --- if max(bw, bh) < 20 and 0.5 < aspect < 2.0 and solidity > 0.6: return "dot", 0.6 # --- Larger illustration --- if area > 2000: return "illustration", 0.5 # --- Generic icon --- return "icon", 0.4 # --------------------------------------------------------------------------- # Main detection # --------------------------------------------------------------------------- def detect_graphic_elements( img_bgr: np.ndarray, word_boxes: List[Dict], detected_boxes: Optional[List[Dict]] = None, min_area: int = 30, max_area_ratio: float = 0.05, word_pad: int = 3, max_elements: int = 80, ) -> List[GraphicElement]: """Find non-text graphical elements on the page. 1. Build ink mask (dark + colored pixels). 2. Subtract OCR word regions and detected boxes. 3. Find connected components and classify shapes. Args: img_bgr: BGR color image. word_boxes: List of OCR word dicts with left/top/width/height. detected_boxes: Optional list of detected box dicts (x/y/w/h). min_area: Minimum contour area to keep. max_area_ratio: Maximum area as fraction of image area. word_pad: Padding around word boxes for exclusion. max_elements: Maximum number of elements to return. Returns: List of GraphicElement, sorted by area descending. """ if img_bgr is None: return [] h, w = img_bgr.shape[:2] max_area = int(h * w * max_area_ratio) gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV) # --- 1. Build ink mask: dark pixels + saturated colored pixels --- # Adaptive threshold for dark ink _, dark_mask = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) # Saturated colored pixels (catches colored arrows, markers) sat_mask = (hsv[:, :, 1] > 40).astype(np.uint8) * 255 # Only include saturated pixels that are also reasonably dark (not background) val_mask = (hsv[:, :, 2] < 230).astype(np.uint8) * 255 color_ink = cv2.bitwise_and(sat_mask, val_mask) ink_mask = cv2.bitwise_or(dark_mask, color_ink) # --- 2. Build exclusion mask from OCR words --- exclusion = np.zeros((h, w), dtype=np.uint8) for wb in word_boxes: x1 = max(0, int(wb.get("left", 0)) - word_pad) y1 = max(0, int(wb.get("top", 0)) - word_pad) x2 = min(w, int(wb.get("left", 0) + wb.get("width", 0)) + word_pad) y2 = min(h, int(wb.get("top", 0) + wb.get("height", 0)) + word_pad) exclusion[y1:y2, x1:x2] = 255 # Also exclude detected box interiors (they contain text, not graphics) # But keep a border strip so arrows/icons at box edges are found if detected_boxes: box_inset = 8 for box in detected_boxes: bx = int(box.get("x", 0)) by = int(box.get("y", 0)) bbw = int(box.get("w", box.get("width", 0))) bbh = int(box.get("h", box.get("height", 0))) x1 = max(0, bx + box_inset) y1 = max(0, by + box_inset) x2 = min(w, bx + bbw - box_inset) y2 = min(h, by + bbh - box_inset) if x2 > x1 and y2 > y1: exclusion[y1:y2, x1:x2] = 255 # Subtract exclusion from ink graphic_mask = cv2.bitwise_and(ink_mask, cv2.bitwise_not(exclusion)) # --- 3. Morphological cleanup --- # Close small gaps (connects arrow stroke + head) kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5)) graphic_mask = cv2.morphologyEx(graphic_mask, cv2.MORPH_CLOSE, kernel_close) # Remove tiny noise kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2)) graphic_mask = cv2.morphologyEx(graphic_mask, cv2.MORPH_OPEN, kernel_open) # --- 4. Find contours --- contours, _ = cv2.findContours( graphic_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE, ) # --- 5. Analyse and classify --- candidates: List[GraphicElement] = [] for cnt in contours: area = cv2.contourArea(cnt) if area < min_area or area > max_area: continue bx, by, bw, bh = cv2.boundingRect(cnt) if bw < 4 or bh < 4: continue # Skip elements that are mostly inside the exclusion zone # (partial overlap with a word) roi_excl = exclusion[by:by + bh, bx:bx + bw] excl_ratio = np.sum(roi_excl > 0) / (bw * bh) if bw * bh > 0 else 0 if excl_ratio > 0.6: continue # Classify shape shape, conf = _classify_shape(cnt, bw, bh, area) # Determine dominant color roi_hsv = hsv[by:by + bh, bx:bx + bw] # Only sample pixels that are actually in the contour cnt_mask = np.zeros((bh, bw), dtype=np.uint8) shifted_cnt = cnt - np.array([bx, by]) cv2.drawContours(cnt_mask, [shifted_cnt], -1, 255, -1) masked_hsv = roi_hsv[cnt_mask > 0] color_name, color_hex = _dominant_color(masked_hsv) candidates.append(GraphicElement( x=bx, y=by, width=bw, height=bh, area=int(area), shape=shape, color_name=color_name, color_hex=color_hex, confidence=conf, contour=cnt, )) # Sort by area descending, limit count candidates.sort(key=lambda g: g.area, reverse=True) result = candidates[:max_elements] if result: shape_counts = {} for g in result: shape_counts[g.shape] = shape_counts.get(g.shape, 0) + 1 logger.info( "GraphicDetect: %d elements found (%s)", len(result), ", ".join(f"{s}: {c}" for s, c in sorted(shape_counts.items())), ) return result