From fe7339c7a139ba4660b882836fecfc7636a513ce Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Mon, 16 Mar 2026 13:51:02 +0100 Subject: [PATCH] fix: suppress text fragments in graphic detection - Raise min_area from 30 to 200 (text fragments are small) - Raise word_pad from 3 to 10px (OCR bboxes are tight) - Reduce morph close kernel from 5x5 to 3x3 (avoid reconnecting text) - Tighten arrow detection: min 20px, circularity<0.35, >=2 defects - Add 'noise' category for too-small elements, filter them out - Raise min dimension from 4 to 8px - Add debug logging for word count and exclusion coverage - Raise max_area_ratio to 0.25 (allow larger illustrations) Co-Authored-By: Claude Opus 4.6 --- klausur-service/backend/cv_graphic_detect.py | 104 +++++++++++-------- klausur-service/backend/ocr_pipeline_api.py | 4 + 2 files changed, 63 insertions(+), 45 deletions(-) diff --git a/klausur-service/backend/cv_graphic_detect.py b/klausur-service/backend/cv_graphic_detect.py index 5fadf9d..06da34f 100644 --- a/klausur-service/backend/cv_graphic_detect.py +++ b/klausur-service/backend/cv_graphic_detect.py @@ -102,6 +102,8 @@ def _classify_shape( """Classify contour shape → (shape_name, confidence). Uses circularity, aspect ratio, solidity, and vertex count. + Only classifies as arrow/circle/line if the element is large enough + to be a genuine graphic (not a text fragment). """ aspect = bw / bh if bh > 0 else 1.0 perimeter = cv2.arcLength(contour, True) @@ -116,46 +118,47 @@ def _classify_shape( approx = cv2.approxPolyDP(contour, epsilon, True) vertices = len(approx) - # --- Arrow detection --- - # Arrows typically have: vertices 5-8, moderate solidity (0.4-0.8), - # moderate aspect ratio, low circularity - if 4 <= vertices <= 9 and 0.3 < solidity < 0.85 and circularity < 0.5: - # Check for a pointed tip via convexity defects + min_dim = min(bw, bh) + max_dim = max(bw, bh) + + # --- Circle / balloon --- (check first, most reliable) + # Must be reasonably large (not a dot/period) + if circularity > 0.70 and 0.6 < aspect < 1.7 and min_dim > 25: + conf = min(0.95, circularity) + return "circle", conf + + # --- Arrow detection --- (strict: must be sizable, distinct shape) + # Arrows must be at least 20px in both dimensions + if (min_dim > 20 and max_dim > 30 + and 5 <= vertices <= 9 + and 0.35 < solidity < 0.80 + and circularity < 0.35): hull_idx = cv2.convexHull(contour, returnPoints=False) if len(hull_idx) >= 4: try: defects = cv2.convexityDefects(contour, hull_idx) - if defects is not None and len(defects) >= 1: - # Significant defect = pointed shape + if defects is not None and len(defects) >= 2: max_depth = max(d[0][3] for d in defects) / 256.0 - if max_depth > min(bw, bh) * 0.15: - return "arrow", min(0.75, 0.5 + max_depth / max(bw, bh)) + if max_depth > min_dim * 0.25: + return "arrow", min(0.75, 0.5 + max_depth / max_dim) except cv2.error: pass - # --- Circle / balloon --- - if circularity > 0.65 and 0.5 < aspect < 2.0: - conf = min(0.95, circularity) - return "circle", conf - - # --- Line --- - if aspect > 4.0 or aspect < 0.25: + # --- Line (decorative rule, separator) --- + # Must be long enough to not be a dash/hyphen + if (aspect > 6.0 or aspect < 1 / 6.0) and max_dim > 40: return "line", 0.7 - # --- Exclamation mark (tall narrow + high solidity) --- - if aspect < 0.45 and bh > 12 and solidity > 0.5: - return "exclamation", 0.7 + # --- Larger illustration (drawing, image) --- + if area > 3000 and min_dim > 30: + return "illustration", 0.6 - # --- Dot / bullet (small, roughly square, high solidity) --- - if max(bw, bh) < 20 and 0.5 < aspect < 2.0 and solidity > 0.6: - return "dot", 0.6 + # --- Generic icon (moderate size, non-text shape) --- + if area > 500 and min_dim > 15: + return "icon", 0.4 - # --- Larger illustration --- - if area > 2000: - return "illustration", 0.5 - - # --- Generic icon --- - return "icon", 0.4 + # Everything else is too small or text-like — skip + return "noise", 0.0 # --------------------------------------------------------------------------- @@ -166,10 +169,10 @@ def detect_graphic_elements( img_bgr: np.ndarray, word_boxes: List[Dict], detected_boxes: Optional[List[Dict]] = None, - min_area: int = 30, - max_area_ratio: float = 0.05, - word_pad: int = 3, - max_elements: int = 80, + min_area: int = 200, + max_area_ratio: float = 0.25, + word_pad: int = 10, + max_elements: int = 50, ) -> List[GraphicElement]: """Find non-text graphical elements on the page. @@ -181,9 +184,9 @@ def detect_graphic_elements( img_bgr: BGR color image. word_boxes: List of OCR word dicts with left/top/width/height. detected_boxes: Optional list of detected box dicts (x/y/w/h). - min_area: Minimum contour area to keep. + min_area: Minimum contour area to keep (200 filters text fragments). max_area_ratio: Maximum area as fraction of image area. - word_pad: Padding around word boxes for exclusion. + word_pad: Padding around word boxes for exclusion (10px covers font edges). max_elements: Maximum number of elements to return. Returns: @@ -195,16 +198,17 @@ def detect_graphic_elements( h, w = img_bgr.shape[:2] max_area = int(h * w * max_area_ratio) + logger.info("GraphicDetect: image %dx%d, %d word_boxes, %d detected_boxes", + w, h, len(word_boxes), len(detected_boxes or [])) + gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV) # --- 1. Build ink mask: dark pixels + saturated colored pixels --- - # Adaptive threshold for dark ink _, dark_mask = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) # Saturated colored pixels (catches colored arrows, markers) sat_mask = (hsv[:, :, 1] > 40).astype(np.uint8) * 255 - # Only include saturated pixels that are also reasonably dark (not background) val_mask = (hsv[:, :, 2] < 230).astype(np.uint8) * 255 color_ink = cv2.bitwise_and(sat_mask, val_mask) @@ -236,15 +240,19 @@ def detect_graphic_elements( if x2 > x1 and y2 > y1: exclusion[y1:y2, x1:x2] = 255 + excl_pct = int(np.sum(exclusion > 0) * 100 / (h * w)) if h * w else 0 + logger.info("GraphicDetect: exclusion mask covers %d%% of image", excl_pct) + # Subtract exclusion from ink graphic_mask = cv2.bitwise_and(ink_mask, cv2.bitwise_not(exclusion)) # --- 3. Morphological cleanup --- - # Close small gaps (connects arrow stroke + head) - kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5)) + # Close small gaps (connects arrow stroke + head) — but not too large + # to avoid reconnecting text fragments + kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3)) graphic_mask = cv2.morphologyEx(graphic_mask, cv2.MORPH_CLOSE, kernel_close) - # Remove tiny noise - kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2)) + # Remove small noise + kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3)) graphic_mask = cv2.morphologyEx(graphic_mask, cv2.MORPH_OPEN, kernel_open) # --- 4. Find contours --- @@ -252,6 +260,8 @@ def detect_graphic_elements( graphic_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE, ) + logger.info("GraphicDetect: %d raw contours after exclusion", len(contours)) + # --- 5. Analyse and classify --- candidates: List[GraphicElement] = [] for cnt in contours: @@ -260,22 +270,24 @@ def detect_graphic_elements( continue bx, by, bw, bh = cv2.boundingRect(cnt) - if bw < 4 or bh < 4: + if bw < 8 or bh < 8: continue - # Skip elements that are mostly inside the exclusion zone - # (partial overlap with a word) + # Skip elements that overlap significantly with the exclusion zone roi_excl = exclusion[by:by + bh, bx:bx + bw] excl_ratio = np.sum(roi_excl > 0) / (bw * bh) if bw * bh > 0 else 0 - if excl_ratio > 0.6: + if excl_ratio > 0.4: continue # Classify shape shape, conf = _classify_shape(cnt, bw, bh, area) + # Skip noise (too small or text-like) + if shape == "noise": + continue + # Determine dominant color roi_hsv = hsv[by:by + bh, bx:bx + bw] - # Only sample pixels that are actually in the contour cnt_mask = np.zeros((bh, bw), dtype=np.uint8) shifted_cnt = cnt - np.array([bx, by]) cv2.drawContours(cnt_mask, [shifted_cnt], -1, 255, -1) @@ -305,5 +317,7 @@ def detect_graphic_elements( len(result), ", ".join(f"{s}: {c}" for s, c in sorted(shape_counts.items())), ) + else: + logger.info("GraphicDetect: no graphic elements found") return result diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py index f03f44d..6ba8b91 100644 --- a/klausur-service/backend/ocr_pipeline_api.py +++ b/klausur-service/backend/ocr_pipeline_api.py @@ -1236,6 +1236,10 @@ async def detect_structure(session_id: str): for cell in word_result["cells"]: for wb in (cell.get("word_boxes") or []): words.append(wb) + logger.info("detect-structure: word_result present=%s, cells=%d, word_boxes extracted=%d", + word_result is not None, + len(word_result.get("cells", [])) if word_result else 0, + len(words)) # If no words yet, use image dimensions with small margin if words: content_x = max(0, min(int(wb["left"]) for wb in words))