feat: region-based graphic detection with word-overlap filtering
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m3s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 19s

New approach: dilate color mask heavily (25x25) to merge nearby colored
pixels into regions, then check word overlap:
- >50% overlap with OCR word boxes → colored text → skip
- <50% overlap → colored image/graphic → keep

This detects balloon clusters as one "image" region instead of trying
to classify individual shapes. Red words like "borrow/lend" are filtered
because they overlap with their word boxes.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-16 14:49:15 +01:00
parent eeee61108a
commit 6668661895

View File

@@ -1,14 +1,13 @@
""" """
Graphical element detection for OCR pages. Graphical element detection for OCR pages.
Two-pass approach: Region-based approach:
Pass 1 — COLOR PASS: Detect colored graphical elements (balloons, colored 1. Build a color mask (saturation channel — black text is invisible).
arrows, icons) on the saturation channel alone. Black text has 2. Dilate heavily to merge nearby colored pixels into regions.
zero saturation and is invisible on this channel, so no word 3. For each region, check overlap with OCR word boxes:
exclusion is needed. - High word overlap → colored text (skip)
Pass 2 — INK PASS: Detect large black-ink illustrations by subtracting - Low word overlap → colored graphic / image (keep)
OCR word boxes from the full ink mask and keeping only very large 4. Separately detect large black-ink illustrations via ink mask.
remaining contours.
Boxes and text colors are handled by cv_box_detect / cv_color_detect. Boxes and text colors are handled by cv_box_detect / cv_color_detect.
@@ -36,7 +35,7 @@ class GraphicElement:
width: int width: int
height: int height: int
area: int area: int
shape: str # circle, illustration shape: str # image, illustration
color_name: str # dominant color or 'black' color_name: str # dominant color or 'black'
color_hex: str color_hex: str
confidence: float confidence: float
@@ -59,7 +58,7 @@ _COLOR_HEX = {
} }
def _dominant_color(hsv_roi: np.ndarray, sat_threshold: int = 50) -> tuple: def _dominant_color(hsv_roi: np.ndarray, sat_threshold: int = 40) -> tuple:
"""Return (color_name, color_hex) for an HSV region.""" """Return (color_name, color_hex) for an HSV region."""
if hsv_roi.size == 0: if hsv_roi.size == 0:
return "black", _COLOR_HEX["black"] return "black", _COLOR_HEX["black"]
@@ -104,13 +103,10 @@ def detect_graphic_elements(
detected_boxes: Optional[List[Dict]] = None, detected_boxes: Optional[List[Dict]] = None,
max_elements: int = 50, max_elements: int = 50,
) -> List[GraphicElement]: ) -> List[GraphicElement]:
"""Find non-text graphical elements on the page. """Find non-text graphical regions on the page.
Two-pass approach: Region-based: dilate color mask to form regions, then check word
Pass 1 (color): Find colored elements via saturation channel. overlap to distinguish colored text from colored graphics.
No word exclusion needed — black text is invisible.
Pass 2 (ink): Find large black illustrations via ink mask minus
word exclusion.
Args: Args:
img_bgr: BGR color image. img_bgr: BGR color image.
@@ -133,89 +129,104 @@ def detect_graphic_elements(
hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV) hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
candidates: List[GraphicElement] = [] candidates: List[GraphicElement] = []
# --- Build word mask (for overlap checking) ---
word_mask = np.zeros((h, w), dtype=np.uint8)
for wb in word_boxes:
x1 = max(0, int(wb.get("left", 0)))
y1 = max(0, int(wb.get("top", 0)))
x2 = min(w, int(wb.get("left", 0) + wb.get("width", 0)))
y2 = min(h, int(wb.get("top", 0) + wb.get("height", 0)))
word_mask[y1:y2, x1:x2] = 255
# ===================================================================== # =====================================================================
# PASS 1 — COLOR CHANNEL (no word exclusion needed) # PASS 1 — COLORED IMAGE REGIONS
# ===================================================================== # =====================================================================
# Saturated pixels = colored ink. Black text has sat ≈ 0 → invisible. # Color mask: saturated pixels (black text has sat ≈ 0 → invisible)
sat_mask = (hsv[:, :, 1] > 40).astype(np.uint8) * 255 sat_mask = (hsv[:, :, 1] > 40).astype(np.uint8) * 255
# Exclude very bright backgrounds (white/near-white with color cast)
val_mask = (hsv[:, :, 2] < 240).astype(np.uint8) * 255 val_mask = (hsv[:, :, 2] < 240).astype(np.uint8) * 255
color_mask = cv2.bitwise_and(sat_mask, val_mask) color_pixels = cv2.bitwise_and(sat_mask, val_mask)
# Only remove tiny speckle — NO closing, which would merge nearby # Remove tiny speckle
# colored elements into one giant blob spanning half the page.
kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2)) kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
color_mask = cv2.morphologyEx(color_mask, cv2.MORPH_OPEN, kernel_open) color_pixels = cv2.morphologyEx(color_pixels, cv2.MORPH_OPEN, kernel_open)
contours_color, _ = cv2.findContours( # Count raw colored pixels before dilation (for density check later)
color_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE, color_pixel_raw = color_pixels.copy()
# Heavy dilation to merge nearby colored elements into regions.
# A 25x25 kernel merges elements within ~12px of each other.
kernel_dilate = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (25, 25))
region_mask = cv2.dilate(color_pixels, kernel_dilate, iterations=1)
contours_regions, _ = cv2.findContours(
region_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
) )
logger.info("GraphicDetect PASS1 (color): %d contours", len(contours_color)) logger.info("GraphicDetect PASS1: %d color regions after dilation", len(contours_regions))
for cnt in contours_color:
area = cv2.contourArea(cnt)
if area < 80:
continue
for cnt in contours_regions:
bx, by, bw, bh = cv2.boundingRect(cnt) bx, by, bw, bh = cv2.boundingRect(cnt)
if bw < 8 or bh < 8:
# Skip tiny regions
if bw < 15 or bh < 15:
continue continue
# Skip page-spanning contours (background color cast / merged blobs) # Skip page-spanning regions
if bw > w * 0.5 or bh > h * 0.5 or area > img_area * 0.10: if bw > w * 0.5 or bh > h * 0.5:
logger.info("GraphicDetect PASS1 SKIP page-spanning (%d,%d) %dx%d", bx, by, bw, bh)
continue continue
perimeter = cv2.arcLength(cnt, True) bbox_area = bw * bh
circularity = (4 * np.pi * area) / (perimeter * perimeter) if perimeter > 0 else 0
aspect = bw / bh if bh > 0 else 1.0
min_dim = min(bw, bh)
# Colored circle / balloon # Check: how much of this region's bounding box overlaps with words?
if circularity > 0.45 and 0.4 < aspect < 2.5 and min_dim > 12: roi_words = word_mask[by:by + bh, bx:bx + bw]
# Determine color word_pixel_count = int(np.sum(roi_words > 0))
roi_hsv = hsv[by:by + bh, bx:bx + bw] word_overlap = word_pixel_count / bbox_area if bbox_area > 0 else 0
cnt_mask_roi = np.zeros((bh, bw), dtype=np.uint8)
cv2.drawContours(cnt_mask_roi, [cnt - np.array([bx, by])], -1, 255, -1)
masked_hsv = roi_hsv[cnt_mask_roi > 0]
color_name, color_hex = _dominant_color(masked_hsv, sat_threshold=30)
conf = min(0.95, circularity) # Check: how many actual colored pixels are in this region?
logger.info("GraphicDetect PASS1 ACCEPT circle at (%d,%d) %dx%d area=%d circ=%.2f color=%s", roi_color = color_pixel_raw[by:by + bh, bx:bx + bw]
bx, by, bw, bh, int(area), circularity, color_name) color_pixel_count = int(np.sum(roi_color > 0))
candidates.append(GraphicElement(
x=bx, y=by, width=bw, height=bh, # If most of the region is covered by word boxes → colored text, skip
area=int(area), shape="circle", if word_overlap > 0.5:
color_name=color_name, color_hex=color_hex, logger.info("GraphicDetect PASS1 SKIP text region (%d,%d) %dx%d word_overlap=%.0f%%",
confidence=conf, contour=cnt, bx, by, bw, bh, word_overlap * 100)
))
continue continue
# Colored illustration (large colored region) # Need a minimum number of colored pixels (not just dilated area)
if area > 2000 and min_dim > 20: if color_pixel_count < 200:
roi_hsv = hsv[by:by + bh, bx:bx + bw]
cnt_mask_roi = np.zeros((bh, bw), dtype=np.uint8)
cv2.drawContours(cnt_mask_roi, [cnt - np.array([bx, by])], -1, 255, -1)
masked_hsv = roi_hsv[cnt_mask_roi > 0]
color_name, color_hex = _dominant_color(masked_hsv, sat_threshold=30)
logger.info("GraphicDetect PASS1 ACCEPT illustration at (%d,%d) %dx%d area=%d color=%s",
bx, by, bw, bh, int(area), color_name)
candidates.append(GraphicElement(
x=bx, y=by, width=bw, height=bh,
area=int(area), shape="illustration",
color_name=color_name, color_hex=color_hex,
confidence=0.6, contour=cnt,
))
continue continue
# Determine dominant color from the actual colored pixels
roi_hsv = hsv[by:by + bh, bx:bx + bw]
color_px_mask = roi_color > 0
if np.sum(color_px_mask) > 0:
masked_hsv = roi_hsv[color_px_mask]
color_name, color_hex = _dominant_color(masked_hsv)
else:
color_name, color_hex = "black", _COLOR_HEX["black"]
# Confidence based on color density and low word overlap
density = color_pixel_count / bbox_area if bbox_area > 0 else 0
conf = min(0.95, 0.5 + density * 0.5)
logger.info("GraphicDetect PASS1 ACCEPT image at (%d,%d) %dx%d "
"color_px=%d word_overlap=%.0f%% color=%s",
bx, by, bw, bh, color_pixel_count, word_overlap * 100, color_name)
candidates.append(GraphicElement(
x=bx, y=by, width=bw, height=bh,
area=color_pixel_count,
shape="image",
color_name=color_name, color_hex=color_hex,
confidence=round(conf, 2), contour=cnt,
))
# ===================================================================== # =====================================================================
# PASS 2 — INK (dark pixels) with word exclusion # PASS 2 — LARGE BLACK-INK ILLUSTRATIONS
# Only for large black illustrations (drawings in black ink).
# ===================================================================== # =====================================================================
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
_, dark_mask = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) _, dark_mask = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
# Build exclusion mask from words # Exclude words and colored regions already found
exclusion = np.zeros((h, w), dtype=np.uint8) exclusion = np.zeros((h, w), dtype=np.uint8)
word_pad = 5 word_pad = 5
for wb in word_boxes: for wb in word_boxes:
@@ -225,7 +236,6 @@ def detect_graphic_elements(
y2 = min(h, int(wb.get("top", 0) + wb.get("height", 0)) + word_pad) y2 = min(h, int(wb.get("top", 0) + wb.get("height", 0)) + word_pad)
exclusion[y1:y2, x1:x2] = 255 exclusion[y1:y2, x1:x2] = 255
# Also exclude detected box regions
if detected_boxes: if detected_boxes:
for box in detected_boxes: for box in detected_boxes:
bbx = int(box.get("x", 0)) bbx = int(box.get("x", 0))
@@ -241,11 +251,8 @@ def detect_graphic_elements(
exclusion[y1:y2, x1:x2] = 255 exclusion[y1:y2, x1:x2] = 255
ink_only = cv2.bitwise_and(dark_mask, cv2.bitwise_not(exclusion)) ink_only = cv2.bitwise_and(dark_mask, cv2.bitwise_not(exclusion))
ink_only = cv2.bitwise_and(ink_only, cv2.bitwise_not(color_pixels))
# Remove colored regions already found in pass 1
ink_only = cv2.bitwise_and(ink_only, cv2.bitwise_not(color_mask))
# Only look for LARGE remaining regions (black illustrations)
contours_ink, _ = cv2.findContours( contours_ink, _ = cv2.findContours(
ink_only, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE, ink_only, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
) )
@@ -254,14 +261,10 @@ def detect_graphic_elements(
for cnt in contours_ink: for cnt in contours_ink:
area = cv2.contourArea(cnt) area = cv2.contourArea(cnt)
bx, by, bw, bh = cv2.boundingRect(cnt) bx, by, bw, bh = cv2.boundingRect(cnt)
min_dim = min(bw, bh)
# Only large illustrations survive (area > 5000, min_dim > 40) if area < 5000 or min(bw, bh) < 40:
if area < 5000 or min_dim < 40:
continue continue
if bw > w * 0.8 or bh > h * 0.8:
# Skip page-spanning contours
if bw > w * 0.8 and bh > h * 0.8:
continue continue
logger.info("GraphicDetect PASS2 ACCEPT illustration at (%d,%d) %dx%d area=%d", logger.info("GraphicDetect PASS2 ACCEPT illustration at (%d,%d) %dx%d area=%d",
@@ -274,16 +277,14 @@ def detect_graphic_elements(
)) ))
# ===================================================================== # =====================================================================
# Deduplicate overlapping results and return # Deduplicate and return
# ===================================================================== # =====================================================================
candidates.sort(key=lambda g: g.area, reverse=True) candidates.sort(key=lambda g: g.area, reverse=True)
# Remove duplicates where bounding boxes overlap > 50%
final: List[GraphicElement] = [] final: List[GraphicElement] = []
for c in candidates: for c in candidates:
overlap = False overlap = False
for f in final: for f in final:
# Intersection
ix1 = max(c.x, f.x) ix1 = max(c.x, f.x)
iy1 = max(c.y, f.y) iy1 = max(c.y, f.y)
ix2 = min(c.x + c.width, f.x + f.width) ix2 = min(c.x + c.width, f.x + f.width)