feat: region-based graphic detection with word-overlap filtering
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m3s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 19s

New approach: dilate color mask heavily (25x25) to merge nearby colored
pixels into regions, then check word overlap:
- >50% overlap with OCR word boxes → colored text → skip
- <50% overlap → colored image/graphic → keep

This detects balloon clusters as one "image" region instead of trying
to classify individual shapes. Red words like "borrow/lend" are filtered
because they overlap with their word boxes.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-16 14:49:15 +01:00
parent eeee61108a
commit 6668661895

View File

@@ -1,14 +1,13 @@
"""
Graphical element detection for OCR pages.
Two-pass approach:
Pass 1 — COLOR PASS: Detect colored graphical elements (balloons, colored
arrows, icons) on the saturation channel alone. Black text has
zero saturation and is invisible on this channel, so no word
exclusion is needed.
Pass 2 — INK PASS: Detect large black-ink illustrations by subtracting
OCR word boxes from the full ink mask and keeping only very large
remaining contours.
Region-based approach:
1. Build a color mask (saturation channel — black text is invisible).
2. Dilate heavily to merge nearby colored pixels into regions.
3. For each region, check overlap with OCR word boxes:
- High word overlap → colored text (skip)
- Low word overlap → colored graphic / image (keep)
4. Separately detect large black-ink illustrations via ink mask.
Boxes and text colors are handled by cv_box_detect / cv_color_detect.
@@ -36,7 +35,7 @@ class GraphicElement:
width: int
height: int
area: int
shape: str # circle, illustration
shape: str # image, illustration
color_name: str # dominant color or 'black'
color_hex: str
confidence: float
@@ -59,7 +58,7 @@ _COLOR_HEX = {
}
def _dominant_color(hsv_roi: np.ndarray, sat_threshold: int = 50) -> tuple:
def _dominant_color(hsv_roi: np.ndarray, sat_threshold: int = 40) -> tuple:
"""Return (color_name, color_hex) for an HSV region."""
if hsv_roi.size == 0:
return "black", _COLOR_HEX["black"]
@@ -104,13 +103,10 @@ def detect_graphic_elements(
detected_boxes: Optional[List[Dict]] = None,
max_elements: int = 50,
) -> List[GraphicElement]:
"""Find non-text graphical elements on the page.
"""Find non-text graphical regions on the page.
Two-pass approach:
Pass 1 (color): Find colored elements via saturation channel.
No word exclusion needed — black text is invisible.
Pass 2 (ink): Find large black illustrations via ink mask minus
word exclusion.
Region-based: dilate color mask to form regions, then check word
overlap to distinguish colored text from colored graphics.
Args:
img_bgr: BGR color image.
@@ -133,89 +129,104 @@ def detect_graphic_elements(
hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
candidates: List[GraphicElement] = []
# --- Build word mask (for overlap checking) ---
word_mask = np.zeros((h, w), dtype=np.uint8)
for wb in word_boxes:
x1 = max(0, int(wb.get("left", 0)))
y1 = max(0, int(wb.get("top", 0)))
x2 = min(w, int(wb.get("left", 0) + wb.get("width", 0)))
y2 = min(h, int(wb.get("top", 0) + wb.get("height", 0)))
word_mask[y1:y2, x1:x2] = 255
# =====================================================================
# PASS 1 — COLOR CHANNEL (no word exclusion needed)
# PASS 1 — COLORED IMAGE REGIONS
# =====================================================================
# Saturated pixels = colored ink. Black text has sat ≈ 0 → invisible.
# Color mask: saturated pixels (black text has sat ≈ 0 → invisible)
sat_mask = (hsv[:, :, 1] > 40).astype(np.uint8) * 255
# Exclude very bright backgrounds (white/near-white with color cast)
val_mask = (hsv[:, :, 2] < 240).astype(np.uint8) * 255
color_mask = cv2.bitwise_and(sat_mask, val_mask)
color_pixels = cv2.bitwise_and(sat_mask, val_mask)
# Only remove tiny speckle — NO closing, which would merge nearby
# colored elements into one giant blob spanning half the page.
# Remove tiny speckle
kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
color_mask = cv2.morphologyEx(color_mask, cv2.MORPH_OPEN, kernel_open)
color_pixels = cv2.morphologyEx(color_pixels, cv2.MORPH_OPEN, kernel_open)
contours_color, _ = cv2.findContours(
color_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
# Count raw colored pixels before dilation (for density check later)
color_pixel_raw = color_pixels.copy()
# Heavy dilation to merge nearby colored elements into regions.
# A 25x25 kernel merges elements within ~12px of each other.
kernel_dilate = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (25, 25))
region_mask = cv2.dilate(color_pixels, kernel_dilate, iterations=1)
contours_regions, _ = cv2.findContours(
region_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
)
logger.info("GraphicDetect PASS1 (color): %d contours", len(contours_color))
for cnt in contours_color:
area = cv2.contourArea(cnt)
if area < 80:
continue
logger.info("GraphicDetect PASS1: %d color regions after dilation", len(contours_regions))
for cnt in contours_regions:
bx, by, bw, bh = cv2.boundingRect(cnt)
if bw < 8 or bh < 8:
# Skip tiny regions
if bw < 15 or bh < 15:
continue
# Skip page-spanning contours (background color cast / merged blobs)
if bw > w * 0.5 or bh > h * 0.5 or area > img_area * 0.10:
# Skip page-spanning regions
if bw > w * 0.5 or bh > h * 0.5:
logger.info("GraphicDetect PASS1 SKIP page-spanning (%d,%d) %dx%d", bx, by, bw, bh)
continue
perimeter = cv2.arcLength(cnt, True)
circularity = (4 * np.pi * area) / (perimeter * perimeter) if perimeter > 0 else 0
aspect = bw / bh if bh > 0 else 1.0
min_dim = min(bw, bh)
bbox_area = bw * bh
# Colored circle / balloon
if circularity > 0.45 and 0.4 < aspect < 2.5 and min_dim > 12:
# Determine color
roi_hsv = hsv[by:by + bh, bx:bx + bw]
cnt_mask_roi = np.zeros((bh, bw), dtype=np.uint8)
cv2.drawContours(cnt_mask_roi, [cnt - np.array([bx, by])], -1, 255, -1)
masked_hsv = roi_hsv[cnt_mask_roi > 0]
color_name, color_hex = _dominant_color(masked_hsv, sat_threshold=30)
# Check: how much of this region's bounding box overlaps with words?
roi_words = word_mask[by:by + bh, bx:bx + bw]
word_pixel_count = int(np.sum(roi_words > 0))
word_overlap = word_pixel_count / bbox_area if bbox_area > 0 else 0
conf = min(0.95, circularity)
logger.info("GraphicDetect PASS1 ACCEPT circle at (%d,%d) %dx%d area=%d circ=%.2f color=%s",
bx, by, bw, bh, int(area), circularity, color_name)
candidates.append(GraphicElement(
x=bx, y=by, width=bw, height=bh,
area=int(area), shape="circle",
color_name=color_name, color_hex=color_hex,
confidence=conf, contour=cnt,
))
# Check: how many actual colored pixels are in this region?
roi_color = color_pixel_raw[by:by + bh, bx:bx + bw]
color_pixel_count = int(np.sum(roi_color > 0))
# If most of the region is covered by word boxes → colored text, skip
if word_overlap > 0.5:
logger.info("GraphicDetect PASS1 SKIP text region (%d,%d) %dx%d word_overlap=%.0f%%",
bx, by, bw, bh, word_overlap * 100)
continue
# Colored illustration (large colored region)
if area > 2000 and min_dim > 20:
roi_hsv = hsv[by:by + bh, bx:bx + bw]
cnt_mask_roi = np.zeros((bh, bw), dtype=np.uint8)
cv2.drawContours(cnt_mask_roi, [cnt - np.array([bx, by])], -1, 255, -1)
masked_hsv = roi_hsv[cnt_mask_roi > 0]
color_name, color_hex = _dominant_color(masked_hsv, sat_threshold=30)
logger.info("GraphicDetect PASS1 ACCEPT illustration at (%d,%d) %dx%d area=%d color=%s",
bx, by, bw, bh, int(area), color_name)
candidates.append(GraphicElement(
x=bx, y=by, width=bw, height=bh,
area=int(area), shape="illustration",
color_name=color_name, color_hex=color_hex,
confidence=0.6, contour=cnt,
))
# Need a minimum number of colored pixels (not just dilated area)
if color_pixel_count < 200:
continue
# Determine dominant color from the actual colored pixels
roi_hsv = hsv[by:by + bh, bx:bx + bw]
color_px_mask = roi_color > 0
if np.sum(color_px_mask) > 0:
masked_hsv = roi_hsv[color_px_mask]
color_name, color_hex = _dominant_color(masked_hsv)
else:
color_name, color_hex = "black", _COLOR_HEX["black"]
# Confidence based on color density and low word overlap
density = color_pixel_count / bbox_area if bbox_area > 0 else 0
conf = min(0.95, 0.5 + density * 0.5)
logger.info("GraphicDetect PASS1 ACCEPT image at (%d,%d) %dx%d "
"color_px=%d word_overlap=%.0f%% color=%s",
bx, by, bw, bh, color_pixel_count, word_overlap * 100, color_name)
candidates.append(GraphicElement(
x=bx, y=by, width=bw, height=bh,
area=color_pixel_count,
shape="image",
color_name=color_name, color_hex=color_hex,
confidence=round(conf, 2), contour=cnt,
))
# =====================================================================
# PASS 2 — INK (dark pixels) with word exclusion
# Only for large black illustrations (drawings in black ink).
# PASS 2 — LARGE BLACK-INK ILLUSTRATIONS
# =====================================================================
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
_, dark_mask = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
# Build exclusion mask from words
# Exclude words and colored regions already found
exclusion = np.zeros((h, w), dtype=np.uint8)
word_pad = 5
for wb in word_boxes:
@@ -225,7 +236,6 @@ def detect_graphic_elements(
y2 = min(h, int(wb.get("top", 0) + wb.get("height", 0)) + word_pad)
exclusion[y1:y2, x1:x2] = 255
# Also exclude detected box regions
if detected_boxes:
for box in detected_boxes:
bbx = int(box.get("x", 0))
@@ -241,11 +251,8 @@ def detect_graphic_elements(
exclusion[y1:y2, x1:x2] = 255
ink_only = cv2.bitwise_and(dark_mask, cv2.bitwise_not(exclusion))
ink_only = cv2.bitwise_and(ink_only, cv2.bitwise_not(color_pixels))
# Remove colored regions already found in pass 1
ink_only = cv2.bitwise_and(ink_only, cv2.bitwise_not(color_mask))
# Only look for LARGE remaining regions (black illustrations)
contours_ink, _ = cv2.findContours(
ink_only, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
)
@@ -254,14 +261,10 @@ def detect_graphic_elements(
for cnt in contours_ink:
area = cv2.contourArea(cnt)
bx, by, bw, bh = cv2.boundingRect(cnt)
min_dim = min(bw, bh)
# Only large illustrations survive (area > 5000, min_dim > 40)
if area < 5000 or min_dim < 40:
if area < 5000 or min(bw, bh) < 40:
continue
# Skip page-spanning contours
if bw > w * 0.8 and bh > h * 0.8:
if bw > w * 0.8 or bh > h * 0.8:
continue
logger.info("GraphicDetect PASS2 ACCEPT illustration at (%d,%d) %dx%d area=%d",
@@ -274,16 +277,14 @@ def detect_graphic_elements(
))
# =====================================================================
# Deduplicate overlapping results and return
# Deduplicate and return
# =====================================================================
candidates.sort(key=lambda g: g.area, reverse=True)
# Remove duplicates where bounding boxes overlap > 50%
final: List[GraphicElement] = []
for c in candidates:
overlap = False
for f in final:
# Intersection
ix1 = max(c.x, f.x)
iy1 = max(c.y, f.y)
ix2 = min(c.x + c.width, f.x + f.width)