feat: region-based graphic detection with word-overlap filtering
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m3s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 19s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m3s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 19s
New approach: dilate color mask heavily (25x25) to merge nearby colored pixels into regions, then check word overlap: - >50% overlap with OCR word boxes → colored text → skip - <50% overlap → colored image/graphic → keep This detects balloon clusters as one "image" region instead of trying to classify individual shapes. Red words like "borrow/lend" are filtered because they overlap with their word boxes. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,14 +1,13 @@
|
||||
"""
|
||||
Graphical element detection for OCR pages.
|
||||
|
||||
Two-pass approach:
|
||||
Pass 1 — COLOR PASS: Detect colored graphical elements (balloons, colored
|
||||
arrows, icons) on the saturation channel alone. Black text has
|
||||
zero saturation and is invisible on this channel, so no word
|
||||
exclusion is needed.
|
||||
Pass 2 — INK PASS: Detect large black-ink illustrations by subtracting
|
||||
OCR word boxes from the full ink mask and keeping only very large
|
||||
remaining contours.
|
||||
Region-based approach:
|
||||
1. Build a color mask (saturation channel — black text is invisible).
|
||||
2. Dilate heavily to merge nearby colored pixels into regions.
|
||||
3. For each region, check overlap with OCR word boxes:
|
||||
- High word overlap → colored text (skip)
|
||||
- Low word overlap → colored graphic / image (keep)
|
||||
4. Separately detect large black-ink illustrations via ink mask.
|
||||
|
||||
Boxes and text colors are handled by cv_box_detect / cv_color_detect.
|
||||
|
||||
@@ -36,7 +35,7 @@ class GraphicElement:
|
||||
width: int
|
||||
height: int
|
||||
area: int
|
||||
shape: str # circle, illustration
|
||||
shape: str # image, illustration
|
||||
color_name: str # dominant color or 'black'
|
||||
color_hex: str
|
||||
confidence: float
|
||||
@@ -59,7 +58,7 @@ _COLOR_HEX = {
|
||||
}
|
||||
|
||||
|
||||
def _dominant_color(hsv_roi: np.ndarray, sat_threshold: int = 50) -> tuple:
|
||||
def _dominant_color(hsv_roi: np.ndarray, sat_threshold: int = 40) -> tuple:
|
||||
"""Return (color_name, color_hex) for an HSV region."""
|
||||
if hsv_roi.size == 0:
|
||||
return "black", _COLOR_HEX["black"]
|
||||
@@ -104,13 +103,10 @@ def detect_graphic_elements(
|
||||
detected_boxes: Optional[List[Dict]] = None,
|
||||
max_elements: int = 50,
|
||||
) -> List[GraphicElement]:
|
||||
"""Find non-text graphical elements on the page.
|
||||
"""Find non-text graphical regions on the page.
|
||||
|
||||
Two-pass approach:
|
||||
Pass 1 (color): Find colored elements via saturation channel.
|
||||
No word exclusion needed — black text is invisible.
|
||||
Pass 2 (ink): Find large black illustrations via ink mask minus
|
||||
word exclusion.
|
||||
Region-based: dilate color mask to form regions, then check word
|
||||
overlap to distinguish colored text from colored graphics.
|
||||
|
||||
Args:
|
||||
img_bgr: BGR color image.
|
||||
@@ -133,89 +129,104 @@ def detect_graphic_elements(
|
||||
hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
|
||||
candidates: List[GraphicElement] = []
|
||||
|
||||
# --- Build word mask (for overlap checking) ---
|
||||
word_mask = np.zeros((h, w), dtype=np.uint8)
|
||||
for wb in word_boxes:
|
||||
x1 = max(0, int(wb.get("left", 0)))
|
||||
y1 = max(0, int(wb.get("top", 0)))
|
||||
x2 = min(w, int(wb.get("left", 0) + wb.get("width", 0)))
|
||||
y2 = min(h, int(wb.get("top", 0) + wb.get("height", 0)))
|
||||
word_mask[y1:y2, x1:x2] = 255
|
||||
|
||||
# =====================================================================
|
||||
# PASS 1 — COLOR CHANNEL (no word exclusion needed)
|
||||
# PASS 1 — COLORED IMAGE REGIONS
|
||||
# =====================================================================
|
||||
# Saturated pixels = colored ink. Black text has sat ≈ 0 → invisible.
|
||||
# Color mask: saturated pixels (black text has sat ≈ 0 → invisible)
|
||||
sat_mask = (hsv[:, :, 1] > 40).astype(np.uint8) * 255
|
||||
# Exclude very bright backgrounds (white/near-white with color cast)
|
||||
val_mask = (hsv[:, :, 2] < 240).astype(np.uint8) * 255
|
||||
color_mask = cv2.bitwise_and(sat_mask, val_mask)
|
||||
color_pixels = cv2.bitwise_and(sat_mask, val_mask)
|
||||
|
||||
# Only remove tiny speckle — NO closing, which would merge nearby
|
||||
# colored elements into one giant blob spanning half the page.
|
||||
# Remove tiny speckle
|
||||
kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
|
||||
color_mask = cv2.morphologyEx(color_mask, cv2.MORPH_OPEN, kernel_open)
|
||||
color_pixels = cv2.morphologyEx(color_pixels, cv2.MORPH_OPEN, kernel_open)
|
||||
|
||||
contours_color, _ = cv2.findContours(
|
||||
color_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
|
||||
# Count raw colored pixels before dilation (for density check later)
|
||||
color_pixel_raw = color_pixels.copy()
|
||||
|
||||
# Heavy dilation to merge nearby colored elements into regions.
|
||||
# A 25x25 kernel merges elements within ~12px of each other.
|
||||
kernel_dilate = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (25, 25))
|
||||
region_mask = cv2.dilate(color_pixels, kernel_dilate, iterations=1)
|
||||
|
||||
contours_regions, _ = cv2.findContours(
|
||||
region_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
|
||||
)
|
||||
logger.info("GraphicDetect PASS1 (color): %d contours", len(contours_color))
|
||||
|
||||
for cnt in contours_color:
|
||||
area = cv2.contourArea(cnt)
|
||||
if area < 80:
|
||||
continue
|
||||
logger.info("GraphicDetect PASS1: %d color regions after dilation", len(contours_regions))
|
||||
|
||||
for cnt in contours_regions:
|
||||
bx, by, bw, bh = cv2.boundingRect(cnt)
|
||||
if bw < 8 or bh < 8:
|
||||
|
||||
# Skip tiny regions
|
||||
if bw < 15 or bh < 15:
|
||||
continue
|
||||
|
||||
# Skip page-spanning contours (background color cast / merged blobs)
|
||||
if bw > w * 0.5 or bh > h * 0.5 or area > img_area * 0.10:
|
||||
# Skip page-spanning regions
|
||||
if bw > w * 0.5 or bh > h * 0.5:
|
||||
logger.info("GraphicDetect PASS1 SKIP page-spanning (%d,%d) %dx%d", bx, by, bw, bh)
|
||||
continue
|
||||
|
||||
perimeter = cv2.arcLength(cnt, True)
|
||||
circularity = (4 * np.pi * area) / (perimeter * perimeter) if perimeter > 0 else 0
|
||||
aspect = bw / bh if bh > 0 else 1.0
|
||||
min_dim = min(bw, bh)
|
||||
bbox_area = bw * bh
|
||||
|
||||
# Colored circle / balloon
|
||||
if circularity > 0.45 and 0.4 < aspect < 2.5 and min_dim > 12:
|
||||
# Determine color
|
||||
roi_hsv = hsv[by:by + bh, bx:bx + bw]
|
||||
cnt_mask_roi = np.zeros((bh, bw), dtype=np.uint8)
|
||||
cv2.drawContours(cnt_mask_roi, [cnt - np.array([bx, by])], -1, 255, -1)
|
||||
masked_hsv = roi_hsv[cnt_mask_roi > 0]
|
||||
color_name, color_hex = _dominant_color(masked_hsv, sat_threshold=30)
|
||||
# Check: how much of this region's bounding box overlaps with words?
|
||||
roi_words = word_mask[by:by + bh, bx:bx + bw]
|
||||
word_pixel_count = int(np.sum(roi_words > 0))
|
||||
word_overlap = word_pixel_count / bbox_area if bbox_area > 0 else 0
|
||||
|
||||
conf = min(0.95, circularity)
|
||||
logger.info("GraphicDetect PASS1 ACCEPT circle at (%d,%d) %dx%d area=%d circ=%.2f color=%s",
|
||||
bx, by, bw, bh, int(area), circularity, color_name)
|
||||
candidates.append(GraphicElement(
|
||||
x=bx, y=by, width=bw, height=bh,
|
||||
area=int(area), shape="circle",
|
||||
color_name=color_name, color_hex=color_hex,
|
||||
confidence=conf, contour=cnt,
|
||||
))
|
||||
# Check: how many actual colored pixels are in this region?
|
||||
roi_color = color_pixel_raw[by:by + bh, bx:bx + bw]
|
||||
color_pixel_count = int(np.sum(roi_color > 0))
|
||||
|
||||
# If most of the region is covered by word boxes → colored text, skip
|
||||
if word_overlap > 0.5:
|
||||
logger.info("GraphicDetect PASS1 SKIP text region (%d,%d) %dx%d word_overlap=%.0f%%",
|
||||
bx, by, bw, bh, word_overlap * 100)
|
||||
continue
|
||||
|
||||
# Colored illustration (large colored region)
|
||||
if area > 2000 and min_dim > 20:
|
||||
roi_hsv = hsv[by:by + bh, bx:bx + bw]
|
||||
cnt_mask_roi = np.zeros((bh, bw), dtype=np.uint8)
|
||||
cv2.drawContours(cnt_mask_roi, [cnt - np.array([bx, by])], -1, 255, -1)
|
||||
masked_hsv = roi_hsv[cnt_mask_roi > 0]
|
||||
color_name, color_hex = _dominant_color(masked_hsv, sat_threshold=30)
|
||||
|
||||
logger.info("GraphicDetect PASS1 ACCEPT illustration at (%d,%d) %dx%d area=%d color=%s",
|
||||
bx, by, bw, bh, int(area), color_name)
|
||||
candidates.append(GraphicElement(
|
||||
x=bx, y=by, width=bw, height=bh,
|
||||
area=int(area), shape="illustration",
|
||||
color_name=color_name, color_hex=color_hex,
|
||||
confidence=0.6, contour=cnt,
|
||||
))
|
||||
# Need a minimum number of colored pixels (not just dilated area)
|
||||
if color_pixel_count < 200:
|
||||
continue
|
||||
|
||||
# Determine dominant color from the actual colored pixels
|
||||
roi_hsv = hsv[by:by + bh, bx:bx + bw]
|
||||
color_px_mask = roi_color > 0
|
||||
if np.sum(color_px_mask) > 0:
|
||||
masked_hsv = roi_hsv[color_px_mask]
|
||||
color_name, color_hex = _dominant_color(masked_hsv)
|
||||
else:
|
||||
color_name, color_hex = "black", _COLOR_HEX["black"]
|
||||
|
||||
# Confidence based on color density and low word overlap
|
||||
density = color_pixel_count / bbox_area if bbox_area > 0 else 0
|
||||
conf = min(0.95, 0.5 + density * 0.5)
|
||||
|
||||
logger.info("GraphicDetect PASS1 ACCEPT image at (%d,%d) %dx%d "
|
||||
"color_px=%d word_overlap=%.0f%% color=%s",
|
||||
bx, by, bw, bh, color_pixel_count, word_overlap * 100, color_name)
|
||||
candidates.append(GraphicElement(
|
||||
x=bx, y=by, width=bw, height=bh,
|
||||
area=color_pixel_count,
|
||||
shape="image",
|
||||
color_name=color_name, color_hex=color_hex,
|
||||
confidence=round(conf, 2), contour=cnt,
|
||||
))
|
||||
|
||||
# =====================================================================
|
||||
# PASS 2 — INK (dark pixels) with word exclusion
|
||||
# Only for large black illustrations (drawings in black ink).
|
||||
# PASS 2 — LARGE BLACK-INK ILLUSTRATIONS
|
||||
# =====================================================================
|
||||
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
|
||||
_, dark_mask = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
||||
|
||||
# Build exclusion mask from words
|
||||
# Exclude words and colored regions already found
|
||||
exclusion = np.zeros((h, w), dtype=np.uint8)
|
||||
word_pad = 5
|
||||
for wb in word_boxes:
|
||||
@@ -225,7 +236,6 @@ def detect_graphic_elements(
|
||||
y2 = min(h, int(wb.get("top", 0) + wb.get("height", 0)) + word_pad)
|
||||
exclusion[y1:y2, x1:x2] = 255
|
||||
|
||||
# Also exclude detected box regions
|
||||
if detected_boxes:
|
||||
for box in detected_boxes:
|
||||
bbx = int(box.get("x", 0))
|
||||
@@ -241,11 +251,8 @@ def detect_graphic_elements(
|
||||
exclusion[y1:y2, x1:x2] = 255
|
||||
|
||||
ink_only = cv2.bitwise_and(dark_mask, cv2.bitwise_not(exclusion))
|
||||
ink_only = cv2.bitwise_and(ink_only, cv2.bitwise_not(color_pixels))
|
||||
|
||||
# Remove colored regions already found in pass 1
|
||||
ink_only = cv2.bitwise_and(ink_only, cv2.bitwise_not(color_mask))
|
||||
|
||||
# Only look for LARGE remaining regions (black illustrations)
|
||||
contours_ink, _ = cv2.findContours(
|
||||
ink_only, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
|
||||
)
|
||||
@@ -254,14 +261,10 @@ def detect_graphic_elements(
|
||||
for cnt in contours_ink:
|
||||
area = cv2.contourArea(cnt)
|
||||
bx, by, bw, bh = cv2.boundingRect(cnt)
|
||||
min_dim = min(bw, bh)
|
||||
|
||||
# Only large illustrations survive (area > 5000, min_dim > 40)
|
||||
if area < 5000 or min_dim < 40:
|
||||
if area < 5000 or min(bw, bh) < 40:
|
||||
continue
|
||||
|
||||
# Skip page-spanning contours
|
||||
if bw > w * 0.8 and bh > h * 0.8:
|
||||
if bw > w * 0.8 or bh > h * 0.8:
|
||||
continue
|
||||
|
||||
logger.info("GraphicDetect PASS2 ACCEPT illustration at (%d,%d) %dx%d area=%d",
|
||||
@@ -274,16 +277,14 @@ def detect_graphic_elements(
|
||||
))
|
||||
|
||||
# =====================================================================
|
||||
# Deduplicate overlapping results and return
|
||||
# Deduplicate and return
|
||||
# =====================================================================
|
||||
candidates.sort(key=lambda g: g.area, reverse=True)
|
||||
|
||||
# Remove duplicates where bounding boxes overlap > 50%
|
||||
final: List[GraphicElement] = []
|
||||
for c in candidates:
|
||||
overlap = False
|
||||
for f in final:
|
||||
# Intersection
|
||||
ix1 = max(c.x, f.x)
|
||||
iy1 = max(c.y, f.y)
|
||||
ix2 = min(c.x + c.width, f.x + f.width)
|
||||
|
||||
Reference in New Issue
Block a user