feat: two-pass graphic detection (color channel + ink)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 31s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 1m59s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 21s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 31s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 1m59s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 21s
Pass 1 (color): Detect colored graphics on HSV saturation channel. Black text is invisible on this channel, so no word exclusion needed. Catches colored balloons, arrows, icons reliably. Pass 2 (ink): Detect large black illustrations on dark ink mask minus word exclusion. Only keeps area > 5000 to avoid text fragments. Fixes: all 5 balloons now detectable (previously word exclusion zones were eating colored graphics that overlapped with nearby OCR words). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,11 +1,16 @@
|
||||
"""
|
||||
Graphical element detection for OCR pages.
|
||||
|
||||
Finds non-text visual elements (arrows, balloons, icons, illustrations)
|
||||
by subtracting known OCR word regions from the page ink and analysing
|
||||
remaining connected components via contour shape metrics.
|
||||
Two-pass approach:
|
||||
Pass 1 — COLOR PASS: Detect colored graphical elements (balloons, colored
|
||||
arrows, icons) on the saturation channel alone. Black text has
|
||||
zero saturation and is invisible on this channel, so no word
|
||||
exclusion is needed.
|
||||
Pass 2 — INK PASS: Detect large black-ink illustrations by subtracting
|
||||
OCR word boxes from the full ink mask and keeping only very large
|
||||
remaining contours.
|
||||
|
||||
Works on both color and grayscale scans.
|
||||
Boxes and text colors are handled by cv_box_detect / cv_color_detect.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
@@ -31,11 +36,11 @@ class GraphicElement:
|
||||
width: int
|
||||
height: int
|
||||
area: int
|
||||
shape: str # arrow, circle, line, icon, illustration
|
||||
shape: str # circle, illustration
|
||||
color_name: str # dominant color or 'black'
|
||||
color_hex: str
|
||||
confidence: float
|
||||
contour: Any = field(default=None, repr=False) # numpy contour, excluded from repr
|
||||
contour: Any = field(default=None, repr=False)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -89,46 +94,6 @@ def _dominant_color(hsv_roi: np.ndarray, sat_threshold: int = 50) -> tuple:
|
||||
return name, _COLOR_HEX.get(name, _COLOR_HEX["black"])
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Shape classification via contour analysis
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _classify_shape(
|
||||
contour: np.ndarray,
|
||||
bw: int,
|
||||
bh: int,
|
||||
area: float,
|
||||
) -> tuple:
|
||||
"""Classify contour shape → (shape_name, confidence).
|
||||
|
||||
Only detects high-confidence shapes that are clearly non-text:
|
||||
- circle/balloon: high circularity (very reliable)
|
||||
- illustration: large area (clearly a drawing/image)
|
||||
|
||||
Text fragments are classified as 'noise' and filtered out.
|
||||
Boxes and colors are detected by separate modules.
|
||||
"""
|
||||
perimeter = cv2.arcLength(contour, True)
|
||||
circularity = (4 * np.pi * area) / (perimeter * perimeter) if perimeter > 0 else 0
|
||||
aspect = bw / bh if bh > 0 else 1.0
|
||||
min_dim = min(bw, bh)
|
||||
|
||||
# --- Circle / balloon ---
|
||||
# High circularity is the most reliable non-text indicator.
|
||||
# Text characters rarely have circularity > 0.55.
|
||||
if circularity > 0.55 and 0.5 < aspect < 2.0 and min_dim > 15:
|
||||
conf = min(0.95, circularity)
|
||||
return "circle", conf
|
||||
|
||||
# --- Illustration (drawing, image, large graphic) ---
|
||||
# Large connected regions that survived word exclusion = genuine graphics.
|
||||
if area > 3000 and min_dim > 30:
|
||||
return "illustration", 0.6
|
||||
|
||||
# Everything else is likely a text fragment — skip
|
||||
return "noise", 0.0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main detection
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -137,24 +102,20 @@ def detect_graphic_elements(
|
||||
img_bgr: np.ndarray,
|
||||
word_boxes: List[Dict],
|
||||
detected_boxes: Optional[List[Dict]] = None,
|
||||
min_area: int = 80,
|
||||
max_area_ratio: float = 0.25,
|
||||
word_pad: int = 5,
|
||||
max_elements: int = 50,
|
||||
) -> List[GraphicElement]:
|
||||
"""Find non-text graphical elements on the page.
|
||||
|
||||
1. Build ink mask (dark + colored pixels).
|
||||
2. Subtract OCR word regions and detected boxes.
|
||||
3. Find connected components and classify shapes.
|
||||
Two-pass approach:
|
||||
Pass 1 (color): Find colored elements via saturation channel.
|
||||
No word exclusion needed — black text is invisible.
|
||||
Pass 2 (ink): Find large black illustrations via ink mask minus
|
||||
word exclusion.
|
||||
|
||||
Args:
|
||||
img_bgr: BGR color image.
|
||||
word_boxes: List of OCR word dicts with left/top/width/height.
|
||||
detected_boxes: Optional list of detected box dicts (x/y/w/h).
|
||||
min_area: Minimum contour area to keep (80 filters tiny noise).
|
||||
max_area_ratio: Maximum area as fraction of image area.
|
||||
word_pad: Padding around word boxes for exclusion (5px).
|
||||
max_elements: Maximum number of elements to return.
|
||||
|
||||
Returns:
|
||||
@@ -164,27 +125,100 @@ def detect_graphic_elements(
|
||||
return []
|
||||
|
||||
h, w = img_bgr.shape[:2]
|
||||
max_area = int(h * w * max_area_ratio)
|
||||
img_area = h * w
|
||||
|
||||
logger.info("GraphicDetect: image %dx%d, %d word_boxes, %d detected_boxes",
|
||||
w, h, len(word_boxes), len(detected_boxes or []))
|
||||
|
||||
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
|
||||
hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
|
||||
candidates: List[GraphicElement] = []
|
||||
|
||||
# --- 1. Build ink mask: dark pixels + saturated colored pixels ---
|
||||
# =====================================================================
|
||||
# PASS 1 — COLOR CHANNEL (no word exclusion needed)
|
||||
# =====================================================================
|
||||
# Saturated pixels = colored ink. Black text has sat ≈ 0 → invisible.
|
||||
sat_mask = (hsv[:, :, 1] > 50).astype(np.uint8) * 255
|
||||
# Exclude very bright backgrounds (white/near-white with color cast)
|
||||
val_mask = (hsv[:, :, 2] < 235).astype(np.uint8) * 255
|
||||
color_mask = cv2.bitwise_and(sat_mask, val_mask)
|
||||
|
||||
# Morphological cleanup: close small gaps, remove speckle
|
||||
kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (5, 5))
|
||||
color_mask = cv2.morphologyEx(color_mask, cv2.MORPH_CLOSE, kernel)
|
||||
kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
|
||||
color_mask = cv2.morphologyEx(color_mask, cv2.MORPH_OPEN, kernel_open)
|
||||
|
||||
contours_color, _ = cv2.findContours(
|
||||
color_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
|
||||
)
|
||||
logger.info("GraphicDetect PASS1 (color): %d contours", len(contours_color))
|
||||
|
||||
for cnt in contours_color:
|
||||
area = cv2.contourArea(cnt)
|
||||
if area < 80:
|
||||
continue
|
||||
|
||||
bx, by, bw, bh = cv2.boundingRect(cnt)
|
||||
if bw < 8 or bh < 8:
|
||||
continue
|
||||
|
||||
# Skip page-spanning contours (background color cast)
|
||||
if bw > w * 0.8 and bh > h * 0.8:
|
||||
continue
|
||||
|
||||
perimeter = cv2.arcLength(cnt, True)
|
||||
circularity = (4 * np.pi * area) / (perimeter * perimeter) if perimeter > 0 else 0
|
||||
aspect = bw / bh if bh > 0 else 1.0
|
||||
min_dim = min(bw, bh)
|
||||
|
||||
# Colored circle / balloon
|
||||
if circularity > 0.45 and 0.4 < aspect < 2.5 and min_dim > 12:
|
||||
# Determine color
|
||||
roi_hsv = hsv[by:by + bh, bx:bx + bw]
|
||||
cnt_mask_roi = np.zeros((bh, bw), dtype=np.uint8)
|
||||
cv2.drawContours(cnt_mask_roi, [cnt - np.array([bx, by])], -1, 255, -1)
|
||||
masked_hsv = roi_hsv[cnt_mask_roi > 0]
|
||||
color_name, color_hex = _dominant_color(masked_hsv, sat_threshold=30)
|
||||
|
||||
conf = min(0.95, circularity)
|
||||
logger.info("GraphicDetect PASS1 ACCEPT circle at (%d,%d) %dx%d area=%d circ=%.2f color=%s",
|
||||
bx, by, bw, bh, int(area), circularity, color_name)
|
||||
candidates.append(GraphicElement(
|
||||
x=bx, y=by, width=bw, height=bh,
|
||||
area=int(area), shape="circle",
|
||||
color_name=color_name, color_hex=color_hex,
|
||||
confidence=conf, contour=cnt,
|
||||
))
|
||||
continue
|
||||
|
||||
# Colored illustration (large colored region)
|
||||
if area > 2000 and min_dim > 20:
|
||||
roi_hsv = hsv[by:by + bh, bx:bx + bw]
|
||||
cnt_mask_roi = np.zeros((bh, bw), dtype=np.uint8)
|
||||
cv2.drawContours(cnt_mask_roi, [cnt - np.array([bx, by])], -1, 255, -1)
|
||||
masked_hsv = roi_hsv[cnt_mask_roi > 0]
|
||||
color_name, color_hex = _dominant_color(masked_hsv, sat_threshold=30)
|
||||
|
||||
logger.info("GraphicDetect PASS1 ACCEPT illustration at (%d,%d) %dx%d area=%d color=%s",
|
||||
bx, by, bw, bh, int(area), color_name)
|
||||
candidates.append(GraphicElement(
|
||||
x=bx, y=by, width=bw, height=bh,
|
||||
area=int(area), shape="illustration",
|
||||
color_name=color_name, color_hex=color_hex,
|
||||
confidence=0.6, contour=cnt,
|
||||
))
|
||||
continue
|
||||
|
||||
# =====================================================================
|
||||
# PASS 2 — INK (dark pixels) with word exclusion
|
||||
# Only for large black illustrations (drawings in black ink).
|
||||
# =====================================================================
|
||||
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
|
||||
_, dark_mask = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
||||
|
||||
# Saturated colored pixels (catches colored arrows, markers)
|
||||
sat_mask = (hsv[:, :, 1] > 40).astype(np.uint8) * 255
|
||||
val_mask = (hsv[:, :, 2] < 230).astype(np.uint8) * 255
|
||||
color_ink = cv2.bitwise_and(sat_mask, val_mask)
|
||||
|
||||
ink_mask = cv2.bitwise_or(dark_mask, color_ink)
|
||||
|
||||
# --- 2. Build exclusion mask from OCR words ---
|
||||
# Build exclusion mask from words
|
||||
exclusion = np.zeros((h, w), dtype=np.uint8)
|
||||
|
||||
word_pad = 5
|
||||
for wb in word_boxes:
|
||||
x1 = max(0, int(wb.get("left", 0)) - word_pad)
|
||||
y1 = max(0, int(wb.get("top", 0)) - word_pad)
|
||||
@@ -192,110 +226,82 @@ def detect_graphic_elements(
|
||||
y2 = min(h, int(wb.get("top", 0) + wb.get("height", 0)) + word_pad)
|
||||
exclusion[y1:y2, x1:x2] = 255
|
||||
|
||||
# Also exclude detected box interiors (they contain text, not graphics)
|
||||
# But keep a border strip so arrows/icons at box edges are found
|
||||
# Also exclude detected box regions
|
||||
if detected_boxes:
|
||||
box_inset = 8
|
||||
for box in detected_boxes:
|
||||
bx = int(box.get("x", 0))
|
||||
by = int(box.get("y", 0))
|
||||
bbx = int(box.get("x", 0))
|
||||
bby = int(box.get("y", 0))
|
||||
bbw = int(box.get("w", box.get("width", 0)))
|
||||
bbh = int(box.get("h", box.get("height", 0)))
|
||||
x1 = max(0, bx + box_inset)
|
||||
y1 = max(0, by + box_inset)
|
||||
x2 = min(w, bx + bbw - box_inset)
|
||||
y2 = min(h, by + bbh - box_inset)
|
||||
inset = 8
|
||||
x1 = max(0, bbx + inset)
|
||||
y1 = max(0, bby + inset)
|
||||
x2 = min(w, bbx + bbw - inset)
|
||||
y2 = min(h, bby + bbh - inset)
|
||||
if x2 > x1 and y2 > y1:
|
||||
exclusion[y1:y2, x1:x2] = 255
|
||||
|
||||
excl_pct = int(np.sum(exclusion > 0) * 100 / (h * w)) if h * w else 0
|
||||
logger.info("GraphicDetect: exclusion mask covers %d%% of image", excl_pct)
|
||||
ink_only = cv2.bitwise_and(dark_mask, cv2.bitwise_not(exclusion))
|
||||
|
||||
# Subtract exclusion from ink
|
||||
graphic_mask = cv2.bitwise_and(ink_mask, cv2.bitwise_not(exclusion))
|
||||
# Remove colored regions already found in pass 1
|
||||
ink_only = cv2.bitwise_and(ink_only, cv2.bitwise_not(color_mask))
|
||||
|
||||
# --- 3. Morphological cleanup ---
|
||||
# Close small gaps (connects arrow stroke + head) — but not too large
|
||||
# to avoid reconnecting text fragments
|
||||
kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
|
||||
graphic_mask = cv2.morphologyEx(graphic_mask, cv2.MORPH_CLOSE, kernel_close)
|
||||
# Remove small noise
|
||||
kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
|
||||
graphic_mask = cv2.morphologyEx(graphic_mask, cv2.MORPH_OPEN, kernel_open)
|
||||
|
||||
# --- 4. Find contours ---
|
||||
contours, _ = cv2.findContours(
|
||||
graphic_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
|
||||
# Only look for LARGE remaining regions (black illustrations)
|
||||
contours_ink, _ = cv2.findContours(
|
||||
ink_only, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
|
||||
)
|
||||
logger.info("GraphicDetect PASS2 (ink): %d contours", len(contours_ink))
|
||||
|
||||
logger.info("GraphicDetect: %d raw contours after exclusion", len(contours))
|
||||
|
||||
# --- 5. Analyse and classify ---
|
||||
candidates: List[GraphicElement] = []
|
||||
skip_reasons: Dict[str, int] = {}
|
||||
for cnt in contours:
|
||||
for cnt in contours_ink:
|
||||
area = cv2.contourArea(cnt)
|
||||
if area < min_area or area > max_area:
|
||||
bx, by, bw, bh = cv2.boundingRect(cnt)
|
||||
reason = f"area={int(area)}<{min_area}" if area < min_area else f"area={int(area)}>{max_area}"
|
||||
logger.info("GraphicDetect SKIP: %s at (%d,%d) %dx%d", reason, bx, by, bw, bh)
|
||||
skip_reasons[f"area_filter"] = skip_reasons.get("area_filter", 0) + 1
|
||||
continue
|
||||
|
||||
bx, by, bw, bh = cv2.boundingRect(cnt)
|
||||
if bw < 8 or bh < 8:
|
||||
skip_reasons["too_small_dim"] = skip_reasons.get("too_small_dim", 0) + 1
|
||||
min_dim = min(bw, bh)
|
||||
|
||||
# Only large illustrations survive (area > 5000, min_dim > 40)
|
||||
if area < 5000 or min_dim < 40:
|
||||
continue
|
||||
|
||||
# Skip elements that overlap significantly with the exclusion zone
|
||||
roi_excl = exclusion[by:by + bh, bx:bx + bw]
|
||||
excl_ratio = np.sum(roi_excl > 0) / (bw * bh) if bw * bh > 0 else 0
|
||||
if excl_ratio > 0.4:
|
||||
logger.info("GraphicDetect SKIP excl_ratio=%.2f at (%d,%d) %dx%d area=%d",
|
||||
excl_ratio, bx, by, bw, bh, int(area))
|
||||
skip_reasons["excl_overlap"] = skip_reasons.get("excl_overlap", 0) + 1
|
||||
# Skip page-spanning contours
|
||||
if bw > w * 0.8 and bh > h * 0.8:
|
||||
continue
|
||||
|
||||
# Classify shape
|
||||
shape, conf = _classify_shape(cnt, bw, bh, area)
|
||||
|
||||
# Skip noise (too small or text-like)
|
||||
if shape == "noise":
|
||||
logger.info("GraphicDetect SKIP noise at (%d,%d) %dx%d area=%d",
|
||||
bx, by, bw, bh, int(area))
|
||||
skip_reasons["noise"] = skip_reasons.get("noise", 0) + 1
|
||||
continue
|
||||
|
||||
# Determine dominant color
|
||||
roi_hsv = hsv[by:by + bh, bx:bx + bw]
|
||||
cnt_mask = np.zeros((bh, bw), dtype=np.uint8)
|
||||
shifted_cnt = cnt - np.array([bx, by])
|
||||
cv2.drawContours(cnt_mask, [shifted_cnt], -1, 255, -1)
|
||||
masked_hsv = roi_hsv[cnt_mask > 0]
|
||||
color_name, color_hex = _dominant_color(masked_hsv)
|
||||
|
||||
logger.info("GraphicDetect ACCEPT: %s at (%d,%d) %dx%d area=%d color=%s conf=%.2f",
|
||||
shape, bx, by, bw, bh, int(area), color_name, conf)
|
||||
logger.info("GraphicDetect PASS2 ACCEPT illustration at (%d,%d) %dx%d area=%d",
|
||||
bx, by, bw, bh, int(area))
|
||||
candidates.append(GraphicElement(
|
||||
x=bx, y=by, width=bw, height=bh,
|
||||
area=int(area),
|
||||
shape=shape,
|
||||
color_name=color_name,
|
||||
color_hex=color_hex,
|
||||
confidence=conf,
|
||||
contour=cnt,
|
||||
area=int(area), shape="illustration",
|
||||
color_name="black", color_hex="#000000",
|
||||
confidence=0.5, contour=cnt,
|
||||
))
|
||||
|
||||
if skip_reasons:
|
||||
logger.info("GraphicDetect: skipped contours: %s",
|
||||
", ".join(f"{k}={v}" for k, v in sorted(skip_reasons.items())))
|
||||
|
||||
# Sort by area descending, limit count
|
||||
# =====================================================================
|
||||
# Deduplicate overlapping results and return
|
||||
# =====================================================================
|
||||
candidates.sort(key=lambda g: g.area, reverse=True)
|
||||
result = candidates[:max_elements]
|
||||
|
||||
# Remove duplicates where bounding boxes overlap > 50%
|
||||
final: List[GraphicElement] = []
|
||||
for c in candidates:
|
||||
overlap = False
|
||||
for f in final:
|
||||
# Intersection
|
||||
ix1 = max(c.x, f.x)
|
||||
iy1 = max(c.y, f.y)
|
||||
ix2 = min(c.x + c.width, f.x + f.width)
|
||||
iy2 = min(c.y + c.height, f.y + f.height)
|
||||
if ix2 > ix1 and iy2 > iy1:
|
||||
inter = (ix2 - ix1) * (iy2 - iy1)
|
||||
smaller = min(c.width * c.height, f.width * f.height)
|
||||
if smaller > 0 and inter / smaller > 0.5:
|
||||
overlap = True
|
||||
break
|
||||
if not overlap:
|
||||
final.append(c)
|
||||
|
||||
result = final[:max_elements]
|
||||
|
||||
if result:
|
||||
shape_counts = {}
|
||||
shape_counts: Dict[str, int] = {}
|
||||
for g in result:
|
||||
shape_counts[g.shape] = shape_counts.get(g.shape, 0) + 1
|
||||
logger.info(
|
||||
|
||||
Reference in New Issue
Block a user