feat: run shading-based box detection alongside line detection
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m54s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 18s

Previously color/shading detection only ran as fallback when no line-based
boxes were found. Now both methods run in parallel with result merging,
so smaller shaded boxes (like "German leihen") get detected even when
larger bordered boxes are already found. Uses median-blur background
analysis that works for both colored and grayscale/B&W scans.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-16 08:12:52 +01:00
parent a6951940b9
commit fbbec6cf5e

View File

@@ -5,9 +5,11 @@ Detects boxes (grammar tips, exercises, etc.) that span the page width and
interrupt the normal column layout. Splits the page into vertical zones so interrupt the normal column layout. Splits the page into vertical zones so
that column detection can run independently per zone. that column detection can run independently per zone.
Two-stage algorithm: Two-stage algorithm (both run, results merged):
1. Morphological line detection — finds bordered boxes via horizontal lines. 1. Morphological line detection — finds bordered boxes via horizontal lines.
2. Color/saturation fallback — finds shaded boxes without visible borders. 2. Background shading detection — finds shaded/colored boxes via median-blur
background analysis. Works for colored (blue, green) and grayscale
(gray shading on B/W scans) boxes.
Lizenz: Apache 2.0 (kommerziell nutzbar) Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
@@ -121,10 +123,10 @@ def _detect_boxes_by_lines(
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Stage 2: Color / saturation fallback # Stage 2: Background shading detection (color + grayscale)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
def _detect_boxes_by_color( def _detect_boxes_by_shading(
img_bgr: np.ndarray, img_bgr: np.ndarray,
content_x: int, content_x: int,
content_w: int, content_w: int,
@@ -133,6 +135,11 @@ def _detect_boxes_by_color(
) -> List[DetectedBox]: ) -> List[DetectedBox]:
"""Find boxes with shaded/colored background (no visible border lines). """Find boxes with shaded/colored background (no visible border lines).
Uses heavy median blur to remove text and reveal the underlying background.
Then detects rectangular regions where the background differs from white.
Works for both colored boxes (blue, green) and grayscale shading (gray on
B/W scans).
Args: Args:
img_bgr: BGR color image (full page). img_bgr: BGR color image (full page).
content_x, content_w: Horizontal content bounds. content_x, content_w: Horizontal content bounds.
@@ -143,24 +150,43 @@ def _detect_boxes_by_color(
""" """
h, w = img_bgr.shape[:2] h, w = img_bgr.shape[:2]
hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV) # --- Heavy median blur removes text strokes, keeps background ---
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) blur_size = 31 # large kernel to wipe out text
blurred = cv2.medianBlur(img_bgr, blur_size)
blur_gray = cv2.cvtColor(blurred, cv2.COLOR_BGR2GRAY)
blur_hsv = cv2.cvtColor(blurred, cv2.COLOR_BGR2HSV)
# Mask: pixels that are saturated OR noticeably darker than white # Estimate page background from top-left / top-right corners
sat_mask = hsv[:, :, 1] > 25 corner_size = max(20, min(h // 10, w // 10))
dark_mask = gray < 220 corners = np.concatenate([
combined = (sat_mask | dark_mask).astype(np.uint8) * 255 blur_gray[:corner_size, :corner_size].ravel(),
blur_gray[:corner_size, -corner_size:].ravel(),
])
page_bg = float(np.median(corners))
# Close small gaps in the mask # Two masks: grayscale shading + color saturation
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (15, 15)) # Grayscale: regions noticeably darker than the page background
combined = cv2.morphologyEx(combined, cv2.MORPH_CLOSE, kernel) shade_thresh = max(page_bg - 30, 150)
gray_mask = (blur_gray < shade_thresh).astype(np.uint8) * 255
# Color: regions with noticeable saturation (blue/green/etc. boxes)
sat_mask = (blur_hsv[:, :, 1] > 20).astype(np.uint8) * 255
combined = cv2.bitwise_or(gray_mask, sat_mask)
# Morphological cleanup: close gaps, remove small noise
kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (25, 10))
combined = cv2.morphologyEx(combined, cv2.MORPH_CLOSE, kernel_close)
kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (10, 5))
combined = cv2.morphologyEx(combined, cv2.MORPH_OPEN, kernel_open)
contours, _ = cv2.findContours(combined, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) contours, _ = cv2.findContours(combined, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
min_area = content_w * content_h * 0.05 # Size thresholds: smaller boxes allowed (e.g. "German leihen" ~30% width)
min_box_h = 30 min_area = content_w * 30 # at least 30px tall at full width
min_box_h = 25
max_box_h = int(content_h * 0.70) max_box_h = int(content_h * 0.70)
min_width_ratio = 0.60 min_width_ratio = 0.25 # boxes can be ~25% of content width
boxes: List[DetectedBox] = [] boxes: List[DetectedBox] = []
for cnt in contours: for cnt in contours:
@@ -168,15 +194,9 @@ def _detect_boxes_by_color(
if area < min_area: if area < min_area:
continue continue
# Approximate to polygon — check if roughly rectangular
peri = cv2.arcLength(cnt, True)
approx = cv2.approxPolyDP(cnt, 0.04 * peri, True)
if len(approx) < 4 or len(approx) > 8:
continue
bx, by, bw, bh = cv2.boundingRect(cnt) bx, by, bw, bh = cv2.boundingRect(cnt)
# Width filter: must span most of the page # Width filter
if bw < content_w * min_width_ratio: if bw < content_w * min_width_ratio:
continue continue
@@ -184,12 +204,35 @@ def _detect_boxes_by_color(
if bh < min_box_h or bh > max_box_h: if bh < min_box_h or bh > max_box_h:
continue continue
# Rectangularity check: area / bounding-rect area > 0.6
rect_area = bw * bh
if rect_area > 0 and area / rect_area < 0.5:
continue
# Verify that the background inside this region is actually shaded
roi_gray = blur_gray[by:by + bh, bx:bx + bw]
roi_hsv = blur_hsv[by:by + bh, bx:bx + bw]
if roi_gray.size == 0:
continue
median_val = float(np.median(roi_gray))
median_sat = float(np.median(roi_hsv[:, :, 1]))
# Must be noticeably different from page background
is_shaded = median_val < (page_bg - 15)
is_colored = median_sat > 15
if not is_shaded and not is_colored:
continue
conf = 0.7 if is_colored else 0.6
boxes.append(DetectedBox( boxes.append(DetectedBox(
x=bx, x=bx,
y=by, y=by,
width=bw, width=bw,
height=bh, height=bh,
confidence=0.6, confidence=conf,
border_thickness=0, border_thickness=0,
)) ))
@@ -208,12 +251,12 @@ def _validate_box(
median_row_gap: int, median_row_gap: int,
) -> bool: ) -> bool:
"""Validate that a detected box is genuine (not a table-row separator etc.).""" """Validate that a detected box is genuine (not a table-row separator etc.)."""
# Must span > 60% of content width # Must span > 25% of content width (lowered from 60% to allow smaller boxes)
if box.width < content_w * 0.60: if box.width < content_w * 0.25:
return False return False
# Height constraints # Height constraints
if box.height < 30 or box.height > content_h * 0.70: if box.height < 25 or box.height > content_h * 0.70:
return False return False
# Must not be confused with a table-row separator: # Must not be confused with a table-row separator:
@@ -222,7 +265,12 @@ def _validate_box(
return False return False
# Must contain some text (ink density check) # Must contain some text (ink density check)
roi = gray[box.y:box.y + box.height, box.x:box.x + box.width] h, w = gray.shape[:2]
y1 = max(0, box.y)
y2 = min(h, box.y + box.height)
x1 = max(0, box.x)
x2 = min(w, box.x + box.width)
roi = gray[y1:y2, x1:x2]
if roi.size == 0: if roi.size == 0:
return False return False
ink_ratio = np.sum(roi < 128) / roi.size ink_ratio = np.sum(roi < 128) / roi.size
@@ -236,6 +284,54 @@ def _validate_box(
# Public API: detect_boxes # Public API: detect_boxes
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
def _merge_overlapping_boxes(boxes: List[DetectedBox]) -> List[DetectedBox]:
"""Merge boxes that overlap significantly (IoU > 0.3 or one contains the other).
When two boxes overlap, keep the one with higher confidence (or the larger
one if confidences are equal).
"""
if len(boxes) <= 1:
return boxes
# Sort by area descending so larger boxes are processed first
boxes = sorted(boxes, key=lambda b: b.width * b.height, reverse=True)
keep = [True] * len(boxes)
for i in range(len(boxes)):
if not keep[i]:
continue
bi = boxes[i]
for j in range(i + 1, len(boxes)):
if not keep[j]:
continue
bj = boxes[j]
# Compute overlap
x1 = max(bi.x, bj.x)
y1 = max(bi.y, bj.y)
x2 = min(bi.x + bi.width, bj.x + bj.width)
y2 = min(bi.y + bi.height, bj.y + bj.height)
if x2 <= x1 or y2 <= y1:
continue # no overlap
inter = (x2 - x1) * (y2 - y1)
area_i = bi.width * bi.height
area_j = bj.width * bj.height
smaller_area = min(area_i, area_j)
# If overlap covers > 50% of the smaller box, merge (drop the weaker)
if smaller_area > 0 and inter / smaller_area > 0.50:
# Keep the one with higher confidence; if equal, keep larger
if bj.confidence > bi.confidence:
keep[i] = False
break
else:
keep[j] = False
return [b for b, k in zip(boxes, keep) if k]
def detect_boxes( def detect_boxes(
img_bgr: np.ndarray, img_bgr: np.ndarray,
content_x: int, content_x: int,
@@ -246,8 +342,8 @@ def detect_boxes(
) -> List[DetectedBox]: ) -> List[DetectedBox]:
"""Detect embedded boxes on a page image. """Detect embedded boxes on a page image.
Runs line-based detection first, then color-based fallback if no Runs BOTH line-based and shading-based detection, then merges and
bordered boxes are found. deduplicates results.
Args: Args:
img_bgr: BGR color image (full page or cropped). img_bgr: BGR color image (full page or cropped).
@@ -260,22 +356,28 @@ def detect_boxes(
""" """
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
# Stage 1: Line-based detection # Stage 1: Line-based detection (bordered boxes)
boxes = _detect_boxes_by_lines(gray, content_x, content_w, content_y, content_h) line_boxes = _detect_boxes_by_lines(gray, content_x, content_w, content_y, content_h)
# Stage 2: Color fallback if no bordered boxes found # Stage 2: Shading-based detection (colored/gray background boxes)
if not boxes: shade_boxes = _detect_boxes_by_shading(img_bgr, content_x, content_w, content_y, content_h)
boxes = _detect_boxes_by_color(img_bgr, content_x, content_w, content_y, content_h)
logger.debug("BoxDetect: %d line-based, %d shading-based candidates",
len(line_boxes), len(shade_boxes))
# Combine and deduplicate
all_boxes = line_boxes + shade_boxes
merged = _merge_overlapping_boxes(all_boxes)
# Validate # Validate
validated = [b for b in boxes if _validate_box(b, gray, content_w, content_h, median_row_gap)] validated = [b for b in merged if _validate_box(b, gray, content_w, content_h, median_row_gap)]
# Sort top to bottom # Sort top to bottom
validated.sort(key=lambda b: b.y) validated.sort(key=lambda b: b.y)
if validated: if validated:
logger.info(f"BoxDetect: {len(validated)} box(es) detected " logger.info("BoxDetect: %d box(es) detected (line=%d, shade=%d, merged=%d)",
f"(from {len(boxes)} candidates)") len(validated), len(line_boxes), len(shade_boxes), len(merged))
else: else:
logger.debug("BoxDetect: no boxes detected") logger.debug("BoxDetect: no boxes detected")