Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m54s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 18s
Previously color/shading detection only ran as fallback when no line-based boxes were found. Now both methods run in parallel with result merging, so smaller shaded boxes (like "German leihen") get detected even when larger bordered boxes are already found. Uses median-blur background analysis that works for both colored and grayscale/B&W scans. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
472 lines
15 KiB
Python
472 lines
15 KiB
Python
"""
|
|
Embedded box detection and page zone splitting for the CV vocabulary pipeline.
|
|
|
|
Detects boxes (grammar tips, exercises, etc.) that span the page width and
|
|
interrupt the normal column layout. Splits the page into vertical zones so
|
|
that column detection can run independently per zone.
|
|
|
|
Two-stage algorithm (both run, results merged):
|
|
1. Morphological line detection — finds bordered boxes via horizontal lines.
|
|
2. Background shading detection — finds shaded/colored boxes via median-blur
|
|
background analysis. Works for colored (blue, green) and grayscale
|
|
(gray shading on B/W scans) boxes.
|
|
|
|
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
|
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
|
"""
|
|
|
|
import logging
|
|
from typing import List, Optional, Tuple
|
|
|
|
import cv2
|
|
import numpy as np
|
|
|
|
from cv_vocab_types import DetectedBox, PageZone
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
__all__ = [
|
|
"detect_boxes",
|
|
"split_page_into_zones",
|
|
]
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Stage 1: Morphological line detection
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _detect_boxes_by_lines(
|
|
gray: np.ndarray,
|
|
content_x: int,
|
|
content_w: int,
|
|
content_y: int,
|
|
content_h: int,
|
|
) -> List[DetectedBox]:
|
|
"""Find boxes defined by pairs of long horizontal border lines.
|
|
|
|
Args:
|
|
gray: Grayscale image (full page).
|
|
content_x, content_w: Horizontal content bounds.
|
|
content_y, content_h: Vertical content bounds.
|
|
|
|
Returns:
|
|
List of DetectedBox for each detected bordered box.
|
|
"""
|
|
h, w = gray.shape[:2]
|
|
|
|
# Binarize: dark pixels → white on black background
|
|
_, binary = cv2.threshold(gray, 180, 255, cv2.THRESH_BINARY_INV)
|
|
|
|
# Horizontal morphology kernel — at least 50% of content width
|
|
kernel_w = max(50, content_w // 2)
|
|
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_w, 1))
|
|
lines_img = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)
|
|
|
|
# Horizontal projection: count line pixels per row
|
|
h_proj = np.sum(lines_img[:, content_x:content_x + content_w] > 0, axis=1)
|
|
line_threshold = content_w * 0.30
|
|
|
|
# Group consecutive rows with enough line pixels into line segments
|
|
line_segments: List[Tuple[int, int]] = [] # (y_start, y_end)
|
|
seg_start: Optional[int] = None
|
|
for y in range(h):
|
|
if h_proj[y] >= line_threshold:
|
|
if seg_start is None:
|
|
seg_start = y
|
|
else:
|
|
if seg_start is not None:
|
|
line_segments.append((seg_start, y))
|
|
seg_start = None
|
|
if seg_start is not None:
|
|
line_segments.append((seg_start, h))
|
|
|
|
if len(line_segments) < 2:
|
|
return []
|
|
|
|
# Pair lines into boxes: top-line + bottom-line
|
|
# Minimum box height: 30px. Maximum: 70% of content height.
|
|
min_box_h = 30
|
|
max_box_h = int(content_h * 0.70)
|
|
|
|
boxes: List[DetectedBox] = []
|
|
used = set()
|
|
for i, (top_start, top_end) in enumerate(line_segments):
|
|
if i in used:
|
|
continue
|
|
for j in range(i + 1, len(line_segments)):
|
|
if j in used:
|
|
continue
|
|
bot_start, bot_end = line_segments[j]
|
|
box_y = top_start
|
|
box_h = bot_end - top_start
|
|
if box_h < min_box_h or box_h > max_box_h:
|
|
continue
|
|
|
|
# Estimate border thickness from line segment heights
|
|
border_top = top_end - top_start
|
|
border_bot = bot_end - bot_start
|
|
|
|
box = DetectedBox(
|
|
x=content_x,
|
|
y=box_y,
|
|
width=content_w,
|
|
height=box_h,
|
|
confidence=0.8,
|
|
border_thickness=max(border_top, border_bot),
|
|
)
|
|
boxes.append(box)
|
|
used.add(i)
|
|
used.add(j)
|
|
break # move to next top-line candidate
|
|
|
|
return boxes
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Stage 2: Background shading detection (color + grayscale)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _detect_boxes_by_shading(
|
|
img_bgr: np.ndarray,
|
|
content_x: int,
|
|
content_w: int,
|
|
content_y: int,
|
|
content_h: int,
|
|
) -> List[DetectedBox]:
|
|
"""Find boxes with shaded/colored background (no visible border lines).
|
|
|
|
Uses heavy median blur to remove text and reveal the underlying background.
|
|
Then detects rectangular regions where the background differs from white.
|
|
Works for both colored boxes (blue, green) and grayscale shading (gray on
|
|
B/W scans).
|
|
|
|
Args:
|
|
img_bgr: BGR color image (full page).
|
|
content_x, content_w: Horizontal content bounds.
|
|
content_y, content_h: Vertical content bounds.
|
|
|
|
Returns:
|
|
List of DetectedBox for each detected shaded box.
|
|
"""
|
|
h, w = img_bgr.shape[:2]
|
|
|
|
# --- Heavy median blur removes text strokes, keeps background ---
|
|
blur_size = 31 # large kernel to wipe out text
|
|
blurred = cv2.medianBlur(img_bgr, blur_size)
|
|
blur_gray = cv2.cvtColor(blurred, cv2.COLOR_BGR2GRAY)
|
|
blur_hsv = cv2.cvtColor(blurred, cv2.COLOR_BGR2HSV)
|
|
|
|
# Estimate page background from top-left / top-right corners
|
|
corner_size = max(20, min(h // 10, w // 10))
|
|
corners = np.concatenate([
|
|
blur_gray[:corner_size, :corner_size].ravel(),
|
|
blur_gray[:corner_size, -corner_size:].ravel(),
|
|
])
|
|
page_bg = float(np.median(corners))
|
|
|
|
# Two masks: grayscale shading + color saturation
|
|
# Grayscale: regions noticeably darker than the page background
|
|
shade_thresh = max(page_bg - 30, 150)
|
|
gray_mask = (blur_gray < shade_thresh).astype(np.uint8) * 255
|
|
|
|
# Color: regions with noticeable saturation (blue/green/etc. boxes)
|
|
sat_mask = (blur_hsv[:, :, 1] > 20).astype(np.uint8) * 255
|
|
|
|
combined = cv2.bitwise_or(gray_mask, sat_mask)
|
|
|
|
# Morphological cleanup: close gaps, remove small noise
|
|
kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (25, 10))
|
|
combined = cv2.morphologyEx(combined, cv2.MORPH_CLOSE, kernel_close)
|
|
kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (10, 5))
|
|
combined = cv2.morphologyEx(combined, cv2.MORPH_OPEN, kernel_open)
|
|
|
|
contours, _ = cv2.findContours(combined, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
|
|
|
# Size thresholds: smaller boxes allowed (e.g. "German leihen" ~30% width)
|
|
min_area = content_w * 30 # at least 30px tall at full width
|
|
min_box_h = 25
|
|
max_box_h = int(content_h * 0.70)
|
|
min_width_ratio = 0.25 # boxes can be ~25% of content width
|
|
|
|
boxes: List[DetectedBox] = []
|
|
for cnt in contours:
|
|
area = cv2.contourArea(cnt)
|
|
if area < min_area:
|
|
continue
|
|
|
|
bx, by, bw, bh = cv2.boundingRect(cnt)
|
|
|
|
# Width filter
|
|
if bw < content_w * min_width_ratio:
|
|
continue
|
|
|
|
# Height filter
|
|
if bh < min_box_h or bh > max_box_h:
|
|
continue
|
|
|
|
# Rectangularity check: area / bounding-rect area > 0.6
|
|
rect_area = bw * bh
|
|
if rect_area > 0 and area / rect_area < 0.5:
|
|
continue
|
|
|
|
# Verify that the background inside this region is actually shaded
|
|
roi_gray = blur_gray[by:by + bh, bx:bx + bw]
|
|
roi_hsv = blur_hsv[by:by + bh, bx:bx + bw]
|
|
if roi_gray.size == 0:
|
|
continue
|
|
|
|
median_val = float(np.median(roi_gray))
|
|
median_sat = float(np.median(roi_hsv[:, :, 1]))
|
|
|
|
# Must be noticeably different from page background
|
|
is_shaded = median_val < (page_bg - 15)
|
|
is_colored = median_sat > 15
|
|
|
|
if not is_shaded and not is_colored:
|
|
continue
|
|
|
|
conf = 0.7 if is_colored else 0.6
|
|
|
|
boxes.append(DetectedBox(
|
|
x=bx,
|
|
y=by,
|
|
width=bw,
|
|
height=bh,
|
|
confidence=conf,
|
|
border_thickness=0,
|
|
))
|
|
|
|
return boxes
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Validation
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _validate_box(
|
|
box: DetectedBox,
|
|
gray: np.ndarray,
|
|
content_w: int,
|
|
content_h: int,
|
|
median_row_gap: int,
|
|
) -> bool:
|
|
"""Validate that a detected box is genuine (not a table-row separator etc.)."""
|
|
# Must span > 25% of content width (lowered from 60% to allow smaller boxes)
|
|
if box.width < content_w * 0.25:
|
|
return False
|
|
|
|
# Height constraints
|
|
if box.height < 25 or box.height > content_h * 0.70:
|
|
return False
|
|
|
|
# Must not be confused with a table-row separator:
|
|
# real boxes are at least 3x the median row gap
|
|
if median_row_gap > 0 and box.height < median_row_gap * 3:
|
|
return False
|
|
|
|
# Must contain some text (ink density check)
|
|
h, w = gray.shape[:2]
|
|
y1 = max(0, box.y)
|
|
y2 = min(h, box.y + box.height)
|
|
x1 = max(0, box.x)
|
|
x2 = min(w, box.x + box.width)
|
|
roi = gray[y1:y2, x1:x2]
|
|
if roi.size == 0:
|
|
return False
|
|
ink_ratio = np.sum(roi < 128) / roi.size
|
|
if ink_ratio < 0.002: # nearly empty → not a real content box
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Public API: detect_boxes
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _merge_overlapping_boxes(boxes: List[DetectedBox]) -> List[DetectedBox]:
|
|
"""Merge boxes that overlap significantly (IoU > 0.3 or one contains the other).
|
|
|
|
When two boxes overlap, keep the one with higher confidence (or the larger
|
|
one if confidences are equal).
|
|
"""
|
|
if len(boxes) <= 1:
|
|
return boxes
|
|
|
|
# Sort by area descending so larger boxes are processed first
|
|
boxes = sorted(boxes, key=lambda b: b.width * b.height, reverse=True)
|
|
keep = [True] * len(boxes)
|
|
|
|
for i in range(len(boxes)):
|
|
if not keep[i]:
|
|
continue
|
|
bi = boxes[i]
|
|
for j in range(i + 1, len(boxes)):
|
|
if not keep[j]:
|
|
continue
|
|
bj = boxes[j]
|
|
|
|
# Compute overlap
|
|
x1 = max(bi.x, bj.x)
|
|
y1 = max(bi.y, bj.y)
|
|
x2 = min(bi.x + bi.width, bj.x + bj.width)
|
|
y2 = min(bi.y + bi.height, bj.y + bj.height)
|
|
|
|
if x2 <= x1 or y2 <= y1:
|
|
continue # no overlap
|
|
|
|
inter = (x2 - x1) * (y2 - y1)
|
|
area_i = bi.width * bi.height
|
|
area_j = bj.width * bj.height
|
|
smaller_area = min(area_i, area_j)
|
|
|
|
# If overlap covers > 50% of the smaller box, merge (drop the weaker)
|
|
if smaller_area > 0 and inter / smaller_area > 0.50:
|
|
# Keep the one with higher confidence; if equal, keep larger
|
|
if bj.confidence > bi.confidence:
|
|
keep[i] = False
|
|
break
|
|
else:
|
|
keep[j] = False
|
|
|
|
return [b for b, k in zip(boxes, keep) if k]
|
|
|
|
|
|
def detect_boxes(
|
|
img_bgr: np.ndarray,
|
|
content_x: int,
|
|
content_w: int,
|
|
content_y: int,
|
|
content_h: int,
|
|
median_row_gap: int = 0,
|
|
) -> List[DetectedBox]:
|
|
"""Detect embedded boxes on a page image.
|
|
|
|
Runs BOTH line-based and shading-based detection, then merges and
|
|
deduplicates results.
|
|
|
|
Args:
|
|
img_bgr: BGR color image (full page or cropped).
|
|
content_x, content_w: Horizontal content bounds.
|
|
content_y, content_h: Vertical content bounds.
|
|
median_row_gap: Median row gap height (for filtering out table separators).
|
|
|
|
Returns:
|
|
List of validated DetectedBox instances, sorted by y position.
|
|
"""
|
|
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
|
|
|
|
# Stage 1: Line-based detection (bordered boxes)
|
|
line_boxes = _detect_boxes_by_lines(gray, content_x, content_w, content_y, content_h)
|
|
|
|
# Stage 2: Shading-based detection (colored/gray background boxes)
|
|
shade_boxes = _detect_boxes_by_shading(img_bgr, content_x, content_w, content_y, content_h)
|
|
|
|
logger.debug("BoxDetect: %d line-based, %d shading-based candidates",
|
|
len(line_boxes), len(shade_boxes))
|
|
|
|
# Combine and deduplicate
|
|
all_boxes = line_boxes + shade_boxes
|
|
merged = _merge_overlapping_boxes(all_boxes)
|
|
|
|
# Validate
|
|
validated = [b for b in merged if _validate_box(b, gray, content_w, content_h, median_row_gap)]
|
|
|
|
# Sort top to bottom
|
|
validated.sort(key=lambda b: b.y)
|
|
|
|
if validated:
|
|
logger.info("BoxDetect: %d box(es) detected (line=%d, shade=%d, merged=%d)",
|
|
len(validated), len(line_boxes), len(shade_boxes), len(merged))
|
|
else:
|
|
logger.debug("BoxDetect: no boxes detected")
|
|
|
|
return validated
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Zone Splitting
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def split_page_into_zones(
|
|
content_x: int,
|
|
content_y: int,
|
|
content_w: int,
|
|
content_h: int,
|
|
boxes: List[DetectedBox],
|
|
min_zone_height: int = 40,
|
|
) -> List[PageZone]:
|
|
"""Split a page into vertical zones based on detected boxes.
|
|
|
|
Regions above, between, and below boxes become 'content' zones;
|
|
box regions become 'box' zones.
|
|
|
|
Args:
|
|
content_x, content_y, content_w, content_h: Content area bounds.
|
|
boxes: Detected boxes, sorted by y position.
|
|
min_zone_height: Minimum height for a content zone to be kept.
|
|
|
|
Returns:
|
|
List of PageZone, ordered top to bottom.
|
|
"""
|
|
if not boxes:
|
|
# Single zone: entire content area
|
|
return [PageZone(
|
|
index=0,
|
|
zone_type='content',
|
|
y=content_y,
|
|
height=content_h,
|
|
x=content_x,
|
|
width=content_w,
|
|
)]
|
|
|
|
zones: List[PageZone] = []
|
|
zone_idx = 0
|
|
cursor_y = content_y
|
|
content_bottom = content_y + content_h
|
|
|
|
for box in boxes:
|
|
# Content zone above this box
|
|
gap_above = box.y - cursor_y
|
|
if gap_above >= min_zone_height:
|
|
zones.append(PageZone(
|
|
index=zone_idx,
|
|
zone_type='content',
|
|
y=cursor_y,
|
|
height=gap_above,
|
|
x=content_x,
|
|
width=content_w,
|
|
))
|
|
zone_idx += 1
|
|
|
|
# Box zone
|
|
zones.append(PageZone(
|
|
index=zone_idx,
|
|
zone_type='box',
|
|
y=box.y,
|
|
height=box.height,
|
|
x=box.x,
|
|
width=box.width,
|
|
box=box,
|
|
))
|
|
zone_idx += 1
|
|
|
|
cursor_y = box.y + box.height
|
|
|
|
# Content zone below last box
|
|
remaining = content_bottom - cursor_y
|
|
if remaining >= min_zone_height:
|
|
zones.append(PageZone(
|
|
index=zone_idx,
|
|
zone_type='content',
|
|
y=cursor_y,
|
|
height=remaining,
|
|
x=content_x,
|
|
width=content_w,
|
|
))
|
|
|
|
logger.info(f"ZoneSplit: {len(zones)} zones from {len(boxes)} box(es): "
|
|
f"{[z.zone_type for z in zones]}")
|
|
|
|
return zones
|