Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 39s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m34s
CI / test-python-agent-core (push) Successful in 20s
CI / test-nodejs-website (push) Successful in 26s
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
472 lines
15 KiB
Python
472 lines
15 KiB
Python
"""
|
|
Embedded box detection and page zone splitting for the CV vocabulary pipeline.
|
|
|
|
Detects boxes (grammar tips, exercises, etc.) that span the page width and
|
|
interrupt the normal column layout. Splits the page into vertical zones so
|
|
that column detection can run independently per zone.
|
|
|
|
Two-stage algorithm (both run, results merged):
|
|
1. Morphological line detection — finds bordered boxes via horizontal lines.
|
|
2. Background shading detection — finds shaded/colored boxes via median-blur
|
|
background analysis. Works for colored (blue, green) and grayscale
|
|
(gray shading on B/W scans) boxes.
|
|
|
|
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
|
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
|
"""
|
|
|
|
import logging
|
|
from typing import List, Optional, Tuple
|
|
|
|
import cv2
|
|
import numpy as np
|
|
|
|
from ..types import DetectedBox, PageZone
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
__all__ = [
|
|
"detect_boxes",
|
|
"split_page_into_zones",
|
|
]
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Stage 1: Morphological line detection
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _detect_boxes_by_lines(
|
|
gray: np.ndarray,
|
|
content_x: int,
|
|
content_w: int,
|
|
content_y: int,
|
|
content_h: int,
|
|
) -> List[DetectedBox]:
|
|
"""Find boxes defined by pairs of long horizontal border lines.
|
|
|
|
Args:
|
|
gray: Grayscale image (full page).
|
|
content_x, content_w: Horizontal content bounds.
|
|
content_y, content_h: Vertical content bounds.
|
|
|
|
Returns:
|
|
List of DetectedBox for each detected bordered box.
|
|
"""
|
|
h, w = gray.shape[:2]
|
|
|
|
# Binarize: dark pixels → white on black background
|
|
_, binary = cv2.threshold(gray, 180, 255, cv2.THRESH_BINARY_INV)
|
|
|
|
# Horizontal morphology kernel — at least 50% of content width
|
|
kernel_w = max(50, content_w // 2)
|
|
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_w, 1))
|
|
lines_img = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)
|
|
|
|
# Horizontal projection: count line pixels per row
|
|
h_proj = np.sum(lines_img[:, content_x:content_x + content_w] > 0, axis=1)
|
|
line_threshold = content_w * 0.30
|
|
|
|
# Group consecutive rows with enough line pixels into line segments
|
|
line_segments: List[Tuple[int, int]] = [] # (y_start, y_end)
|
|
seg_start: Optional[int] = None
|
|
for y in range(h):
|
|
if h_proj[y] >= line_threshold:
|
|
if seg_start is None:
|
|
seg_start = y
|
|
else:
|
|
if seg_start is not None:
|
|
line_segments.append((seg_start, y))
|
|
seg_start = None
|
|
if seg_start is not None:
|
|
line_segments.append((seg_start, h))
|
|
|
|
if len(line_segments) < 2:
|
|
return []
|
|
|
|
# Pair lines into boxes: top-line + bottom-line
|
|
# Minimum box height: 30px. Maximum: 70% of content height.
|
|
min_box_h = 30
|
|
max_box_h = int(content_h * 0.70)
|
|
|
|
boxes: List[DetectedBox] = []
|
|
used = set()
|
|
for i, (top_start, top_end) in enumerate(line_segments):
|
|
if i in used:
|
|
continue
|
|
for j in range(i + 1, len(line_segments)):
|
|
if j in used:
|
|
continue
|
|
bot_start, bot_end = line_segments[j]
|
|
box_y = top_start
|
|
box_h = bot_end - top_start
|
|
if box_h < min_box_h or box_h > max_box_h:
|
|
continue
|
|
|
|
# Estimate border thickness from line segment heights
|
|
border_top = top_end - top_start
|
|
border_bot = bot_end - bot_start
|
|
|
|
box = DetectedBox(
|
|
x=content_x,
|
|
y=box_y,
|
|
width=content_w,
|
|
height=box_h,
|
|
confidence=0.8,
|
|
border_thickness=max(border_top, border_bot),
|
|
)
|
|
boxes.append(box)
|
|
used.add(i)
|
|
used.add(j)
|
|
break # move to next top-line candidate
|
|
|
|
return boxes
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Stage 2: Background shading detection (color + grayscale)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _detect_boxes_by_shading(
|
|
img_bgr: np.ndarray,
|
|
content_x: int,
|
|
content_w: int,
|
|
content_y: int,
|
|
content_h: int,
|
|
) -> List[DetectedBox]:
|
|
"""Find boxes with shaded/colored background (no visible border lines).
|
|
|
|
Uses heavy median blur to remove text and reveal the underlying background.
|
|
Then detects rectangular regions where the background differs from white.
|
|
Works for both colored boxes (blue, green) and grayscale shading (gray on
|
|
B/W scans).
|
|
|
|
Args:
|
|
img_bgr: BGR color image (full page).
|
|
content_x, content_w: Horizontal content bounds.
|
|
content_y, content_h: Vertical content bounds.
|
|
|
|
Returns:
|
|
List of DetectedBox for each detected shaded box.
|
|
"""
|
|
h, w = img_bgr.shape[:2]
|
|
|
|
# --- Heavy median blur removes text strokes, keeps background ---
|
|
blur_size = 31 # large kernel to wipe out text
|
|
blurred = cv2.medianBlur(img_bgr, blur_size)
|
|
blur_gray = cv2.cvtColor(blurred, cv2.COLOR_BGR2GRAY)
|
|
blur_hsv = cv2.cvtColor(blurred, cv2.COLOR_BGR2HSV)
|
|
|
|
# Estimate page background from top-left / top-right corners
|
|
corner_size = max(20, min(h // 10, w // 10))
|
|
corners = np.concatenate([
|
|
blur_gray[:corner_size, :corner_size].ravel(),
|
|
blur_gray[:corner_size, -corner_size:].ravel(),
|
|
])
|
|
page_bg = float(np.median(corners))
|
|
|
|
# Two masks: grayscale shading + color saturation
|
|
# Grayscale: regions noticeably darker than the page background
|
|
shade_thresh = max(page_bg - 30, 150)
|
|
gray_mask = (blur_gray < shade_thresh).astype(np.uint8) * 255
|
|
|
|
# Color: regions with noticeable saturation (blue/green/etc. boxes)
|
|
sat_mask = (blur_hsv[:, :, 1] > 20).astype(np.uint8) * 255
|
|
|
|
combined = cv2.bitwise_or(gray_mask, sat_mask)
|
|
|
|
# Morphological cleanup: close gaps, remove small noise
|
|
kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (25, 10))
|
|
combined = cv2.morphologyEx(combined, cv2.MORPH_CLOSE, kernel_close)
|
|
kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (10, 5))
|
|
combined = cv2.morphologyEx(combined, cv2.MORPH_OPEN, kernel_open)
|
|
|
|
contours, _ = cv2.findContours(combined, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
|
|
|
# Size thresholds: smaller boxes allowed (e.g. "German leihen" ~30% width)
|
|
min_area = content_w * 30 # at least 30px tall at full width
|
|
min_box_h = 25
|
|
max_box_h = int(content_h * 0.70)
|
|
min_width_ratio = 0.25 # boxes can be ~25% of content width
|
|
|
|
boxes: List[DetectedBox] = []
|
|
for cnt in contours:
|
|
area = cv2.contourArea(cnt)
|
|
if area < min_area:
|
|
continue
|
|
|
|
bx, by, bw, bh = cv2.boundingRect(cnt)
|
|
|
|
# Width filter
|
|
if bw < content_w * min_width_ratio:
|
|
continue
|
|
|
|
# Height filter
|
|
if bh < min_box_h or bh > max_box_h:
|
|
continue
|
|
|
|
# Rectangularity check: area / bounding-rect area > 0.6
|
|
rect_area = bw * bh
|
|
if rect_area > 0 and area / rect_area < 0.5:
|
|
continue
|
|
|
|
# Verify that the background inside this region is actually shaded
|
|
roi_gray = blur_gray[by:by + bh, bx:bx + bw]
|
|
roi_hsv = blur_hsv[by:by + bh, bx:bx + bw]
|
|
if roi_gray.size == 0:
|
|
continue
|
|
|
|
median_val = float(np.median(roi_gray))
|
|
median_sat = float(np.median(roi_hsv[:, :, 1]))
|
|
|
|
# Must be noticeably different from page background
|
|
is_shaded = median_val < (page_bg - 15)
|
|
is_colored = median_sat > 15
|
|
|
|
if not is_shaded and not is_colored:
|
|
continue
|
|
|
|
conf = 0.7 if is_colored else 0.6
|
|
|
|
boxes.append(DetectedBox(
|
|
x=bx,
|
|
y=by,
|
|
width=bw,
|
|
height=bh,
|
|
confidence=conf,
|
|
border_thickness=0,
|
|
))
|
|
|
|
return boxes
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Validation
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _validate_box(
|
|
box: DetectedBox,
|
|
gray: np.ndarray,
|
|
content_w: int,
|
|
content_h: int,
|
|
median_row_gap: int,
|
|
) -> bool:
|
|
"""Validate that a detected box is genuine (not a table-row separator etc.)."""
|
|
# Must span > 25% of content width (lowered from 60% to allow smaller boxes)
|
|
if box.width < content_w * 0.25:
|
|
return False
|
|
|
|
# Height constraints
|
|
if box.height < 25 or box.height > content_h * 0.70:
|
|
return False
|
|
|
|
# Must not be confused with a table-row separator:
|
|
# real boxes are at least 3x the median row gap
|
|
if median_row_gap > 0 and box.height < median_row_gap * 3:
|
|
return False
|
|
|
|
# Must contain some text (ink density check)
|
|
h, w = gray.shape[:2]
|
|
y1 = max(0, box.y)
|
|
y2 = min(h, box.y + box.height)
|
|
x1 = max(0, box.x)
|
|
x2 = min(w, box.x + box.width)
|
|
roi = gray[y1:y2, x1:x2]
|
|
if roi.size == 0:
|
|
return False
|
|
ink_ratio = np.sum(roi < 128) / roi.size
|
|
if ink_ratio < 0.002: # nearly empty → not a real content box
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Public API: detect_boxes
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _merge_overlapping_boxes(boxes: List[DetectedBox]) -> List[DetectedBox]:
|
|
"""Merge boxes that overlap significantly (IoU > 0.3 or one contains the other).
|
|
|
|
When two boxes overlap, keep the one with higher confidence (or the larger
|
|
one if confidences are equal).
|
|
"""
|
|
if len(boxes) <= 1:
|
|
return boxes
|
|
|
|
# Sort by area descending so larger boxes are processed first
|
|
boxes = sorted(boxes, key=lambda b: b.width * b.height, reverse=True)
|
|
keep = [True] * len(boxes)
|
|
|
|
for i in range(len(boxes)):
|
|
if not keep[i]:
|
|
continue
|
|
bi = boxes[i]
|
|
for j in range(i + 1, len(boxes)):
|
|
if not keep[j]:
|
|
continue
|
|
bj = boxes[j]
|
|
|
|
# Compute overlap
|
|
x1 = max(bi.x, bj.x)
|
|
y1 = max(bi.y, bj.y)
|
|
x2 = min(bi.x + bi.width, bj.x + bj.width)
|
|
y2 = min(bi.y + bi.height, bj.y + bj.height)
|
|
|
|
if x2 <= x1 or y2 <= y1:
|
|
continue # no overlap
|
|
|
|
inter = (x2 - x1) * (y2 - y1)
|
|
area_i = bi.width * bi.height
|
|
area_j = bj.width * bj.height
|
|
smaller_area = min(area_i, area_j)
|
|
|
|
# If overlap covers > 50% of the smaller box, merge (drop the weaker)
|
|
if smaller_area > 0 and inter / smaller_area > 0.50:
|
|
# Keep the one with higher confidence; if equal, keep larger
|
|
if bj.confidence > bi.confidence:
|
|
keep[i] = False
|
|
break
|
|
else:
|
|
keep[j] = False
|
|
|
|
return [b for b, k in zip(boxes, keep) if k]
|
|
|
|
|
|
def detect_boxes(
|
|
img_bgr: np.ndarray,
|
|
content_x: int,
|
|
content_w: int,
|
|
content_y: int,
|
|
content_h: int,
|
|
median_row_gap: int = 0,
|
|
) -> List[DetectedBox]:
|
|
"""Detect embedded boxes on a page image.
|
|
|
|
Runs BOTH line-based and shading-based detection, then merges and
|
|
deduplicates results.
|
|
|
|
Args:
|
|
img_bgr: BGR color image (full page or cropped).
|
|
content_x, content_w: Horizontal content bounds.
|
|
content_y, content_h: Vertical content bounds.
|
|
median_row_gap: Median row gap height (for filtering out table separators).
|
|
|
|
Returns:
|
|
List of validated DetectedBox instances, sorted by y position.
|
|
"""
|
|
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
|
|
|
|
# Stage 1: Line-based detection (bordered boxes)
|
|
line_boxes = _detect_boxes_by_lines(gray, content_x, content_w, content_y, content_h)
|
|
|
|
# Stage 2: Shading-based detection (colored/gray background boxes)
|
|
shade_boxes = _detect_boxes_by_shading(img_bgr, content_x, content_w, content_y, content_h)
|
|
|
|
logger.debug("BoxDetect: %d line-based, %d shading-based candidates",
|
|
len(line_boxes), len(shade_boxes))
|
|
|
|
# Combine and deduplicate
|
|
all_boxes = line_boxes + shade_boxes
|
|
merged = _merge_overlapping_boxes(all_boxes)
|
|
|
|
# Validate
|
|
validated = [b for b in merged if _validate_box(b, gray, content_w, content_h, median_row_gap)]
|
|
|
|
# Sort top to bottom
|
|
validated.sort(key=lambda b: b.y)
|
|
|
|
if validated:
|
|
logger.info("BoxDetect: %d box(es) detected (line=%d, shade=%d, merged=%d)",
|
|
len(validated), len(line_boxes), len(shade_boxes), len(merged))
|
|
else:
|
|
logger.debug("BoxDetect: no boxes detected")
|
|
|
|
return validated
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Zone Splitting
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def split_page_into_zones(
|
|
content_x: int,
|
|
content_y: int,
|
|
content_w: int,
|
|
content_h: int,
|
|
boxes: List[DetectedBox],
|
|
min_zone_height: int = 40,
|
|
) -> List[PageZone]:
|
|
"""Split a page into vertical zones based on detected boxes.
|
|
|
|
Regions above, between, and below boxes become 'content' zones;
|
|
box regions become 'box' zones.
|
|
|
|
Args:
|
|
content_x, content_y, content_w, content_h: Content area bounds.
|
|
boxes: Detected boxes, sorted by y position.
|
|
min_zone_height: Minimum height for a content zone to be kept.
|
|
|
|
Returns:
|
|
List of PageZone, ordered top to bottom.
|
|
"""
|
|
if not boxes:
|
|
# Single zone: entire content area
|
|
return [PageZone(
|
|
index=0,
|
|
zone_type='content',
|
|
y=content_y,
|
|
height=content_h,
|
|
x=content_x,
|
|
width=content_w,
|
|
)]
|
|
|
|
zones: List[PageZone] = []
|
|
zone_idx = 0
|
|
cursor_y = content_y
|
|
content_bottom = content_y + content_h
|
|
|
|
for box in boxes:
|
|
# Content zone above this box
|
|
gap_above = box.y - cursor_y
|
|
if gap_above >= min_zone_height:
|
|
zones.append(PageZone(
|
|
index=zone_idx,
|
|
zone_type='content',
|
|
y=cursor_y,
|
|
height=gap_above,
|
|
x=content_x,
|
|
width=content_w,
|
|
))
|
|
zone_idx += 1
|
|
|
|
# Box zone
|
|
zones.append(PageZone(
|
|
index=zone_idx,
|
|
zone_type='box',
|
|
y=box.y,
|
|
height=box.height,
|
|
x=box.x,
|
|
width=box.width,
|
|
box=box,
|
|
))
|
|
zone_idx += 1
|
|
|
|
cursor_y = box.y + box.height
|
|
|
|
# Content zone below last box
|
|
remaining = content_bottom - cursor_y
|
|
if remaining >= min_zone_height:
|
|
zones.append(PageZone(
|
|
index=zone_idx,
|
|
zone_type='content',
|
|
y=cursor_y,
|
|
height=remaining,
|
|
x=content_x,
|
|
width=content_w,
|
|
))
|
|
|
|
logger.info(f"ZoneSplit: {len(zones)} zones from {len(boxes)} box(es): "
|
|
f"{[z.zone_type for z in zones]}")
|
|
|
|
return zones
|