feat: generische Box-Erkennung fuer zonenbasierte Spaltenerkennung

- Neue Datei cv_box_detect.py: 2-Stufen-Algorithmus (Linien + Farbe) - DetectedBox/PageZone Dataclasses in cv_vocab_types.py - detect_column_geometry_zoned() in cv_layout.py - API-Endpoints erweitert: zones/boxes_detected im column_result - Overlay-Funktionen zeichnen Box-Grenzen als gestrichelte Rechtecke - Fix: numpy array or-Verknuepfung an 7 Stellen in ocr_pipeline_api.py - 12 Unit-Tests fuer Box-Erkennung und Zone-Splitting Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-09 15:06:23 +01:00
parent e60254bc75
commit 7005b18561
6 changed files with 821 additions and 14 deletions
--- a/klausur-service/backend/cv_box_detect.py
+++ b/klausur-service/backend/cv_box_detect.py
@@ -0,0 +1,369 @@
+"""
+Embedded box detection and page zone splitting for the CV vocabulary pipeline.
+
+Detects boxes (grammar tips, exercises, etc.) that span the page width and
+interrupt the normal column layout. Splits the page into vertical zones so
+that column detection can run independently per zone.
+
+Two-stage algorithm:
+  1. Morphological line detection — finds bordered boxes via horizontal lines.
+  2. Color/saturation fallback — finds shaded boxes without visible borders.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+from typing import List, Optional, Tuple
+
+import cv2
+import numpy as np
+
+from cv_vocab_types import DetectedBox, PageZone
+
+logger = logging.getLogger(__name__)
+
+__all__ = [
+    "detect_boxes",
+    "split_page_into_zones",
+]
+
+
+# ---------------------------------------------------------------------------
+# Stage 1: Morphological line detection
+# ---------------------------------------------------------------------------
+
+def _detect_boxes_by_lines(
+    gray: np.ndarray,
+    content_x: int,
+    content_w: int,
+    content_y: int,
+    content_h: int,
+) -> List[DetectedBox]:
+    """Find boxes defined by pairs of long horizontal border lines.
+
+    Args:
+        gray: Grayscale image (full page).
+        content_x, content_w: Horizontal content bounds.
+        content_y, content_h: Vertical content bounds.
+
+    Returns:
+        List of DetectedBox for each detected bordered box.
+    """
+    h, w = gray.shape[:2]
+
+    # Binarize: dark pixels → white on black background
+    _, binary = cv2.threshold(gray, 180, 255, cv2.THRESH_BINARY_INV)
+
+    # Horizontal morphology kernel — at least 50% of content width
+    kernel_w = max(50, content_w // 2)
+    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_w, 1))
+    lines_img = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)
+
+    # Horizontal projection: count line pixels per row
+    h_proj = np.sum(lines_img[:, content_x:content_x + content_w] > 0, axis=1)
+    line_threshold = content_w * 0.30
+
+    # Group consecutive rows with enough line pixels into line segments
+    line_segments: List[Tuple[int, int]] = []  # (y_start, y_end)
+    seg_start: Optional[int] = None
+    for y in range(h):
+        if h_proj[y] >= line_threshold:
+            if seg_start is None:
+                seg_start = y
+        else:
+            if seg_start is not None:
+                line_segments.append((seg_start, y))
+                seg_start = None
+    if seg_start is not None:
+        line_segments.append((seg_start, h))
+
+    if len(line_segments) < 2:
+        return []
+
+    # Pair lines into boxes: top-line + bottom-line
+    # Minimum box height: 30px.  Maximum: 70% of content height.
+    min_box_h = 30
+    max_box_h = int(content_h * 0.70)
+
+    boxes: List[DetectedBox] = []
+    used = set()
+    for i, (top_start, top_end) in enumerate(line_segments):
+        if i in used:
+            continue
+        for j in range(i + 1, len(line_segments)):
+            if j in used:
+                continue
+            bot_start, bot_end = line_segments[j]
+            box_y = top_start
+            box_h = bot_end - top_start
+            if box_h < min_box_h or box_h > max_box_h:
+                continue
+
+            # Estimate border thickness from line segment heights
+            border_top = top_end - top_start
+            border_bot = bot_end - bot_start
+
+            box = DetectedBox(
+                x=content_x,
+                y=box_y,
+                width=content_w,
+                height=box_h,
+                confidence=0.8,
+                border_thickness=max(border_top, border_bot),
+            )
+            boxes.append(box)
+            used.add(i)
+            used.add(j)
+            break  # move to next top-line candidate
+
+    return boxes
+
+
+# ---------------------------------------------------------------------------
+# Stage 2: Color / saturation fallback
+# ---------------------------------------------------------------------------
+
+def _detect_boxes_by_color(
+    img_bgr: np.ndarray,
+    content_x: int,
+    content_w: int,
+    content_y: int,
+    content_h: int,
+) -> List[DetectedBox]:
+    """Find boxes with shaded/colored background (no visible border lines).
+
+    Args:
+        img_bgr: BGR color image (full page).
+        content_x, content_w: Horizontal content bounds.
+        content_y, content_h: Vertical content bounds.
+
+    Returns:
+        List of DetectedBox for each detected shaded box.
+    """
+    h, w = img_bgr.shape[:2]
+
+    hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
+    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+
+    # Mask: pixels that are saturated OR noticeably darker than white
+    sat_mask = hsv[:, :, 1] > 25
+    dark_mask = gray < 220
+    combined = (sat_mask | dark_mask).astype(np.uint8) * 255
+
+    # Close small gaps in the mask
+    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (15, 15))
+    combined = cv2.morphologyEx(combined, cv2.MORPH_CLOSE, kernel)
+
+    contours, _ = cv2.findContours(combined, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+
+    min_area = content_w * content_h * 0.05
+    min_box_h = 30
+    max_box_h = int(content_h * 0.70)
+    min_width_ratio = 0.60
+
+    boxes: List[DetectedBox] = []
+    for cnt in contours:
+        area = cv2.contourArea(cnt)
+        if area < min_area:
+            continue
+
+        # Approximate to polygon — check if roughly rectangular
+        peri = cv2.arcLength(cnt, True)
+        approx = cv2.approxPolyDP(cnt, 0.04 * peri, True)
+        if len(approx) < 4 or len(approx) > 8:
+            continue
+
+        bx, by, bw, bh = cv2.boundingRect(cnt)
+
+        # Width filter: must span most of the page
+        if bw < content_w * min_width_ratio:
+            continue
+
+        # Height filter
+        if bh < min_box_h or bh > max_box_h:
+            continue
+
+        boxes.append(DetectedBox(
+            x=bx,
+            y=by,
+            width=bw,
+            height=bh,
+            confidence=0.6,
+            border_thickness=0,
+        ))
+
+    return boxes
+
+
+# ---------------------------------------------------------------------------
+# Validation
+# ---------------------------------------------------------------------------
+
+def _validate_box(
+    box: DetectedBox,
+    gray: np.ndarray,
+    content_w: int,
+    content_h: int,
+    median_row_gap: int,
+) -> bool:
+    """Validate that a detected box is genuine (not a table-row separator etc.)."""
+    # Must span > 60% of content width
+    if box.width < content_w * 0.60:
+        return False
+
+    # Height constraints
+    if box.height < 30 or box.height > content_h * 0.70:
+        return False
+
+    # Must not be confused with a table-row separator:
+    # real boxes are at least 3x the median row gap
+    if median_row_gap > 0 and box.height < median_row_gap * 3:
+        return False
+
+    # Must contain some text (ink density check)
+    roi = gray[box.y:box.y + box.height, box.x:box.x + box.width]
+    if roi.size == 0:
+        return False
+    ink_ratio = np.sum(roi < 128) / roi.size
+    if ink_ratio < 0.002:  # nearly empty → not a real content box
+        return False
+
+    return True
+
+
+# ---------------------------------------------------------------------------
+# Public API: detect_boxes
+# ---------------------------------------------------------------------------
+
+def detect_boxes(
+    img_bgr: np.ndarray,
+    content_x: int,
+    content_w: int,
+    content_y: int,
+    content_h: int,
+    median_row_gap: int = 0,
+) -> List[DetectedBox]:
+    """Detect embedded boxes on a page image.
+
+    Runs line-based detection first, then color-based fallback if no
+    bordered boxes are found.
+
+    Args:
+        img_bgr: BGR color image (full page or cropped).
+        content_x, content_w: Horizontal content bounds.
+        content_y, content_h: Vertical content bounds.
+        median_row_gap: Median row gap height (for filtering out table separators).
+
+    Returns:
+        List of validated DetectedBox instances, sorted by y position.
+    """
+    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+
+    # Stage 1: Line-based detection
+    boxes = _detect_boxes_by_lines(gray, content_x, content_w, content_y, content_h)
+
+    # Stage 2: Color fallback if no bordered boxes found
+    if not boxes:
+        boxes = _detect_boxes_by_color(img_bgr, content_x, content_w, content_y, content_h)
+
+    # Validate
+    validated = [b for b in boxes if _validate_box(b, gray, content_w, content_h, median_row_gap)]
+
+    # Sort top to bottom
+    validated.sort(key=lambda b: b.y)
+
+    if validated:
+        logger.info(f"BoxDetect: {len(validated)} box(es) detected "
+                    f"(from {len(boxes)} candidates)")
+    else:
+        logger.debug("BoxDetect: no boxes detected")
+
+    return validated
+
+
+# ---------------------------------------------------------------------------
+# Zone Splitting
+# ---------------------------------------------------------------------------
+
+def split_page_into_zones(
+    content_x: int,
+    content_y: int,
+    content_w: int,
+    content_h: int,
+    boxes: List[DetectedBox],
+    min_zone_height: int = 40,
+) -> List[PageZone]:
+    """Split a page into vertical zones based on detected boxes.
+
+    Regions above, between, and below boxes become 'content' zones;
+    box regions become 'box' zones.
+
+    Args:
+        content_x, content_y, content_w, content_h: Content area bounds.
+        boxes: Detected boxes, sorted by y position.
+        min_zone_height: Minimum height for a content zone to be kept.
+
+    Returns:
+        List of PageZone, ordered top to bottom.
+    """
+    if not boxes:
+        # Single zone: entire content area
+        return [PageZone(
+            index=0,
+            zone_type='content',
+            y=content_y,
+            height=content_h,
+            x=content_x,
+            width=content_w,
+        )]
+
+    zones: List[PageZone] = []
+    zone_idx = 0
+    cursor_y = content_y
+    content_bottom = content_y + content_h
+
+    for box in boxes:
+        # Content zone above this box
+        gap_above = box.y - cursor_y
+        if gap_above >= min_zone_height:
+            zones.append(PageZone(
+                index=zone_idx,
+                zone_type='content',
+                y=cursor_y,
+                height=gap_above,
+                x=content_x,
+                width=content_w,
+            ))
+            zone_idx += 1
+
+        # Box zone
+        zones.append(PageZone(
+            index=zone_idx,
+            zone_type='box',
+            y=box.y,
+            height=box.height,
+            x=box.x,
+            width=box.width,
+            box=box,
+        ))
+        zone_idx += 1
+
+        cursor_y = box.y + box.height
+
+    # Content zone below last box
+    remaining = content_bottom - cursor_y
+    if remaining >= min_zone_height:
+        zones.append(PageZone(
+            index=zone_idx,
+            zone_type='content',
+            y=cursor_y,
+            height=remaining,
+            x=content_x,
+            width=content_w,
+        ))
+
+    logger.info(f"ZoneSplit: {len(zones)} zones from {len(boxes)} box(es): "
+                f"{[z.zone_type for z in zones]}")
+
+    return zones