[split-required] Split 700-870 LOC files across all services

backend-lehrer (11 files): - llm_gateway/routes/schools.py (867 → 5), recording_api.py (848 → 6) - messenger_api.py (840 → 5), print_generator.py (824 → 5) - unit_analytics_api.py (751 → 5), classroom/routes/context.py (726 → 4) - llm_gateway/routes/edu_search_seeds.py (710 → 4) klausur-service (12 files): - ocr_labeling_api.py (845 → 4), metrics_db.py (833 → 4) - legal_corpus_api.py (790 → 4), page_crop.py (758 → 3) - mail/ai_service.py (747 → 4), github_crawler.py (767 → 3) - trocr_service.py (730 → 4), full_compliance_pipeline.py (723 → 4) - dsfa_rag_api.py (715 → 4), ocr_pipeline_auto.py (705 → 4) website (6 pages): - audit-checklist (867 → 8), content (806 → 6) - screen-flow (790 → 4), scraper (789 → 5) - zeugnisse (776 → 5), modules (745 → 4) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 08:01:18 +02:00
parent b6983ab1dc
commit 34da9f4cda
106 changed files with 16500 additions and 16947 deletions
--- a/klausur-service/backend/page_crop_edges.py
+++ b/klausur-service/backend/page_crop_edges.py
@@ -0,0 +1,388 @@
+"""
+Page Crop - Edge Detection Helpers
+
+Spine shadow detection, gutter continuity analysis, projection-based
+edge detection, and narrow-run filtering for content cropping.
+
+Extracted from page_crop.py to keep files under 500 LOC.
+License: Apache 2.0
+"""
+
+import logging
+from typing import Optional, Tuple
+
+import cv2
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+# Minimum ink density (fraction of pixels) to count a row/column as "content"
+_INK_THRESHOLD = 0.003  # 0.3%
+
+# Minimum run length (fraction of dimension) to keep — shorter runs are noise
+_MIN_RUN_FRAC = 0.005  # 0.5%
+
+
+def _detect_spine_shadow(
+    gray: np.ndarray,
+    search_region: np.ndarray,
+    offset_x: int,
+    w: int,
+    side: str,
+) -> Optional[int]:
+    """Find the book spine center (darkest point) in a scanner shadow.
+
+    The scanner produces a gray strip where the book spine presses against
+    the glass.  The darkest column in that strip is the spine center —
+    that's where we crop.
+
+    Distinguishes real spine shadows from text content by checking:
+    1. Strong brightness range (> 40 levels)
+    2. Darkest point is genuinely dark (< 180 mean brightness)
+    3. The dark area is a NARROW valley, not a text-content plateau
+    4. Brightness rises significantly toward the page content side
+
+    Args:
+        gray: Full grayscale image (for context).
+        search_region: Column slice of the grayscale image to search in.
+        offset_x: X offset of search_region relative to full image.
+        w: Full image width.
+        side: 'left' or 'right' (for logging).
+
+    Returns:
+        X coordinate (in full image) of the spine center, or None.
+    """
+    region_w = search_region.shape[1]
+    if region_w < 10:
+        return None
+
+    # Column-mean brightness in the search region
+    col_means = np.mean(search_region, axis=0).astype(np.float64)
+
+    # Smooth with boxcar kernel (width = 1% of image width, min 5)
+    kernel_size = max(5, w // 100)
+    if kernel_size % 2 == 0:
+        kernel_size += 1
+    kernel = np.ones(kernel_size) / kernel_size
+    smoothed_raw = np.convolve(col_means, kernel, mode="same")
+
+    # Trim convolution edge artifacts (edges are zero-padded -> artificially low)
+    margin = kernel_size // 2
+    if region_w <= 2 * margin + 10:
+        return None
+    smoothed = smoothed_raw[margin:region_w - margin]
+    trim_offset = margin  # offset of smoothed[0] relative to search_region
+
+    val_min = float(np.min(smoothed))
+    val_max = float(np.max(smoothed))
+    shadow_range = val_max - val_min
+
+    # --- Check 1: Strong brightness gradient ---
+    if shadow_range <= 40:
+        logger.debug(
+            "%s edge: no spine (range=%.0f <= 40)", side.capitalize(), shadow_range,
+        )
+        return None
+
+    # --- Check 2: Darkest point must be genuinely dark ---
+    if val_min > 180:
+        logger.debug(
+            "%s edge: no spine (darkest=%.0f > 180, likely text)", side.capitalize(), val_min,
+        )
+        return None
+
+    spine_idx = int(np.argmin(smoothed))  # index in trimmed array
+    spine_local = spine_idx + trim_offset  # index in search_region
+    trimmed_len = len(smoothed)
+
+    # --- Check 3: Valley width (spine is narrow, text plateau is wide) ---
+    valley_thresh = val_min + shadow_range * 0.20
+    valley_mask = smoothed < valley_thresh
+    valley_width = int(np.sum(valley_mask))
+    max_valley_frac = 0.50
+    if valley_width > trimmed_len * max_valley_frac:
+        logger.debug(
+            "%s edge: no spine (valley too wide: %d/%d = %.0f%%)",
+            side.capitalize(), valley_width, trimmed_len,
+            100.0 * valley_width / trimmed_len,
+        )
+        return None
+
+    # --- Check 4: Brightness must rise toward page content ---
+    rise_check_w = max(5, trimmed_len // 5)
+    if side == "left":
+        right_start = min(spine_idx + 5, trimmed_len - 1)
+        right_end = min(right_start + rise_check_w, trimmed_len)
+        if right_end > right_start:
+            rise_brightness = float(np.mean(smoothed[right_start:right_end]))
+            rise = rise_brightness - val_min
+            if rise < shadow_range * 0.3:
+                logger.debug(
+                    "%s edge: no spine (insufficient rise: %.0f, need %.0f)",
+                    side.capitalize(), rise, shadow_range * 0.3,
+                )
+                return None
+    else:  # right
+        left_end = max(spine_idx - 5, 0)
+        left_start = max(left_end - rise_check_w, 0)
+        if left_end > left_start:
+            rise_brightness = float(np.mean(smoothed[left_start:left_end]))
+            rise = rise_brightness - val_min
+            if rise < shadow_range * 0.3:
+                logger.debug(
+                    "%s edge: no spine (insufficient rise: %.0f, need %.0f)",
+                    side.capitalize(), rise, shadow_range * 0.3,
+                )
+                return None
+
+    spine_x = offset_x + spine_local
+
+    logger.info(
+        "%s edge: spine center at x=%d (brightness=%.0f, range=%.0f, valley=%dpx)",
+        side.capitalize(), spine_x, val_min, shadow_range, valley_width,
+    )
+    return spine_x
+
+
+def _detect_gutter_continuity(
+    gray: np.ndarray,
+    search_region: np.ndarray,
+    offset_x: int,
+    w: int,
+    side: str,
+) -> Optional[int]:
+    """Detect gutter shadow via vertical continuity analysis.
+
+    Camera book scans produce a subtle brightness gradient at the gutter
+    that is too faint for scanner-shadow detection (range < 40).  However,
+    the gutter shadow has a unique property: it runs **continuously from
+    top to bottom** without interruption.
+
+    Algorithm:
+    1. Divide image into N horizontal strips (~60px each)
+    2. For each column, compute what fraction of strips are darker than
+       the page median (from the center 50% of the full image)
+    3. A "gutter column" has >= 75% of strips darker than page_median - d
+    4. Smooth the dark-fraction profile and find the transition point
+    5. Validate: gutter band must be 0.5%-10% of image width
+    """
+    region_h, region_w = search_region.shape[:2]
+    if region_w < 20 or region_h < 100:
+        return None
+
+    # --- 1. Divide into horizontal strips ---
+    strip_target_h = 60
+    n_strips = max(10, region_h // strip_target_h)
+    strip_h = region_h // n_strips
+
+    strip_means = np.zeros((n_strips, region_w), dtype=np.float64)
+    for s in range(n_strips):
+        y0 = s * strip_h
+        y1 = min((s + 1) * strip_h, region_h)
+        strip_means[s] = np.mean(search_region[y0:y1, :], axis=0)
+
+    # --- 2. Page median from center 50% of full image ---
+    center_lo = w // 4
+    center_hi = 3 * w // 4
+    page_median = float(np.median(gray[:, center_lo:center_hi]))
+
+    dark_thresh = page_median - 5.0
+
+    if page_median < 180:
+        return None
+
+    # --- 3. Per-column dark fraction ---
+    dark_count = np.sum(strip_means < dark_thresh, axis=0).astype(np.float64)
+    dark_frac = dark_count / n_strips
+
+    # --- 4. Smooth and find transition ---
+    smooth_w = max(5, w // 100)
+    if smooth_w % 2 == 0:
+        smooth_w += 1
+    kernel = np.ones(smooth_w) / smooth_w
+    frac_smooth = np.convolve(dark_frac, kernel, mode="same")
+
+    margin = smooth_w // 2
+    if region_w <= 2 * margin + 10:
+        return None
+
+    transition_thresh = 0.50
+    peak_frac = float(np.max(frac_smooth[margin:region_w - margin]))
+
+    if peak_frac < 0.70:
+        logger.debug(
+            "%s gutter: peak dark fraction %.2f < 0.70", side.capitalize(), peak_frac,
+        )
+        return None
+
+    peak_x = int(np.argmax(frac_smooth[margin:region_w - margin])) + margin
+    gutter_inner = None
+
+    if side == "right":
+        for x in range(peak_x, margin, -1):
+            if frac_smooth[x] < transition_thresh:
+                gutter_inner = x + 1
+                break
+    else:
+        for x in range(peak_x, region_w - margin):
+            if frac_smooth[x] < transition_thresh:
+                gutter_inner = x - 1
+                break
+
+    if gutter_inner is None:
+        return None
+
+    # --- 5. Validate gutter width ---
+    if side == "right":
+        gutter_width = region_w - gutter_inner
+    else:
+        gutter_width = gutter_inner
+
+    min_gutter = max(3, int(w * 0.005))
+    max_gutter = int(w * 0.10)
+
+    if gutter_width < min_gutter:
+        logger.debug(
+            "%s gutter: too narrow (%dpx < %dpx)", side.capitalize(),
+            gutter_width, min_gutter,
+        )
+        return None
+
+    if gutter_width > max_gutter:
+        logger.debug(
+            "%s gutter: too wide (%dpx > %dpx)", side.capitalize(),
+            gutter_width, max_gutter,
+        )
+        return None
+
+    if side == "right":
+        gutter_brightness = float(np.mean(strip_means[:, gutter_inner:]))
+    else:
+        gutter_brightness = float(np.mean(strip_means[:, :gutter_inner]))
+
+    brightness_drop = page_median - gutter_brightness
+    if brightness_drop < 3:
+        logger.debug(
+            "%s gutter: insufficient brightness drop (%.1f levels)",
+            side.capitalize(), brightness_drop,
+        )
+        return None
+
+    gutter_x = offset_x + gutter_inner
+
+    logger.info(
+        "%s gutter (continuity): x=%d, width=%dpx (%.1f%%), "
+        "brightness=%.0f vs page=%.0f (drop=%.0f), frac@edge=%.2f",
+        side.capitalize(), gutter_x, gutter_width,
+        100.0 * gutter_width / w, gutter_brightness, page_median,
+        brightness_drop, float(frac_smooth[gutter_inner]),
+    )
+    return gutter_x
+
+
+def _detect_left_edge_shadow(
+    gray: np.ndarray,
+    binary: np.ndarray,
+    w: int,
+    h: int,
+) -> int:
+    """Detect left content edge, accounting for book-spine shadow.
+
+    Tries three methods in order:
+    1. Scanner spine-shadow (dark gradient, range > 40)
+    2. Camera gutter continuity (subtle shadow running top-to-bottom)
+    3. Binary projection fallback (first ink column)
+    """
+    search_w = max(1, w // 4)
+    spine_x = _detect_spine_shadow(gray, gray[:, :search_w], 0, w, "left")
+    if spine_x is not None:
+        return spine_x
+
+    gutter_x = _detect_gutter_continuity(gray, gray[:, :search_w], 0, w, "left")
+    if gutter_x is not None:
+        return gutter_x
+
+    return _detect_edge_projection(binary, axis=0, from_start=True, dim=w)
+
+
+def _detect_right_edge_shadow(
+    gray: np.ndarray,
+    binary: np.ndarray,
+    w: int,
+    h: int,
+) -> int:
+    """Detect right content edge, accounting for book-spine shadow.
+
+    Tries three methods in order:
+    1. Scanner spine-shadow (dark gradient, range > 40)
+    2. Camera gutter continuity (subtle shadow running top-to-bottom)
+    3. Binary projection fallback (last ink column)
+    """
+    search_w = max(1, w // 4)
+    right_start = w - search_w
+    spine_x = _detect_spine_shadow(gray, gray[:, right_start:], right_start, w, "right")
+    if spine_x is not None:
+        return spine_x
+
+    gutter_x = _detect_gutter_continuity(gray, gray[:, right_start:], right_start, w, "right")
+    if gutter_x is not None:
+        return gutter_x
+
+    return _detect_edge_projection(binary, axis=0, from_start=False, dim=w)
+
+
+def _detect_top_bottom_edges(binary: np.ndarray, w: int, h: int) -> Tuple[int, int]:
+    """Detect top and bottom content edges via binary horizontal projection."""
+    top = _detect_edge_projection(binary, axis=1, from_start=True, dim=h)
+    bottom = _detect_edge_projection(binary, axis=1, from_start=False, dim=h)
+    return top, bottom
+
+
+def _detect_edge_projection(
+    binary: np.ndarray,
+    axis: int,
+    from_start: bool,
+    dim: int,
+) -> int:
+    """Find the first/last row or column with ink density above threshold.
+
+    axis=0 -> project vertically (column densities) -> returns x position
+    axis=1 -> project horizontally (row densities) -> returns y position
+
+    Filters out narrow noise runs shorter than _MIN_RUN_FRAC of the dimension.
+    """
+    projection = np.mean(binary, axis=axis) / 255.0
+
+    ink_mask = projection >= _INK_THRESHOLD
+
+    min_run = max(1, int(dim * _MIN_RUN_FRAC))
+    ink_mask = _filter_narrow_runs(ink_mask, min_run)
+
+    ink_positions = np.where(ink_mask)[0]
+    if len(ink_positions) == 0:
+        return 0 if from_start else dim
+
+    if from_start:
+        return int(ink_positions[0])
+    else:
+        return int(ink_positions[-1])
+
+
+def _filter_narrow_runs(mask: np.ndarray, min_run: int) -> np.ndarray:
+    """Remove True-runs shorter than min_run pixels."""
+    if min_run <= 1:
+        return mask
+
+    result = mask.copy()
+    n = len(result)
+    i = 0
+    while i < n:
+        if result[i]:
+            start = i
+            while i < n and result[i]:
+                i += 1
+            if i - start < min_run:
+                result[start:i] = False
+        else:
+            i += 1
+    return result