[split-required] Split 700-870 LOC files across all services

backend-lehrer (11 files): - llm_gateway/routes/schools.py (867 → 5), recording_api.py (848 → 6) - messenger_api.py (840 → 5), print_generator.py (824 → 5) - unit_analytics_api.py (751 → 5), classroom/routes/context.py (726 → 4) - llm_gateway/routes/edu_search_seeds.py (710 → 4) klausur-service (12 files): - ocr_labeling_api.py (845 → 4), metrics_db.py (833 → 4) - legal_corpus_api.py (790 → 4), page_crop.py (758 → 3) - mail/ai_service.py (747 → 4), github_crawler.py (767 → 3) - trocr_service.py (730 → 4), full_compliance_pipeline.py (723 → 4) - dsfa_rag_api.py (715 → 4), ocr_pipeline_auto.py (705 → 4) website (6 pages): - audit-checklist (867 → 8), content (806 → 6) - screen-flow (790 → 4), scraper (789 → 5) - zeugnisse (776 → 5), modules (745 → 4) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 08:01:18 +02:00
parent b6983ab1dc
commit 34da9f4cda
106 changed files with 16500 additions and 16947 deletions
--- a/klausur-service/backend/page_crop_core.py
+++ b/klausur-service/backend/page_crop_core.py
@@ -0,0 +1,342 @@
+"""
+Page Crop - Core Crop and Format Detection
+
+Content-based crop for scanned pages and book scans.  Detects the content
+boundary by analysing ink density projections and (for book scans) the
+spine shadow gradient.
+
+Extracted from page_crop.py to keep files under 500 LOC.
+License: Apache 2.0
+"""
+
+import logging
+from typing import Dict, Any, Tuple
+
+import cv2
+import numpy as np
+
+from page_crop_edges import (
+    _detect_left_edge_shadow,
+    _detect_right_edge_shadow,
+    _detect_top_bottom_edges,
+)
+
+logger = logging.getLogger(__name__)
+
+# Known paper format aspect ratios (height / width, portrait orientation)
+PAPER_FORMATS = {
+    "A4": 297.0 / 210.0,       # 1.4143
+    "A5": 210.0 / 148.0,       # 1.4189
+    "Letter": 11.0 / 8.5,      # 1.2941
+    "Legal": 14.0 / 8.5,       # 1.6471
+    "A3": 420.0 / 297.0,       # 1.4141
+}
+
+
+def detect_page_splits(
+    img_bgr: np.ndarray,
+) -> list:
+    """Detect if the image is a multi-page spread and return split rectangles.
+
+    Uses **brightness** (not ink density) to find the spine area:
+    the scanner bed produces a characteristic gray strip where pages meet,
+    which is darker than the white paper on either side.
+
+    Returns a list of page dicts ``{x, y, width, height, page_index}``
+    or an empty list if only one page is detected.
+    """
+    h, w = img_bgr.shape[:2]
+
+    # Only check landscape-ish images (width > height * 1.15)
+    if w < h * 1.15:
+        return []
+
+    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+
+    # Column-mean brightness (0-255) — the spine is darker (gray scanner bed)
+    col_brightness = np.mean(gray, axis=0).astype(np.float64)
+
+    # Heavy smoothing to ignore individual text lines
+    kern = max(11, w // 50)
+    if kern % 2 == 0:
+        kern += 1
+    brightness_smooth = np.convolve(col_brightness, np.ones(kern) / kern, mode="same")
+
+    # Page paper is bright (typically > 200), spine/scanner bed is darker
+    page_brightness = float(np.max(brightness_smooth))
+    if page_brightness < 100:
+        return []  # Very dark image, skip
+
+    # Spine threshold: significantly darker than the page
+    spine_thresh = page_brightness * 0.88
+
+    # Search in center region (30-70% of width)
+    center_lo = int(w * 0.30)
+    center_hi = int(w * 0.70)
+
+    # Find the darkest valley in the center region
+    center_brightness = brightness_smooth[center_lo:center_hi]
+    darkest_val = float(np.min(center_brightness))
+
+    if darkest_val >= spine_thresh:
+        logger.debug("No spine detected: min brightness %.0f >= threshold %.0f",
+                      darkest_val, spine_thresh)
+        return []
+
+    # Find ALL contiguous dark runs in the center region
+    is_dark = center_brightness < spine_thresh
+    dark_runs: list = []
+    run_start = -1
+    for i in range(len(is_dark)):
+        if is_dark[i]:
+            if run_start < 0:
+                run_start = i
+        else:
+            if run_start >= 0:
+                dark_runs.append((run_start, i))
+                run_start = -1
+    if run_start >= 0:
+        dark_runs.append((run_start, len(is_dark)))
+
+    # Filter out runs that are too narrow (< 1% of image width)
+    min_spine_px = int(w * 0.01)
+    dark_runs = [(s, e) for s, e in dark_runs if e - s >= min_spine_px]
+
+    if not dark_runs:
+        logger.debug("No dark runs wider than %dpx in center region", min_spine_px)
+        return []
+
+    # Score each dark run: prefer centered, dark, narrow valleys
+    center_region_len = center_hi - center_lo
+    image_center_in_region = (w * 0.5 - center_lo)
+    best_score = -1.0
+    best_start, best_end = dark_runs[0]
+
+    for rs, re in dark_runs:
+        run_width = re - rs
+        run_center = (rs + re) / 2.0
+
+        sigma = center_region_len * 0.15
+        dist = abs(run_center - image_center_in_region)
+        center_factor = float(np.exp(-0.5 * (dist / sigma) ** 2))
+
+        run_brightness = float(np.mean(center_brightness[rs:re]))
+        darkness_factor = max(0.0, (spine_thresh - run_brightness) / spine_thresh)
+
+        width_frac = run_width / w
+        if width_frac <= 0.05:
+            narrowness_bonus = 1.0
+        elif width_frac <= 0.15:
+            narrowness_bonus = 1.0 - (width_frac - 0.05) / 0.10
+        else:
+            narrowness_bonus = 0.0
+
+        score = center_factor * darkness_factor * (0.3 + 0.7 * narrowness_bonus)
+
+        logger.debug(
+            "Dark run x=%d..%d (w=%d): center_f=%.3f dark_f=%.3f narrow_b=%.3f -> score=%.4f",
+            center_lo + rs, center_lo + re, run_width,
+            center_factor, darkness_factor, narrowness_bonus, score,
+        )
+
+        if score > best_score:
+            best_score = score
+            best_start, best_end = rs, re
+
+    spine_w = best_end - best_start
+    spine_x = center_lo + best_start
+    spine_center = spine_x + spine_w // 2
+
+    logger.debug(
+        "Best spine candidate: x=%d..%d (w=%d), score=%.4f",
+        spine_x, spine_x + spine_w, spine_w, best_score,
+    )
+
+    # Verify: must have bright (paper) content on BOTH sides
+    left_brightness = float(np.mean(brightness_smooth[max(0, spine_x - w // 10):spine_x]))
+    right_end = center_lo + best_end
+    right_brightness = float(np.mean(brightness_smooth[right_end:min(w, right_end + w // 10)]))
+
+    if left_brightness < spine_thresh or right_brightness < spine_thresh:
+        logger.debug("No bright paper flanking spine: left=%.0f right=%.0f thresh=%.0f",
+                      left_brightness, right_brightness, spine_thresh)
+        return []
+
+    logger.info(
+        "Spine detected: x=%d..%d (w=%d), brightness=%.0f vs paper=%.0f, "
+        "left_paper=%.0f, right_paper=%.0f",
+        spine_x, right_end, spine_w, darkest_val, page_brightness,
+        left_brightness, right_brightness,
+    )
+
+    # Split at the spine center
+    split_points = [spine_center]
+
+    # Build page rectangles
+    pages: list = []
+    prev_x = 0
+    for i, sx in enumerate(split_points):
+        pages.append({"x": prev_x, "y": 0, "width": sx - prev_x,
+                       "height": h, "page_index": i})
+        prev_x = sx
+    pages.append({"x": prev_x, "y": 0, "width": w - prev_x,
+                   "height": h, "page_index": len(split_points)})
+
+    # Filter out tiny pages (< 15% of total width)
+    pages = [p for p in pages if p["width"] >= w * 0.15]
+    if len(pages) < 2:
+        return []
+
+    # Re-index
+    for i, p in enumerate(pages):
+        p["page_index"] = i
+
+    logger.info(
+        "Page split detected: %d pages, spine_w=%d, split_points=%s",
+        len(pages), spine_w, split_points,
+    )
+    return pages
+
+
+def detect_and_crop_page(
+    img_bgr: np.ndarray,
+    margin_frac: float = 0.01,
+) -> Tuple[np.ndarray, Dict[str, Any]]:
+    """Detect content boundary and crop scanner/book borders.
+
+    Algorithm (4-edge detection):
+    1. Adaptive threshold -> binary (text=255, bg=0)
+    2. Left edge: spine-shadow detection via grayscale column means,
+       fallback to binary vertical projection
+    3. Right edge: binary vertical projection (last ink column)
+    4. Top/bottom edges: binary horizontal projection
+    5. Sanity checks, then crop with configurable margin
+
+    Args:
+        img_bgr: Input BGR image (should already be deskewed/dewarped)
+        margin_frac: Extra margin around content (fraction of dimension, default 1%)
+
+    Returns:
+        Tuple of (cropped_image, result_dict)
+    """
+    h, w = img_bgr.shape[:2]
+    total_area = h * w
+
+    result: Dict[str, Any] = {
+        "crop_applied": False,
+        "crop_rect": None,
+        "crop_rect_pct": None,
+        "original_size": {"width": w, "height": h},
+        "cropped_size": {"width": w, "height": h},
+        "detected_format": None,
+        "format_confidence": 0.0,
+        "aspect_ratio": round(max(h, w) / max(min(h, w), 1), 4),
+        "border_fractions": {"top": 0.0, "bottom": 0.0, "left": 0.0, "right": 0.0},
+    }
+
+    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+
+    # --- Binarise with adaptive threshold ---
+    binary = cv2.adaptiveThreshold(
+        gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+        cv2.THRESH_BINARY_INV, blockSize=51, C=15,
+    )
+
+    # --- Edge detection ---
+    left_edge = _detect_left_edge_shadow(gray, binary, w, h)
+    right_edge = _detect_right_edge_shadow(gray, binary, w, h)
+    top_edge, bottom_edge = _detect_top_bottom_edges(binary, w, h)
+
+    # Compute border fractions
+    border_top = top_edge / h
+    border_bottom = (h - bottom_edge) / h
+    border_left = left_edge / w
+    border_right = (w - right_edge) / w
+
+    result["border_fractions"] = {
+        "top": round(border_top, 4),
+        "bottom": round(border_bottom, 4),
+        "left": round(border_left, 4),
+        "right": round(border_right, 4),
+    }
+
+    # Sanity: only crop if at least one edge has > 2% border
+    min_border = 0.02
+    if all(f < min_border for f in [border_top, border_bottom, border_left, border_right]):
+        logger.info("All borders < %.0f%% — no crop needed", min_border * 100)
+        result["detected_format"], result["format_confidence"] = _detect_format(w, h)
+        return img_bgr, result
+
+    # Add margin
+    margin_x = int(w * margin_frac)
+    margin_y = int(h * margin_frac)
+
+    crop_x = max(0, left_edge - margin_x)
+    crop_y = max(0, top_edge - margin_y)
+    crop_x2 = min(w, right_edge + margin_x)
+    crop_y2 = min(h, bottom_edge + margin_y)
+
+    crop_w = crop_x2 - crop_x
+    crop_h = crop_y2 - crop_y
+
+    # Sanity: cropped area must be >= 40% of original
+    if crop_w * crop_h < 0.40 * total_area:
+        logger.warning("Cropped area too small (%.0f%%) — skipping crop",
+                       100.0 * crop_w * crop_h / total_area)
+        result["detected_format"], result["format_confidence"] = _detect_format(w, h)
+        return img_bgr, result
+
+    cropped = img_bgr[crop_y:crop_y2, crop_x:crop_x2].copy()
+
+    detected_format, format_confidence = _detect_format(crop_w, crop_h)
+
+    result["crop_applied"] = True
+    result["crop_rect"] = {"x": crop_x, "y": crop_y, "width": crop_w, "height": crop_h}
+    result["crop_rect_pct"] = {
+        "x": round(100.0 * crop_x / w, 2),
+        "y": round(100.0 * crop_y / h, 2),
+        "width": round(100.0 * crop_w / w, 2),
+        "height": round(100.0 * crop_h / h, 2),
+    }
+    result["cropped_size"] = {"width": crop_w, "height": crop_h}
+    result["detected_format"] = detected_format
+    result["format_confidence"] = format_confidence
+    result["aspect_ratio"] = round(max(crop_w, crop_h) / max(min(crop_w, crop_h), 1), 4)
+
+    logger.info(
+        "Page cropped: %dx%d -> %dx%d, format=%s (%.0f%%), "
+        "borders: T=%.1f%% B=%.1f%% L=%.1f%% R=%.1f%%",
+        w, h, crop_w, crop_h, detected_format, format_confidence * 100,
+        border_top * 100, border_bottom * 100,
+        border_left * 100, border_right * 100,
+    )
+
+    return cropped, result
+
+
+# ---------------------------------------------------------------------------
+# Format detection (kept as optional metadata)
+# ---------------------------------------------------------------------------
+
+def _detect_format(width: int, height: int) -> Tuple[str, float]:
+    """Detect paper format from dimensions by comparing aspect ratios."""
+    if width <= 0 or height <= 0:
+        return "unknown", 0.0
+
+    aspect = max(width, height) / min(width, height)
+
+    best_format = "unknown"
+    best_diff = float("inf")
+
+    for fmt, expected_ratio in PAPER_FORMATS.items():
+        diff = abs(aspect - expected_ratio)
+        if diff < best_diff:
+            best_diff = diff
+            best_format = fmt
+
+    confidence = max(0.0, 1.0 - best_diff * 5.0)
+
+    if confidence < 0.3:
+        return "unknown", 0.0
+
+    return best_format, round(confidence, 3)