Fix: Remove broken getKlausurApiUrl and clean up empty lines

sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 16:02:04 +02:00
parent b07f802c24
commit 9ba420fa91
150 changed files with 30231 additions and 32053 deletions
--- a/klausur-service/backend/cv_layout_analyze.py
+++ b/klausur-service/backend/cv_layout_analyze.py
@@ -0,0 +1,257 @@
+"""
+Legacy layout analysis using projection profiles.
+
+Extracted from cv_layout_columns.py — contains:
+- analyze_layout()   (projection-profile based column/header/footer detection)
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+from typing import List
+
+import numpy as np
+
+from cv_vocab_types import PageRegion
+from cv_layout_detection import _find_content_bounds
+
+logger = logging.getLogger(__name__)
+
+try:
+    import cv2
+except ImportError:
+    cv2 = None  # type: ignore[assignment]
+
+
+def analyze_layout(layout_img: np.ndarray, ocr_img: np.ndarray) -> List[PageRegion]:
+    """Detect columns, header, and footer using projection profiles.
+
+    Uses content-bounds detection to exclude page margins before searching
+    for column separators within the actual text area.
+
+    Args:
+        layout_img: CLAHE-enhanced grayscale image.
+        ocr_img: Binarized image for text density analysis.
+
+    Returns:
+        List of PageRegion objects describing detected regions.
+    """
+    h, w = ocr_img.shape[:2]
+
+    # Invert: black text on white → white text on black for projection
+    inv = cv2.bitwise_not(ocr_img)
+
+    # --- Find actual content bounds (exclude page margins) ---
+    left_x, right_x, top_y, bottom_y = _find_content_bounds(inv)
+    content_w = right_x - left_x
+    content_h = bottom_y - top_y
+
+    logger.info(f"Layout: content bounds x=[{left_x}..{right_x}] ({content_w}px), "
+                f"y=[{top_y}..{bottom_y}] ({content_h}px) in {w}x{h} image")
+
+    if content_w < w * 0.3 or content_h < h * 0.3:
+        # Fallback if detection seems wrong
+        left_x, right_x = 0, w
+        top_y, bottom_y = 0, h
+        content_w, content_h = w, h
+
+    # --- Vertical projection within content area to find column separators ---
+    content_strip = inv[top_y:bottom_y, left_x:right_x]
+    v_proj = np.sum(content_strip, axis=0).astype(float)
+    v_proj_norm = v_proj / (content_h * 255) if content_h > 0 else v_proj
+
+    # Smooth the projection profile
+    kernel_size = max(5, content_w // 50)
+    if kernel_size % 2 == 0:
+        kernel_size += 1
+    v_proj_smooth = np.convolve(v_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
+
+    # Debug: log projection profile statistics
+    p_mean = float(np.mean(v_proj_smooth))
+    p_median = float(np.median(v_proj_smooth))
+    p_min = float(np.min(v_proj_smooth))
+    p_max = float(np.max(v_proj_smooth))
+    logger.info(f"Layout: v_proj stats — min={p_min:.4f}, max={p_max:.4f}, "
+                f"mean={p_mean:.4f}, median={p_median:.4f}")
+
+    # Find valleys using multiple threshold strategies
+    # Strategy 1: relative to median (catches clear separators)
+    # Strategy 2: local minima approach (catches subtle gaps)
+    threshold = max(p_median * 0.3, p_mean * 0.2)
+    logger.info(f"Layout: valley threshold={threshold:.4f}")
+
+    in_valley = v_proj_smooth < threshold
+
+    # Find contiguous valley regions
+    all_valleys = []
+    start = None
+    for x in range(len(v_proj_smooth)):
+        if in_valley[x] and start is None:
+            start = x
+        elif not in_valley[x] and start is not None:
+            valley_width = x - start
+            valley_depth = float(np.min(v_proj_smooth[start:x]))
+            # Valley must be at least 3px wide
+            if valley_width >= 3:
+                all_valleys.append((start, x, (start + x) // 2, valley_width, valley_depth))
+            start = None
+
+    logger.info(f"Layout: raw valleys (before filter): {len(all_valleys)} — "
+                f"{[(v[0]+left_x, v[1]+left_x, v[3], f'{v[4]:.4f}') for v in all_valleys[:10]]}")
+
+    # Filter: valleys must be inside the content area (not at edges)
+    inner_margin = int(content_w * 0.08)
+    valleys = [v for v in all_valleys if inner_margin < v[2] < content_w - inner_margin]
+
+    # If no valleys found with strict threshold, try local minima approach
+    if len(valleys) < 2:
+        logger.info("Layout: trying local minima approach for column detection")
+        # Divide content into 20 segments, find the 2 lowest
+        seg_count = 20
+        seg_width = content_w // seg_count
+        seg_scores = []
+        for i in range(seg_count):
+            sx = i * seg_width
+            ex = min((i + 1) * seg_width, content_w)
+            seg_mean = float(np.mean(v_proj_smooth[sx:ex]))
+            seg_scores.append((i, sx, ex, seg_mean))
+
+        seg_scores.sort(key=lambda s: s[3])
+        logger.info(f"Layout: segment scores (lowest 5): "
+                    f"{[(s[0], s[1]+left_x, s[2]+left_x, f'{s[3]:.4f}') for s in seg_scores[:5]]}")
+
+        # Find two lowest non-adjacent segments that create reasonable columns
+        candidate_valleys = []
+        for seg_idx, sx, ex, seg_mean in seg_scores:
+            # Must not be at the edges
+            if seg_idx <= 1 or seg_idx >= seg_count - 2:
+                continue
+            # Must be significantly lower than overall mean
+            if seg_mean < p_mean * 0.6:
+                center = (sx + ex) // 2
+                candidate_valleys.append((sx, ex, center, ex - sx, seg_mean))
+
+        if len(candidate_valleys) >= 2:
+            # Pick the best pair: non-adjacent, creating reasonable column widths
+            candidate_valleys.sort(key=lambda v: v[2])
+            best_pair = None
+            best_score = float('inf')
+            for i in range(len(candidate_valleys)):
+                for j in range(i + 1, len(candidate_valleys)):
+                    c1 = candidate_valleys[i][2]
+                    c2 = candidate_valleys[j][2]
+                    # Must be at least 20% apart
+                    if (c2 - c1) < content_w * 0.2:
+                        continue
+                    col1 = c1
+                    col2 = c2 - c1
+                    col3 = content_w - c2
+                    # Each column at least 15%
+                    if col1 < content_w * 0.12 or col2 < content_w * 0.12 or col3 < content_w * 0.12:
+                        continue
+                    parts = sorted([col1, col2, col3])
+                    score = parts[2] - parts[0]
+                    if score < best_score:
+                        best_score = score
+                        best_pair = (candidate_valleys[i], candidate_valleys[j])
+
+            if best_pair:
+                valleys = list(best_pair)
+                logger.info(f"Layout: local minima found 2 valleys: "
+                            f"{[(v[0]+left_x, v[1]+left_x, v[3]) for v in valleys]}")
+
+    logger.info(f"Layout: final {len(valleys)} valleys: "
+                f"{[(v[0]+left_x, v[1]+left_x, v[3]) for v in valleys]}")
+
+    regions = []
+
+    if len(valleys) >= 2:
+        # 3-column layout detected
+        valleys.sort(key=lambda v: v[2])
+
+        if len(valleys) == 2:
+            sep1_center = valleys[0][2]
+            sep2_center = valleys[1][2]
+        else:
+            # Pick the two valleys that best divide into 3 parts
+            # Prefer wider valleys (more likely true separators)
+            best_pair = None
+            best_score = float('inf')
+            for i in range(len(valleys)):
+                for j in range(i + 1, len(valleys)):
+                    c1, c2 = valleys[i][2], valleys[j][2]
+                    # Each column should be at least 15% of content width
+                    col1 = c1
+                    col2 = c2 - c1
+                    col3 = content_w - c2
+                    if col1 < content_w * 0.15 or col2 < content_w * 0.15 or col3 < content_w * 0.15:
+                        continue
+                    # Score: lower is better (more even distribution)
+                    parts = sorted([col1, col2, col3])
+                    score = parts[2] - parts[0]
+                    # Bonus for wider valleys (subtract valley width)
+                    score -= (valleys[i][3] + valleys[j][3]) * 0.5
+                    if score < best_score:
+                        best_score = score
+                        best_pair = (c1, c2)
+            if best_pair:
+                sep1_center, sep2_center = best_pair
+            else:
+                sep1_center = valleys[0][2]
+                sep2_center = valleys[1][2]
+
+        # Convert from content-relative to absolute coordinates
+        abs_sep1 = sep1_center + left_x
+        abs_sep2 = sep2_center + left_x
+
+        logger.info(f"Layout: 3 columns at separators x={abs_sep1}, x={abs_sep2} "
+                    f"(widths: {abs_sep1}, {abs_sep2-abs_sep1}, {w-abs_sep2})")
+
+        regions.append(PageRegion(
+            type='column_en', x=0, y=top_y,
+            width=abs_sep1, height=content_h
+        ))
+        regions.append(PageRegion(
+            type='column_de', x=abs_sep1, y=top_y,
+            width=abs_sep2 - abs_sep1, height=content_h
+        ))
+        regions.append(PageRegion(
+            type='column_example', x=abs_sep2, y=top_y,
+            width=w - abs_sep2, height=content_h
+        ))
+
+    elif len(valleys) == 1:
+        # 2-column layout
+        abs_sep = valleys[0][2] + left_x
+
+        logger.info(f"Layout: 2 columns at separator x={abs_sep}")
+
+        regions.append(PageRegion(
+            type='column_en', x=0, y=top_y,
+            width=abs_sep, height=content_h
+        ))
+        regions.append(PageRegion(
+            type='column_de', x=abs_sep, y=top_y,
+            width=w - abs_sep, height=content_h
+        ))
+
+    else:
+        # No columns detected — run full-page OCR as single column
+        logger.warning("Layout: no column separators found, using full page")
+        regions.append(PageRegion(
+            type='column_en', x=0, y=top_y,
+            width=w, height=content_h
+        ))
+
+    # Add header/footer info (gap-based detection with fallback)
+    # Lazy import to avoid circular dependency with cv_layout.py
+    from cv_layout_detection import _add_header_footer
+    _add_header_footer(regions, top_y, bottom_y, w, h, inv=inv)
+
+    top_region = next((r.type for r in regions if r.type in ('header', 'margin_top')), 'none')
+    bottom_region = next((r.type for r in regions if r.type in ('footer', 'margin_bottom')), 'none')
+    col_count = len([r for r in regions if r.type.startswith('column')])
+    logger.info(f"Layout: {col_count} columns, top={top_region}, bottom={bottom_region}")
+
+    return regions