fix: Seite an Sub-Headern segmentieren, groesstes Segment fuer Projektion

Statt full-width Zeilen zu maskieren wird die Seite jetzt an grossen horizontalen Luecken (Sub-Header, Kapitelgrenzen) in Segmente unterteilt. Das groesste Segment wird fuer die vertikale Projektion verwendet. Dadurch stoeren Illustrationen und Ueberschriften nicht mehr. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-05 23:07:23 +01:00
parent cb2b924a7b
commit 391449fedf
1 changed files with 60 additions and 30 deletions
@@ -2131,45 +2131,75 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt

    logger.info(f"ColumnGeometry: {len(left_edges)} words detected in content area")

-    # --- Step 2b: Mask out full-width rows (sub-headers, colored bands) ---
-    # Rows where ink spans nearly the full content width distort the vertical
-    # projection by filling in column gaps.  Detect them via horizontal density
-    # and zero them out before computing v_proj.
+    # --- Step 2b: Segment by sub-headers ---
+    # Pages with sub-headers (e.g. "Unit 4: Bonnie Scotland") have full-width
+    # text bands that pollute the vertical projection.  We detect large
+    # horizontal gaps (= whitespace rows separating sections) and use only
+    # the tallest content segment for the projection.  This makes column
+    # detection immune to sub-headers, illustrations, and section dividers.
    content_strip = inv[top_y:bottom_y, left_x:right_x]
    h_proj_row = np.sum(content_strip, axis=1).astype(float)
    h_proj_row_norm = h_proj_row / (content_w * 255) if content_w > 0 else h_proj_row

-    FULLWIDTH_THRESHOLD = 0.40  # normal text ~10-25%; full-width bands 40%+
-    fullwidth_mask = h_proj_row_norm > FULLWIDTH_THRESHOLD
+    # Find horizontal gaps (near-empty rows)
+    H_GAP_THRESH = 0.02  # rows with <2% ink density are "empty"
+    h_in_gap = h_proj_row_norm < H_GAP_THRESH
+    H_MIN_GAP = max(5, content_h // 200)  # min gap height ~5-7px

-    # Only mask contiguous bands (>=3 rows), not isolated noisy rows
-    masked_strip = content_strip.copy()
-    n_masked = 0
-    band_start = None
-    for y_idx in range(len(fullwidth_mask)):
-        if fullwidth_mask[y_idx]:
-            if band_start is None:
-                band_start = y_idx
+    h_gaps: List[Tuple[int, int]] = []
+    h_gap_start = None
+    for y_idx in range(len(h_in_gap)):
+        if h_in_gap[y_idx]:
+            if h_gap_start is None:
+                h_gap_start = y_idx
        else:
-            if band_start is not None:
-                band_height = y_idx - band_start
-                if band_height >= 3:
-                    masked_strip[band_start:y_idx, :] = 0
-                    n_masked += band_height
-                band_start = None
-    if band_start is not None:
-        band_height = len(fullwidth_mask) - band_start
-        if band_height >= 3:
-            masked_strip[band_start:len(fullwidth_mask), :] = 0
-            n_masked += band_height
+            if h_gap_start is not None:
+                if y_idx - h_gap_start >= H_MIN_GAP:
+                    h_gaps.append((h_gap_start, y_idx))
+                h_gap_start = None
+    if h_gap_start is not None and len(h_in_gap) - h_gap_start >= H_MIN_GAP:
+        h_gaps.append((h_gap_start, len(h_in_gap)))

-    if n_masked > 0:
-        logger.info(f"ColumnGeometry: masked {n_masked} full-width rows "
-                    f"({n_masked * 100 / content_h:.1f}% of content height)")
+    # Identify "large" gaps (significantly bigger than median) that indicate
+    # section boundaries (sub-headers, chapter titles).
+    if len(h_gaps) >= 3:
+        gap_sizes = sorted(g[1] - g[0] for g in h_gaps)
+        median_gap_h = gap_sizes[len(gap_sizes) // 2]
+        large_gap_thresh = max(median_gap_h * 1.8, H_MIN_GAP + 3)
+        large_gaps = [(gs, ge) for gs, ge in h_gaps if ge - gs >= large_gap_thresh]
+    else:
+        large_gaps = h_gaps
+
+    # Build content segments between large gaps and pick the tallest
+    seg_boundaries = [0]
+    for gs, ge in large_gaps:
+        seg_boundaries.append(gs)
+        seg_boundaries.append(ge)
+    seg_boundaries.append(content_h)
+
+    segments = []
+    for i in range(0, len(seg_boundaries) - 1, 2):
+        seg_top = seg_boundaries[i]
+        seg_bot = seg_boundaries[i + 1] if i + 1 < len(seg_boundaries) else content_h
+        seg_height = seg_bot - seg_top
+        if seg_height > 20:  # ignore tiny fragments
+            segments.append((seg_top, seg_bot, seg_height))
+
+    if segments:
+        segments.sort(key=lambda s: s[2], reverse=True)
+        best_seg = segments[0]
+        proj_strip = content_strip[best_seg[0]:best_seg[1], :]
+        effective_h = best_seg[2]
+        if len(segments) > 1:
+            logger.info(f"ColumnGeometry: {len(segments)} segments from {len(large_gaps)} "
+                        f"large h-gaps, using tallest: rows {best_seg[0]}..{best_seg[1]} "
+                        f"({effective_h}px, {effective_h*100/content_h:.0f}%)")
+    else:
+        proj_strip = content_strip
+        effective_h = content_h

    # --- Step 3: Vertical projection profile ---
-    effective_h = content_h - n_masked
-    v_proj = np.sum(masked_strip, axis=0).astype(float)
+    v_proj = np.sum(proj_strip, axis=0).astype(float)
    v_proj_norm = v_proj / (effective_h * 255) if effective_h > 0 else v_proj

    # Smooth the projection to avoid noise-induced micro-gaps