fix: word-coverage gap detection als Fallback bei Illustrationen

Wenn pixel-basierte Projektion zu wenige Spaltenluecken findet (z.B. durch Illustrationen/Grafiken die Luecken fuellen), wird jetzt eine wort-basierte Gap-Detection als Zwischenschritt vor dem Clustering ausgefuehrt. Tesseract-Wort-BBs sind immun gegen dekorative Grafiken. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-05 22:58:27 +01:00
parent 8f3a50b981
commit cb2b924a7b
1 changed files with 44 additions and 0 deletions
@@ -2250,6 +2250,50 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
    logger.info(f"ColumnGeometry: {len(validated_gaps)} gaps after word validation: "
                f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in validated_gaps]}")

+    # --- Step 5b: Word-coverage gap detection (fallback for noisy scans) ---
+    # When pixel-based projection fails (e.g. due to illustrations or colored
+    # bands), use word bounding boxes to find clear vertical gaps.  This is
+    # immune to decorative graphics that Tesseract doesn't recognise as words.
+    if len(validated_gaps) < 2:
+        logger.info("ColumnGeometry: < 2 pixel-gaps, trying word-coverage gaps")
+        word_coverage = np.zeros(content_w, dtype=np.int32)
+        for wd in word_dicts:
+            wl = max(0, wd['left'])
+            wr = min(wd['left'] + wd['width'], content_w)
+            if wr > wl:
+                word_coverage[wl:wr] += 1
+
+        # Smooth slightly to bridge tiny 1-2px noise gaps between words
+        wc_kernel = max(3, content_w // 300)
+        if wc_kernel % 2 == 0:
+            wc_kernel += 1
+        wc_smooth = np.convolve(word_coverage.astype(float),
+                                np.ones(wc_kernel) / wc_kernel, mode='same')
+
+        wc_in_gap = wc_smooth < 0.5  # effectively zero word coverage
+        WC_MIN_GAP = max(4, content_w // 300)
+
+        wc_gaps: List[Tuple[int, int]] = []
+        wc_gap_start = None
+        for x in range(len(wc_in_gap)):
+            if wc_in_gap[x]:
+                if wc_gap_start is None:
+                    wc_gap_start = x
+            else:
+                if wc_gap_start is not None:
+                    if x - wc_gap_start >= WC_MIN_GAP:
+                        wc_gaps.append((wc_gap_start, x))
+                    wc_gap_start = None
+        if wc_gap_start is not None and len(wc_in_gap) - wc_gap_start >= WC_MIN_GAP:
+            wc_gaps.append((wc_gap_start, len(wc_in_gap)))
+
+        logger.info(f"ColumnGeometry: {len(wc_gaps)} word-coverage gaps found "
+                    f"(min_width={WC_MIN_GAP}px): "
+                    f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in wc_gaps]}")
+
+        if len(wc_gaps) >= 2:
+            validated_gaps = wc_gaps
+
    # --- Step 6: Fallback to clustering if too few gaps ---
    if len(validated_gaps) < 2:
        logger.info("ColumnGeometry: < 2 gaps found, falling back to clustering")