fix: expand narrow columns + lower dewarp thresholds for small angles

Two fixes for edge case where residual shear pushes content out of narrow columns (marker, page_ref): 1. Column expansion (Step 10): After detection, narrow columns (<10% content width) expand into adjacent whitespace gaps, claiming up to 40% of the gap but never past the nearest word in the neighbor column. This gives marker/page_ref columns breathing room. 2. Dewarp sensitivity: Lower minimum angle from 0.15° to 0.08°, lower ensemble min confidence from 0.5 to 0.35, lower final threshold from 0.5 to 0.4, and skip quality gate for small corrections (<0.5°) where projection variance change is negligible. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-04 09:32:47 +01:00
parent 0d3f001acb
commit e426de937c
1 changed files with 78 additions and 6 deletions
@@ -793,8 +793,9 @@ def _ensemble_shear(detections: List[Dict[str, Any]]) -> Tuple[float, float, str
    Returns:
        (shear_degrees, ensemble_confidence, methods_used_str)
    """
-    # Higher confidence threshold — "im Zweifel nichts tun"
-    _MIN_CONF = 0.5
+    # Confidence threshold — lowered from 0.5 to 0.35 to catch subtle shear
+    # that individual methods detect with moderate confidence.
+    _MIN_CONF = 0.35

    # text_lines gets a weight boost as the most content-aware method
    _METHOD_WEIGHT_BOOST = {"text_lines": 1.5}
@@ -910,16 +911,22 @@ def dewarp_image(img: np.ndarray, use_ensemble: bool = True) -> Tuple[np.ndarray
        for d in detections
    ]

-    # Higher thresholds: subtle shear (<0.15°) is irrelevant for OCR
-    if abs(shear_deg) < 0.15 or confidence < 0.5:
+    # Thresholds: very small shear (<0.08°) is truly irrelevant for OCR.
+    # For ensemble confidence, require at least 0.4 (lowered from 0.5 to
+    # catch moderate-confidence detections from multiple agreeing methods).
+    if abs(shear_deg) < 0.08 or confidence < 0.4:
        no_correction["detections"] = _all_detections
        return img, no_correction

    # Apply correction (negate the detected shear to straighten)
    corrected = _apply_shear(img, -shear_deg)

-    # Quality gate: verify the correction actually improved alignment
-    if not _dewarp_quality_check(img, corrected):
+    # Quality gate: verify the correction actually improved alignment.
+    # For small corrections (< 0.5°), the projection variance change can be
+    # negligible, so we skip the quality gate — the cost of a tiny wrong
+    # correction is much less than the cost of leaving 0.4° uncorrected
+    # (which shifts content ~25px at image edges on tall scans).
+    if abs(shear_deg) >= 0.5 and not _dewarp_quality_check(img, corrected):
        logger.info("dewarp: quality gate REJECTED correction (%.3f°) — "
                     "projection variance did not improve", shear_deg)
        no_correction["detections"] = _all_detections
@@ -1876,6 +1883,71 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
        logger.info(f"ColumnGeometry: {len(geometries)} columns after phantom filter: "
                    f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")

+    # --- Step 10: Expand narrow columns into adjacent gaps ---
+    # Narrow columns (marker, page_ref, < 10% width) often lose content at
+    # image edges due to residual shear.  Expand them into the gap toward
+    # the neighbouring column, but never past 40 % of the gap or past the
+    # nearest word in the neighbour.
+    _NARROW_THRESHOLD_PCT = 10.0  # columns below this % of content_w are "narrow"
+    _GAP_CLAIM_RATIO = 0.40       # narrow col may claim up to 40 % of the gap
+    _MIN_WORD_MARGIN = 4          # always keep 4 px between col edge and nearest word
+
+    if len(geometries) >= 2:
+        for i, g in enumerate(geometries):
+            col_pct = g.width / content_w * 100 if content_w > 0 else 100
+            if col_pct >= _NARROW_THRESHOLD_PCT:
+                continue  # not narrow — skip
+
+            expanded = False
+
+            # --- try expanding to the LEFT (into gap with left neighbor) ---
+            if i > 0:
+                left_nb = geometries[i - 1]
+                gap_left = g.x - (left_nb.x + left_nb.width)
+                if gap_left > _MIN_WORD_MARGIN * 2:
+                    # Find nearest word in left neighbor (right edge)
+                    nb_right_rel = (left_nb.x + left_nb.width) - left_x
+                    nb_words_right = [wd['left'] + wd.get('width', 0)
+                                      for wd in left_nb.words]
+                    max_word_right = max(nb_words_right) if nb_words_right else (nb_right_rel - 20)
+                    # max_word_right is relative to left_x
+                    safe_left_abs = left_x + max_word_right + _MIN_WORD_MARGIN
+                    max_expand = int(gap_left * _GAP_CLAIM_RATIO)
+                    new_x = max(safe_left_abs, g.x - max_expand)
+                    if new_x < g.x:
+                        delta = g.x - new_x
+                        g.width += delta
+                        g.x = new_x
+                        expanded = True
+
+            # --- try expanding to the RIGHT (into gap with right neighbor) ---
+            if i + 1 < len(geometries):
+                right_nb = geometries[i + 1]
+                gap_right = right_nb.x - (g.x + g.width)
+                if gap_right > _MIN_WORD_MARGIN * 2:
+                    # Find nearest word in right neighbor (left edge)
+                    nb_words_left = [wd['left'] for wd in right_nb.words]
+                    min_word_left_rel = min(nb_words_left) if nb_words_left else ((right_nb.x - left_x) + 20)
+                    safe_right_abs = left_x + min_word_left_rel - _MIN_WORD_MARGIN
+                    max_expand = int(gap_right * _GAP_CLAIM_RATIO)
+                    new_right = min(safe_right_abs, g.x + g.width + max_expand)
+                    if new_right > g.x + g.width:
+                        g.width = new_right - g.x
+                        expanded = True
+
+            if expanded:
+                # Re-assign words to this expanded column
+                col_left_rel = g.x - left_x
+                col_right_rel = col_left_rel + g.width
+                g.words = [wd for wd in word_dicts
+                           if col_left_rel <= wd['left'] < col_right_rel]
+                g.word_count = len(g.words)
+                g.width_ratio = g.width / content_w if content_w > 0 else 0.0
+                logger.info(
+                    "ColumnGeometry: expanded narrow col %d "
+                    "(%.1f%% → %.1f%%) x=%d w=%d",
+                    i, col_pct, g.width / content_w * 100, g.x, g.width)
+
    return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)