Fix cross-column word assignment by splitting OCR merge artifacts

When OCR merges adjacent words from different columns into one word box (e.g. "sichzie" spanning Col 1+2, "dasZimmer" crossing boundary), the grid builder assigned the entire merged word to one column. New _split_cross_column_words() function splits these at column boundaries using case transitions and spellchecker validation to avoid false positives on real words like "oder", "Kabel", "Zeitung". Regression: 12/12 GT sessions pass with diff=+0. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-28 10:54:41 +01:00
parent 0168ab1a67
commit 21b69e06be
1 changed files with 148 additions and 0 deletions
@@ -22,6 +22,148 @@ from cv_ocr_engines import _text_has_garbled_ipa
 logger = logging.getLogger(__name__)


+# ---------------------------------------------------------------------------
+# Cross-column word splitting
+# ---------------------------------------------------------------------------
+
+_spell_cache: Optional[Any] = None
+_spell_loaded = False
+
+
+def _is_recognized_word(text: str) -> bool:
+    """Check if *text* is a recognized German or English word.
+
+    Uses the spellchecker library (same as cv_syllable_detect.py).
+    Returns True for real words like "oder", "Kabel", "Zeitung".
+    Returns False for OCR merge artifacts like "sichzie", "dasZimmer".
+    """
+    global _spell_cache, _spell_loaded
+    if not text or len(text) < 2:
+        return False
+
+    if not _spell_loaded:
+        _spell_loaded = True
+        try:
+            from spellchecker import SpellChecker
+            _spell_cache = SpellChecker(language="de")
+        except Exception:
+            pass
+
+    if _spell_cache is None:
+        return False
+
+    return text.lower() in _spell_cache
+
+
+def _split_cross_column_words(
+    words: List[Dict],
+    columns: List[Dict],
+) -> List[Dict]:
+    """Split word boxes that span across column boundaries.
+
+    When OCR merges adjacent words from different columns (e.g. "sichzie"
+    spanning Col 1 and Col 2, or "dasZimmer" crossing the boundary),
+    split the word box at the column boundary so each piece is assigned
+    to the correct column.
+
+    Only splits when:
+    - The word has significant overlap (>15% of its width) on both sides
+    - AND the word is not a recognized real word (OCR merge artifact), OR
+      the word contains a case transition (lowercase→uppercase) near the
+      boundary indicating two merged words like "dasZimmer".
+    """
+    if len(columns) < 2:
+        return words
+
+    # Column boundaries = midpoints between adjacent column edges
+    boundaries = []
+    for i in range(len(columns) - 1):
+        boundary = (columns[i]["x_max"] + columns[i + 1]["x_min"]) / 2
+        boundaries.append(boundary)
+
+    new_words: List[Dict] = []
+    split_count = 0
+    for w in words:
+        w_left = w["left"]
+        w_width = w["width"]
+        w_right = w_left + w_width
+        text = (w.get("text") or "").strip()
+
+        if not text or len(text) < 4 or w_width < 10:
+            new_words.append(w)
+            continue
+
+        # Find the first boundary this word straddles significantly
+        split_boundary = None
+        for b in boundaries:
+            if w_left < b < w_right:
+                left_part = b - w_left
+                right_part = w_right - b
+                # Both sides must have at least 15% of the word width
+                if left_part > w_width * 0.15 and right_part > w_width * 0.15:
+                    split_boundary = b
+                    break
+
+        if split_boundary is None:
+            new_words.append(w)
+            continue
+
+        # Compute approximate split position in the text.
+        left_width = split_boundary - w_left
+        split_ratio = left_width / w_width
+        approx_pos = len(text) * split_ratio
+
+        # Strategy 1: look for a case transition (lowercase→uppercase) near
+        # the approximate split point — e.g. "dasZimmer" splits at 'Z'.
+        split_char = None
+        search_lo = max(1, int(approx_pos) - 3)
+        search_hi = min(len(text), int(approx_pos) + 2)
+        for i in range(search_lo, search_hi):
+            if text[i - 1].islower() and text[i].isupper():
+                split_char = i
+                break
+
+        # Strategy 2: if no case transition, only split if the whole word
+        # is NOT a real word (i.e. it's an OCR merge artifact like "sichzie").
+        # Real words like "oder", "Kabel", "Zeitung" must not be split.
+        if split_char is None:
+            clean = re.sub(r"[,;:.!?]+$", "", text)  # strip trailing punct
+            if _is_recognized_word(clean):
+                new_words.append(w)
+                continue
+            # Not a real word — use floor of proportional position
+            split_char = max(1, min(len(text) - 1, int(approx_pos)))
+
+        left_text = text[:split_char].rstrip()
+        right_text = text[split_char:].lstrip()
+
+        if len(left_text) < 2 or len(right_text) < 2:
+            new_words.append(w)
+            continue
+
+        right_width = w_width - round(left_width)
+        new_words.append({
+            **w,
+            "text": left_text,
+            "width": round(left_width),
+        })
+        new_words.append({
+            **w,
+            "text": right_text,
+            "left": round(split_boundary),
+            "width": right_width,
+        })
+        split_count += 1
+        logger.info(
+            "split cross-column word %r → %r + %r at boundary %.0f",
+            text, left_text, right_text, split_boundary,
+        )
+
+    if split_count:
+        logger.info("split %d cross-column word(s)", split_count)
+    return new_words
+
+
 def _filter_border_strip_words(words: List[Dict]) -> Tuple[List[Dict], int]:
    """Remove page-border decoration strip words BEFORE column detection.

@@ -1111,6 +1253,12 @@ def _build_zone_grid(
            "header_rows": [],
        }

+    # Split word boxes that straddle column boundaries (e.g. "sichzie"
+    # spanning Col 1 + Col 2).  Must happen after column detection and
+    # before cell assignment.
+    if len(columns) >= 2:
+        zone_words = _split_cross_column_words(zone_words, columns)
+
    # Build cells
    cells = _build_cells(zone_words, columns, rows, img_w, img_h)