Fix cross-column word assignment by splitting OCR merge artifacts

When OCR merges adjacent words from different columns into one word box (e.g. "sichzie" spanning Col 1+2, "dasZimmer" crossing boundary), the grid builder assigned the entire merged word to one column. New _split_cross_column_words() function splits these at column boundaries using case transitions and spellchecker validation to avoid false positives on real words like "oder", "Kabel", "Zeitung". Regression: 12/12 GT sessions pass with diff=+0. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-28 10:54:41 +01:00
parent 0168ab1a67
commit 21b69e06be
1 changed files with 148 additions and 0 deletions
@@ -22,6 +22,148 @@ from cv_ocr_engines import _text_has_garbled_ipa
 logger = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 # Cross-column word splitting
 # ---------------------------------------------------------------------------
 _spell_cache: Optional[Any] = None
 _spell_loaded = False
 def _is_recognized_word(text: str) -> bool:
    """Check if *text* is a recognized German or English word.
    Uses the spellchecker library (same as cv_syllable_detect.py).
    Returns True for real words like "oder", "Kabel", "Zeitung".
    Returns False for OCR merge artifacts like "sichzie", "dasZimmer".
    """
    global _spell_cache, _spell_loaded
    if not text or len(text) < 2:
        return False
    if not _spell_loaded:
        _spell_loaded = True
        try:
            from spellchecker import SpellChecker
            _spell_cache = SpellChecker(language="de")
        except Exception:
            pass
    if _spell_cache is None:
        return False
    return text.lower() in _spell_cache
 def _split_cross_column_words(
    words: List[Dict],
    columns: List[Dict],
 ) -> List[Dict]:
    """Split word boxes that span across column boundaries.
    When OCR merges adjacent words from different columns (e.g. "sichzie"
    spanning Col 1 and Col 2, or "dasZimmer" crossing the boundary),
    split the word box at the column boundary so each piece is assigned
    to the correct column.
    Only splits when:
    - The word has significant overlap (>15% of its width) on both sides
    - AND the word is not a recognized real word (OCR merge artifact), OR
      the word contains a case transition (lowercase→uppercase) near the
      boundary indicating two merged words like "dasZimmer".
    """
    if len(columns) < 2:
        return words
    # Column boundaries = midpoints between adjacent column edges
    boundaries = []
    for i in range(len(columns) - 1):
        boundary = (columns[i]["x_max"] + columns[i + 1]["x_min"]) / 2
        boundaries.append(boundary)
    new_words: List[Dict] = []
    split_count = 0
    for w in words:
        w_left = w["left"]
        w_width = w["width"]
        w_right = w_left + w_width
        text = (w.get("text") or "").strip()
        if not text or len(text) < 4 or w_width < 10:
            new_words.append(w)
            continue
        # Find the first boundary this word straddles significantly
        split_boundary = None
        for b in boundaries:
            if w_left < b < w_right:
                left_part = b - w_left
                right_part = w_right - b
                # Both sides must have at least 15% of the word width
                if left_part > w_width * 0.15 and right_part > w_width * 0.15:
                    split_boundary = b
                    break
        if split_boundary is None:
            new_words.append(w)
            continue
        # Compute approximate split position in the text.
        left_width = split_boundary - w_left
        split_ratio = left_width / w_width
        approx_pos = len(text) * split_ratio
        # Strategy 1: look for a case transition (lowercase→uppercase) near
        # the approximate split point — e.g. "dasZimmer" splits at 'Z'.
        split_char = None
        search_lo = max(1, int(approx_pos) - 3)
        search_hi = min(len(text), int(approx_pos) + 2)
        for i in range(search_lo, search_hi):
            if text[i - 1].islower() and text[i].isupper():
                split_char = i
                break
        # Strategy 2: if no case transition, only split if the whole word
        # is NOT a real word (i.e. it's an OCR merge artifact like "sichzie").
        # Real words like "oder", "Kabel", "Zeitung" must not be split.
        if split_char is None:
            clean = re.sub(r"[,;:.!?]+$", "", text)  # strip trailing punct
            if _is_recognized_word(clean):
                new_words.append(w)
                continue
            # Not a real word — use floor of proportional position
            split_char = max(1, min(len(text) - 1, int(approx_pos)))
        left_text = text[:split_char].rstrip()
        right_text = text[split_char:].lstrip()
        if len(left_text) < 2 or len(right_text) < 2:
            new_words.append(w)
            continue
        right_width = w_width - round(left_width)
        new_words.append({
            **w,
            "text": left_text,
            "width": round(left_width),
        })
        new_words.append({
            **w,
            "text": right_text,
            "left": round(split_boundary),
            "width": right_width,
        })
        split_count += 1
        logger.info(
            "split cross-column word %r → %r + %r at boundary %.0f",
            text, left_text, right_text, split_boundary,
        )
    if split_count:
        logger.info("split %d cross-column word(s)", split_count)
    return new_words
 def _filter_border_strip_words(words: List[Dict]) -> Tuple[List[Dict], int]:
    """Remove page-border decoration strip words BEFORE column detection.
@@ -1111,6 +1253,12 @@ def _build_zone_grid(
            "header_rows": [],
        }
    # Split word boxes that straddle column boundaries (e.g. "sichzie"
    # spanning Col 1 + Col 2).  Must happen after column detection and
    # before cell assignment.
    if len(columns) >= 2:
        zone_words = _split_cross_column_words(zone_words, columns)
    # Build cells
    cells = _build_cells(zone_words, columns, rows, img_w, img_h)