Fix cross-column word assignment by splitting OCR merge artifacts
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 47s
CI / test-go-edu-search (push) Successful in 36s
CI / test-python-klausur (push) Failing after 2m21s
CI / test-python-agent-core (push) Successful in 19s
CI / test-nodejs-website (push) Successful in 23s

When OCR merges adjacent words from different columns into one word box
(e.g. "sichzie" spanning Col 1+2, "dasZimmer" crossing boundary), the
grid builder assigned the entire merged word to one column.

New _split_cross_column_words() function splits these at column
boundaries using case transitions and spellchecker validation to
avoid false positives on real words like "oder", "Kabel", "Zeitung".

Regression: 12/12 GT sessions pass with diff=+0.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-28 10:54:41 +01:00
parent 0168ab1a67
commit 21b69e06be

View File

@@ -22,6 +22,148 @@ from cv_ocr_engines import _text_has_garbled_ipa
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Cross-column word splitting
# ---------------------------------------------------------------------------
_spell_cache: Optional[Any] = None
_spell_loaded = False
def _is_recognized_word(text: str) -> bool:
"""Check if *text* is a recognized German or English word.
Uses the spellchecker library (same as cv_syllable_detect.py).
Returns True for real words like "oder", "Kabel", "Zeitung".
Returns False for OCR merge artifacts like "sichzie", "dasZimmer".
"""
global _spell_cache, _spell_loaded
if not text or len(text) < 2:
return False
if not _spell_loaded:
_spell_loaded = True
try:
from spellchecker import SpellChecker
_spell_cache = SpellChecker(language="de")
except Exception:
pass
if _spell_cache is None:
return False
return text.lower() in _spell_cache
def _split_cross_column_words(
words: List[Dict],
columns: List[Dict],
) -> List[Dict]:
"""Split word boxes that span across column boundaries.
When OCR merges adjacent words from different columns (e.g. "sichzie"
spanning Col 1 and Col 2, or "dasZimmer" crossing the boundary),
split the word box at the column boundary so each piece is assigned
to the correct column.
Only splits when:
- The word has significant overlap (>15% of its width) on both sides
- AND the word is not a recognized real word (OCR merge artifact), OR
the word contains a case transition (lowercase→uppercase) near the
boundary indicating two merged words like "dasZimmer".
"""
if len(columns) < 2:
return words
# Column boundaries = midpoints between adjacent column edges
boundaries = []
for i in range(len(columns) - 1):
boundary = (columns[i]["x_max"] + columns[i + 1]["x_min"]) / 2
boundaries.append(boundary)
new_words: List[Dict] = []
split_count = 0
for w in words:
w_left = w["left"]
w_width = w["width"]
w_right = w_left + w_width
text = (w.get("text") or "").strip()
if not text or len(text) < 4 or w_width < 10:
new_words.append(w)
continue
# Find the first boundary this word straddles significantly
split_boundary = None
for b in boundaries:
if w_left < b < w_right:
left_part = b - w_left
right_part = w_right - b
# Both sides must have at least 15% of the word width
if left_part > w_width * 0.15 and right_part > w_width * 0.15:
split_boundary = b
break
if split_boundary is None:
new_words.append(w)
continue
# Compute approximate split position in the text.
left_width = split_boundary - w_left
split_ratio = left_width / w_width
approx_pos = len(text) * split_ratio
# Strategy 1: look for a case transition (lowercase→uppercase) near
# the approximate split point — e.g. "dasZimmer" splits at 'Z'.
split_char = None
search_lo = max(1, int(approx_pos) - 3)
search_hi = min(len(text), int(approx_pos) + 2)
for i in range(search_lo, search_hi):
if text[i - 1].islower() and text[i].isupper():
split_char = i
break
# Strategy 2: if no case transition, only split if the whole word
# is NOT a real word (i.e. it's an OCR merge artifact like "sichzie").
# Real words like "oder", "Kabel", "Zeitung" must not be split.
if split_char is None:
clean = re.sub(r"[,;:.!?]+$", "", text) # strip trailing punct
if _is_recognized_word(clean):
new_words.append(w)
continue
# Not a real word — use floor of proportional position
split_char = max(1, min(len(text) - 1, int(approx_pos)))
left_text = text[:split_char].rstrip()
right_text = text[split_char:].lstrip()
if len(left_text) < 2 or len(right_text) < 2:
new_words.append(w)
continue
right_width = w_width - round(left_width)
new_words.append({
**w,
"text": left_text,
"width": round(left_width),
})
new_words.append({
**w,
"text": right_text,
"left": round(split_boundary),
"width": right_width,
})
split_count += 1
logger.info(
"split cross-column word %r%r + %r at boundary %.0f",
text, left_text, right_text, split_boundary,
)
if split_count:
logger.info("split %d cross-column word(s)", split_count)
return new_words
def _filter_border_strip_words(words: List[Dict]) -> Tuple[List[Dict], int]:
"""Remove page-border decoration strip words BEFORE column detection.
@@ -1111,6 +1253,12 @@ def _build_zone_grid(
"header_rows": [],
}
# Split word boxes that straddle column boundaries (e.g. "sichzie"
# spanning Col 1 + Col 2). Must happen after column detection and
# before cell assignment.
if len(columns) >= 2:
zone_words = _split_cross_column_words(zone_words, columns)
# Build cells
cells = _build_cells(zone_words, columns, rows, img_w, img_h)