Fix cross-column word assignment by splitting OCR merge artifacts
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 47s
CI / test-go-edu-search (push) Successful in 36s
CI / test-python-klausur (push) Failing after 2m21s
CI / test-python-agent-core (push) Successful in 19s
CI / test-nodejs-website (push) Successful in 23s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 47s
CI / test-go-edu-search (push) Successful in 36s
CI / test-python-klausur (push) Failing after 2m21s
CI / test-python-agent-core (push) Successful in 19s
CI / test-nodejs-website (push) Successful in 23s
When OCR merges adjacent words from different columns into one word box (e.g. "sichzie" spanning Col 1+2, "dasZimmer" crossing boundary), the grid builder assigned the entire merged word to one column. New _split_cross_column_words() function splits these at column boundaries using case transitions and spellchecker validation to avoid false positives on real words like "oder", "Kabel", "Zeitung". Regression: 12/12 GT sessions pass with diff=+0. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -22,6 +22,148 @@ from cv_ocr_engines import _text_has_garbled_ipa
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Cross-column word splitting
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_spell_cache: Optional[Any] = None
|
||||
_spell_loaded = False
|
||||
|
||||
|
||||
def _is_recognized_word(text: str) -> bool:
|
||||
"""Check if *text* is a recognized German or English word.
|
||||
|
||||
Uses the spellchecker library (same as cv_syllable_detect.py).
|
||||
Returns True for real words like "oder", "Kabel", "Zeitung".
|
||||
Returns False for OCR merge artifacts like "sichzie", "dasZimmer".
|
||||
"""
|
||||
global _spell_cache, _spell_loaded
|
||||
if not text or len(text) < 2:
|
||||
return False
|
||||
|
||||
if not _spell_loaded:
|
||||
_spell_loaded = True
|
||||
try:
|
||||
from spellchecker import SpellChecker
|
||||
_spell_cache = SpellChecker(language="de")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if _spell_cache is None:
|
||||
return False
|
||||
|
||||
return text.lower() in _spell_cache
|
||||
|
||||
|
||||
def _split_cross_column_words(
|
||||
words: List[Dict],
|
||||
columns: List[Dict],
|
||||
) -> List[Dict]:
|
||||
"""Split word boxes that span across column boundaries.
|
||||
|
||||
When OCR merges adjacent words from different columns (e.g. "sichzie"
|
||||
spanning Col 1 and Col 2, or "dasZimmer" crossing the boundary),
|
||||
split the word box at the column boundary so each piece is assigned
|
||||
to the correct column.
|
||||
|
||||
Only splits when:
|
||||
- The word has significant overlap (>15% of its width) on both sides
|
||||
- AND the word is not a recognized real word (OCR merge artifact), OR
|
||||
the word contains a case transition (lowercase→uppercase) near the
|
||||
boundary indicating two merged words like "dasZimmer".
|
||||
"""
|
||||
if len(columns) < 2:
|
||||
return words
|
||||
|
||||
# Column boundaries = midpoints between adjacent column edges
|
||||
boundaries = []
|
||||
for i in range(len(columns) - 1):
|
||||
boundary = (columns[i]["x_max"] + columns[i + 1]["x_min"]) / 2
|
||||
boundaries.append(boundary)
|
||||
|
||||
new_words: List[Dict] = []
|
||||
split_count = 0
|
||||
for w in words:
|
||||
w_left = w["left"]
|
||||
w_width = w["width"]
|
||||
w_right = w_left + w_width
|
||||
text = (w.get("text") or "").strip()
|
||||
|
||||
if not text or len(text) < 4 or w_width < 10:
|
||||
new_words.append(w)
|
||||
continue
|
||||
|
||||
# Find the first boundary this word straddles significantly
|
||||
split_boundary = None
|
||||
for b in boundaries:
|
||||
if w_left < b < w_right:
|
||||
left_part = b - w_left
|
||||
right_part = w_right - b
|
||||
# Both sides must have at least 15% of the word width
|
||||
if left_part > w_width * 0.15 and right_part > w_width * 0.15:
|
||||
split_boundary = b
|
||||
break
|
||||
|
||||
if split_boundary is None:
|
||||
new_words.append(w)
|
||||
continue
|
||||
|
||||
# Compute approximate split position in the text.
|
||||
left_width = split_boundary - w_left
|
||||
split_ratio = left_width / w_width
|
||||
approx_pos = len(text) * split_ratio
|
||||
|
||||
# Strategy 1: look for a case transition (lowercase→uppercase) near
|
||||
# the approximate split point — e.g. "dasZimmer" splits at 'Z'.
|
||||
split_char = None
|
||||
search_lo = max(1, int(approx_pos) - 3)
|
||||
search_hi = min(len(text), int(approx_pos) + 2)
|
||||
for i in range(search_lo, search_hi):
|
||||
if text[i - 1].islower() and text[i].isupper():
|
||||
split_char = i
|
||||
break
|
||||
|
||||
# Strategy 2: if no case transition, only split if the whole word
|
||||
# is NOT a real word (i.e. it's an OCR merge artifact like "sichzie").
|
||||
# Real words like "oder", "Kabel", "Zeitung" must not be split.
|
||||
if split_char is None:
|
||||
clean = re.sub(r"[,;:.!?]+$", "", text) # strip trailing punct
|
||||
if _is_recognized_word(clean):
|
||||
new_words.append(w)
|
||||
continue
|
||||
# Not a real word — use floor of proportional position
|
||||
split_char = max(1, min(len(text) - 1, int(approx_pos)))
|
||||
|
||||
left_text = text[:split_char].rstrip()
|
||||
right_text = text[split_char:].lstrip()
|
||||
|
||||
if len(left_text) < 2 or len(right_text) < 2:
|
||||
new_words.append(w)
|
||||
continue
|
||||
|
||||
right_width = w_width - round(left_width)
|
||||
new_words.append({
|
||||
**w,
|
||||
"text": left_text,
|
||||
"width": round(left_width),
|
||||
})
|
||||
new_words.append({
|
||||
**w,
|
||||
"text": right_text,
|
||||
"left": round(split_boundary),
|
||||
"width": right_width,
|
||||
})
|
||||
split_count += 1
|
||||
logger.info(
|
||||
"split cross-column word %r → %r + %r at boundary %.0f",
|
||||
text, left_text, right_text, split_boundary,
|
||||
)
|
||||
|
||||
if split_count:
|
||||
logger.info("split %d cross-column word(s)", split_count)
|
||||
return new_words
|
||||
|
||||
|
||||
def _filter_border_strip_words(words: List[Dict]) -> Tuple[List[Dict], int]:
|
||||
"""Remove page-border decoration strip words BEFORE column detection.
|
||||
|
||||
@@ -1111,6 +1253,12 @@ def _build_zone_grid(
|
||||
"header_rows": [],
|
||||
}
|
||||
|
||||
# Split word boxes that straddle column boundaries (e.g. "sichzie"
|
||||
# spanning Col 1 + Col 2). Must happen after column detection and
|
||||
# before cell assignment.
|
||||
if len(columns) >= 2:
|
||||
zone_words = _split_cross_column_words(zone_words, columns)
|
||||
|
||||
# Build cells
|
||||
cells = _build_cells(zone_words, columns, rows, img_w, img_h)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user