Fix cross-column word assignment by splitting OCR merge artifacts
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 47s
CI / test-go-edu-search (push) Successful in 36s
CI / test-python-klausur (push) Failing after 2m21s
CI / test-python-agent-core (push) Successful in 19s
CI / test-nodejs-website (push) Successful in 23s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 47s
CI / test-go-edu-search (push) Successful in 36s
CI / test-python-klausur (push) Failing after 2m21s
CI / test-python-agent-core (push) Successful in 19s
CI / test-nodejs-website (push) Successful in 23s
When OCR merges adjacent words from different columns into one word box (e.g. "sichzie" spanning Col 1+2, "dasZimmer" crossing boundary), the grid builder assigned the entire merged word to one column. New _split_cross_column_words() function splits these at column boundaries using case transitions and spellchecker validation to avoid false positives on real words like "oder", "Kabel", "Zeitung". Regression: 12/12 GT sessions pass with diff=+0. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -22,6 +22,148 @@ from cv_ocr_engines import _text_has_garbled_ipa
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Cross-column word splitting
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_spell_cache: Optional[Any] = None
|
||||||
|
_spell_loaded = False
|
||||||
|
|
||||||
|
|
||||||
|
def _is_recognized_word(text: str) -> bool:
|
||||||
|
"""Check if *text* is a recognized German or English word.
|
||||||
|
|
||||||
|
Uses the spellchecker library (same as cv_syllable_detect.py).
|
||||||
|
Returns True for real words like "oder", "Kabel", "Zeitung".
|
||||||
|
Returns False for OCR merge artifacts like "sichzie", "dasZimmer".
|
||||||
|
"""
|
||||||
|
global _spell_cache, _spell_loaded
|
||||||
|
if not text or len(text) < 2:
|
||||||
|
return False
|
||||||
|
|
||||||
|
if not _spell_loaded:
|
||||||
|
_spell_loaded = True
|
||||||
|
try:
|
||||||
|
from spellchecker import SpellChecker
|
||||||
|
_spell_cache = SpellChecker(language="de")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if _spell_cache is None:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return text.lower() in _spell_cache
|
||||||
|
|
||||||
|
|
||||||
|
def _split_cross_column_words(
|
||||||
|
words: List[Dict],
|
||||||
|
columns: List[Dict],
|
||||||
|
) -> List[Dict]:
|
||||||
|
"""Split word boxes that span across column boundaries.
|
||||||
|
|
||||||
|
When OCR merges adjacent words from different columns (e.g. "sichzie"
|
||||||
|
spanning Col 1 and Col 2, or "dasZimmer" crossing the boundary),
|
||||||
|
split the word box at the column boundary so each piece is assigned
|
||||||
|
to the correct column.
|
||||||
|
|
||||||
|
Only splits when:
|
||||||
|
- The word has significant overlap (>15% of its width) on both sides
|
||||||
|
- AND the word is not a recognized real word (OCR merge artifact), OR
|
||||||
|
the word contains a case transition (lowercase→uppercase) near the
|
||||||
|
boundary indicating two merged words like "dasZimmer".
|
||||||
|
"""
|
||||||
|
if len(columns) < 2:
|
||||||
|
return words
|
||||||
|
|
||||||
|
# Column boundaries = midpoints between adjacent column edges
|
||||||
|
boundaries = []
|
||||||
|
for i in range(len(columns) - 1):
|
||||||
|
boundary = (columns[i]["x_max"] + columns[i + 1]["x_min"]) / 2
|
||||||
|
boundaries.append(boundary)
|
||||||
|
|
||||||
|
new_words: List[Dict] = []
|
||||||
|
split_count = 0
|
||||||
|
for w in words:
|
||||||
|
w_left = w["left"]
|
||||||
|
w_width = w["width"]
|
||||||
|
w_right = w_left + w_width
|
||||||
|
text = (w.get("text") or "").strip()
|
||||||
|
|
||||||
|
if not text or len(text) < 4 or w_width < 10:
|
||||||
|
new_words.append(w)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Find the first boundary this word straddles significantly
|
||||||
|
split_boundary = None
|
||||||
|
for b in boundaries:
|
||||||
|
if w_left < b < w_right:
|
||||||
|
left_part = b - w_left
|
||||||
|
right_part = w_right - b
|
||||||
|
# Both sides must have at least 15% of the word width
|
||||||
|
if left_part > w_width * 0.15 and right_part > w_width * 0.15:
|
||||||
|
split_boundary = b
|
||||||
|
break
|
||||||
|
|
||||||
|
if split_boundary is None:
|
||||||
|
new_words.append(w)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Compute approximate split position in the text.
|
||||||
|
left_width = split_boundary - w_left
|
||||||
|
split_ratio = left_width / w_width
|
||||||
|
approx_pos = len(text) * split_ratio
|
||||||
|
|
||||||
|
# Strategy 1: look for a case transition (lowercase→uppercase) near
|
||||||
|
# the approximate split point — e.g. "dasZimmer" splits at 'Z'.
|
||||||
|
split_char = None
|
||||||
|
search_lo = max(1, int(approx_pos) - 3)
|
||||||
|
search_hi = min(len(text), int(approx_pos) + 2)
|
||||||
|
for i in range(search_lo, search_hi):
|
||||||
|
if text[i - 1].islower() and text[i].isupper():
|
||||||
|
split_char = i
|
||||||
|
break
|
||||||
|
|
||||||
|
# Strategy 2: if no case transition, only split if the whole word
|
||||||
|
# is NOT a real word (i.e. it's an OCR merge artifact like "sichzie").
|
||||||
|
# Real words like "oder", "Kabel", "Zeitung" must not be split.
|
||||||
|
if split_char is None:
|
||||||
|
clean = re.sub(r"[,;:.!?]+$", "", text) # strip trailing punct
|
||||||
|
if _is_recognized_word(clean):
|
||||||
|
new_words.append(w)
|
||||||
|
continue
|
||||||
|
# Not a real word — use floor of proportional position
|
||||||
|
split_char = max(1, min(len(text) - 1, int(approx_pos)))
|
||||||
|
|
||||||
|
left_text = text[:split_char].rstrip()
|
||||||
|
right_text = text[split_char:].lstrip()
|
||||||
|
|
||||||
|
if len(left_text) < 2 or len(right_text) < 2:
|
||||||
|
new_words.append(w)
|
||||||
|
continue
|
||||||
|
|
||||||
|
right_width = w_width - round(left_width)
|
||||||
|
new_words.append({
|
||||||
|
**w,
|
||||||
|
"text": left_text,
|
||||||
|
"width": round(left_width),
|
||||||
|
})
|
||||||
|
new_words.append({
|
||||||
|
**w,
|
||||||
|
"text": right_text,
|
||||||
|
"left": round(split_boundary),
|
||||||
|
"width": right_width,
|
||||||
|
})
|
||||||
|
split_count += 1
|
||||||
|
logger.info(
|
||||||
|
"split cross-column word %r → %r + %r at boundary %.0f",
|
||||||
|
text, left_text, right_text, split_boundary,
|
||||||
|
)
|
||||||
|
|
||||||
|
if split_count:
|
||||||
|
logger.info("split %d cross-column word(s)", split_count)
|
||||||
|
return new_words
|
||||||
|
|
||||||
|
|
||||||
def _filter_border_strip_words(words: List[Dict]) -> Tuple[List[Dict], int]:
|
def _filter_border_strip_words(words: List[Dict]) -> Tuple[List[Dict], int]:
|
||||||
"""Remove page-border decoration strip words BEFORE column detection.
|
"""Remove page-border decoration strip words BEFORE column detection.
|
||||||
|
|
||||||
@@ -1111,6 +1253,12 @@ def _build_zone_grid(
|
|||||||
"header_rows": [],
|
"header_rows": [],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Split word boxes that straddle column boundaries (e.g. "sichzie"
|
||||||
|
# spanning Col 1 + Col 2). Must happen after column detection and
|
||||||
|
# before cell assignment.
|
||||||
|
if len(columns) >= 2:
|
||||||
|
zone_words = _split_cross_column_words(zone_words, columns)
|
||||||
|
|
||||||
# Build cells
|
# Build cells
|
||||||
cells = _build_cells(zone_words, columns, rows, img_w, img_h)
|
cells = _build_cells(zone_words, columns, rows, img_w, img_h)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user