From 21b69e06be29e77e5eb6e3f042f1f9edd1e99a03 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sat, 28 Mar 2026 10:54:41 +0100 Subject: [PATCH] Fix cross-column word assignment by splitting OCR merge artifacts When OCR merges adjacent words from different columns into one word box (e.g. "sichzie" spanning Col 1+2, "dasZimmer" crossing boundary), the grid builder assigned the entire merged word to one column. New _split_cross_column_words() function splits these at column boundaries using case transitions and spellchecker validation to avoid false positives on real words like "oder", "Kabel", "Zeitung". Regression: 12/12 GT sessions pass with diff=+0. Co-Authored-By: Claude Opus 4.6 --- .../backend/grid_editor_helpers.py | 148 ++++++++++++++++++ 1 file changed, 148 insertions(+) diff --git a/klausur-service/backend/grid_editor_helpers.py b/klausur-service/backend/grid_editor_helpers.py index ad8fbff..a025fac 100644 --- a/klausur-service/backend/grid_editor_helpers.py +++ b/klausur-service/backend/grid_editor_helpers.py @@ -22,6 +22,148 @@ from cv_ocr_engines import _text_has_garbled_ipa logger = logging.getLogger(__name__) +# --------------------------------------------------------------------------- +# Cross-column word splitting +# --------------------------------------------------------------------------- + +_spell_cache: Optional[Any] = None +_spell_loaded = False + + +def _is_recognized_word(text: str) -> bool: + """Check if *text* is a recognized German or English word. + + Uses the spellchecker library (same as cv_syllable_detect.py). + Returns True for real words like "oder", "Kabel", "Zeitung". + Returns False for OCR merge artifacts like "sichzie", "dasZimmer". + """ + global _spell_cache, _spell_loaded + if not text or len(text) < 2: + return False + + if not _spell_loaded: + _spell_loaded = True + try: + from spellchecker import SpellChecker + _spell_cache = SpellChecker(language="de") + except Exception: + pass + + if _spell_cache is None: + return False + + return text.lower() in _spell_cache + + +def _split_cross_column_words( + words: List[Dict], + columns: List[Dict], +) -> List[Dict]: + """Split word boxes that span across column boundaries. + + When OCR merges adjacent words from different columns (e.g. "sichzie" + spanning Col 1 and Col 2, or "dasZimmer" crossing the boundary), + split the word box at the column boundary so each piece is assigned + to the correct column. + + Only splits when: + - The word has significant overlap (>15% of its width) on both sides + - AND the word is not a recognized real word (OCR merge artifact), OR + the word contains a case transition (lowercase→uppercase) near the + boundary indicating two merged words like "dasZimmer". + """ + if len(columns) < 2: + return words + + # Column boundaries = midpoints between adjacent column edges + boundaries = [] + for i in range(len(columns) - 1): + boundary = (columns[i]["x_max"] + columns[i + 1]["x_min"]) / 2 + boundaries.append(boundary) + + new_words: List[Dict] = [] + split_count = 0 + for w in words: + w_left = w["left"] + w_width = w["width"] + w_right = w_left + w_width + text = (w.get("text") or "").strip() + + if not text or len(text) < 4 or w_width < 10: + new_words.append(w) + continue + + # Find the first boundary this word straddles significantly + split_boundary = None + for b in boundaries: + if w_left < b < w_right: + left_part = b - w_left + right_part = w_right - b + # Both sides must have at least 15% of the word width + if left_part > w_width * 0.15 and right_part > w_width * 0.15: + split_boundary = b + break + + if split_boundary is None: + new_words.append(w) + continue + + # Compute approximate split position in the text. + left_width = split_boundary - w_left + split_ratio = left_width / w_width + approx_pos = len(text) * split_ratio + + # Strategy 1: look for a case transition (lowercase→uppercase) near + # the approximate split point — e.g. "dasZimmer" splits at 'Z'. + split_char = None + search_lo = max(1, int(approx_pos) - 3) + search_hi = min(len(text), int(approx_pos) + 2) + for i in range(search_lo, search_hi): + if text[i - 1].islower() and text[i].isupper(): + split_char = i + break + + # Strategy 2: if no case transition, only split if the whole word + # is NOT a real word (i.e. it's an OCR merge artifact like "sichzie"). + # Real words like "oder", "Kabel", "Zeitung" must not be split. + if split_char is None: + clean = re.sub(r"[,;:.!?]+$", "", text) # strip trailing punct + if _is_recognized_word(clean): + new_words.append(w) + continue + # Not a real word — use floor of proportional position + split_char = max(1, min(len(text) - 1, int(approx_pos))) + + left_text = text[:split_char].rstrip() + right_text = text[split_char:].lstrip() + + if len(left_text) < 2 or len(right_text) < 2: + new_words.append(w) + continue + + right_width = w_width - round(left_width) + new_words.append({ + **w, + "text": left_text, + "width": round(left_width), + }) + new_words.append({ + **w, + "text": right_text, + "left": round(split_boundary), + "width": right_width, + }) + split_count += 1 + logger.info( + "split cross-column word %r → %r + %r at boundary %.0f", + text, left_text, right_text, split_boundary, + ) + + if split_count: + logger.info("split %d cross-column word(s)", split_count) + return new_words + + def _filter_border_strip_words(words: List[Dict]) -> Tuple[List[Dict], int]: """Remove page-border decoration strip words BEFORE column detection. @@ -1111,6 +1253,12 @@ def _build_zone_grid( "header_rows": [], } + # Split word boxes that straddle column boundaries (e.g. "sichzie" + # spanning Col 1 + Col 2). Must happen after column detection and + # before cell assignment. + if len(columns) >= 2: + zone_words = _split_cross_column_words(zone_words, columns) + # Build cells cells = _build_cells(zone_words, columns, rows, img_w, img_h)