diff --git a/klausur-service/backend/cv_syllable_detect.py b/klausur-service/backend/cv_syllable_detect.py index c67f3c6..96b21b6 100644 --- a/klausur-service/backend/cv_syllable_detect.py +++ b/klausur-service/backend/cv_syllable_detect.py @@ -70,6 +70,14 @@ def _get_hyphenators(): return _hyph_de, _hyph_en +def _is_known_word(word: str, hyph_de, hyph_en) -> bool: + """Check whether pyphen recognises a word (DE or EN).""" + if len(word) < 2: + return False + return ('|' in hyph_de.inserted(word, hyphen='|') + or '|' in hyph_en.inserted(word, hyphen='|')) + + def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]: """Try to hyphenate a word using DE then EN dictionary. @@ -84,6 +92,139 @@ def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]: return None +def _autocorrect_piped_word( + word_with_pipes: str, hyph_de, hyph_en, +) -> Optional[str]: + """Try to correct a word that has OCR pipe artifacts. + + Printed syllable divider lines on dictionary pages confuse OCR: + the vertical stroke is often read as an extra character (commonly + ``l``, ``I``, ``1``, ``i``) adjacent to where the pipe appears. + + Strategy: + 1. Strip ``|`` — if pyphen recognises the result, done. + 2. Record where the pipes were in the stripped string. + 3. Try deleting one character near each pipe position (the extra + character the OCR inserted). If pyphen recognises the + candidate, return it. + """ + stripped = word_with_pipes.replace('|', '') + if not stripped or len(stripped) < 3: + return stripped # too short to validate + + # Case-preserved check; pyphen is case-insensitive internally + if _is_known_word(stripped, hyph_de, hyph_en): + return stripped + + # Map pipe positions into the stripped string. + # e.g. "Ze|plpe|lin" → pipes were at indices 2, 6 in the original, + # which map to positions 2, 5 in "Zeplpelin". + pipe_positions: List[int] = [] + offset = 0 + for i, c in enumerate(word_with_pipes): + if c == '|': + pipe_positions.append(i - offset) + offset += 1 + + # Try single-character deletion near each pipe position. + # OCR typically inserts ONE extra char per pipe stroke. + seen: set = set() + for pos in pipe_positions: + for delta in (0, 1, -1, 2, -2): + idx = pos + delta + if idx < 0 or idx >= len(stripped): + continue + candidate = stripped[:idx] + stripped[idx + 1:] + if candidate in seen or len(candidate) < 3: + continue + seen.add(candidate) + if _is_known_word(candidate, hyph_de, hyph_en): + return candidate + + return None # could not fix + + +def autocorrect_pipe_artifacts( + zones_data: List[Dict], session_id: str, +) -> int: + """Strip OCR pipe artifacts and correct garbled words in-place. + + Printed syllable divider lines on dictionary scans are read by OCR + as ``|`` characters embedded in words (e.g. ``Zel|le``, ``Ze|plpe|lin``). + This function: + + 1. Strips ``|`` from every word in content cells. + 2. Validates the stripped word with pyphen. + 3. If not recognised, tries deleting characters that the OCR inserted + around the pipe position (e.g. ``Zeplpelin`` → ``Zeppelin``). + 4. Updates both word-box texts and cell text. + + Returns the number of cells modified. + """ + hyph_de, hyph_en = _get_hyphenators() + if hyph_de is None: + return 0 + + modified = 0 + for z in zones_data: + for cell in z.get("cells", []): + ct = cell.get("col_type", "") + if not ct.startswith("column_"): + continue + + cell_changed = False + + # --- Fix word boxes --- + for wb in cell.get("word_boxes", []): + wb_text = wb.get("text", "") + if "|" not in wb_text: + continue + + # Separate trailing punctuation + m = re.match( + r'^([^a-zA-ZäöüÄÖÜßẞ]*)' + r'(.*?)' + r'([^a-zA-ZäöüÄÖÜßẞ]*)$', + wb_text, + ) + if not m: + continue + lead, core, trail = m.group(1), m.group(2), m.group(3) + if "|" not in core: + continue + + corrected = _autocorrect_piped_word(core, hyph_de, hyph_en) + if corrected is not None and corrected != core: + wb["text"] = lead + corrected + trail + cell_changed = True + + # --- Rebuild cell text from word boxes --- + if cell_changed: + wbs = cell.get("word_boxes", []) + if wbs: + cell["text"] = " ".join( + (wb.get("text") or "") for wb in wbs + ) + modified += 1 + + # --- Fallback: strip residual | from cell text --- + # (covers cases where word_boxes don't exist or weren't fixed) + text = cell.get("text", "") + if "|" in text: + clean = text.replace("|", "") + if clean != text: + cell["text"] = clean + if not cell_changed: + modified += 1 + + if modified: + logger.info( + "build-grid session %s: autocorrected pipe artifacts in %d cells", + session_id, modified, + ) + return modified + + def _try_merge_pipe_gaps(text: str, hyph_de) -> str: """Merge fragments separated by single spaces where OCR split at a pipe. @@ -185,7 +326,7 @@ def merge_word_gaps_in_zones(zones_data: List[Dict], session_id: str) -> int: def _try_merge_word_gaps(text: str, hyph_de) -> str: - """Merge OCR word fragments with relaxed threshold (max_short=6). + """Merge OCR word fragments with relaxed threshold (max_short=5). Similar to ``_try_merge_pipe_gaps`` but allows slightly longer fragments (max_short=5 instead of 3). Still requires pyphen to recognize the diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 619798c..003de0d 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -1323,6 +1323,14 @@ async def _build_grid_core( and wb.get("conf", 100) < 85): to_remove.add(i) + # Rule (a2): isolated non-alphanumeric symbols (graphic OCR artifacts) + # Small images/icons next to words get OCR'd as ">", "<", "~", etc. + # Remove word boxes that contain NO letters or digits. + for i, wb in enumerate(wbs): + t = (wb.get("text") or "").strip() + if t and not re.search(r'[a-zA-Z0-9äöüÄÖÜß]', t) and len(t) <= 2: + to_remove.add(i) + # Rule (b) + (c): overlap and duplicate detection # Sort by x for pairwise comparison _ALPHA_WORD_RE = re.compile(r'^[A-Za-z\u00c0-\u024f\-]+[.,;:!?]*$') @@ -1619,6 +1627,15 @@ async def _build_grid_core( except Exception as e: logger.warning("Word-gap merge failed: %s", e) + # --- Pipe auto-correction: fix OCR artifacts from printed syllable dividers --- + # Strips | from words, validates with pyphen, tries char-deletion for garbled + # words like "Ze|plpe|lin" → "Zeppelin". + try: + from cv_syllable_detect import autocorrect_pipe_artifacts + autocorrect_pipe_artifacts(zones_data, session_id) + except Exception as e: + logger.warning("Pipe autocorrect failed: %s", e) + # --- Syllable divider insertion for dictionary pages --- # syllable_mode: "auto" = only when original has pipe dividers (1% threshold), # "all" = force on all content words, "en" = English column only,