Add pipe auto-correction and graphic artifact filter for grid builder

- autocorrect_pipe_artifacts(): strips OCR pipe artifacts from printed syllable dividers, validates with pyphen, tries char-deletion near pipe positions for garbled words (e.g. "Ze|plpe|lin" → "Zeppelin") - Rule (a2): filters isolated non-alphanumeric word boxes (≤2 chars, no letters/digits) — catches small icons OCR'd as ">", "<" etc. - Both fixes are generic: pyphen-validated, no session-specific logic Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-27 16:33:38 +01:00
parent 0685fb12da
commit cc4cb3bc2f
2 changed files with 159 additions and 1 deletions
@@ -70,6 +70,14 @@ def _get_hyphenators():
    return _hyph_de, _hyph_en
 def _is_known_word(word: str, hyph_de, hyph_en) -> bool:
    """Check whether pyphen recognises a word (DE or EN)."""
    if len(word) < 2:
        return False
    return ('|' in hyph_de.inserted(word, hyphen='|')
            or '|' in hyph_en.inserted(word, hyphen='|'))
 def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]:
    """Try to hyphenate a word using DE then EN dictionary.
@@ -84,6 +92,139 @@ def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]:
    return None
 def _autocorrect_piped_word(
    word_with_pipes: str, hyph_de, hyph_en,
 ) -> Optional[str]:
    """Try to correct a word that has OCR pipe artifacts.
    Printed syllable divider lines on dictionary pages confuse OCR:
    the vertical stroke is often read as an extra character (commonly
    ``l``, ``I``, ``1``, ``i``) adjacent to where the pipe appears.
    Strategy:
        1. Strip ``|`` — if pyphen recognises the result, done.
        2. Record where the pipes were in the stripped string.
        3. Try deleting one character near each pipe position (the extra
           character the OCR inserted).  If pyphen recognises the
           candidate, return it.
    """
    stripped = word_with_pipes.replace('|', '')
    if not stripped or len(stripped) < 3:
        return stripped  # too short to validate
    # Case-preserved check; pyphen is case-insensitive internally
    if _is_known_word(stripped, hyph_de, hyph_en):
        return stripped
    # Map pipe positions into the stripped string.
    # e.g. "Ze|plpe|lin" → pipes were at indices 2, 6 in the original,
    # which map to positions 2, 5 in "Zeplpelin".
    pipe_positions: List[int] = []
    offset = 0
    for i, c in enumerate(word_with_pipes):
        if c == '|':
            pipe_positions.append(i - offset)
            offset += 1
    # Try single-character deletion near each pipe position.
    # OCR typically inserts ONE extra char per pipe stroke.
    seen: set = set()
    for pos in pipe_positions:
        for delta in (0, 1, -1, 2, -2):
            idx = pos + delta
            if idx < 0 or idx >= len(stripped):
                continue
            candidate = stripped[:idx] + stripped[idx + 1:]
            if candidate in seen or len(candidate) < 3:
                continue
            seen.add(candidate)
            if _is_known_word(candidate, hyph_de, hyph_en):
                return candidate
    return None  # could not fix
 def autocorrect_pipe_artifacts(
    zones_data: List[Dict], session_id: str,
 ) -> int:
    """Strip OCR pipe artifacts and correct garbled words in-place.
    Printed syllable divider lines on dictionary scans are read by OCR
    as ``|`` characters embedded in words (e.g. ``Zel|le``, ``Ze|plpe|lin``).
    This function:
    1. Strips ``|`` from every word in content cells.
    2. Validates the stripped word with pyphen.
    3. If not recognised, tries deleting characters that the OCR inserted
       around the pipe position (e.g. ``Zeplpelin`` → ``Zeppelin``).
    4. Updates both word-box texts and cell text.
    Returns the number of cells modified.
    """
    hyph_de, hyph_en = _get_hyphenators()
    if hyph_de is None:
        return 0
    modified = 0
    for z in zones_data:
        for cell in z.get("cells", []):
            ct = cell.get("col_type", "")
            if not ct.startswith("column_"):
                continue
            cell_changed = False
            # --- Fix word boxes ---
            for wb in cell.get("word_boxes", []):
                wb_text = wb.get("text", "")
                if "|" not in wb_text:
                    continue
                # Separate trailing punctuation
                m = re.match(
                    r'^([^a-zA-ZäöüÄÖÜßẞ]*)'
                    r'(.*?)'
                    r'([^a-zA-ZäöüÄÖÜßẞ]*)$',
                    wb_text,
                )
                if not m:
                    continue
                lead, core, trail = m.group(1), m.group(2), m.group(3)
                if "|" not in core:
                    continue
                corrected = _autocorrect_piped_word(core, hyph_de, hyph_en)
                if corrected is not None and corrected != core:
                    wb["text"] = lead + corrected + trail
                    cell_changed = True
            # --- Rebuild cell text from word boxes ---
            if cell_changed:
                wbs = cell.get("word_boxes", [])
                if wbs:
                    cell["text"] = " ".join(
                        (wb.get("text") or "") for wb in wbs
                    )
                modified += 1
            # --- Fallback: strip residual | from cell text ---
            # (covers cases where word_boxes don't exist or weren't fixed)
            text = cell.get("text", "")
            if "|" in text:
                clean = text.replace("|", "")
                if clean != text:
                    cell["text"] = clean
                    if not cell_changed:
                        modified += 1
    if modified:
        logger.info(
            "build-grid session %s: autocorrected pipe artifacts in %d cells",
            session_id, modified,
        )
    return modified
 def _try_merge_pipe_gaps(text: str, hyph_de) -> str:
    """Merge fragments separated by single spaces where OCR split at a pipe.
@@ -185,7 +326,7 @@ def merge_word_gaps_in_zones(zones_data: List[Dict], session_id: str) -> int:
 def _try_merge_word_gaps(text: str, hyph_de) -> str:
-    """Merge OCR word fragments with relaxed threshold (max_short=6).
+    """Merge OCR word fragments with relaxed threshold (max_short=5).
    Similar to ``_try_merge_pipe_gaps`` but allows slightly longer fragments
    (max_short=5 instead of 3).  Still requires pyphen to recognize the
@@ -1323,6 +1323,14 @@ async def _build_grid_core(
                        and wb.get("conf", 100) < 85):
                    to_remove.add(i)
            # Rule (a2): isolated non-alphanumeric symbols (graphic OCR artifacts)
            # Small images/icons next to words get OCR'd as ">", "<", "~", etc.
            # Remove word boxes that contain NO letters or digits.
            for i, wb in enumerate(wbs):
                t = (wb.get("text") or "").strip()
                if t and not re.search(r'[a-zA-Z0-9äöüÄÖÜß]', t) and len(t) <= 2:
                    to_remove.add(i)
            # Rule (b) + (c): overlap and duplicate detection
            # Sort by x for pairwise comparison
            _ALPHA_WORD_RE = re.compile(r'^[A-Za-z\u00c0-\u024f\-]+[.,;:!?]*$')
@@ -1619,6 +1627,15 @@ async def _build_grid_core(
    except Exception as e:
        logger.warning("Word-gap merge failed: %s", e)
    # --- Pipe auto-correction: fix OCR artifacts from printed syllable dividers ---
    # Strips | from words, validates with pyphen, tries char-deletion for garbled
    # words like "Ze|plpe|lin" → "Zeppelin".
    try:
        from cv_syllable_detect import autocorrect_pipe_artifacts
        autocorrect_pipe_artifacts(zones_data, session_id)
    except Exception as e:
        logger.warning("Pipe autocorrect failed: %s", e)
    # --- Syllable divider insertion for dictionary pages ---
    # syllable_mode: "auto" = only when original has pipe dividers (1% threshold),
    #   "all" = force on all content words, "en" = English column only,