Add pipe auto-correction and graphic artifact filter for grid builder

- autocorrect_pipe_artifacts(): strips OCR pipe artifacts from printed syllable dividers, validates with pyphen, tries char-deletion near pipe positions for garbled words (e.g. "Ze|plpe|lin" → "Zeppelin") - Rule (a2): filters isolated non-alphanumeric word boxes (≤2 chars, no letters/digits) — catches small icons OCR'd as ">", "<" etc. - Both fixes are generic: pyphen-validated, no session-specific logic Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-27 16:33:38 +01:00
parent 0685fb12da
commit cc4cb3bc2f
2 changed files with 159 additions and 1 deletions
@@ -70,6 +70,14 @@ def _get_hyphenators():
    return _hyph_de, _hyph_en


+def _is_known_word(word: str, hyph_de, hyph_en) -> bool:
+    """Check whether pyphen recognises a word (DE or EN)."""
+    if len(word) < 2:
+        return False
+    return ('|' in hyph_de.inserted(word, hyphen='|')
+            or '|' in hyph_en.inserted(word, hyphen='|'))
+
+
 def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]:
    """Try to hyphenate a word using DE then EN dictionary.

@@ -84,6 +92,139 @@ def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]:
    return None


+def _autocorrect_piped_word(
+    word_with_pipes: str, hyph_de, hyph_en,
+) -> Optional[str]:
+    """Try to correct a word that has OCR pipe artifacts.
+
+    Printed syllable divider lines on dictionary pages confuse OCR:
+    the vertical stroke is often read as an extra character (commonly
+    ``l``, ``I``, ``1``, ``i``) adjacent to where the pipe appears.
+
+    Strategy:
+        1. Strip ``|`` — if pyphen recognises the result, done.
+        2. Record where the pipes were in the stripped string.
+        3. Try deleting one character near each pipe position (the extra
+           character the OCR inserted).  If pyphen recognises the
+           candidate, return it.
+    """
+    stripped = word_with_pipes.replace('|', '')
+    if not stripped or len(stripped) < 3:
+        return stripped  # too short to validate
+
+    # Case-preserved check; pyphen is case-insensitive internally
+    if _is_known_word(stripped, hyph_de, hyph_en):
+        return stripped
+
+    # Map pipe positions into the stripped string.
+    # e.g. "Ze|plpe|lin" → pipes were at indices 2, 6 in the original,
+    # which map to positions 2, 5 in "Zeplpelin".
+    pipe_positions: List[int] = []
+    offset = 0
+    for i, c in enumerate(word_with_pipes):
+        if c == '|':
+            pipe_positions.append(i - offset)
+            offset += 1
+
+    # Try single-character deletion near each pipe position.
+    # OCR typically inserts ONE extra char per pipe stroke.
+    seen: set = set()
+    for pos in pipe_positions:
+        for delta in (0, 1, -1, 2, -2):
+            idx = pos + delta
+            if idx < 0 or idx >= len(stripped):
+                continue
+            candidate = stripped[:idx] + stripped[idx + 1:]
+            if candidate in seen or len(candidate) < 3:
+                continue
+            seen.add(candidate)
+            if _is_known_word(candidate, hyph_de, hyph_en):
+                return candidate
+
+    return None  # could not fix
+
+
+def autocorrect_pipe_artifacts(
+    zones_data: List[Dict], session_id: str,
+) -> int:
+    """Strip OCR pipe artifacts and correct garbled words in-place.
+
+    Printed syllable divider lines on dictionary scans are read by OCR
+    as ``|`` characters embedded in words (e.g. ``Zel|le``, ``Ze|plpe|lin``).
+    This function:
+
+    1. Strips ``|`` from every word in content cells.
+    2. Validates the stripped word with pyphen.
+    3. If not recognised, tries deleting characters that the OCR inserted
+       around the pipe position (e.g. ``Zeplpelin`` → ``Zeppelin``).
+    4. Updates both word-box texts and cell text.
+
+    Returns the number of cells modified.
+    """
+    hyph_de, hyph_en = _get_hyphenators()
+    if hyph_de is None:
+        return 0
+
+    modified = 0
+    for z in zones_data:
+        for cell in z.get("cells", []):
+            ct = cell.get("col_type", "")
+            if not ct.startswith("column_"):
+                continue
+
+            cell_changed = False
+
+            # --- Fix word boxes ---
+            for wb in cell.get("word_boxes", []):
+                wb_text = wb.get("text", "")
+                if "|" not in wb_text:
+                    continue
+
+                # Separate trailing punctuation
+                m = re.match(
+                    r'^([^a-zA-ZäöüÄÖÜßẞ]*)'
+                    r'(.*?)'
+                    r'([^a-zA-ZäöüÄÖÜßẞ]*)$',
+                    wb_text,
+                )
+                if not m:
+                    continue
+                lead, core, trail = m.group(1), m.group(2), m.group(3)
+                if "|" not in core:
+                    continue
+
+                corrected = _autocorrect_piped_word(core, hyph_de, hyph_en)
+                if corrected is not None and corrected != core:
+                    wb["text"] = lead + corrected + trail
+                    cell_changed = True
+
+            # --- Rebuild cell text from word boxes ---
+            if cell_changed:
+                wbs = cell.get("word_boxes", [])
+                if wbs:
+                    cell["text"] = " ".join(
+                        (wb.get("text") or "") for wb in wbs
+                    )
+                modified += 1
+
+            # --- Fallback: strip residual | from cell text ---
+            # (covers cases where word_boxes don't exist or weren't fixed)
+            text = cell.get("text", "")
+            if "|" in text:
+                clean = text.replace("|", "")
+                if clean != text:
+                    cell["text"] = clean
+                    if not cell_changed:
+                        modified += 1
+
+    if modified:
+        logger.info(
+            "build-grid session %s: autocorrected pipe artifacts in %d cells",
+            session_id, modified,
+        )
+    return modified
+
+
 def _try_merge_pipe_gaps(text: str, hyph_de) -> str:
    """Merge fragments separated by single spaces where OCR split at a pipe.

@@ -185,7 +326,7 @@ def merge_word_gaps_in_zones(zones_data: List[Dict], session_id: str) -> int:


 def _try_merge_word_gaps(text: str, hyph_de) -> str:
-    """Merge OCR word fragments with relaxed threshold (max_short=6).
+    """Merge OCR word fragments with relaxed threshold (max_short=5).

    Similar to ``_try_merge_pipe_gaps`` but allows slightly longer fragments
    (max_short=5 instead of 3).  Still requires pyphen to recognize the
@@ -1323,6 +1323,14 @@ async def _build_grid_core(
                        and wb.get("conf", 100) < 85):
                    to_remove.add(i)

+            # Rule (a2): isolated non-alphanumeric symbols (graphic OCR artifacts)
+            # Small images/icons next to words get OCR'd as ">", "<", "~", etc.
+            # Remove word boxes that contain NO letters or digits.
+            for i, wb in enumerate(wbs):
+                t = (wb.get("text") or "").strip()
+                if t and not re.search(r'[a-zA-Z0-9äöüÄÖÜß]', t) and len(t) <= 2:
+                    to_remove.add(i)
+
            # Rule (b) + (c): overlap and duplicate detection
            # Sort by x for pairwise comparison
            _ALPHA_WORD_RE = re.compile(r'^[A-Za-z\u00c0-\u024f\-]+[.,;:!?]*$')
@@ -1619,6 +1627,15 @@ async def _build_grid_core(
    except Exception as e:
        logger.warning("Word-gap merge failed: %s", e)

+    # --- Pipe auto-correction: fix OCR artifacts from printed syllable dividers ---
+    # Strips | from words, validates with pyphen, tries char-deletion for garbled
+    # words like "Ze|plpe|lin" → "Zeppelin".
+    try:
+        from cv_syllable_detect import autocorrect_pipe_artifacts
+        autocorrect_pipe_artifacts(zones_data, session_id)
+    except Exception as e:
+        logger.warning("Pipe autocorrect failed: %s", e)
+
    # --- Syllable divider insertion for dictionary pages ---
    # syllable_mode: "auto" = only when original has pipe dividers (1% threshold),
    #   "all" = force on all content words, "en" = English column only,