Add pipe auto-correction and graphic artifact filter for grid builder
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 2m10s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 19s

- autocorrect_pipe_artifacts(): strips OCR pipe artifacts from printed
  syllable dividers, validates with pyphen, tries char-deletion near
  pipe positions for garbled words (e.g. "Ze|plpe|lin" → "Zeppelin")
- Rule (a2): filters isolated non-alphanumeric word boxes (≤2 chars,
  no letters/digits) — catches small icons OCR'd as ">", "<" etc.
- Both fixes are generic: pyphen-validated, no session-specific logic

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-27 16:33:38 +01:00
parent 0685fb12da
commit cc4cb3bc2f
2 changed files with 159 additions and 1 deletions

View File

@@ -70,6 +70,14 @@ def _get_hyphenators():
return _hyph_de, _hyph_en
def _is_known_word(word: str, hyph_de, hyph_en) -> bool:
"""Check whether pyphen recognises a word (DE or EN)."""
if len(word) < 2:
return False
return ('|' in hyph_de.inserted(word, hyphen='|')
or '|' in hyph_en.inserted(word, hyphen='|'))
def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]:
"""Try to hyphenate a word using DE then EN dictionary.
@@ -84,6 +92,139 @@ def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]:
return None
def _autocorrect_piped_word(
word_with_pipes: str, hyph_de, hyph_en,
) -> Optional[str]:
"""Try to correct a word that has OCR pipe artifacts.
Printed syllable divider lines on dictionary pages confuse OCR:
the vertical stroke is often read as an extra character (commonly
``l``, ``I``, ``1``, ``i``) adjacent to where the pipe appears.
Strategy:
1. Strip ``|`` — if pyphen recognises the result, done.
2. Record where the pipes were in the stripped string.
3. Try deleting one character near each pipe position (the extra
character the OCR inserted). If pyphen recognises the
candidate, return it.
"""
stripped = word_with_pipes.replace('|', '')
if not stripped or len(stripped) < 3:
return stripped # too short to validate
# Case-preserved check; pyphen is case-insensitive internally
if _is_known_word(stripped, hyph_de, hyph_en):
return stripped
# Map pipe positions into the stripped string.
# e.g. "Ze|plpe|lin" → pipes were at indices 2, 6 in the original,
# which map to positions 2, 5 in "Zeplpelin".
pipe_positions: List[int] = []
offset = 0
for i, c in enumerate(word_with_pipes):
if c == '|':
pipe_positions.append(i - offset)
offset += 1
# Try single-character deletion near each pipe position.
# OCR typically inserts ONE extra char per pipe stroke.
seen: set = set()
for pos in pipe_positions:
for delta in (0, 1, -1, 2, -2):
idx = pos + delta
if idx < 0 or idx >= len(stripped):
continue
candidate = stripped[:idx] + stripped[idx + 1:]
if candidate in seen or len(candidate) < 3:
continue
seen.add(candidate)
if _is_known_word(candidate, hyph_de, hyph_en):
return candidate
return None # could not fix
def autocorrect_pipe_artifacts(
zones_data: List[Dict], session_id: str,
) -> int:
"""Strip OCR pipe artifacts and correct garbled words in-place.
Printed syllable divider lines on dictionary scans are read by OCR
as ``|`` characters embedded in words (e.g. ``Zel|le``, ``Ze|plpe|lin``).
This function:
1. Strips ``|`` from every word in content cells.
2. Validates the stripped word with pyphen.
3. If not recognised, tries deleting characters that the OCR inserted
around the pipe position (e.g. ``Zeplpelin`` → ``Zeppelin``).
4. Updates both word-box texts and cell text.
Returns the number of cells modified.
"""
hyph_de, hyph_en = _get_hyphenators()
if hyph_de is None:
return 0
modified = 0
for z in zones_data:
for cell in z.get("cells", []):
ct = cell.get("col_type", "")
if not ct.startswith("column_"):
continue
cell_changed = False
# --- Fix word boxes ---
for wb in cell.get("word_boxes", []):
wb_text = wb.get("text", "")
if "|" not in wb_text:
continue
# Separate trailing punctuation
m = re.match(
r'^([^a-zA-ZäöüÄÖÜßẞ]*)'
r'(.*?)'
r'([^a-zA-ZäöüÄÖÜßẞ]*)$',
wb_text,
)
if not m:
continue
lead, core, trail = m.group(1), m.group(2), m.group(3)
if "|" not in core:
continue
corrected = _autocorrect_piped_word(core, hyph_de, hyph_en)
if corrected is not None and corrected != core:
wb["text"] = lead + corrected + trail
cell_changed = True
# --- Rebuild cell text from word boxes ---
if cell_changed:
wbs = cell.get("word_boxes", [])
if wbs:
cell["text"] = " ".join(
(wb.get("text") or "") for wb in wbs
)
modified += 1
# --- Fallback: strip residual | from cell text ---
# (covers cases where word_boxes don't exist or weren't fixed)
text = cell.get("text", "")
if "|" in text:
clean = text.replace("|", "")
if clean != text:
cell["text"] = clean
if not cell_changed:
modified += 1
if modified:
logger.info(
"build-grid session %s: autocorrected pipe artifacts in %d cells",
session_id, modified,
)
return modified
def _try_merge_pipe_gaps(text: str, hyph_de) -> str:
"""Merge fragments separated by single spaces where OCR split at a pipe.
@@ -185,7 +326,7 @@ def merge_word_gaps_in_zones(zones_data: List[Dict], session_id: str) -> int:
def _try_merge_word_gaps(text: str, hyph_de) -> str:
"""Merge OCR word fragments with relaxed threshold (max_short=6).
"""Merge OCR word fragments with relaxed threshold (max_short=5).
Similar to ``_try_merge_pipe_gaps`` but allows slightly longer fragments
(max_short=5 instead of 3). Still requires pyphen to recognize the

View File

@@ -1323,6 +1323,14 @@ async def _build_grid_core(
and wb.get("conf", 100) < 85):
to_remove.add(i)
# Rule (a2): isolated non-alphanumeric symbols (graphic OCR artifacts)
# Small images/icons next to words get OCR'd as ">", "<", "~", etc.
# Remove word boxes that contain NO letters or digits.
for i, wb in enumerate(wbs):
t = (wb.get("text") or "").strip()
if t and not re.search(r'[a-zA-Z0-9äöüÄÖÜß]', t) and len(t) <= 2:
to_remove.add(i)
# Rule (b) + (c): overlap and duplicate detection
# Sort by x for pairwise comparison
_ALPHA_WORD_RE = re.compile(r'^[A-Za-z\u00c0-\u024f\-]+[.,;:!?]*$')
@@ -1619,6 +1627,15 @@ async def _build_grid_core(
except Exception as e:
logger.warning("Word-gap merge failed: %s", e)
# --- Pipe auto-correction: fix OCR artifacts from printed syllable dividers ---
# Strips | from words, validates with pyphen, tries char-deletion for garbled
# words like "Ze|plpe|lin" → "Zeppelin".
try:
from cv_syllable_detect import autocorrect_pipe_artifacts
autocorrect_pipe_artifacts(zones_data, session_id)
except Exception as e:
logger.warning("Pipe autocorrect failed: %s", e)
# --- Syllable divider insertion for dictionary pages ---
# syllable_mode: "auto" = only when original has pipe dividers (1% threshold),
# "all" = force on all content words, "en" = English column only,