Add pipe auto-correction and graphic artifact filter for grid builder
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 2m10s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 19s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 2m10s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 19s
- autocorrect_pipe_artifacts(): strips OCR pipe artifacts from printed syllable dividers, validates with pyphen, tries char-deletion near pipe positions for garbled words (e.g. "Ze|plpe|lin" → "Zeppelin") - Rule (a2): filters isolated non-alphanumeric word boxes (≤2 chars, no letters/digits) — catches small icons OCR'd as ">", "<" etc. - Both fixes are generic: pyphen-validated, no session-specific logic Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -70,6 +70,14 @@ def _get_hyphenators():
|
||||
return _hyph_de, _hyph_en
|
||||
|
||||
|
||||
def _is_known_word(word: str, hyph_de, hyph_en) -> bool:
|
||||
"""Check whether pyphen recognises a word (DE or EN)."""
|
||||
if len(word) < 2:
|
||||
return False
|
||||
return ('|' in hyph_de.inserted(word, hyphen='|')
|
||||
or '|' in hyph_en.inserted(word, hyphen='|'))
|
||||
|
||||
|
||||
def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]:
|
||||
"""Try to hyphenate a word using DE then EN dictionary.
|
||||
|
||||
@@ -84,6 +92,139 @@ def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]:
|
||||
return None
|
||||
|
||||
|
||||
def _autocorrect_piped_word(
|
||||
word_with_pipes: str, hyph_de, hyph_en,
|
||||
) -> Optional[str]:
|
||||
"""Try to correct a word that has OCR pipe artifacts.
|
||||
|
||||
Printed syllable divider lines on dictionary pages confuse OCR:
|
||||
the vertical stroke is often read as an extra character (commonly
|
||||
``l``, ``I``, ``1``, ``i``) adjacent to where the pipe appears.
|
||||
|
||||
Strategy:
|
||||
1. Strip ``|`` — if pyphen recognises the result, done.
|
||||
2. Record where the pipes were in the stripped string.
|
||||
3. Try deleting one character near each pipe position (the extra
|
||||
character the OCR inserted). If pyphen recognises the
|
||||
candidate, return it.
|
||||
"""
|
||||
stripped = word_with_pipes.replace('|', '')
|
||||
if not stripped or len(stripped) < 3:
|
||||
return stripped # too short to validate
|
||||
|
||||
# Case-preserved check; pyphen is case-insensitive internally
|
||||
if _is_known_word(stripped, hyph_de, hyph_en):
|
||||
return stripped
|
||||
|
||||
# Map pipe positions into the stripped string.
|
||||
# e.g. "Ze|plpe|lin" → pipes were at indices 2, 6 in the original,
|
||||
# which map to positions 2, 5 in "Zeplpelin".
|
||||
pipe_positions: List[int] = []
|
||||
offset = 0
|
||||
for i, c in enumerate(word_with_pipes):
|
||||
if c == '|':
|
||||
pipe_positions.append(i - offset)
|
||||
offset += 1
|
||||
|
||||
# Try single-character deletion near each pipe position.
|
||||
# OCR typically inserts ONE extra char per pipe stroke.
|
||||
seen: set = set()
|
||||
for pos in pipe_positions:
|
||||
for delta in (0, 1, -1, 2, -2):
|
||||
idx = pos + delta
|
||||
if idx < 0 or idx >= len(stripped):
|
||||
continue
|
||||
candidate = stripped[:idx] + stripped[idx + 1:]
|
||||
if candidate in seen or len(candidate) < 3:
|
||||
continue
|
||||
seen.add(candidate)
|
||||
if _is_known_word(candidate, hyph_de, hyph_en):
|
||||
return candidate
|
||||
|
||||
return None # could not fix
|
||||
|
||||
|
||||
def autocorrect_pipe_artifacts(
|
||||
zones_data: List[Dict], session_id: str,
|
||||
) -> int:
|
||||
"""Strip OCR pipe artifacts and correct garbled words in-place.
|
||||
|
||||
Printed syllable divider lines on dictionary scans are read by OCR
|
||||
as ``|`` characters embedded in words (e.g. ``Zel|le``, ``Ze|plpe|lin``).
|
||||
This function:
|
||||
|
||||
1. Strips ``|`` from every word in content cells.
|
||||
2. Validates the stripped word with pyphen.
|
||||
3. If not recognised, tries deleting characters that the OCR inserted
|
||||
around the pipe position (e.g. ``Zeplpelin`` → ``Zeppelin``).
|
||||
4. Updates both word-box texts and cell text.
|
||||
|
||||
Returns the number of cells modified.
|
||||
"""
|
||||
hyph_de, hyph_en = _get_hyphenators()
|
||||
if hyph_de is None:
|
||||
return 0
|
||||
|
||||
modified = 0
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
ct = cell.get("col_type", "")
|
||||
if not ct.startswith("column_"):
|
||||
continue
|
||||
|
||||
cell_changed = False
|
||||
|
||||
# --- Fix word boxes ---
|
||||
for wb in cell.get("word_boxes", []):
|
||||
wb_text = wb.get("text", "")
|
||||
if "|" not in wb_text:
|
||||
continue
|
||||
|
||||
# Separate trailing punctuation
|
||||
m = re.match(
|
||||
r'^([^a-zA-ZäöüÄÖÜßẞ]*)'
|
||||
r'(.*?)'
|
||||
r'([^a-zA-ZäöüÄÖÜßẞ]*)$',
|
||||
wb_text,
|
||||
)
|
||||
if not m:
|
||||
continue
|
||||
lead, core, trail = m.group(1), m.group(2), m.group(3)
|
||||
if "|" not in core:
|
||||
continue
|
||||
|
||||
corrected = _autocorrect_piped_word(core, hyph_de, hyph_en)
|
||||
if corrected is not None and corrected != core:
|
||||
wb["text"] = lead + corrected + trail
|
||||
cell_changed = True
|
||||
|
||||
# --- Rebuild cell text from word boxes ---
|
||||
if cell_changed:
|
||||
wbs = cell.get("word_boxes", [])
|
||||
if wbs:
|
||||
cell["text"] = " ".join(
|
||||
(wb.get("text") or "") for wb in wbs
|
||||
)
|
||||
modified += 1
|
||||
|
||||
# --- Fallback: strip residual | from cell text ---
|
||||
# (covers cases where word_boxes don't exist or weren't fixed)
|
||||
text = cell.get("text", "")
|
||||
if "|" in text:
|
||||
clean = text.replace("|", "")
|
||||
if clean != text:
|
||||
cell["text"] = clean
|
||||
if not cell_changed:
|
||||
modified += 1
|
||||
|
||||
if modified:
|
||||
logger.info(
|
||||
"build-grid session %s: autocorrected pipe artifacts in %d cells",
|
||||
session_id, modified,
|
||||
)
|
||||
return modified
|
||||
|
||||
|
||||
def _try_merge_pipe_gaps(text: str, hyph_de) -> str:
|
||||
"""Merge fragments separated by single spaces where OCR split at a pipe.
|
||||
|
||||
@@ -185,7 +326,7 @@ def merge_word_gaps_in_zones(zones_data: List[Dict], session_id: str) -> int:
|
||||
|
||||
|
||||
def _try_merge_word_gaps(text: str, hyph_de) -> str:
|
||||
"""Merge OCR word fragments with relaxed threshold (max_short=6).
|
||||
"""Merge OCR word fragments with relaxed threshold (max_short=5).
|
||||
|
||||
Similar to ``_try_merge_pipe_gaps`` but allows slightly longer fragments
|
||||
(max_short=5 instead of 3). Still requires pyphen to recognize the
|
||||
|
||||
@@ -1323,6 +1323,14 @@ async def _build_grid_core(
|
||||
and wb.get("conf", 100) < 85):
|
||||
to_remove.add(i)
|
||||
|
||||
# Rule (a2): isolated non-alphanumeric symbols (graphic OCR artifacts)
|
||||
# Small images/icons next to words get OCR'd as ">", "<", "~", etc.
|
||||
# Remove word boxes that contain NO letters or digits.
|
||||
for i, wb in enumerate(wbs):
|
||||
t = (wb.get("text") or "").strip()
|
||||
if t and not re.search(r'[a-zA-Z0-9äöüÄÖÜß]', t) and len(t) <= 2:
|
||||
to_remove.add(i)
|
||||
|
||||
# Rule (b) + (c): overlap and duplicate detection
|
||||
# Sort by x for pairwise comparison
|
||||
_ALPHA_WORD_RE = re.compile(r'^[A-Za-z\u00c0-\u024f\-]+[.,;:!?]*$')
|
||||
@@ -1619,6 +1627,15 @@ async def _build_grid_core(
|
||||
except Exception as e:
|
||||
logger.warning("Word-gap merge failed: %s", e)
|
||||
|
||||
# --- Pipe auto-correction: fix OCR artifacts from printed syllable dividers ---
|
||||
# Strips | from words, validates with pyphen, tries char-deletion for garbled
|
||||
# words like "Ze|plpe|lin" → "Zeppelin".
|
||||
try:
|
||||
from cv_syllable_detect import autocorrect_pipe_artifacts
|
||||
autocorrect_pipe_artifacts(zones_data, session_id)
|
||||
except Exception as e:
|
||||
logger.warning("Pipe autocorrect failed: %s", e)
|
||||
|
||||
# --- Syllable divider insertion for dictionary pages ---
|
||||
# syllable_mode: "auto" = only when original has pipe dividers (1% threshold),
|
||||
# "all" = force on all content words, "en" = English column only,
|
||||
|
||||
Reference in New Issue
Block a user