Add pipe auto-correction and graphic artifact filter for grid builder
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 2m10s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 19s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 2m10s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 19s
- autocorrect_pipe_artifacts(): strips OCR pipe artifacts from printed syllable dividers, validates with pyphen, tries char-deletion near pipe positions for garbled words (e.g. "Ze|plpe|lin" → "Zeppelin") - Rule (a2): filters isolated non-alphanumeric word boxes (≤2 chars, no letters/digits) — catches small icons OCR'd as ">", "<" etc. - Both fixes are generic: pyphen-validated, no session-specific logic Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -70,6 +70,14 @@ def _get_hyphenators():
|
|||||||
return _hyph_de, _hyph_en
|
return _hyph_de, _hyph_en
|
||||||
|
|
||||||
|
|
||||||
|
def _is_known_word(word: str, hyph_de, hyph_en) -> bool:
|
||||||
|
"""Check whether pyphen recognises a word (DE or EN)."""
|
||||||
|
if len(word) < 2:
|
||||||
|
return False
|
||||||
|
return ('|' in hyph_de.inserted(word, hyphen='|')
|
||||||
|
or '|' in hyph_en.inserted(word, hyphen='|'))
|
||||||
|
|
||||||
|
|
||||||
def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]:
|
def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]:
|
||||||
"""Try to hyphenate a word using DE then EN dictionary.
|
"""Try to hyphenate a word using DE then EN dictionary.
|
||||||
|
|
||||||
@@ -84,6 +92,139 @@ def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _autocorrect_piped_word(
|
||||||
|
word_with_pipes: str, hyph_de, hyph_en,
|
||||||
|
) -> Optional[str]:
|
||||||
|
"""Try to correct a word that has OCR pipe artifacts.
|
||||||
|
|
||||||
|
Printed syllable divider lines on dictionary pages confuse OCR:
|
||||||
|
the vertical stroke is often read as an extra character (commonly
|
||||||
|
``l``, ``I``, ``1``, ``i``) adjacent to where the pipe appears.
|
||||||
|
|
||||||
|
Strategy:
|
||||||
|
1. Strip ``|`` — if pyphen recognises the result, done.
|
||||||
|
2. Record where the pipes were in the stripped string.
|
||||||
|
3. Try deleting one character near each pipe position (the extra
|
||||||
|
character the OCR inserted). If pyphen recognises the
|
||||||
|
candidate, return it.
|
||||||
|
"""
|
||||||
|
stripped = word_with_pipes.replace('|', '')
|
||||||
|
if not stripped or len(stripped) < 3:
|
||||||
|
return stripped # too short to validate
|
||||||
|
|
||||||
|
# Case-preserved check; pyphen is case-insensitive internally
|
||||||
|
if _is_known_word(stripped, hyph_de, hyph_en):
|
||||||
|
return stripped
|
||||||
|
|
||||||
|
# Map pipe positions into the stripped string.
|
||||||
|
# e.g. "Ze|plpe|lin" → pipes were at indices 2, 6 in the original,
|
||||||
|
# which map to positions 2, 5 in "Zeplpelin".
|
||||||
|
pipe_positions: List[int] = []
|
||||||
|
offset = 0
|
||||||
|
for i, c in enumerate(word_with_pipes):
|
||||||
|
if c == '|':
|
||||||
|
pipe_positions.append(i - offset)
|
||||||
|
offset += 1
|
||||||
|
|
||||||
|
# Try single-character deletion near each pipe position.
|
||||||
|
# OCR typically inserts ONE extra char per pipe stroke.
|
||||||
|
seen: set = set()
|
||||||
|
for pos in pipe_positions:
|
||||||
|
for delta in (0, 1, -1, 2, -2):
|
||||||
|
idx = pos + delta
|
||||||
|
if idx < 0 or idx >= len(stripped):
|
||||||
|
continue
|
||||||
|
candidate = stripped[:idx] + stripped[idx + 1:]
|
||||||
|
if candidate in seen or len(candidate) < 3:
|
||||||
|
continue
|
||||||
|
seen.add(candidate)
|
||||||
|
if _is_known_word(candidate, hyph_de, hyph_en):
|
||||||
|
return candidate
|
||||||
|
|
||||||
|
return None # could not fix
|
||||||
|
|
||||||
|
|
||||||
|
def autocorrect_pipe_artifacts(
|
||||||
|
zones_data: List[Dict], session_id: str,
|
||||||
|
) -> int:
|
||||||
|
"""Strip OCR pipe artifacts and correct garbled words in-place.
|
||||||
|
|
||||||
|
Printed syllable divider lines on dictionary scans are read by OCR
|
||||||
|
as ``|`` characters embedded in words (e.g. ``Zel|le``, ``Ze|plpe|lin``).
|
||||||
|
This function:
|
||||||
|
|
||||||
|
1. Strips ``|`` from every word in content cells.
|
||||||
|
2. Validates the stripped word with pyphen.
|
||||||
|
3. If not recognised, tries deleting characters that the OCR inserted
|
||||||
|
around the pipe position (e.g. ``Zeplpelin`` → ``Zeppelin``).
|
||||||
|
4. Updates both word-box texts and cell text.
|
||||||
|
|
||||||
|
Returns the number of cells modified.
|
||||||
|
"""
|
||||||
|
hyph_de, hyph_en = _get_hyphenators()
|
||||||
|
if hyph_de is None:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
modified = 0
|
||||||
|
for z in zones_data:
|
||||||
|
for cell in z.get("cells", []):
|
||||||
|
ct = cell.get("col_type", "")
|
||||||
|
if not ct.startswith("column_"):
|
||||||
|
continue
|
||||||
|
|
||||||
|
cell_changed = False
|
||||||
|
|
||||||
|
# --- Fix word boxes ---
|
||||||
|
for wb in cell.get("word_boxes", []):
|
||||||
|
wb_text = wb.get("text", "")
|
||||||
|
if "|" not in wb_text:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Separate trailing punctuation
|
||||||
|
m = re.match(
|
||||||
|
r'^([^a-zA-ZäöüÄÖÜßẞ]*)'
|
||||||
|
r'(.*?)'
|
||||||
|
r'([^a-zA-ZäöüÄÖÜßẞ]*)$',
|
||||||
|
wb_text,
|
||||||
|
)
|
||||||
|
if not m:
|
||||||
|
continue
|
||||||
|
lead, core, trail = m.group(1), m.group(2), m.group(3)
|
||||||
|
if "|" not in core:
|
||||||
|
continue
|
||||||
|
|
||||||
|
corrected = _autocorrect_piped_word(core, hyph_de, hyph_en)
|
||||||
|
if corrected is not None and corrected != core:
|
||||||
|
wb["text"] = lead + corrected + trail
|
||||||
|
cell_changed = True
|
||||||
|
|
||||||
|
# --- Rebuild cell text from word boxes ---
|
||||||
|
if cell_changed:
|
||||||
|
wbs = cell.get("word_boxes", [])
|
||||||
|
if wbs:
|
||||||
|
cell["text"] = " ".join(
|
||||||
|
(wb.get("text") or "") for wb in wbs
|
||||||
|
)
|
||||||
|
modified += 1
|
||||||
|
|
||||||
|
# --- Fallback: strip residual | from cell text ---
|
||||||
|
# (covers cases where word_boxes don't exist or weren't fixed)
|
||||||
|
text = cell.get("text", "")
|
||||||
|
if "|" in text:
|
||||||
|
clean = text.replace("|", "")
|
||||||
|
if clean != text:
|
||||||
|
cell["text"] = clean
|
||||||
|
if not cell_changed:
|
||||||
|
modified += 1
|
||||||
|
|
||||||
|
if modified:
|
||||||
|
logger.info(
|
||||||
|
"build-grid session %s: autocorrected pipe artifacts in %d cells",
|
||||||
|
session_id, modified,
|
||||||
|
)
|
||||||
|
return modified
|
||||||
|
|
||||||
|
|
||||||
def _try_merge_pipe_gaps(text: str, hyph_de) -> str:
|
def _try_merge_pipe_gaps(text: str, hyph_de) -> str:
|
||||||
"""Merge fragments separated by single spaces where OCR split at a pipe.
|
"""Merge fragments separated by single spaces where OCR split at a pipe.
|
||||||
|
|
||||||
@@ -185,7 +326,7 @@ def merge_word_gaps_in_zones(zones_data: List[Dict], session_id: str) -> int:
|
|||||||
|
|
||||||
|
|
||||||
def _try_merge_word_gaps(text: str, hyph_de) -> str:
|
def _try_merge_word_gaps(text: str, hyph_de) -> str:
|
||||||
"""Merge OCR word fragments with relaxed threshold (max_short=6).
|
"""Merge OCR word fragments with relaxed threshold (max_short=5).
|
||||||
|
|
||||||
Similar to ``_try_merge_pipe_gaps`` but allows slightly longer fragments
|
Similar to ``_try_merge_pipe_gaps`` but allows slightly longer fragments
|
||||||
(max_short=5 instead of 3). Still requires pyphen to recognize the
|
(max_short=5 instead of 3). Still requires pyphen to recognize the
|
||||||
|
|||||||
@@ -1323,6 +1323,14 @@ async def _build_grid_core(
|
|||||||
and wb.get("conf", 100) < 85):
|
and wb.get("conf", 100) < 85):
|
||||||
to_remove.add(i)
|
to_remove.add(i)
|
||||||
|
|
||||||
|
# Rule (a2): isolated non-alphanumeric symbols (graphic OCR artifacts)
|
||||||
|
# Small images/icons next to words get OCR'd as ">", "<", "~", etc.
|
||||||
|
# Remove word boxes that contain NO letters or digits.
|
||||||
|
for i, wb in enumerate(wbs):
|
||||||
|
t = (wb.get("text") or "").strip()
|
||||||
|
if t and not re.search(r'[a-zA-Z0-9äöüÄÖÜß]', t) and len(t) <= 2:
|
||||||
|
to_remove.add(i)
|
||||||
|
|
||||||
# Rule (b) + (c): overlap and duplicate detection
|
# Rule (b) + (c): overlap and duplicate detection
|
||||||
# Sort by x for pairwise comparison
|
# Sort by x for pairwise comparison
|
||||||
_ALPHA_WORD_RE = re.compile(r'^[A-Za-z\u00c0-\u024f\-]+[.,;:!?]*$')
|
_ALPHA_WORD_RE = re.compile(r'^[A-Za-z\u00c0-\u024f\-]+[.,;:!?]*$')
|
||||||
@@ -1619,6 +1627,15 @@ async def _build_grid_core(
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("Word-gap merge failed: %s", e)
|
logger.warning("Word-gap merge failed: %s", e)
|
||||||
|
|
||||||
|
# --- Pipe auto-correction: fix OCR artifacts from printed syllable dividers ---
|
||||||
|
# Strips | from words, validates with pyphen, tries char-deletion for garbled
|
||||||
|
# words like "Ze|plpe|lin" → "Zeppelin".
|
||||||
|
try:
|
||||||
|
from cv_syllable_detect import autocorrect_pipe_artifacts
|
||||||
|
autocorrect_pipe_artifacts(zones_data, session_id)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Pipe autocorrect failed: %s", e)
|
||||||
|
|
||||||
# --- Syllable divider insertion for dictionary pages ---
|
# --- Syllable divider insertion for dictionary pages ---
|
||||||
# syllable_mode: "auto" = only when original has pipe dividers (1% threshold),
|
# syllable_mode: "auto" = only when original has pipe dividers (1% threshold),
|
||||||
# "all" = force on all content words, "en" = English column only,
|
# "all" = force on all content words, "en" = English column only,
|
||||||
|
|||||||
Reference in New Issue
Block a user