Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 2m29s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 20s
pyphen is a pattern-based hyphenator that accepts nonsense strings like "Zeplpelin". Switch to spellchecker (frequency-based word list) which correctly rejects garbled words and can suggest corrections. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
533 lines
18 KiB
Python
533 lines
18 KiB
Python
"""
|
||
Syllable divider insertion for dictionary pages.
|
||
|
||
For confirmed dictionary pages (is_dictionary=True), processes all content
|
||
column cells:
|
||
1. Strips existing | dividers for clean normalization
|
||
2. Merges pipe-gap spaces (where OCR split a word at a divider position)
|
||
3. Applies pyphen syllabification to each word >= 3 alpha chars (DE then EN)
|
||
4. Only modifies words that pyphen recognizes — garbled OCR stays as-is
|
||
|
||
No CV gate needed — the dictionary detection confidence is sufficient.
|
||
pyphen uses Hunspell/TeX hyphenation dictionaries and is very reliable.
|
||
|
||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||
"""
|
||
|
||
import logging
|
||
import re
|
||
from typing import Any, Dict, List, Optional, Tuple
|
||
|
||
import numpy as np
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# IPA/phonetic characters — skip cells containing these
|
||
_IPA_RE = re.compile(r'[\[\]ˈˌːʃʒθðŋɑɒæɔəɛɜɪʊʌ]')
|
||
|
||
# Common German words that should NOT be merged with adjacent tokens.
|
||
# These are function words that appear as standalone words between
|
||
# headwords/definitions on dictionary pages.
|
||
_STOP_WORDS = frozenset([
|
||
# Articles
|
||
'der', 'die', 'das', 'dem', 'den', 'des',
|
||
'ein', 'eine', 'einem', 'einen', 'einer',
|
||
# Pronouns
|
||
'du', 'er', 'es', 'sie', 'wir', 'ihr', 'ich', 'man', 'sich',
|
||
'dich', 'dir', 'mich', 'mir', 'uns', 'euch', 'ihm', 'ihn',
|
||
# Prepositions
|
||
'mit', 'von', 'zu', 'für', 'auf', 'in', 'an', 'um', 'am', 'im',
|
||
'aus', 'bei', 'nach', 'vor', 'bis', 'durch', 'über', 'unter',
|
||
'zwischen', 'ohne', 'gegen',
|
||
# Conjunctions
|
||
'und', 'oder', 'als', 'wie', 'wenn', 'dass', 'weil', 'aber',
|
||
# Adverbs
|
||
'auch', 'noch', 'nur', 'schon', 'sehr', 'nicht',
|
||
# Verbs
|
||
'ist', 'hat', 'wird', 'kann', 'soll', 'muss', 'darf',
|
||
'sein', 'haben',
|
||
# Other
|
||
'kein', 'keine', 'keinem', 'keinen', 'keiner',
|
||
])
|
||
|
||
# Cached hyphenators
|
||
_hyph_de = None
|
||
_hyph_en = None
|
||
|
||
# Cached spellchecker (for autocorrect_pipe_artifacts)
|
||
_spell_de = None
|
||
|
||
|
||
def _get_hyphenators():
|
||
"""Lazy-load pyphen hyphenators (cached across calls)."""
|
||
global _hyph_de, _hyph_en
|
||
if _hyph_de is not None:
|
||
return _hyph_de, _hyph_en
|
||
try:
|
||
import pyphen
|
||
except ImportError:
|
||
return None, None
|
||
_hyph_de = pyphen.Pyphen(lang='de_DE')
|
||
_hyph_en = pyphen.Pyphen(lang='en_US')
|
||
return _hyph_de, _hyph_en
|
||
|
||
|
||
def _get_spellchecker():
|
||
"""Lazy-load German spellchecker (cached across calls)."""
|
||
global _spell_de
|
||
if _spell_de is not None:
|
||
return _spell_de
|
||
try:
|
||
from spellchecker import SpellChecker
|
||
except ImportError:
|
||
return None
|
||
_spell_de = SpellChecker(language='de')
|
||
return _spell_de
|
||
|
||
|
||
def _is_known_word(word: str, hyph_de, hyph_en) -> bool:
|
||
"""Check whether pyphen recognises a word (DE or EN)."""
|
||
if len(word) < 2:
|
||
return False
|
||
return ('|' in hyph_de.inserted(word, hyphen='|')
|
||
or '|' in hyph_en.inserted(word, hyphen='|'))
|
||
|
||
|
||
def _is_real_word(word: str) -> bool:
|
||
"""Check whether spellchecker knows this word (case-insensitive)."""
|
||
spell = _get_spellchecker()
|
||
if spell is None:
|
||
return False
|
||
return word.lower() in spell
|
||
|
||
|
||
def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]:
|
||
"""Try to hyphenate a word using DE then EN dictionary.
|
||
|
||
Returns word with | separators, or None if not recognized.
|
||
"""
|
||
hyph = hyph_de.inserted(word, hyphen='|')
|
||
if '|' in hyph:
|
||
return hyph
|
||
hyph = hyph_en.inserted(word, hyphen='|')
|
||
if '|' in hyph:
|
||
return hyph
|
||
return None
|
||
|
||
|
||
def _autocorrect_piped_word(word_with_pipes: str) -> Optional[str]:
|
||
"""Try to correct a word that has OCR pipe artifacts.
|
||
|
||
Printed syllable divider lines on dictionary pages confuse OCR:
|
||
the vertical stroke is often read as an extra character (commonly
|
||
``l``, ``I``, ``1``, ``i``) adjacent to where the pipe appears.
|
||
Sometimes OCR reads one divider as ``|`` and another as a letter,
|
||
so the garbled character may be far from any detected pipe.
|
||
|
||
Uses ``spellchecker`` (frequency-based word list) for validation —
|
||
unlike pyphen which is a pattern-based hyphenator and accepts
|
||
nonsense strings like "Zeplpelin".
|
||
|
||
Strategy:
|
||
1. Strip ``|`` — if spellchecker knows the result, done.
|
||
2. Try deleting each pipe-like character (l, I, 1, i, t).
|
||
OCR inserts extra chars that resemble vertical strokes.
|
||
3. Fall back to spellchecker's own ``correction()`` method.
|
||
4. Preserve the original casing of the first letter.
|
||
"""
|
||
stripped = word_with_pipes.replace('|', '')
|
||
if not stripped or len(stripped) < 3:
|
||
return stripped # too short to validate
|
||
|
||
# Step 1: if the stripped word is already a real word, done
|
||
if _is_real_word(stripped):
|
||
return stripped
|
||
|
||
# Step 2: try deleting pipe-like characters (most likely artifacts)
|
||
_PIPE_LIKE = frozenset('lI1it')
|
||
for idx in range(len(stripped)):
|
||
if stripped[idx] not in _PIPE_LIKE:
|
||
continue
|
||
candidate = stripped[:idx] + stripped[idx + 1:]
|
||
if len(candidate) >= 3 and _is_real_word(candidate):
|
||
return candidate
|
||
|
||
# Step 3: use spellchecker's built-in correction
|
||
spell = _get_spellchecker()
|
||
if spell is not None:
|
||
suggestion = spell.correction(stripped.lower())
|
||
if suggestion and suggestion != stripped.lower():
|
||
# Preserve original first-letter case
|
||
if stripped[0].isupper():
|
||
suggestion = suggestion[0].upper() + suggestion[1:]
|
||
return suggestion
|
||
|
||
return None # could not fix
|
||
|
||
|
||
def autocorrect_pipe_artifacts(
|
||
zones_data: List[Dict], session_id: str,
|
||
) -> int:
|
||
"""Strip OCR pipe artifacts and correct garbled words in-place.
|
||
|
||
Printed syllable divider lines on dictionary scans are read by OCR
|
||
as ``|`` characters embedded in words (e.g. ``Zel|le``, ``Ze|plpe|lin``).
|
||
This function:
|
||
|
||
1. Strips ``|`` from every word in content cells.
|
||
2. Validates with spellchecker (real dictionary lookup).
|
||
3. If not recognised, tries deleting pipe-like characters or uses
|
||
spellchecker's correction (e.g. ``Zeplpelin`` → ``Zeppelin``).
|
||
4. Updates both word-box texts and cell text.
|
||
|
||
Returns the number of cells modified.
|
||
"""
|
||
spell = _get_spellchecker()
|
||
if spell is None:
|
||
logger.warning("spellchecker not available — pipe autocorrect limited")
|
||
# Fall back: still strip pipes even without spellchecker
|
||
pass
|
||
|
||
modified = 0
|
||
for z in zones_data:
|
||
for cell in z.get("cells", []):
|
||
ct = cell.get("col_type", "")
|
||
if not ct.startswith("column_"):
|
||
continue
|
||
|
||
cell_changed = False
|
||
|
||
# --- Fix word boxes ---
|
||
for wb in cell.get("word_boxes", []):
|
||
wb_text = wb.get("text", "")
|
||
if "|" not in wb_text:
|
||
continue
|
||
|
||
# Separate trailing punctuation
|
||
m = re.match(
|
||
r'^([^a-zA-ZäöüÄÖÜßẞ]*)'
|
||
r'(.*?)'
|
||
r'([^a-zA-ZäöüÄÖÜßẞ]*)$',
|
||
wb_text,
|
||
)
|
||
if not m:
|
||
continue
|
||
lead, core, trail = m.group(1), m.group(2), m.group(3)
|
||
if "|" not in core:
|
||
continue
|
||
|
||
corrected = _autocorrect_piped_word(core)
|
||
if corrected is not None and corrected != core:
|
||
wb["text"] = lead + corrected + trail
|
||
cell_changed = True
|
||
|
||
# --- Rebuild cell text from word boxes ---
|
||
if cell_changed:
|
||
wbs = cell.get("word_boxes", [])
|
||
if wbs:
|
||
cell["text"] = " ".join(
|
||
(wb.get("text") or "") for wb in wbs
|
||
)
|
||
modified += 1
|
||
|
||
# --- Fallback: strip residual | from cell text ---
|
||
# (covers cases where word_boxes don't exist or weren't fixed)
|
||
text = cell.get("text", "")
|
||
if "|" in text:
|
||
clean = text.replace("|", "")
|
||
if clean != text:
|
||
cell["text"] = clean
|
||
if not cell_changed:
|
||
modified += 1
|
||
|
||
if modified:
|
||
logger.info(
|
||
"build-grid session %s: autocorrected pipe artifacts in %d cells",
|
||
session_id, modified,
|
||
)
|
||
return modified
|
||
|
||
|
||
def _try_merge_pipe_gaps(text: str, hyph_de) -> str:
|
||
"""Merge fragments separated by single spaces where OCR split at a pipe.
|
||
|
||
Example: "Kaf fee" -> "Kaffee" (pyphen recognizes the merged word).
|
||
Multi-step: "Ka bel jau" -> "Kabel jau" -> "Kabeljau".
|
||
|
||
Guards against false merges:
|
||
- The FIRST token must be pure alpha (word start — no attached punctuation)
|
||
- The second token may have trailing punctuation (comma, period) which
|
||
stays attached to the merged word: "Kä" + "fer," -> "Käfer,"
|
||
- Common German function words (der, die, das, ...) are never merged
|
||
- At least one fragment must be very short (<=3 alpha chars)
|
||
"""
|
||
parts = text.split(' ')
|
||
if len(parts) < 2:
|
||
return text
|
||
|
||
result = [parts[0]]
|
||
i = 1
|
||
while i < len(parts):
|
||
prev = result[-1]
|
||
curr = parts[i]
|
||
|
||
# Extract alpha-only core for lookup
|
||
prev_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', prev)
|
||
curr_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', curr)
|
||
|
||
# Guard 1: first token must be pure alpha (word-start fragment)
|
||
# second token may have trailing punctuation
|
||
# Guard 2: neither alpha core can be a common German function word
|
||
# Guard 3: the shorter fragment must be <= 3 chars (pipe-gap signal)
|
||
# Guard 4: combined length must be >= 4
|
||
should_try = (
|
||
prev == prev_alpha # first token: pure alpha (word start)
|
||
and prev_alpha and curr_alpha
|
||
and prev_alpha.lower() not in _STOP_WORDS
|
||
and curr_alpha.lower() not in _STOP_WORDS
|
||
and min(len(prev_alpha), len(curr_alpha)) <= 3
|
||
and len(prev_alpha) + len(curr_alpha) >= 4
|
||
)
|
||
|
||
if should_try:
|
||
merged_alpha = prev_alpha + curr_alpha
|
||
hyph = hyph_de.inserted(merged_alpha, hyphen='-')
|
||
if '-' in hyph:
|
||
# pyphen recognizes merged word — collapse the space
|
||
result[-1] = prev + curr
|
||
i += 1
|
||
continue
|
||
|
||
result.append(curr)
|
||
i += 1
|
||
|
||
return ' '.join(result)
|
||
|
||
|
||
def merge_word_gaps_in_zones(zones_data: List[Dict], session_id: str) -> int:
|
||
"""Merge OCR word-gap fragments in cell texts using pyphen validation.
|
||
|
||
OCR often splits words at syllable boundaries into separate word_boxes,
|
||
producing text like "zerknit tert" instead of "zerknittert". This
|
||
function tries to merge adjacent fragments in every content cell.
|
||
|
||
More permissive than ``_try_merge_pipe_gaps`` (threshold 5 instead of 3)
|
||
but still guarded by pyphen dictionary lookup and stop-word exclusion.
|
||
|
||
Returns the number of cells modified.
|
||
"""
|
||
hyph_de, _ = _get_hyphenators()
|
||
if hyph_de is None:
|
||
return 0
|
||
|
||
modified = 0
|
||
for z in zones_data:
|
||
for cell in z.get("cells", []):
|
||
ct = cell.get("col_type", "")
|
||
if not ct.startswith("column_"):
|
||
continue
|
||
text = cell.get("text", "")
|
||
if not text or " " not in text:
|
||
continue
|
||
|
||
# Skip IPA cells
|
||
text_no_brackets = re.sub(r'\[[^\]]*\]', '', text)
|
||
if _IPA_RE.search(text_no_brackets):
|
||
continue
|
||
|
||
new_text = _try_merge_word_gaps(text, hyph_de)
|
||
if new_text != text:
|
||
cell["text"] = new_text
|
||
modified += 1
|
||
|
||
if modified:
|
||
logger.info(
|
||
"build-grid session %s: merged word gaps in %d cells",
|
||
session_id, modified,
|
||
)
|
||
return modified
|
||
|
||
|
||
def _try_merge_word_gaps(text: str, hyph_de) -> str:
|
||
"""Merge OCR word fragments with relaxed threshold (max_short=5).
|
||
|
||
Similar to ``_try_merge_pipe_gaps`` but allows slightly longer fragments
|
||
(max_short=5 instead of 3). Still requires pyphen to recognize the
|
||
merged word.
|
||
"""
|
||
parts = text.split(' ')
|
||
if len(parts) < 2:
|
||
return text
|
||
|
||
result = [parts[0]]
|
||
i = 1
|
||
while i < len(parts):
|
||
prev = result[-1]
|
||
curr = parts[i]
|
||
|
||
prev_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', prev)
|
||
curr_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', curr)
|
||
|
||
should_try = (
|
||
prev == prev_alpha
|
||
and prev_alpha and curr_alpha
|
||
and prev_alpha.lower() not in _STOP_WORDS
|
||
and curr_alpha.lower() not in _STOP_WORDS
|
||
and min(len(prev_alpha), len(curr_alpha)) <= 5
|
||
and len(prev_alpha) + len(curr_alpha) >= 4
|
||
)
|
||
|
||
if should_try:
|
||
merged_alpha = prev_alpha + curr_alpha
|
||
hyph = hyph_de.inserted(merged_alpha, hyphen='-')
|
||
if '-' in hyph:
|
||
result[-1] = prev + curr
|
||
i += 1
|
||
continue
|
||
|
||
result.append(curr)
|
||
i += 1
|
||
|
||
return ' '.join(result)
|
||
|
||
|
||
def _syllabify_text(text: str, hyph_de, hyph_en) -> str:
|
||
"""Syllabify all significant words in a text string.
|
||
|
||
1. Strip existing | dividers
|
||
2. Merge pipe-gap spaces where possible
|
||
3. Apply pyphen to each word >= 3 alphabetic chars
|
||
4. Words pyphen doesn't recognize stay as-is (no bad guesses)
|
||
"""
|
||
if not text:
|
||
return text
|
||
|
||
# Skip cells that contain IPA transcription characters outside brackets.
|
||
# Bracket content like [bɪltʃøn] is programmatically inserted and should
|
||
# not block syllabification of the surrounding text.
|
||
text_no_brackets = re.sub(r'\[[^\]]*\]', '', text)
|
||
if _IPA_RE.search(text_no_brackets):
|
||
return text
|
||
|
||
# Phase 1: strip existing pipe dividers for clean normalization
|
||
clean = text.replace('|', '')
|
||
|
||
# Phase 2: merge pipe-gap spaces (OCR fragments from pipe splitting)
|
||
clean = _try_merge_pipe_gaps(clean, hyph_de)
|
||
|
||
# Phase 3: tokenize and syllabify each word
|
||
# Split on whitespace and comma/semicolon sequences, keeping separators
|
||
tokens = re.split(r'(\s+|[,;:]+\s*)', clean)
|
||
|
||
result = []
|
||
for tok in tokens:
|
||
if not tok or re.match(r'^[\s,;:]+$', tok):
|
||
result.append(tok)
|
||
continue
|
||
|
||
# Strip trailing/leading punctuation for pyphen lookup
|
||
m = re.match(r'^([^a-zA-ZäöüÄÖÜßẞ]*)(.*?)([^a-zA-ZäöüÄÖÜßẞ]*)$', tok)
|
||
if not m:
|
||
result.append(tok)
|
||
continue
|
||
lead, word, trail = m.group(1), m.group(2), m.group(3)
|
||
|
||
if len(word) < 3 or not re.search(r'[a-zA-ZäöüÄÖÜß]', word):
|
||
result.append(tok)
|
||
continue
|
||
|
||
hyph = _hyphenate_word(word, hyph_de, hyph_en)
|
||
if hyph:
|
||
result.append(lead + hyph + trail)
|
||
else:
|
||
result.append(tok)
|
||
|
||
return ''.join(result)
|
||
|
||
|
||
def insert_syllable_dividers(
|
||
zones_data: List[Dict],
|
||
img_bgr: np.ndarray,
|
||
session_id: str,
|
||
*,
|
||
force: bool = False,
|
||
col_filter: Optional[set] = None,
|
||
) -> int:
|
||
"""Insert pipe syllable dividers into dictionary cells.
|
||
|
||
For dictionary pages: process all content column cells, strip existing
|
||
pipes, merge pipe-gap spaces, and re-syllabify using pyphen.
|
||
|
||
Pre-check: at least 1% of content cells must already contain ``|`` from
|
||
OCR. This guards against pages with zero pipe characters (the primary
|
||
guard — article_col_index — is checked at the call site).
|
||
|
||
Args:
|
||
force: If True, skip the pipe-ratio pre-check and syllabify all
|
||
content words regardless of whether the original has pipe dividers.
|
||
col_filter: If set, only process cells whose col_type is in this set.
|
||
None means process all content columns.
|
||
|
||
Returns the number of cells modified.
|
||
"""
|
||
hyph_de, hyph_en = _get_hyphenators()
|
||
if hyph_de is None:
|
||
logger.warning("pyphen not installed — skipping syllable insertion")
|
||
return 0
|
||
|
||
# Pre-check: count cells that already have | from OCR.
|
||
# Real dictionary pages with printed syllable dividers will have OCR-
|
||
# detected pipes in many cells. Pages without syllable dividers will
|
||
# have zero — skip those to avoid false syllabification.
|
||
if not force:
|
||
total_col_cells = 0
|
||
cells_with_pipes = 0
|
||
for z in zones_data:
|
||
for cell in z.get("cells", []):
|
||
if cell.get("col_type", "").startswith("column_"):
|
||
total_col_cells += 1
|
||
if "|" in cell.get("text", ""):
|
||
cells_with_pipes += 1
|
||
|
||
if total_col_cells > 0:
|
||
pipe_ratio = cells_with_pipes / total_col_cells
|
||
if pipe_ratio < 0.01:
|
||
logger.info(
|
||
"build-grid session %s: skipping syllable insertion — "
|
||
"only %.1f%% of cells have existing pipes (need >=1%%)",
|
||
session_id, pipe_ratio * 100,
|
||
)
|
||
return 0
|
||
|
||
insertions = 0
|
||
for z in zones_data:
|
||
for cell in z.get("cells", []):
|
||
ct = cell.get("col_type", "")
|
||
if not ct.startswith("column_"):
|
||
continue
|
||
if col_filter is not None and ct not in col_filter:
|
||
continue
|
||
text = cell.get("text", "")
|
||
if not text:
|
||
continue
|
||
|
||
# In auto mode (force=False), only normalize cells that already
|
||
# have | from OCR (i.e. printed syllable dividers on the original
|
||
# scan). Don't add new syllable marks to other words.
|
||
if not force and "|" not in text:
|
||
continue
|
||
|
||
new_text = _syllabify_text(text, hyph_de, hyph_en)
|
||
if new_text != text:
|
||
cell["text"] = new_text
|
||
insertions += 1
|
||
|
||
if insertions:
|
||
logger.info(
|
||
"build-grid session %s: syllable dividers inserted/normalized "
|
||
"in %d cells (pyphen)",
|
||
session_id, insertions,
|
||
)
|
||
return insertions
|