feat: auto-insert syllable dividers via pyphen on dictionary pages
OCR engines don't detect | pipe chars used as syllable dividers in dictionaries. After dictionary detection (is_dict=True), use pyphen (MIT) to insert syllable breaks into headword cells. Tries DE first, then EN. Skips IPA content, short words, and cells already containing |. Also adds pyphen>=0.16.0 to requirements.txt. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2801,6 +2801,71 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("Dictionary detection failed: %s", e)
|
logger.warning("Dictionary detection failed: %s", e)
|
||||||
|
|
||||||
|
# --- Syllable divider insertion for dictionary pages ---
|
||||||
|
# Dictionary pages show syllable breaks as "|" (e.g. "Ka|me|rad").
|
||||||
|
# OCR engines rarely detect "|", so we insert them via pyphen
|
||||||
|
# hyphenation rules when the page is confirmed as a dictionary.
|
||||||
|
syllable_insertions = 0
|
||||||
|
if dict_detection.get("is_dictionary"):
|
||||||
|
try:
|
||||||
|
import pyphen
|
||||||
|
_hyph_de = pyphen.Pyphen(lang='de_DE')
|
||||||
|
_hyph_en = pyphen.Pyphen(lang='en_US')
|
||||||
|
# IPA/bracket pattern — don't hyphenate phonetic transcriptions
|
||||||
|
_ipa_re = re.compile(r'[\[\]ˈˌːʃʒθðŋɑɒæɔəɛɜɪʊʌ]')
|
||||||
|
for z in zones_data:
|
||||||
|
for cell in z.get("cells", []):
|
||||||
|
ct = cell.get("col_type", "")
|
||||||
|
if not ct.startswith("column_"):
|
||||||
|
continue
|
||||||
|
text = cell.get("text", "")
|
||||||
|
if not text or "|" in text:
|
||||||
|
continue # already has pipes or empty
|
||||||
|
if _ipa_re.search(text):
|
||||||
|
continue # IPA content — skip
|
||||||
|
# Split on commas/semicolons to handle "Kabel, die Kabel"
|
||||||
|
parts = re.split(r'([,;]\s*)', text)
|
||||||
|
new_parts = []
|
||||||
|
changed = False
|
||||||
|
for part in parts:
|
||||||
|
if re.match(r'^[,;]\s*$', part):
|
||||||
|
new_parts.append(part)
|
||||||
|
continue
|
||||||
|
# Process individual words in each part
|
||||||
|
words_in = re.split(r'(\s+)', part)
|
||||||
|
new_words = []
|
||||||
|
for w in words_in:
|
||||||
|
if re.match(r'^\s+$', w):
|
||||||
|
new_words.append(w)
|
||||||
|
continue
|
||||||
|
# Only hyphenate words ≥ 4 chars, skip articles/short
|
||||||
|
clean = re.sub(r'[().\-]', '', w)
|
||||||
|
if len(clean) < 4:
|
||||||
|
new_words.append(w)
|
||||||
|
continue
|
||||||
|
# Try DE first, then EN
|
||||||
|
hyph = _hyph_de.inserted(w, hyphen='|')
|
||||||
|
if '|' not in hyph:
|
||||||
|
hyph = _hyph_en.inserted(w, hyphen='|')
|
||||||
|
if '|' in hyph and hyph != w:
|
||||||
|
new_words.append(hyph)
|
||||||
|
changed = True
|
||||||
|
else:
|
||||||
|
new_words.append(w)
|
||||||
|
new_parts.append(''.join(new_words))
|
||||||
|
if changed:
|
||||||
|
cell["text"] = ''.join(new_parts)
|
||||||
|
syllable_insertions += 1
|
||||||
|
if syllable_insertions:
|
||||||
|
logger.info(
|
||||||
|
"build-grid session %s: inserted syllable dividers in %d cells",
|
||||||
|
session_id, syllable_insertions,
|
||||||
|
)
|
||||||
|
except ImportError:
|
||||||
|
logger.warning("pyphen not installed — skipping syllable insertion")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Syllable insertion failed: %s", e)
|
||||||
|
|
||||||
result = {
|
result = {
|
||||||
"session_id": session_id,
|
"session_id": session_id,
|
||||||
"image_width": img_w,
|
"image_width": img_w,
|
||||||
|
|||||||
@@ -38,6 +38,9 @@ eng-to-ipa
|
|||||||
# Spell-checker for rule-based OCR correction (MIT license)
|
# Spell-checker for rule-based OCR correction (MIT license)
|
||||||
pyspellchecker>=0.8.1
|
pyspellchecker>=0.8.1
|
||||||
|
|
||||||
|
# Syllable hyphenation for dictionary pipe-divider insertion (MIT license)
|
||||||
|
pyphen>=0.16.0
|
||||||
|
|
||||||
# PostgreSQL (for metrics storage)
|
# PostgreSQL (for metrics storage)
|
||||||
psycopg2-binary>=2.9.0
|
psycopg2-binary>=2.9.0
|
||||||
asyncpg>=0.29.0
|
asyncpg>=0.29.0
|
||||||
|
|||||||
Reference in New Issue
Block a user