Files
breakpilot-lehrer/klausur-service/backend/cv_syllable_detect.py
Benjamin Admin 525de55791
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 35s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m16s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 18s
Fix syllable+IPA combination: strip bracket content before IPA guard
The _IPA_RE check in _syllabify_text() skipped entire cells containing
any IPA character. After German IPA insertion adds [bɪltʃøn], the check
blocked syllabification entirely. Now strips bracket content before
checking, so programmatically inserted IPA doesn't prevent syllable
divider insertion on the surrounding text.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-26 00:03:10 +01:00

274 lines
9.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Syllable divider insertion for dictionary pages.
For confirmed dictionary pages (is_dictionary=True), processes all content
column cells:
1. Strips existing | dividers for clean normalization
2. Merges pipe-gap spaces (where OCR split a word at a divider position)
3. Applies pyphen syllabification to each word >= 3 alpha chars (DE then EN)
4. Only modifies words that pyphen recognizes — garbled OCR stays as-is
No CV gate needed — the dictionary detection confidence is sufficient.
pyphen uses Hunspell/TeX hyphenation dictionaries and is very reliable.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
import re
from typing import Any, Dict, List, Optional, Tuple
import numpy as np
logger = logging.getLogger(__name__)
# IPA/phonetic characters — skip cells containing these
_IPA_RE = re.compile(r'[\[\]ˈˌːʃʒθðŋɑɒæɔəɛɜɪʊʌ]')
# Common German words that should NOT be merged with adjacent tokens.
# These are function words that appear as standalone words between
# headwords/definitions on dictionary pages.
_STOP_WORDS = frozenset([
# Articles
'der', 'die', 'das', 'dem', 'den', 'des',
'ein', 'eine', 'einem', 'einen', 'einer',
# Pronouns
'er', 'es', 'sie', 'wir', 'ihr', 'ich', 'man', 'sich',
# Prepositions
'mit', 'von', 'zu', 'für', 'auf', 'in', 'an', 'um', 'am', 'im',
'aus', 'bei', 'nach', 'vor', 'bis', 'durch', 'über', 'unter',
'zwischen', 'ohne', 'gegen',
# Conjunctions
'und', 'oder', 'als', 'wie', 'wenn', 'dass', 'weil', 'aber',
# Adverbs
'auch', 'noch', 'nur', 'schon', 'sehr', 'nicht',
# Verbs
'ist', 'hat', 'wird', 'kann', 'soll', 'muss', 'darf',
'sein', 'haben',
# Other
'kein', 'keine', 'keinem', 'keinen', 'keiner',
])
# Cached hyphenators
_hyph_de = None
_hyph_en = None
def _get_hyphenators():
"""Lazy-load pyphen hyphenators (cached across calls)."""
global _hyph_de, _hyph_en
if _hyph_de is not None:
return _hyph_de, _hyph_en
try:
import pyphen
except ImportError:
return None, None
_hyph_de = pyphen.Pyphen(lang='de_DE')
_hyph_en = pyphen.Pyphen(lang='en_US')
return _hyph_de, _hyph_en
def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]:
"""Try to hyphenate a word using DE then EN dictionary.
Returns word with | separators, or None if not recognized.
"""
hyph = hyph_de.inserted(word, hyphen='|')
if '|' in hyph:
return hyph
hyph = hyph_en.inserted(word, hyphen='|')
if '|' in hyph:
return hyph
return None
def _try_merge_pipe_gaps(text: str, hyph_de) -> str:
"""Merge fragments separated by single spaces where OCR split at a pipe.
Example: "Kaf fee" -> "Kaffee" (pyphen recognizes the merged word).
Multi-step: "Ka bel jau" -> "Kabel jau" -> "Kabeljau".
Guards against false merges:
- The FIRST token must be pure alpha (word start — no attached punctuation)
- The second token may have trailing punctuation (comma, period) which
stays attached to the merged word: "" + "fer," -> "Käfer,"
- Common German function words (der, die, das, ...) are never merged
- At least one fragment must be very short (<=3 alpha chars)
"""
parts = text.split(' ')
if len(parts) < 2:
return text
result = [parts[0]]
i = 1
while i < len(parts):
prev = result[-1]
curr = parts[i]
# Extract alpha-only core for lookup
prev_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', prev)
curr_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', curr)
# Guard 1: first token must be pure alpha (word-start fragment)
# second token may have trailing punctuation
# Guard 2: neither alpha core can be a common German function word
# Guard 3: the shorter fragment must be <= 3 chars (pipe-gap signal)
# Guard 4: combined length must be >= 4
should_try = (
prev == prev_alpha # first token: pure alpha (word start)
and prev_alpha and curr_alpha
and prev_alpha.lower() not in _STOP_WORDS
and curr_alpha.lower() not in _STOP_WORDS
and min(len(prev_alpha), len(curr_alpha)) <= 3
and len(prev_alpha) + len(curr_alpha) >= 4
)
if should_try:
merged_alpha = prev_alpha + curr_alpha
hyph = hyph_de.inserted(merged_alpha, hyphen='-')
if '-' in hyph:
# pyphen recognizes merged word — collapse the space
result[-1] = prev + curr
i += 1
continue
result.append(curr)
i += 1
return ' '.join(result)
def _syllabify_text(text: str, hyph_de, hyph_en) -> str:
"""Syllabify all significant words in a text string.
1. Strip existing | dividers
2. Merge pipe-gap spaces where possible
3. Apply pyphen to each word >= 3 alphabetic chars
4. Words pyphen doesn't recognize stay as-is (no bad guesses)
"""
if not text:
return text
# Skip cells that contain IPA transcription characters outside brackets.
# Bracket content like [bɪltʃøn] is programmatically inserted and should
# not block syllabification of the surrounding text.
text_no_brackets = re.sub(r'\[[^\]]*\]', '', text)
if _IPA_RE.search(text_no_brackets):
return text
# Phase 1: strip existing pipe dividers for clean normalization
clean = text.replace('|', '')
# Phase 2: merge pipe-gap spaces (OCR fragments from pipe splitting)
clean = _try_merge_pipe_gaps(clean, hyph_de)
# Phase 3: tokenize and syllabify each word
# Split on whitespace and comma/semicolon sequences, keeping separators
tokens = re.split(r'(\s+|[,;:]+\s*)', clean)
result = []
for tok in tokens:
if not tok or re.match(r'^[\s,;:]+$', tok):
result.append(tok)
continue
# Strip trailing/leading punctuation for pyphen lookup
m = re.match(r'^([^a-zA-ZäöüÄÖÜßẞ]*)(.*?)([^a-zA-ZäöüÄÖÜßẞ]*)$', tok)
if not m:
result.append(tok)
continue
lead, word, trail = m.group(1), m.group(2), m.group(3)
if len(word) < 3 or not re.search(r'[a-zA-ZäöüÄÖÜß]', word):
result.append(tok)
continue
hyph = _hyphenate_word(word, hyph_de, hyph_en)
if hyph:
result.append(lead + hyph + trail)
else:
result.append(tok)
return ''.join(result)
def insert_syllable_dividers(
zones_data: List[Dict],
img_bgr: np.ndarray,
session_id: str,
*,
force: bool = False,
col_filter: Optional[set] = None,
) -> int:
"""Insert pipe syllable dividers into dictionary cells.
For dictionary pages: process all content column cells, strip existing
pipes, merge pipe-gap spaces, and re-syllabify using pyphen.
Pre-check: at least 1% of content cells must already contain ``|`` from
OCR. This guards against pages with zero pipe characters (the primary
guard — article_col_index — is checked at the call site).
Args:
force: If True, skip the pipe-ratio pre-check and syllabify all
content words regardless of whether the original has pipe dividers.
col_filter: If set, only process cells whose col_type is in this set.
None means process all content columns.
Returns the number of cells modified.
"""
hyph_de, hyph_en = _get_hyphenators()
if hyph_de is None:
logger.warning("pyphen not installed — skipping syllable insertion")
return 0
# Pre-check: count cells that already have | from OCR.
# Real dictionary pages with printed syllable dividers will have OCR-
# detected pipes in many cells. Pages without syllable dividers will
# have zero — skip those to avoid false syllabification.
if not force:
total_col_cells = 0
cells_with_pipes = 0
for z in zones_data:
for cell in z.get("cells", []):
if cell.get("col_type", "").startswith("column_"):
total_col_cells += 1
if "|" in cell.get("text", ""):
cells_with_pipes += 1
if total_col_cells > 0:
pipe_ratio = cells_with_pipes / total_col_cells
if pipe_ratio < 0.01:
logger.info(
"build-grid session %s: skipping syllable insertion — "
"only %.1f%% of cells have existing pipes (need >=1%%)",
session_id, pipe_ratio * 100,
)
return 0
insertions = 0
for z in zones_data:
for cell in z.get("cells", []):
ct = cell.get("col_type", "")
if not ct.startswith("column_"):
continue
if col_filter is not None and ct not in col_filter:
continue
text = cell.get("text", "")
if not text:
continue
new_text = _syllabify_text(text, hyph_de, hyph_en)
if new_text != text:
cell["text"] = new_text
insertions += 1
if insertions:
logger.info(
"build-grid session %s: syllable dividers inserted/normalized "
"in %d cells (pyphen)",
session_id, insertions,
)
return insertions