Files
breakpilot-lehrer/klausur-service/backend/cv_syllable_core.py
Benjamin Admin bd4b956e3c [split-required] Split final 43 files (500-668 LOC) to complete refactoring
klausur-service (11 files):
- cv_gutter_repair, ocr_pipeline_regression, upload_api
- ocr_pipeline_sessions, smart_spell, nru_worksheet_generator
- ocr_pipeline_overlays, mail/aggregator, zeugnis_api
- cv_syllable_detect, self_rag

backend-lehrer (17 files):
- classroom_engine/suggestions, generators/quiz_generator
- worksheets_api, llm_gateway/comparison, state_engine_api
- classroom/models (→ 4 submodules), services/file_processor
- alerts_agent/api/wizard+digests+routes, content_generators/pdf
- classroom/routes/sessions, llm_gateway/inference
- classroom_engine/analytics, auth/keycloak_auth
- alerts_agent/processing/rule_engine, ai_processor/print_versions

agent-core (5 files):
- brain/memory_store, brain/knowledge_graph, brain/context_manager
- orchestrator/supervisor, sessions/session_manager

admin-lehrer (5 components):
- GridOverlay, StepGridReview, DevOpsPipelineSidebar
- DataFlowDiagram, sbom/wizard/page

website (2 files):
- DependencyMap, lehrer/abitur-archiv

Other: nibis_ingestion, grid_detection_service, export-doclayout-onnx

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 09:41:42 +02:00

232 lines
7.6 KiB
Python

"""
Syllable Core — hyphenator init, word validation, pipe autocorrect.
Extracted from cv_syllable_detect.py for modularity.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
import re
from typing import Any, Dict, List, Optional, Tuple
logger = logging.getLogger(__name__)
# IPA/phonetic characters -- skip cells containing these
_IPA_RE = re.compile(r'[\[\]\u02c8\u02cc\u02d0\u0283\u0292\u03b8\u00f0\u014b\u0251\u0252\u00e6\u0254\u0259\u025b\u025c\u026a\u028a\u028c]')
# Common German words that should NOT be merged with adjacent tokens.
_STOP_WORDS = frozenset([
# Articles
'der', 'die', 'das', 'dem', 'den', 'des',
'ein', 'eine', 'einem', 'einen', 'einer',
# Pronouns
'du', 'er', 'es', 'sie', 'wir', 'ihr', 'ich', 'man', 'sich',
'dich', 'dir', 'mich', 'mir', 'uns', 'euch', 'ihm', 'ihn',
# Prepositions
'mit', 'von', 'zu', 'f\u00fcr', 'auf', 'in', 'an', 'um', 'am', 'im',
'aus', 'bei', 'nach', 'vor', 'bis', 'durch', '\u00fcber', 'unter',
'zwischen', 'ohne', 'gegen',
# Conjunctions
'und', 'oder', 'als', 'wie', 'wenn', 'dass', 'weil', 'aber',
# Adverbs
'auch', 'noch', 'nur', 'schon', 'sehr', 'nicht',
# Verbs
'ist', 'hat', 'wird', 'kann', 'soll', 'muss', 'darf',
'sein', 'haben',
# Other
'kein', 'keine', 'keinem', 'keinen', 'keiner',
])
# Cached hyphenators
_hyph_de = None
_hyph_en = None
# Cached spellchecker (for autocorrect_pipe_artifacts)
_spell_de = None
def _get_hyphenators():
"""Lazy-load pyphen hyphenators (cached across calls)."""
global _hyph_de, _hyph_en
if _hyph_de is not None:
return _hyph_de, _hyph_en
try:
import pyphen
except ImportError:
return None, None
_hyph_de = pyphen.Pyphen(lang='de_DE')
_hyph_en = pyphen.Pyphen(lang='en_US')
return _hyph_de, _hyph_en
def _get_spellchecker():
"""Lazy-load German spellchecker (cached across calls)."""
global _spell_de
if _spell_de is not None:
return _spell_de
try:
from spellchecker import SpellChecker
except ImportError:
return None
_spell_de = SpellChecker(language='de')
return _spell_de
def _is_known_word(word: str, hyph_de, hyph_en) -> bool:
"""Check whether pyphen recognises a word (DE or EN)."""
if len(word) < 2:
return False
return ('|' in hyph_de.inserted(word, hyphen='|')
or '|' in hyph_en.inserted(word, hyphen='|'))
def _is_real_word(word: str) -> bool:
"""Check whether spellchecker knows this word (case-insensitive)."""
spell = _get_spellchecker()
if spell is None:
return False
return word.lower() in spell
def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]:
"""Try to hyphenate a word using DE then EN dictionary.
Returns word with | separators, or None if not recognized.
"""
hyph = hyph_de.inserted(word, hyphen='|')
if '|' in hyph:
return hyph
hyph = hyph_en.inserted(word, hyphen='|')
if '|' in hyph:
return hyph
return None
def _autocorrect_piped_word(word_with_pipes: str) -> Optional[str]:
"""Try to correct a word that has OCR pipe artifacts.
Printed syllable divider lines on dictionary pages confuse OCR:
the vertical stroke is often read as an extra character (commonly
``l``, ``I``, ``1``, ``i``) adjacent to where the pipe appears.
Uses ``spellchecker`` (frequency-based word list) for validation.
Strategy:
1. Strip ``|`` -- if spellchecker knows the result, done.
2. Try deleting each pipe-like character (l, I, 1, i, t).
3. Fall back to spellchecker's own ``correction()`` method.
4. Preserve the original casing of the first letter.
"""
stripped = word_with_pipes.replace('|', '')
if not stripped or len(stripped) < 3:
return stripped # too short to validate
# Step 1: if the stripped word is already a real word, done
if _is_real_word(stripped):
return stripped
# Step 2: try deleting pipe-like characters (most likely artifacts)
_PIPE_LIKE = frozenset('lI1it')
for idx in range(len(stripped)):
if stripped[idx] not in _PIPE_LIKE:
continue
candidate = stripped[:idx] + stripped[idx + 1:]
if len(candidate) >= 3 and _is_real_word(candidate):
return candidate
# Step 3: use spellchecker's built-in correction
spell = _get_spellchecker()
if spell is not None:
suggestion = spell.correction(stripped.lower())
if suggestion and suggestion != stripped.lower():
# Preserve original first-letter case
if stripped[0].isupper():
suggestion = suggestion[0].upper() + suggestion[1:]
return suggestion
return None # could not fix
def autocorrect_pipe_artifacts(
zones_data: List[Dict], session_id: str,
) -> int:
"""Strip OCR pipe artifacts and correct garbled words in-place.
Printed syllable divider lines on dictionary scans are read by OCR
as ``|`` characters embedded in words (e.g. ``Zel|le``, ``Ze|plpe|lin``).
This function:
1. Strips ``|`` from every word in content cells.
2. Validates with spellchecker (real dictionary lookup).
3. If not recognised, tries deleting pipe-like characters or uses
spellchecker's correction (e.g. ``Zeplpelin`` -> ``Zeppelin``).
4. Updates both word-box texts and cell text.
Returns the number of cells modified.
"""
spell = _get_spellchecker()
if spell is None:
logger.warning("spellchecker not available -- pipe autocorrect limited")
# Fall back: still strip pipes even without spellchecker
pass
modified = 0
for z in zones_data:
for cell in z.get("cells", []):
ct = cell.get("col_type", "")
if not ct.startswith("column_"):
continue
cell_changed = False
# --- Fix word boxes ---
for wb in cell.get("word_boxes", []):
wb_text = wb.get("text", "")
if "|" not in wb_text:
continue
# Separate trailing punctuation
m = re.match(
r'^([^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]*)'
r'(.*?)'
r'([^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]*)$',
wb_text,
)
if not m:
continue
lead, core, trail = m.group(1), m.group(2), m.group(3)
if "|" not in core:
continue
corrected = _autocorrect_piped_word(core)
if corrected is not None and corrected != core:
wb["text"] = lead + corrected + trail
cell_changed = True
# --- Rebuild cell text from word boxes ---
if cell_changed:
wbs = cell.get("word_boxes", [])
if wbs:
cell["text"] = " ".join(
(wb.get("text") or "") for wb in wbs
)
modified += 1
# --- Fallback: strip residual | from cell text ---
text = cell.get("text", "")
if "|" in text:
clean = text.replace("|", "")
if clean != text:
cell["text"] = clean
if not cell_changed:
modified += 1
if modified:
logger.info(
"build-grid session %s: autocorrected pipe artifacts in %d cells",
session_id, modified,
)
return modified