klausur-service (11 files): - cv_gutter_repair, ocr_pipeline_regression, upload_api - ocr_pipeline_sessions, smart_spell, nru_worksheet_generator - ocr_pipeline_overlays, mail/aggregator, zeugnis_api - cv_syllable_detect, self_rag backend-lehrer (17 files): - classroom_engine/suggestions, generators/quiz_generator - worksheets_api, llm_gateway/comparison, state_engine_api - classroom/models (→ 4 submodules), services/file_processor - alerts_agent/api/wizard+digests+routes, content_generators/pdf - classroom/routes/sessions, llm_gateway/inference - classroom_engine/analytics, auth/keycloak_auth - alerts_agent/processing/rule_engine, ai_processor/print_versions agent-core (5 files): - brain/memory_store, brain/knowledge_graph, brain/context_manager - orchestrator/supervisor, sessions/session_manager admin-lehrer (5 components): - GridOverlay, StepGridReview, DevOpsPipelineSidebar - DataFlowDiagram, sbom/wizard/page website (2 files): - DependencyMap, lehrer/abitur-archiv Other: nibis_ingestion, grid_detection_service, export-doclayout-onnx Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
232 lines
7.6 KiB
Python
232 lines
7.6 KiB
Python
"""
|
|
Syllable Core — hyphenator init, word validation, pipe autocorrect.
|
|
|
|
Extracted from cv_syllable_detect.py for modularity.
|
|
|
|
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
|
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
|
"""
|
|
|
|
import logging
|
|
import re
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# IPA/phonetic characters -- skip cells containing these
|
|
_IPA_RE = re.compile(r'[\[\]\u02c8\u02cc\u02d0\u0283\u0292\u03b8\u00f0\u014b\u0251\u0252\u00e6\u0254\u0259\u025b\u025c\u026a\u028a\u028c]')
|
|
|
|
# Common German words that should NOT be merged with adjacent tokens.
|
|
_STOP_WORDS = frozenset([
|
|
# Articles
|
|
'der', 'die', 'das', 'dem', 'den', 'des',
|
|
'ein', 'eine', 'einem', 'einen', 'einer',
|
|
# Pronouns
|
|
'du', 'er', 'es', 'sie', 'wir', 'ihr', 'ich', 'man', 'sich',
|
|
'dich', 'dir', 'mich', 'mir', 'uns', 'euch', 'ihm', 'ihn',
|
|
# Prepositions
|
|
'mit', 'von', 'zu', 'f\u00fcr', 'auf', 'in', 'an', 'um', 'am', 'im',
|
|
'aus', 'bei', 'nach', 'vor', 'bis', 'durch', '\u00fcber', 'unter',
|
|
'zwischen', 'ohne', 'gegen',
|
|
# Conjunctions
|
|
'und', 'oder', 'als', 'wie', 'wenn', 'dass', 'weil', 'aber',
|
|
# Adverbs
|
|
'auch', 'noch', 'nur', 'schon', 'sehr', 'nicht',
|
|
# Verbs
|
|
'ist', 'hat', 'wird', 'kann', 'soll', 'muss', 'darf',
|
|
'sein', 'haben',
|
|
# Other
|
|
'kein', 'keine', 'keinem', 'keinen', 'keiner',
|
|
])
|
|
|
|
# Cached hyphenators
|
|
_hyph_de = None
|
|
_hyph_en = None
|
|
|
|
# Cached spellchecker (for autocorrect_pipe_artifacts)
|
|
_spell_de = None
|
|
|
|
|
|
def _get_hyphenators():
|
|
"""Lazy-load pyphen hyphenators (cached across calls)."""
|
|
global _hyph_de, _hyph_en
|
|
if _hyph_de is not None:
|
|
return _hyph_de, _hyph_en
|
|
try:
|
|
import pyphen
|
|
except ImportError:
|
|
return None, None
|
|
_hyph_de = pyphen.Pyphen(lang='de_DE')
|
|
_hyph_en = pyphen.Pyphen(lang='en_US')
|
|
return _hyph_de, _hyph_en
|
|
|
|
|
|
def _get_spellchecker():
|
|
"""Lazy-load German spellchecker (cached across calls)."""
|
|
global _spell_de
|
|
if _spell_de is not None:
|
|
return _spell_de
|
|
try:
|
|
from spellchecker import SpellChecker
|
|
except ImportError:
|
|
return None
|
|
_spell_de = SpellChecker(language='de')
|
|
return _spell_de
|
|
|
|
|
|
def _is_known_word(word: str, hyph_de, hyph_en) -> bool:
|
|
"""Check whether pyphen recognises a word (DE or EN)."""
|
|
if len(word) < 2:
|
|
return False
|
|
return ('|' in hyph_de.inserted(word, hyphen='|')
|
|
or '|' in hyph_en.inserted(word, hyphen='|'))
|
|
|
|
|
|
def _is_real_word(word: str) -> bool:
|
|
"""Check whether spellchecker knows this word (case-insensitive)."""
|
|
spell = _get_spellchecker()
|
|
if spell is None:
|
|
return False
|
|
return word.lower() in spell
|
|
|
|
|
|
def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]:
|
|
"""Try to hyphenate a word using DE then EN dictionary.
|
|
|
|
Returns word with | separators, or None if not recognized.
|
|
"""
|
|
hyph = hyph_de.inserted(word, hyphen='|')
|
|
if '|' in hyph:
|
|
return hyph
|
|
hyph = hyph_en.inserted(word, hyphen='|')
|
|
if '|' in hyph:
|
|
return hyph
|
|
return None
|
|
|
|
|
|
def _autocorrect_piped_word(word_with_pipes: str) -> Optional[str]:
|
|
"""Try to correct a word that has OCR pipe artifacts.
|
|
|
|
Printed syllable divider lines on dictionary pages confuse OCR:
|
|
the vertical stroke is often read as an extra character (commonly
|
|
``l``, ``I``, ``1``, ``i``) adjacent to where the pipe appears.
|
|
|
|
Uses ``spellchecker`` (frequency-based word list) for validation.
|
|
|
|
Strategy:
|
|
1. Strip ``|`` -- if spellchecker knows the result, done.
|
|
2. Try deleting each pipe-like character (l, I, 1, i, t).
|
|
3. Fall back to spellchecker's own ``correction()`` method.
|
|
4. Preserve the original casing of the first letter.
|
|
"""
|
|
stripped = word_with_pipes.replace('|', '')
|
|
if not stripped or len(stripped) < 3:
|
|
return stripped # too short to validate
|
|
|
|
# Step 1: if the stripped word is already a real word, done
|
|
if _is_real_word(stripped):
|
|
return stripped
|
|
|
|
# Step 2: try deleting pipe-like characters (most likely artifacts)
|
|
_PIPE_LIKE = frozenset('lI1it')
|
|
for idx in range(len(stripped)):
|
|
if stripped[idx] not in _PIPE_LIKE:
|
|
continue
|
|
candidate = stripped[:idx] + stripped[idx + 1:]
|
|
if len(candidate) >= 3 and _is_real_word(candidate):
|
|
return candidate
|
|
|
|
# Step 3: use spellchecker's built-in correction
|
|
spell = _get_spellchecker()
|
|
if spell is not None:
|
|
suggestion = spell.correction(stripped.lower())
|
|
if suggestion and suggestion != stripped.lower():
|
|
# Preserve original first-letter case
|
|
if stripped[0].isupper():
|
|
suggestion = suggestion[0].upper() + suggestion[1:]
|
|
return suggestion
|
|
|
|
return None # could not fix
|
|
|
|
|
|
def autocorrect_pipe_artifacts(
|
|
zones_data: List[Dict], session_id: str,
|
|
) -> int:
|
|
"""Strip OCR pipe artifacts and correct garbled words in-place.
|
|
|
|
Printed syllable divider lines on dictionary scans are read by OCR
|
|
as ``|`` characters embedded in words (e.g. ``Zel|le``, ``Ze|plpe|lin``).
|
|
This function:
|
|
|
|
1. Strips ``|`` from every word in content cells.
|
|
2. Validates with spellchecker (real dictionary lookup).
|
|
3. If not recognised, tries deleting pipe-like characters or uses
|
|
spellchecker's correction (e.g. ``Zeplpelin`` -> ``Zeppelin``).
|
|
4. Updates both word-box texts and cell text.
|
|
|
|
Returns the number of cells modified.
|
|
"""
|
|
spell = _get_spellchecker()
|
|
if spell is None:
|
|
logger.warning("spellchecker not available -- pipe autocorrect limited")
|
|
# Fall back: still strip pipes even without spellchecker
|
|
pass
|
|
|
|
modified = 0
|
|
for z in zones_data:
|
|
for cell in z.get("cells", []):
|
|
ct = cell.get("col_type", "")
|
|
if not ct.startswith("column_"):
|
|
continue
|
|
|
|
cell_changed = False
|
|
|
|
# --- Fix word boxes ---
|
|
for wb in cell.get("word_boxes", []):
|
|
wb_text = wb.get("text", "")
|
|
if "|" not in wb_text:
|
|
continue
|
|
|
|
# Separate trailing punctuation
|
|
m = re.match(
|
|
r'^([^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]*)'
|
|
r'(.*?)'
|
|
r'([^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]*)$',
|
|
wb_text,
|
|
)
|
|
if not m:
|
|
continue
|
|
lead, core, trail = m.group(1), m.group(2), m.group(3)
|
|
if "|" not in core:
|
|
continue
|
|
|
|
corrected = _autocorrect_piped_word(core)
|
|
if corrected is not None and corrected != core:
|
|
wb["text"] = lead + corrected + trail
|
|
cell_changed = True
|
|
|
|
# --- Rebuild cell text from word boxes ---
|
|
if cell_changed:
|
|
wbs = cell.get("word_boxes", [])
|
|
if wbs:
|
|
cell["text"] = " ".join(
|
|
(wb.get("text") or "") for wb in wbs
|
|
)
|
|
modified += 1
|
|
|
|
# --- Fallback: strip residual | from cell text ---
|
|
text = cell.get("text", "")
|
|
if "|" in text:
|
|
clean = text.replace("|", "")
|
|
if clean != text:
|
|
cell["text"] = clean
|
|
if not cell_changed:
|
|
modified += 1
|
|
|
|
if modified:
|
|
logger.info(
|
|
"build-grid session %s: autocorrected pipe artifacts in %d cells",
|
|
session_id, modified,
|
|
)
|
|
return modified
|