[split-required] Split final 43 files (500-668 LOC) to complete refactoring

klausur-service (11 files): - cv_gutter_repair, ocr_pipeline_regression, upload_api - ocr_pipeline_sessions, smart_spell, nru_worksheet_generator - ocr_pipeline_overlays, mail/aggregator, zeugnis_api - cv_syllable_detect, self_rag backend-lehrer (17 files): - classroom_engine/suggestions, generators/quiz_generator - worksheets_api, llm_gateway/comparison, state_engine_api - classroom/models (→ 4 submodules), services/file_processor - alerts_agent/api/wizard+digests+routes, content_generators/pdf - classroom/routes/sessions, llm_gateway/inference - classroom_engine/analytics, auth/keycloak_auth - alerts_agent/processing/rule_engine, ai_processor/print_versions agent-core (5 files): - brain/memory_store, brain/knowledge_graph, brain/context_manager - orchestrator/supervisor, sessions/session_manager admin-lehrer (5 components): - GridOverlay, StepGridReview, DevOpsPipelineSidebar - DataFlowDiagram, sbom/wizard/page website (2 files): - DependencyMap, lehrer/abitur-archiv Other: nibis_ingestion, grid_detection_service, export-doclayout-onnx Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 09:41:42 +02:00
parent 451365a312
commit bd4b956e3c
113 changed files with 13790 additions and 14148 deletions
--- a/klausur-service/backend/cv_syllable_detect.py
+++ b/klausur-service/backend/cv_syllable_detect.py
@@ -1,532 +1,32 @@
 """
-Syllable divider insertion for dictionary pages.
+Syllable divider insertion for dictionary pages — barrel re-export.

-For confirmed dictionary pages (is_dictionary=True), processes all content
-column cells:
-  1. Strips existing | dividers for clean normalization
-  2. Merges pipe-gap spaces (where OCR split a word at a divider position)
-  3. Applies pyphen syllabification to each word >= 3 alpha chars (DE then EN)
-  4. Only modifies words that pyphen recognizes — garbled OCR stays as-is
-
-No CV gate needed — the dictionary detection confidence is sufficient.
-pyphen uses Hunspell/TeX hyphenation dictionaries and is very reliable.
+All implementation split into:
+  cv_syllable_core  — hyphenator init, word validation, pipe autocorrect
+  cv_syllable_merge — word gap merging, syllabification, divider insertion

 Lizenz: Apache 2.0 (kommerziell nutzbar)
 DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
 """

-import logging
-import re
-from typing import Any, Dict, List, Optional, Tuple
-
-import numpy as np
-
-logger = logging.getLogger(__name__)
-
-# IPA/phonetic characters — skip cells containing these
-_IPA_RE = re.compile(r'[\[\]ˈˌːʃʒθðŋɑɒæɔəɛɜɪʊʌ]')
-
-# Common German words that should NOT be merged with adjacent tokens.
-# These are function words that appear as standalone words between
-# headwords/definitions on dictionary pages.
-_STOP_WORDS = frozenset([
-    # Articles
-    'der', 'die', 'das', 'dem', 'den', 'des',
-    'ein', 'eine', 'einem', 'einen', 'einer',
-    # Pronouns
-    'du', 'er', 'es', 'sie', 'wir', 'ihr', 'ich', 'man', 'sich',
-    'dich', 'dir', 'mich', 'mir', 'uns', 'euch', 'ihm', 'ihn',
-    # Prepositions
-    'mit', 'von', 'zu', 'für', 'auf', 'in', 'an', 'um', 'am', 'im',
-    'aus', 'bei', 'nach', 'vor', 'bis', 'durch', 'über', 'unter',
-    'zwischen', 'ohne', 'gegen',
-    # Conjunctions
-    'und', 'oder', 'als', 'wie', 'wenn', 'dass', 'weil', 'aber',
-    # Adverbs
-    'auch', 'noch', 'nur', 'schon', 'sehr', 'nicht',
-    # Verbs
-    'ist', 'hat', 'wird', 'kann', 'soll', 'muss', 'darf',
-    'sein', 'haben',
-    # Other
-    'kein', 'keine', 'keinem', 'keinen', 'keiner',
-])
-
-# Cached hyphenators
-_hyph_de = None
-_hyph_en = None
-
-# Cached spellchecker (for autocorrect_pipe_artifacts)
-_spell_de = None
-
-
-def _get_hyphenators():
-    """Lazy-load pyphen hyphenators (cached across calls)."""
-    global _hyph_de, _hyph_en
-    if _hyph_de is not None:
-        return _hyph_de, _hyph_en
-    try:
-        import pyphen
-    except ImportError:
-        return None, None
-    _hyph_de = pyphen.Pyphen(lang='de_DE')
-    _hyph_en = pyphen.Pyphen(lang='en_US')
-    return _hyph_de, _hyph_en
-
-
-def _get_spellchecker():
-    """Lazy-load German spellchecker (cached across calls)."""
-    global _spell_de
-    if _spell_de is not None:
-        return _spell_de
-    try:
-        from spellchecker import SpellChecker
-    except ImportError:
-        return None
-    _spell_de = SpellChecker(language='de')
-    return _spell_de
-
-
-def _is_known_word(word: str, hyph_de, hyph_en) -> bool:
-    """Check whether pyphen recognises a word (DE or EN)."""
-    if len(word) < 2:
-        return False
-    return ('|' in hyph_de.inserted(word, hyphen='|')
-            or '|' in hyph_en.inserted(word, hyphen='|'))
-
-
-def _is_real_word(word: str) -> bool:
-    """Check whether spellchecker knows this word (case-insensitive)."""
-    spell = _get_spellchecker()
-    if spell is None:
-        return False
-    return word.lower() in spell
-
-
-def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]:
-    """Try to hyphenate a word using DE then EN dictionary.
-
-    Returns word with | separators, or None if not recognized.
-    """
-    hyph = hyph_de.inserted(word, hyphen='|')
-    if '|' in hyph:
-        return hyph
-    hyph = hyph_en.inserted(word, hyphen='|')
-    if '|' in hyph:
-        return hyph
-    return None
-
-
-def _autocorrect_piped_word(word_with_pipes: str) -> Optional[str]:
-    """Try to correct a word that has OCR pipe artifacts.
-
-    Printed syllable divider lines on dictionary pages confuse OCR:
-    the vertical stroke is often read as an extra character (commonly
-    ``l``, ``I``, ``1``, ``i``) adjacent to where the pipe appears.
-    Sometimes OCR reads one divider as ``|`` and another as a letter,
-    so the garbled character may be far from any detected pipe.
-
-    Uses ``spellchecker`` (frequency-based word list) for validation —
-    unlike pyphen which is a pattern-based hyphenator and accepts
-    nonsense strings like "Zeplpelin".
-
-    Strategy:
-        1. Strip ``|`` — if spellchecker knows the result, done.
-        2. Try deleting each pipe-like character (l, I, 1, i, t).
-           OCR inserts extra chars that resemble vertical strokes.
-        3. Fall back to spellchecker's own ``correction()`` method.
-        4. Preserve the original casing of the first letter.
-    """
-    stripped = word_with_pipes.replace('|', '')
-    if not stripped or len(stripped) < 3:
-        return stripped  # too short to validate
-
-    # Step 1: if the stripped word is already a real word, done
-    if _is_real_word(stripped):
-        return stripped
-
-    # Step 2: try deleting pipe-like characters (most likely artifacts)
-    _PIPE_LIKE = frozenset('lI1it')
-    for idx in range(len(stripped)):
-        if stripped[idx] not in _PIPE_LIKE:
-            continue
-        candidate = stripped[:idx] + stripped[idx + 1:]
-        if len(candidate) >= 3 and _is_real_word(candidate):
-            return candidate
-
-    # Step 3: use spellchecker's built-in correction
-    spell = _get_spellchecker()
-    if spell is not None:
-        suggestion = spell.correction(stripped.lower())
-        if suggestion and suggestion != stripped.lower():
-            # Preserve original first-letter case
-            if stripped[0].isupper():
-                suggestion = suggestion[0].upper() + suggestion[1:]
-            return suggestion
-
-    return None  # could not fix
-
-
-def autocorrect_pipe_artifacts(
-    zones_data: List[Dict], session_id: str,
-) -> int:
-    """Strip OCR pipe artifacts and correct garbled words in-place.
-
-    Printed syllable divider lines on dictionary scans are read by OCR
-    as ``|`` characters embedded in words (e.g. ``Zel|le``, ``Ze|plpe|lin``).
-    This function:
-
-    1. Strips ``|`` from every word in content cells.
-    2. Validates with spellchecker (real dictionary lookup).
-    3. If not recognised, tries deleting pipe-like characters or uses
-       spellchecker's correction (e.g. ``Zeplpelin`` → ``Zeppelin``).
-    4. Updates both word-box texts and cell text.
-
-    Returns the number of cells modified.
-    """
-    spell = _get_spellchecker()
-    if spell is None:
-        logger.warning("spellchecker not available — pipe autocorrect limited")
-        # Fall back: still strip pipes even without spellchecker
-        pass
-
-    modified = 0
-    for z in zones_data:
-        for cell in z.get("cells", []):
-            ct = cell.get("col_type", "")
-            if not ct.startswith("column_"):
-                continue
-
-            cell_changed = False
-
-            # --- Fix word boxes ---
-            for wb in cell.get("word_boxes", []):
-                wb_text = wb.get("text", "")
-                if "|" not in wb_text:
-                    continue
-
-                # Separate trailing punctuation
-                m = re.match(
-                    r'^([^a-zA-ZäöüÄÖÜßẞ]*)'
-                    r'(.*?)'
-                    r'([^a-zA-ZäöüÄÖÜßẞ]*)$',
-                    wb_text,
-                )
-                if not m:
-                    continue
-                lead, core, trail = m.group(1), m.group(2), m.group(3)
-                if "|" not in core:
-                    continue
-
-                corrected = _autocorrect_piped_word(core)
-                if corrected is not None and corrected != core:
-                    wb["text"] = lead + corrected + trail
-                    cell_changed = True
-
-            # --- Rebuild cell text from word boxes ---
-            if cell_changed:
-                wbs = cell.get("word_boxes", [])
-                if wbs:
-                    cell["text"] = " ".join(
-                        (wb.get("text") or "") for wb in wbs
-                    )
-                modified += 1
-
-            # --- Fallback: strip residual | from cell text ---
-            # (covers cases where word_boxes don't exist or weren't fixed)
-            text = cell.get("text", "")
-            if "|" in text:
-                clean = text.replace("|", "")
-                if clean != text:
-                    cell["text"] = clean
-                    if not cell_changed:
-                        modified += 1
-
-    if modified:
-        logger.info(
-            "build-grid session %s: autocorrected pipe artifacts in %d cells",
-            session_id, modified,
-        )
-    return modified
-
-
-def _try_merge_pipe_gaps(text: str, hyph_de) -> str:
-    """Merge fragments separated by single spaces where OCR split at a pipe.
-
-    Example: "Kaf fee" -> "Kaffee" (pyphen recognizes the merged word).
-    Multi-step: "Ka bel jau" -> "Kabel jau" -> "Kabeljau".
-
-    Guards against false merges:
-    - The FIRST token must be pure alpha (word start — no attached punctuation)
-    - The second token may have trailing punctuation (comma, period) which
-      stays attached to the merged word: "Kä" + "fer," -> "Käfer,"
-    - Common German function words (der, die, das, ...) are never merged
-    - At least one fragment must be very short (<=3 alpha chars)
-    """
-    parts = text.split(' ')
-    if len(parts) < 2:
-        return text
-
-    result = [parts[0]]
-    i = 1
-    while i < len(parts):
-        prev = result[-1]
-        curr = parts[i]
-
-        # Extract alpha-only core for lookup
-        prev_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', prev)
-        curr_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', curr)
-
-        # Guard 1: first token must be pure alpha (word-start fragment)
-        #          second token may have trailing punctuation
-        # Guard 2: neither alpha core can be a common German function word
-        # Guard 3: the shorter fragment must be <= 3 chars (pipe-gap signal)
-        # Guard 4: combined length must be >= 4
-        should_try = (
-            prev == prev_alpha  # first token: pure alpha (word start)
-            and prev_alpha and curr_alpha
-            and prev_alpha.lower() not in _STOP_WORDS
-            and curr_alpha.lower() not in _STOP_WORDS
-            and min(len(prev_alpha), len(curr_alpha)) <= 3
-            and len(prev_alpha) + len(curr_alpha) >= 4
-        )
-
-        if should_try:
-            merged_alpha = prev_alpha + curr_alpha
-            hyph = hyph_de.inserted(merged_alpha, hyphen='-')
-            if '-' in hyph:
-                # pyphen recognizes merged word — collapse the space
-                result[-1] = prev + curr
-                i += 1
-                continue
-
-        result.append(curr)
-        i += 1
-
-    return ' '.join(result)
-
-
-def merge_word_gaps_in_zones(zones_data: List[Dict], session_id: str) -> int:
-    """Merge OCR word-gap fragments in cell texts using pyphen validation.
-
-    OCR often splits words at syllable boundaries into separate word_boxes,
-    producing text like "zerknit tert" instead of "zerknittert".  This
-    function tries to merge adjacent fragments in every content cell.
-
-    More permissive than ``_try_merge_pipe_gaps`` (threshold 5 instead of 3)
-    but still guarded by pyphen dictionary lookup and stop-word exclusion.
-
-    Returns the number of cells modified.
-    """
-    hyph_de, _ = _get_hyphenators()
-    if hyph_de is None:
-        return 0
-
-    modified = 0
-    for z in zones_data:
-        for cell in z.get("cells", []):
-            ct = cell.get("col_type", "")
-            if not ct.startswith("column_"):
-                continue
-            text = cell.get("text", "")
-            if not text or " " not in text:
-                continue
-
-            # Skip IPA cells
-            text_no_brackets = re.sub(r'\[[^\]]*\]', '', text)
-            if _IPA_RE.search(text_no_brackets):
-                continue
-
-            new_text = _try_merge_word_gaps(text, hyph_de)
-            if new_text != text:
-                cell["text"] = new_text
-                modified += 1
-
-    if modified:
-        logger.info(
-            "build-grid session %s: merged word gaps in %d cells",
-            session_id, modified,
-        )
-    return modified
-
-
-def _try_merge_word_gaps(text: str, hyph_de) -> str:
-    """Merge OCR word fragments with relaxed threshold (max_short=5).
-
-    Similar to ``_try_merge_pipe_gaps`` but allows slightly longer fragments
-    (max_short=5 instead of 3).  Still requires pyphen to recognize the
-    merged word.
-    """
-    parts = text.split(' ')
-    if len(parts) < 2:
-        return text
-
-    result = [parts[0]]
-    i = 1
-    while i < len(parts):
-        prev = result[-1]
-        curr = parts[i]
-
-        prev_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', prev)
-        curr_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ]', '', curr)
-
-        should_try = (
-            prev == prev_alpha
-            and prev_alpha and curr_alpha
-            and prev_alpha.lower() not in _STOP_WORDS
-            and curr_alpha.lower() not in _STOP_WORDS
-            and min(len(prev_alpha), len(curr_alpha)) <= 5
-            and len(prev_alpha) + len(curr_alpha) >= 4
-        )
-
-        if should_try:
-            merged_alpha = prev_alpha + curr_alpha
-            hyph = hyph_de.inserted(merged_alpha, hyphen='-')
-            if '-' in hyph:
-                result[-1] = prev + curr
-                i += 1
-                continue
-
-        result.append(curr)
-        i += 1
-
-    return ' '.join(result)
-
-
-def _syllabify_text(text: str, hyph_de, hyph_en) -> str:
-    """Syllabify all significant words in a text string.
-
-    1. Strip existing | dividers
-    2. Merge pipe-gap spaces where possible
-    3. Apply pyphen to each word >= 3 alphabetic chars
-    4. Words pyphen doesn't recognize stay as-is (no bad guesses)
-    """
-    if not text:
-        return text
-
-    # Skip cells that contain IPA transcription characters outside brackets.
-    # Bracket content like [bɪltʃøn] is programmatically inserted and should
-    # not block syllabification of the surrounding text.
-    text_no_brackets = re.sub(r'\[[^\]]*\]', '', text)
-    if _IPA_RE.search(text_no_brackets):
-        return text
-
-    # Phase 1: strip existing pipe dividers for clean normalization
-    clean = text.replace('|', '')
-
-    # Phase 2: merge pipe-gap spaces (OCR fragments from pipe splitting)
-    clean = _try_merge_pipe_gaps(clean, hyph_de)
-
-    # Phase 3: tokenize and syllabify each word
-    # Split on whitespace and comma/semicolon sequences, keeping separators
-    tokens = re.split(r'(\s+|[,;:]+\s*)', clean)
-
-    result = []
-    for tok in tokens:
-        if not tok or re.match(r'^[\s,;:]+$', tok):
-            result.append(tok)
-            continue
-
-        # Strip trailing/leading punctuation for pyphen lookup
-        m = re.match(r'^([^a-zA-ZäöüÄÖÜßẞ]*)(.*?)([^a-zA-ZäöüÄÖÜßẞ]*)$', tok)
-        if not m:
-            result.append(tok)
-            continue
-        lead, word, trail = m.group(1), m.group(2), m.group(3)
-
-        if len(word) < 3 or not re.search(r'[a-zA-ZäöüÄÖÜß]', word):
-            result.append(tok)
-            continue
-
-        hyph = _hyphenate_word(word, hyph_de, hyph_en)
-        if hyph:
-            result.append(lead + hyph + trail)
-        else:
-            result.append(tok)
-
-    return ''.join(result)
-
-
-def insert_syllable_dividers(
-    zones_data: List[Dict],
-    img_bgr: np.ndarray,
-    session_id: str,
-    *,
-    force: bool = False,
-    col_filter: Optional[set] = None,
-) -> int:
-    """Insert pipe syllable dividers into dictionary cells.
-
-    For dictionary pages: process all content column cells, strip existing
-    pipes, merge pipe-gap spaces, and re-syllabify using pyphen.
-
-    Pre-check: at least 1% of content cells must already contain ``|`` from
-    OCR.  This guards against pages with zero pipe characters (the primary
-    guard — article_col_index — is checked at the call site).
-
-    Args:
-        force: If True, skip the pipe-ratio pre-check and syllabify all
-            content words regardless of whether the original has pipe dividers.
-        col_filter: If set, only process cells whose col_type is in this set.
-            None means process all content columns.
-
-    Returns the number of cells modified.
-    """
-    hyph_de, hyph_en = _get_hyphenators()
-    if hyph_de is None:
-        logger.warning("pyphen not installed — skipping syllable insertion")
-        return 0
-
-    # Pre-check: count cells that already have | from OCR.
-    # Real dictionary pages with printed syllable dividers will have OCR-
-    # detected pipes in many cells.  Pages without syllable dividers will
-    # have zero — skip those to avoid false syllabification.
-    if not force:
-        total_col_cells = 0
-        cells_with_pipes = 0
-        for z in zones_data:
-            for cell in z.get("cells", []):
-                if cell.get("col_type", "").startswith("column_"):
-                    total_col_cells += 1
-                    if "|" in cell.get("text", ""):
-                        cells_with_pipes += 1
-
-        if total_col_cells > 0:
-            pipe_ratio = cells_with_pipes / total_col_cells
-            if pipe_ratio < 0.01:
-                logger.info(
-                    "build-grid session %s: skipping syllable insertion — "
-                    "only %.1f%% of cells have existing pipes (need >=1%%)",
-                    session_id, pipe_ratio * 100,
-                )
-                return 0
-
-    insertions = 0
-    for z in zones_data:
-        for cell in z.get("cells", []):
-            ct = cell.get("col_type", "")
-            if not ct.startswith("column_"):
-                continue
-            if col_filter is not None and ct not in col_filter:
-                continue
-            text = cell.get("text", "")
-            if not text:
-                continue
-
-            # In auto mode (force=False), only normalize cells that already
-            # have | from OCR (i.e. printed syllable dividers on the original
-            # scan).  Don't add new syllable marks to other words.
-            if not force and "|" not in text:
-                continue
-
-            new_text = _syllabify_text(text, hyph_de, hyph_en)
-            if new_text != text:
-                cell["text"] = new_text
-                insertions += 1
-
-    if insertions:
-        logger.info(
-            "build-grid session %s: syllable dividers inserted/normalized "
-            "in %d cells (pyphen)",
-            session_id, insertions,
-        )
-    return insertions
+# Core: init, validation, autocorrect
+from cv_syllable_core import (  # noqa: F401
+    _IPA_RE,
+    _STOP_WORDS,
+    _get_hyphenators,
+    _get_spellchecker,
+    _is_known_word,
+    _is_real_word,
+    _hyphenate_word,
+    _autocorrect_piped_word,
+    autocorrect_pipe_artifacts,
+)
+
+# Merge: gap merging, syllabify, insert
+from cv_syllable_merge import (  # noqa: F401
+    _try_merge_pipe_gaps,
+    merge_word_gaps_in_zones,
+    _try_merge_word_gaps,
+    _syllabify_text,
+    insert_syllable_dividers,
+)