[split-required] Split final 43 files (500-668 LOC) to complete refactoring

klausur-service (11 files): - cv_gutter_repair, ocr_pipeline_regression, upload_api - ocr_pipeline_sessions, smart_spell, nru_worksheet_generator - ocr_pipeline_overlays, mail/aggregator, zeugnis_api - cv_syllable_detect, self_rag backend-lehrer (17 files): - classroom_engine/suggestions, generators/quiz_generator - worksheets_api, llm_gateway/comparison, state_engine_api - classroom/models (→ 4 submodules), services/file_processor - alerts_agent/api/wizard+digests+routes, content_generators/pdf - classroom/routes/sessions, llm_gateway/inference - classroom_engine/analytics, auth/keycloak_auth - alerts_agent/processing/rule_engine, ai_processor/print_versions agent-core (5 files): - brain/memory_store, brain/knowledge_graph, brain/context_manager - orchestrator/supervisor, sessions/session_manager admin-lehrer (5 components): - GridOverlay, StepGridReview, DevOpsPipelineSidebar - DataFlowDiagram, sbom/wizard/page website (2 files): - DependencyMap, lehrer/abitur-archiv Other: nibis_ingestion, grid_detection_service, export-doclayout-onnx Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 09:41:42 +02:00
parent 451365a312
commit bd4b956e3c
113 changed files with 13790 additions and 14148 deletions
--- a/klausur-service/backend/cv_syllable_merge.py
+++ b/klausur-service/backend/cv_syllable_merge.py
@@ -0,0 +1,300 @@
+"""
+Syllable Merge — word gap merging, syllabification, divider insertion.
+
+Extracted from cv_syllable_detect.py for modularity.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+import re
+from typing import Any, Dict, List, Optional
+
+import numpy as np
+
+from cv_syllable_core import (
+    _get_hyphenators,
+    _hyphenate_word,
+    _IPA_RE,
+    _STOP_WORDS,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def _try_merge_pipe_gaps(text: str, hyph_de) -> str:
+    """Merge fragments separated by single spaces where OCR split at a pipe.
+
+    Example: "Kaf fee" -> "Kaffee" (pyphen recognizes the merged word).
+    Multi-step: "Ka bel jau" -> "Kabel jau" -> "Kabeljau".
+
+    Guards against false merges:
+    - The FIRST token must be pure alpha (word start -- no attached punctuation)
+    - The second token may have trailing punctuation (comma, period) which
+      stays attached to the merged word: "Ka" + "fer," -> "Kafer,"
+    - Common German function words (der, die, das, ...) are never merged
+    - At least one fragment must be very short (<=3 alpha chars)
+    """
+    parts = text.split(' ')
+    if len(parts) < 2:
+        return text
+
+    result = [parts[0]]
+    i = 1
+    while i < len(parts):
+        prev = result[-1]
+        curr = parts[i]
+
+        # Extract alpha-only core for lookup
+        prev_alpha = re.sub(r'[^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]', '', prev)
+        curr_alpha = re.sub(r'[^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]', '', curr)
+
+        # Guard 1: first token must be pure alpha (word-start fragment)
+        #          second token may have trailing punctuation
+        # Guard 2: neither alpha core can be a common German function word
+        # Guard 3: the shorter fragment must be <= 3 chars (pipe-gap signal)
+        # Guard 4: combined length must be >= 4
+        should_try = (
+            prev == prev_alpha  # first token: pure alpha (word start)
+            and prev_alpha and curr_alpha
+            and prev_alpha.lower() not in _STOP_WORDS
+            and curr_alpha.lower() not in _STOP_WORDS
+            and min(len(prev_alpha), len(curr_alpha)) <= 3
+            and len(prev_alpha) + len(curr_alpha) >= 4
+        )
+
+        if should_try:
+            merged_alpha = prev_alpha + curr_alpha
+            hyph = hyph_de.inserted(merged_alpha, hyphen='-')
+            if '-' in hyph:
+                # pyphen recognizes merged word -- collapse the space
+                result[-1] = prev + curr
+                i += 1
+                continue
+
+        result.append(curr)
+        i += 1
+
+    return ' '.join(result)
+
+
+def merge_word_gaps_in_zones(zones_data: List[Dict], session_id: str) -> int:
+    """Merge OCR word-gap fragments in cell texts using pyphen validation.
+
+    OCR often splits words at syllable boundaries into separate word_boxes,
+    producing text like "zerknit tert" instead of "zerknittert".  This
+    function tries to merge adjacent fragments in every content cell.
+
+    More permissive than ``_try_merge_pipe_gaps`` (threshold 5 instead of 3)
+    but still guarded by pyphen dictionary lookup and stop-word exclusion.
+
+    Returns the number of cells modified.
+    """
+    hyph_de, _ = _get_hyphenators()
+    if hyph_de is None:
+        return 0
+
+    modified = 0
+    for z in zones_data:
+        for cell in z.get("cells", []):
+            ct = cell.get("col_type", "")
+            if not ct.startswith("column_"):
+                continue
+            text = cell.get("text", "")
+            if not text or " " not in text:
+                continue
+
+            # Skip IPA cells
+            text_no_brackets = re.sub(r'\[[^\]]*\]', '', text)
+            if _IPA_RE.search(text_no_brackets):
+                continue
+
+            new_text = _try_merge_word_gaps(text, hyph_de)
+            if new_text != text:
+                cell["text"] = new_text
+                modified += 1
+
+    if modified:
+        logger.info(
+            "build-grid session %s: merged word gaps in %d cells",
+            session_id, modified,
+        )
+    return modified
+
+
+def _try_merge_word_gaps(text: str, hyph_de) -> str:
+    """Merge OCR word fragments with relaxed threshold (max_short=5).
+
+    Similar to ``_try_merge_pipe_gaps`` but allows slightly longer fragments
+    (max_short=5 instead of 3).  Still requires pyphen to recognize the
+    merged word.
+    """
+    parts = text.split(' ')
+    if len(parts) < 2:
+        return text
+
+    result = [parts[0]]
+    i = 1
+    while i < len(parts):
+        prev = result[-1]
+        curr = parts[i]
+
+        prev_alpha = re.sub(r'[^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]', '', prev)
+        curr_alpha = re.sub(r'[^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]', '', curr)
+
+        should_try = (
+            prev == prev_alpha
+            and prev_alpha and curr_alpha
+            and prev_alpha.lower() not in _STOP_WORDS
+            and curr_alpha.lower() not in _STOP_WORDS
+            and min(len(prev_alpha), len(curr_alpha)) <= 5
+            and len(prev_alpha) + len(curr_alpha) >= 4
+        )
+
+        if should_try:
+            merged_alpha = prev_alpha + curr_alpha
+            hyph = hyph_de.inserted(merged_alpha, hyphen='-')
+            if '-' in hyph:
+                result[-1] = prev + curr
+                i += 1
+                continue
+
+        result.append(curr)
+        i += 1
+
+    return ' '.join(result)
+
+
+def _syllabify_text(text: str, hyph_de, hyph_en) -> str:
+    """Syllabify all significant words in a text string.
+
+    1. Strip existing | dividers
+    2. Merge pipe-gap spaces where possible
+    3. Apply pyphen to each word >= 3 alphabetic chars
+    4. Words pyphen doesn't recognize stay as-is (no bad guesses)
+    """
+    if not text:
+        return text
+
+    # Skip cells that contain IPA transcription characters outside brackets.
+    text_no_brackets = re.sub(r'\[[^\]]*\]', '', text)
+    if _IPA_RE.search(text_no_brackets):
+        return text
+
+    # Phase 1: strip existing pipe dividers for clean normalization
+    clean = text.replace('|', '')
+
+    # Phase 2: merge pipe-gap spaces (OCR fragments from pipe splitting)
+    clean = _try_merge_pipe_gaps(clean, hyph_de)
+
+    # Phase 3: tokenize and syllabify each word
+    # Split on whitespace and comma/semicolon sequences, keeping separators
+    tokens = re.split(r'(\s+|[,;:]+\s*)', clean)
+
+    result = []
+    for tok in tokens:
+        if not tok or re.match(r'^[\s,;:]+$', tok):
+            result.append(tok)
+            continue
+
+        # Strip trailing/leading punctuation for pyphen lookup
+        m = re.match(r'^([^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]*)(.*?)([^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]*)$', tok)
+        if not m:
+            result.append(tok)
+            continue
+        lead, word, trail = m.group(1), m.group(2), m.group(3)
+
+        if len(word) < 3 or not re.search(r'[a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df]', word):
+            result.append(tok)
+            continue
+
+        hyph = _hyphenate_word(word, hyph_de, hyph_en)
+        if hyph:
+            result.append(lead + hyph + trail)
+        else:
+            result.append(tok)
+
+    return ''.join(result)
+
+
+def insert_syllable_dividers(
+    zones_data: List[Dict],
+    img_bgr: np.ndarray,
+    session_id: str,
+    *,
+    force: bool = False,
+    col_filter: Optional[set] = None,
+) -> int:
+    """Insert pipe syllable dividers into dictionary cells.
+
+    For dictionary pages: process all content column cells, strip existing
+    pipes, merge pipe-gap spaces, and re-syllabify using pyphen.
+
+    Pre-check: at least 1% of content cells must already contain ``|`` from
+    OCR.  This guards against pages with zero pipe characters.
+
+    Args:
+        force: If True, skip the pipe-ratio pre-check and syllabify all
+            content words regardless of whether the original has pipe dividers.
+        col_filter: If set, only process cells whose col_type is in this set.
+            None means process all content columns.
+
+    Returns the number of cells modified.
+    """
+    hyph_de, hyph_en = _get_hyphenators()
+    if hyph_de is None:
+        logger.warning("pyphen not installed -- skipping syllable insertion")
+        return 0
+
+    # Pre-check: count cells that already have | from OCR.
+    if not force:
+        total_col_cells = 0
+        cells_with_pipes = 0
+        for z in zones_data:
+            for cell in z.get("cells", []):
+                if cell.get("col_type", "").startswith("column_"):
+                    total_col_cells += 1
+                    if "|" in cell.get("text", ""):
+                        cells_with_pipes += 1
+
+        if total_col_cells > 0:
+            pipe_ratio = cells_with_pipes / total_col_cells
+            if pipe_ratio < 0.01:
+                logger.info(
+                    "build-grid session %s: skipping syllable insertion -- "
+                    "only %.1f%% of cells have existing pipes (need >=1%%)",
+                    session_id, pipe_ratio * 100,
+                )
+                return 0
+
+    insertions = 0
+    for z in zones_data:
+        for cell in z.get("cells", []):
+            ct = cell.get("col_type", "")
+            if not ct.startswith("column_"):
+                continue
+            if col_filter is not None and ct not in col_filter:
+                continue
+            text = cell.get("text", "")
+            if not text:
+                continue
+
+            # In auto mode (force=False), only normalize cells that already
+            # have | from OCR (i.e. printed syllable dividers on the original
+            # scan).  Don't add new syllable marks to other words.
+            if not force and "|" not in text:
+                continue
+
+            new_text = _syllabify_text(text, hyph_de, hyph_en)
+            if new_text != text:
+                cell["text"] = new_text
+                insertions += 1
+
+    if insertions:
+        logger.info(
+            "build-grid session %s: syllable dividers inserted/normalized "
+            "in %d cells (pyphen)",
+            session_id, insertions,
+        )
+    return insertions