[split-required] Split remaining Python monoliths (Phase 1 continued)

klausur-service (7 monoliths): - grid_editor_helpers.py (1,737 → 5 files: columns, filters, headers, zones) - cv_cell_grid.py (1,675 → 7 files: build, legacy, streaming, merge, vocab) - worksheet_editor_api.py (1,305 → 4 files: models, AI, reconstruct, routes) - legal_corpus_ingestion.py (1,280 → 3 files: registry, chunking, ingestion) - cv_review.py (1,248 → 4 files: pipeline, spell, LLM, barrel) - cv_preprocessing.py (1,166 → 3 files: deskew, dewarp, barrel) - rbac.py, admin_api.py, routes/eh.py remain (next batch) backend-lehrer (1 monolith): - classroom_engine/repository.py (1,705 → 7 files by domain) All re-export barrels preserve backward compatibility. Zero import errors verified. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 22:47:59 +02:00
parent 0b37c5e692
commit b2a0126f14
34 changed files with 9264 additions and 9164 deletions
--- a/klausur-service/backend/cv_cell_grid_merge.py
+++ b/klausur-service/backend/cv_cell_grid_merge.py
@@ -0,0 +1,235 @@
+"""
+Row-merging logic for vocabulary entries (phonetic, wrapped, continuation rows).
+
+Extracted from cv_cell_grid.py.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+import re
+from typing import Any, Dict, List
+
+from cv_ocr_engines import _RE_ALPHA
+
+logger = logging.getLogger(__name__)
+
+# Regex: line starts with phonetic bracket content only (no real word before it)
+_PHONETIC_ONLY_RE = re.compile(
+    r'''^\s*[\[\('"]*[^\]]*[\])\s]*$'''
+)
+
+
+def _is_phonetic_only_text(text: str) -> bool:
+    """Check if text consists only of phonetic transcription.
+
+    Phonetic-only patterns:
+      ['mani serva]   ->  True
+      [dance]         ->  True
+      ["a:mand]       ->  True
+      almond ['a:mand] -> False (has real word before bracket)
+      Mandel           -> False
+    """
+    t = text.strip()
+    if not t:
+        return False
+    # Must contain at least one bracket
+    if '[' not in t and ']' not in t:
+        return False
+    # Remove all bracket content and surrounding punctuation/whitespace
+    without_brackets = re.sub(r"\[.*?\]", '', t)
+    without_brackets = re.sub(r"[\[\]'\"()\s]", '', without_brackets)
+    # If nothing meaningful remains, it's phonetic-only
+    alpha_remaining = ''.join(_RE_ALPHA.findall(without_brackets))
+    return len(alpha_remaining) < 2
+
+
+def _merge_phonetic_continuation_rows(
+    entries: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """Merge rows that contain only phonetic transcription into previous entry.
+
+    In dictionary pages, phonetic transcription sometimes wraps to the next
+    row.  E.g.:
+      Row 28: EN="it's a money-saver"  DE="es spart Kosten"
+      Row 29: EN="['mani serva]"       DE=""
+
+    Row 29 is phonetic-only -> merge into row 28's EN field.
+    """
+    if len(entries) < 2:
+        return entries
+
+    merged: List[Dict[str, Any]] = []
+    for entry in entries:
+        en = (entry.get('english') or '').strip()
+        de = (entry.get('german') or '').strip()
+        ex = (entry.get('example') or '').strip()
+
+        # Check if this entry is phonetic-only (EN has only phonetics, DE empty)
+        if merged and _is_phonetic_only_text(en) and not de:
+            prev = merged[-1]
+            prev_en = (prev.get('english') or '').strip()
+            # Append phonetic to previous entry's EN
+            if prev_en:
+                prev['english'] = prev_en + ' ' + en
+            else:
+                prev['english'] = en
+            # If there was an example, append to previous too
+            if ex:
+                prev_ex = (prev.get('example') or '').strip()
+                prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
+            logger.debug(
+                f"Merged phonetic row {entry.get('row_index')} "
+                f"into previous entry: {prev['english']!r}"
+            )
+            continue
+
+        merged.append(entry)
+
+    return merged
+
+
+def _merge_wrapped_rows(
+    entries: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """Merge rows where the primary column (EN) is empty -- cell wrap continuation.
+
+    In textbook vocabulary tables, columns are often narrow, so the author
+    wraps text within a cell. OCR treats each physical line as a separate row.
+    The key indicator: if the EN column is empty but DE/example have text,
+    this row is a continuation of the previous row's cells.
+
+    Example (original textbook has ONE row):
+      Row 2: EN="take part (in)"  DE="teilnehmen (an), mitmachen"  EX="More than 200 singers took"
+      Row 3: EN=""                DE="(bei)"                        EX="part in the concert."
+      -> Merged: EN="take part (in)" DE="teilnehmen (an), mitmachen (bei)" EX="..."
+
+    Also handles the reverse case: DE empty but EN has text (wrap in EN column).
+    """
+    if len(entries) < 2:
+        return entries
+
+    merged: List[Dict[str, Any]] = []
+    for entry in entries:
+        en = (entry.get('english') or '').strip()
+        de = (entry.get('german') or '').strip()
+        ex = (entry.get('example') or '').strip()
+
+        if not merged:
+            merged.append(entry)
+            continue
+
+        prev = merged[-1]
+        prev_en = (prev.get('english') or '').strip()
+        prev_de = (prev.get('german') or '').strip()
+        prev_ex = (prev.get('example') or '').strip()
+
+        # Case 1: EN is empty -> continuation of previous row
+        if not en and (de or ex) and prev_en:
+            if de:
+                if prev_de.endswith(','):
+                    sep = ' '
+                elif prev_de.endswith(('-', '(')):
+                    sep = ''
+                else:
+                    sep = ' '
+                prev['german'] = (prev_de + sep + de).strip()
+            if ex:
+                sep = ' ' if prev_ex else ''
+                prev['example'] = (prev_ex + sep + ex).strip()
+            logger.debug(
+                f"Merged wrapped row {entry.get('row_index')} into previous "
+                f"(empty EN): DE={prev['german']!r}, EX={prev.get('example', '')!r}"
+            )
+            continue
+
+        # Case 2: DE is empty, EN has text that looks like continuation
+        if en and not de and prev_de:
+            is_paren = en.startswith('(')
+            first_alpha = next((c for c in en if c.isalpha()), '')
+            starts_lower = first_alpha and first_alpha.islower()
+
+            if (is_paren or starts_lower) and len(en.split()) < 5:
+                sep = ' ' if prev_en and not prev_en.endswith((',', '-', '(')) else ''
+                prev['english'] = (prev_en + sep + en).strip()
+                if ex:
+                    sep2 = ' ' if prev_ex else ''
+                    prev['example'] = (prev_ex + sep2 + ex).strip()
+                logger.debug(
+                    f"Merged wrapped row {entry.get('row_index')} into previous "
+                    f"(empty DE): EN={prev['english']!r}"
+                )
+                continue
+
+        merged.append(entry)
+
+    if len(merged) < len(entries):
+        logger.info(
+            f"_merge_wrapped_rows: merged {len(entries) - len(merged)} "
+            f"continuation rows ({len(entries)} -> {len(merged)})"
+        )
+    return merged
+
+
+def _merge_continuation_rows(
+    entries: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """Merge multi-line vocabulary entries where text wraps to the next row.
+
+    A row is a continuation of the previous entry when:
+    - EN has text, but DE is empty
+    - EN starts with a lowercase letter (not a new vocab entry)
+    - Previous entry's EN does NOT end with a sentence terminator (.!?)
+    - The continuation text has fewer than 4 words (not an example sentence)
+    - The row was not already merged as phonetic
+
+    Example:
+      Row 5: EN="to put up"       DE="aufstellen"
+      Row 6: EN="with sth."       DE=""
+      -> Merged: EN="to put up with sth."  DE="aufstellen"
+    """
+    if len(entries) < 2:
+        return entries
+
+    merged: List[Dict[str, Any]] = []
+    for entry in entries:
+        en = (entry.get('english') or '').strip()
+        de = (entry.get('german') or '').strip()
+
+        if merged and en and not de:
+            # Check: not phonetic (already handled)
+            if _is_phonetic_only_text(en):
+                merged.append(entry)
+                continue
+
+            # Check: starts with lowercase
+            first_alpha = next((c for c in en if c.isalpha()), '')
+            starts_lower = first_alpha and first_alpha.islower()
+
+            # Check: fewer than 4 words (not an example sentence)
+            word_count = len(en.split())
+            is_short = word_count < 4
+
+            # Check: previous entry doesn't end with sentence terminator
+            prev = merged[-1]
+            prev_en = (prev.get('english') or '').strip()
+            prev_ends_sentence = prev_en and prev_en[-1] in '.!?'
+
+            if starts_lower and is_short and not prev_ends_sentence:
+                # Merge into previous entry
+                prev['english'] = (prev_en + ' ' + en).strip()
+                # Merge example if present
+                ex = (entry.get('example') or '').strip()
+                if ex:
+                    prev_ex = (prev.get('example') or '').strip()
+                    prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
+                logger.debug(
+                    f"Merged continuation row {entry.get('row_index')} "
+                    f"into previous entry: {prev['english']!r}"
+                )
+                continue
+
+        merged.append(entry)
+
+    return merged