Restructure: Move grid_* + vocab_* into packages (klausur-service)

grid/ package (16 files): grid/build/ — core, zones, cleanup, text_ops, cell_ops, finalize grid/editor/ — api, helpers, columns, filters, headers, zones vocab/ package (10 files): vocab/worksheet/ — api, models, extraction, generation, ocr, upload, analysis, compare vocab/ — session_store, learn_bridge 26 backward-compat shims. Internal imports relative. RAG untouched. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 21:30:20 +02:00
parent 098a2ff092
commit 59c400b9aa
58 changed files with 8803 additions and 8659 deletions
--- a/klausur-service/backend/grid_build_finalize.py
+++ b/klausur-service/backend/grid_build_finalize.py
@@ -1,452 +1,4 @@
-"""
-Grid Build Finalize — Phase 5b+6: Orchestrates cell-level operations,
-dictionary detection, syllable dividers, spell checking, empty column
-removal, and result assembly.
-
-Extracted from grid_build_core.py for maintainability.
-"""
-
-import logging
-import re
-from typing import Any, Dict, List, Optional
-
-from grid_build_cell_ops import (
-    _remove_bullets_and_artifacts,
-    _remove_garbled_cells,
-    _normalize_word_order,
-    _enforce_max_columns,
-)
-
-logger = logging.getLogger(__name__)
-
-
-def _finalize_grid(
-    zones_data: List[Dict[str, Any]],
-    all_words: List[Dict[str, Any]],
-    img_bgr: Any,
-    img_w: int,
-    img_h: int,
-    session_id: str,
-    max_columns: Optional[int],
-    ipa_mode: str,
-    syllable_mode: str,
-    en_col_type: Optional[str],
-    ipa_target_cols: set,
-    all_content_cols: set,
-    skip_ipa: bool,
-    document_category: Optional[str],
-    margin_strip_detected: bool,
-    page_number_info: Optional[Dict],
-    boxes_detected: int,
-    recovered_count: int,
-    duration: float,
-) -> dict:
-    """Run final processing steps and assemble result dict.
-
-    Handles: bullet removal, artifact cells, word ordering, max_columns,
-    dictionary detection, syllable dividers, spell check, empty columns,
-    internal flag cleanup, and result assembly.
-    """
-    total_cols = sum(len(z.get("columns", [])) for z in zones_data)
-
-    # 5i. Remove blue bullet/artifact word_boxes
-    _remove_bullets_and_artifacts(zones_data)
-
-    # 5j-pre. Remove cells whose text is entirely garbled / artifact noise
-    _remove_garbled_cells(zones_data)
-
-    # 5j. Normalise word_box order to reading order
-    _normalize_word_order(zones_data)
-
-    # 5k. Enforce max_columns by merging narrowest columns
-    if max_columns and max_columns > 0:
-        _enforce_max_columns(zones_data, max_columns)
-
-    # --- Dictionary detection on assembled grid ---
-    dict_detection = _detect_dictionary(
-        zones_data, img_w, img_h, document_category, margin_strip_detected
-    )
-
-    # --- Word-gap merge ---
-    try:
-        from cv_syllable_detect import merge_word_gaps_in_zones
-        merge_word_gaps_in_zones(zones_data, session_id)
-    except Exception as e:
-        logger.warning("Word-gap merge failed: %s", e)
-
-    # --- Pipe auto-correction ---
-    try:
-        from cv_syllable_detect import autocorrect_pipe_artifacts
-        autocorrect_pipe_artifacts(zones_data, session_id)
-    except Exception as e:
-        logger.warning("Pipe autocorrect failed: %s", e)
-
-    # --- Syllable divider insertion ---
-    syllable_insertions = _insert_syllable_dividers(
-        zones_data, img_bgr, session_id, syllable_mode, dict_detection,
-        en_col_type, all_content_cols, total_cols,
-    )
-
-    # --- Split merged words ---
-    _split_merged_words(zones_data, session_id)
-
-    # --- Ensure space before IPA/phonetic brackets ---
-    _fix_ipa_spacing(zones_data)
-
-    # --- SmartSpellChecker ---
-    _run_spell_checker(zones_data, session_id, en_col_type, total_cols)
-
-    # --- Debug log cell counts per column ---
-    for z in zones_data:
-        if z.get("zone_type") == "content":
-            from collections import Counter as _Counter
-            _cc = _Counter(c.get("col_index") for c in z.get("cells", []))
-            _cols = z.get("columns", [])
-            logger.info(
-                "pre-empty-col-removal zone %d: %d cols, cells_per_col=%s",
-                z.get("zone_index", 0), len(_cols), dict(sorted(_cc.items())),
-            )
-
-    # --- Remove empty columns ---
-    _remove_empty_columns(zones_data)
-
-    # Clean up internal flags before returning
-    for z in zones_data:
-        for cell in z.get("cells", []):
-            cell.pop("_ipa_corrected", None)
-
-    # 6. Build result
-    return _assemble_result(
-        zones_data, all_words, img_w, img_h, session_id,
-        ipa_mode, syllable_mode, ipa_target_cols, skip_ipa,
-        dict_detection, page_number_info, boxes_detected,
-        recovered_count, duration, syllable_insertions,
-    )
-
-
-def _detect_dictionary(
-    zones_data: List[Dict[str, Any]],
-    img_w: int,
-    img_h: int,
-    document_category: Optional[str],
-    margin_strip_detected: bool,
-) -> Dict[str, Any]:
-    """Run dictionary detection on the assembled grid."""
-    from cv_layout import _score_dictionary_signals
-    dict_detection: Dict[str, Any] = {"is_dictionary": False, "confidence": 0.0}
-    try:
-        from cv_vocab_types import ColumnGeometry
-        for z in zones_data:
-            zone_cells = z.get("cells", [])
-            zone_cols = z.get("columns", [])
-            if len(zone_cols) < 2 or len(zone_cells) < 10:
-                continue
-            pseudo_geoms = []
-            for col in zone_cols:
-                ci = col["index"]
-                col_cells = [c for c in zone_cells if c.get("col_index") == ci]
-                col_words = []
-                for cell in col_cells:
-                    for wb in cell.get("word_boxes") or []:
-                        col_words.append({
-                            "text": wb.get("text", ""),
-                            "conf": wb.get("conf", 0),
-                            "top": wb.get("top", 0),
-                            "left": wb.get("left", 0),
-                            "height": wb.get("height", 0),
-                            "width": wb.get("width", 0),
-                        })
-                    if not cell.get("word_boxes") and cell.get("text"):
-                        col_words.append({
-                            "text": cell["text"],
-                            "conf": cell.get("confidence", 50),
-                            "top": cell.get("bbox_px", {}).get("y", 0),
-                            "left": cell.get("bbox_px", {}).get("x", 0),
-                            "height": cell.get("bbox_px", {}).get("h", 20),
-                            "width": cell.get("bbox_px", {}).get("w", 50),
-                        })
-                col_w = col.get("x_max_px", 0) - col.get("x_min_px", 0)
-                pseudo_geoms.append(ColumnGeometry(
-                    index=ci, x=col.get("x_min_px", 0), y=0,
-                    width=max(col_w, 1), height=img_h,
-                    word_count=len(col_words), words=col_words,
-                    width_ratio=col_w / max(img_w, 1),
-                ))
-            if len(pseudo_geoms) >= 2:
-                dd = _score_dictionary_signals(
-                    pseudo_geoms,
-                    document_category=document_category,
-                    margin_strip_detected=margin_strip_detected,
-                )
-                if dd["confidence"] > dict_detection["confidence"]:
-                    dict_detection = dd
-    except Exception as e:
-        logger.warning("Dictionary detection failed: %s", e)
-    return dict_detection
-
-
-def _insert_syllable_dividers(
-    zones_data: List[Dict[str, Any]],
-    img_bgr: Any,
-    session_id: str,
-    syllable_mode: str,
-    dict_detection: Dict[str, Any],
-    en_col_type: Optional[str],
-    all_content_cols: set,
-    total_cols: int,
-) -> int:
-    """Insert syllable dividers for dictionary pages. Returns insertion count."""
-    syllable_insertions = 0
-    if syllable_mode == "none" or img_bgr is None:
-        if syllable_mode == "none":
-            for z in zones_data:
-                for cell in z.get("cells", []):
-                    t = cell.get("text", "")
-                    if "|" in t:
-                        cell["text"] = t.replace("|", "")
-        return syllable_insertions
-
-    _syllable_eligible = False
-    if syllable_mode in ("all", "de", "en"):
-        _syllable_eligible = True
-    elif (dict_detection.get("is_dictionary")
-            and dict_detection.get("article_col_index") is not None):
-        _syllable_eligible = True
-
-    _syllable_col_filter: Optional[set] = None
-    if syllable_mode == "en":
-        _syllable_col_filter = {en_col_type} if en_col_type else set()
-    elif syllable_mode == "de":
-        if en_col_type and total_cols >= 3:
-            _syllable_col_filter = all_content_cols - {en_col_type}
-
-    if _syllable_eligible:
-        try:
-            from cv_syllable_detect import insert_syllable_dividers
-            force_syllables = (syllable_mode in ("all", "de", "en"))
-            syllable_insertions = insert_syllable_dividers(
-                zones_data, img_bgr, session_id,
-                force=force_syllables,
-                col_filter=_syllable_col_filter,
-            )
-        except Exception as e:
-            logger.warning("Syllable insertion failed: %s", e)
-
-    return syllable_insertions
-
-
-def _split_merged_words(
-    zones_data: List[Dict[str, Any]],
-    session_id: str,
-) -> None:
-    """Split merged words using dictionary lookup."""
-    try:
-        from cv_review import _try_split_merged_word, _SPELL_AVAILABLE
-        if not _SPELL_AVAILABLE:
-            return
-        split_count = 0
-        for z in zones_data:
-            for cell in z.get("cells", []):
-                text = cell.get("text", "")
-                if not text:
-                    continue
-                parts = []
-                changed = False
-                for token in text.split():
-                    clean = token
-                    bracket_pos = clean.find('[')
-                    suffix_ipa = ""
-                    if bracket_pos > 0:
-                        suffix_ipa = clean[bracket_pos:]
-                        clean = clean[:bracket_pos]
-                    suffix_punct = ""
-                    stripped = clean.rstrip(".,!?;:'\")")
-                    if stripped != clean:
-                        suffix_punct = clean[len(stripped):]
-                        clean = stripped
-                    suffix = suffix_punct + suffix_ipa
-                    contraction = ""
-                    if "'" in clean and clean.index("'") >= 2:
-                        apos_pos = clean.index("'")
-                        contraction = clean[apos_pos:]
-                        clean = clean[:apos_pos]
-                        suffix = contraction + suffix
-                    if len(clean) >= 4 and clean.isalpha():
-                        split = _try_split_merged_word(clean)
-                        if split:
-                            parts.append(split + suffix)
-                            changed = True
-                            continue
-                    parts.append(token)
-                if changed:
-                    cell["text"] = " ".join(parts)
-                    split_count += 1
-        if split_count:
-            logger.info("build-grid session %s: split %d merged words", session_id, split_count)
-    except ImportError:
-        pass
-
-
-def _fix_ipa_spacing(zones_data: List[Dict[str, Any]]) -> None:
-    """Ensure space before IPA/phonetic brackets: 'word[ipa]' -> 'word [ipa]'."""
-    _IPA_NOSPACE_RE = re.compile(r'([a-zA-ZäöüÄÖÜß])(\[[^\]]{2,}\])')
-    for z in zones_data:
-        for cell in z.get("cells", []):
-            text = cell.get("text", "")
-            if text and "[" in text:
-                fixed = _IPA_NOSPACE_RE.sub(r'\1 \2', text)
-                if fixed != text:
-                    cell["text"] = fixed
-
-
-def _run_spell_checker(
-    zones_data: List[Dict[str, Any]],
-    session_id: str,
-    en_col_type: Optional[str],
-    total_cols: int,
-) -> None:
-    """Run SmartSpellChecker on all cells."""
-    try:
-        from smart_spell import SmartSpellChecker
-        _ssc = SmartSpellChecker()
-        spell_fix_count = 0
-
-        for z in zones_data:
-            for cell in z.get("cells", []):
-                text = cell.get("text", "")
-                if not text or not text.strip():
-                    continue
-                ct = cell.get("col_type", "")
-                if not ct.startswith("column_"):
-                    continue
-
-                if total_cols >= 3 and en_col_type:
-                    lang = "en" if ct == en_col_type else "de"
-                elif total_cols <= 2:
-                    lang = "auto"
-                else:
-                    lang = "auto"
-
-                result = _ssc.correct_text(text, lang=lang)
-                if result.changed:
-                    cell["text"] = result.corrected
-                    spell_fix_count += 1
-
-        if spell_fix_count:
-            logger.info(
-                "build-grid session %s: SmartSpellChecker fixed %d cells",
-                session_id, spell_fix_count,
-            )
-    except ImportError:
-        logger.debug("SmartSpellChecker not available in build-grid")
-    except Exception as e:
-        logger.warning("SmartSpellChecker error in build-grid: %s", e)
-
-
-def _remove_empty_columns(zones_data: List[Dict[str, Any]]) -> None:
-    """Remove columns that have no cells assigned."""
-    for z in zones_data:
-        cells = z.get("cells", [])
-        used_col_indices = {c.get("col_index") for c in cells}
-        old_cols = z.get("columns", [])
-        new_cols = [c for c in old_cols if c.get("col_index", c.get("index", -1)) in used_col_indices]
-        if len(new_cols) < len(old_cols):
-            old_to_new = {}
-            for new_i, col in enumerate(new_cols):
-                old_i = col.get("col_index", col.get("index", new_i))
-                old_to_new[old_i] = new_i
-                col["col_index"] = new_i
-                col["index"] = new_i
-                col["label"] = f"column_{new_i + 1}" if len(new_cols) > 1 else "column_text"
-            for cell in cells:
-                old_ci = cell.get("col_index", 0)
-                cell["col_index"] = old_to_new.get(old_ci, old_ci)
-                cell["col_type"] = f"column_{cell['col_index'] + 1}" if len(new_cols) > 1 else "column_text"
-            z["columns"] = new_cols
-
-
-def _assemble_result(
-    zones_data: List[Dict[str, Any]],
-    all_words: List[Dict[str, Any]],
-    img_w: int,
-    img_h: int,
-    session_id: str,
-    ipa_mode: str,
-    syllable_mode: str,
-    ipa_target_cols: set,
-    skip_ipa: bool,
-    dict_detection: Dict[str, Any],
-    page_number_info: Optional[Dict],
-    boxes_detected: int,
-    recovered_count: int,
-    duration: float,
-    syllable_insertions: int,
-) -> dict:
-    """Build the final result dict (Phase 6)."""
-    total_cells = sum(len(z.get("cells", [])) for z in zones_data)
-    total_columns = sum(len(z.get("columns", [])) for z in zones_data)
-    total_rows = sum(len(z.get("rows", [])) for z in zones_data)
-
-    # Collect color statistics
-    color_stats: Dict[str, int] = {}
-    for z in zones_data:
-        for cell in z.get("cells", []):
-            for wb in cell.get("word_boxes", []):
-                cn = wb.get("color_name", "black")
-                color_stats[cn] = color_stats.get(cn, 0) + 1
-
-    # Compute layout metrics
-    all_content_row_heights: List[float] = []
-    for z in zones_data:
-        for row in z.get("rows", []):
-            if not row.get("is_header", False):
-                h = row.get("y_max_px", 0) - row.get("y_min_px", 0)
-                if h > 0:
-                    all_content_row_heights.append(h)
-    avg_row_height = (
-        sum(all_content_row_heights) / len(all_content_row_heights)
-        if all_content_row_heights else 30.0
-    )
-    font_size_suggestion = max(10, int(avg_row_height * 0.6))
-
-    return {
-        "session_id": session_id,
-        "image_width": img_w,
-        "image_height": img_h,
-        "zones": zones_data,
-        "boxes_detected": boxes_detected,
-        "summary": {
-            "total_zones": len(zones_data),
-            "total_columns": total_columns,
-            "total_rows": total_rows,
-            "total_cells": total_cells,
-            "total_words": len(all_words),
-            "recovered_colored": recovered_count,
-            "color_stats": color_stats,
-        },
-        "formatting": {
-            "bold_columns": [],
-            "header_rows": [],
-        },
-        "layout_metrics": {
-            "page_width_px": img_w,
-            "page_height_px": img_h,
-            "avg_row_height_px": round(avg_row_height, 1),
-            "font_size_suggestion_px": font_size_suggestion,
-        },
-        "dictionary_detection": {
-            "is_dictionary": dict_detection.get("is_dictionary", False),
-            "confidence": dict_detection.get("confidence", 0.0),
-            "signals": dict_detection.get("signals", {}),
-            "article_col_index": dict_detection.get("article_col_index"),
-            "headword_col_index": dict_detection.get("headword_col_index"),
-        },
-        "processing_modes": {
-            "ipa_mode": ipa_mode,
-            "syllable_mode": syllable_mode,
-            "ipa_applied": bool(ipa_target_cols) if not skip_ipa else False,
-            "syllables_applied": syllable_insertions > 0,
-        },
-        "page_number": page_number_info,
-        "duration_seconds": round(duration, 2),
-    }
+# Backward-compat shim -- module moved to grid/build/finalize.py
+import importlib as _importlib
+import sys as _sys
+_sys.modules[__name__] = _importlib.import_module("grid.build.finalize")