Restructure: Move grid_* + vocab_* into packages (klausur-service)

grid/ package (16 files): grid/build/ — core, zones, cleanup, text_ops, cell_ops, finalize grid/editor/ — api, helpers, columns, filters, headers, zones vocab/ package (10 files): vocab/worksheet/ — api, models, extraction, generation, ocr, upload, analysis, compare vocab/ — session_store, learn_bridge 26 backward-compat shims. Internal imports relative. RAG untouched. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 21:30:20 +02:00
parent 098a2ff092
commit 59c400b9aa
58 changed files with 8803 additions and 8659 deletions
--- a/klausur-service/backend/grid/build/core.py
+++ b/klausur-service/backend/grid/build/core.py
@@ -0,0 +1,213 @@
+"""
+Grid Build Core — the main _build_grid_core() function.
+
+Extracted from grid_editor_api.py for maintainability.
+Takes merged OCR word positions and builds a structured, zone-aware grid.
+
+The function delegates to phase-specific modules:
+- grid_build_zones.py   — image loading, graphic/box detection, zone grids
+- grid_build_cleanup.py — junk rows, artifacts, pipes, border strips
+- grid_build_text_ops.py — color, headings, IPA, page refs
+- grid_build_finalize.py — bullets, max_columns, dictionary, spelling, result
+"""
+
+import logging
+import time
+from typing import Any, Dict, List, Optional
+
+from grid.editor.filters import (
+    _flatten_word_boxes,
+    _get_content_bounds,
+    _filter_decorative_margin,
+    _filter_footer_words,
+    _filter_header_junk,
+)
+
+from .zones import _build_zones
+from .cleanup import _cleanup_zones
+from .text_ops import _process_text
+from .finalize import _finalize_grid
+
+logger = logging.getLogger(__name__)
+
+
+async def _build_grid_core(
+    session_id: str,
+    session: dict,
+    *,
+    ipa_mode: str = "auto",
+    syllable_mode: str = "auto",
+    enhance: bool = True,
+    max_columns: Optional[int] = None,
+    min_conf: Optional[int] = None,
+) -> dict:
+    """Core grid building logic — pure computation, no HTTP or DB side effects.
+
+    Args:
+        session_id: Session identifier (for logging and image loading).
+        session: Full session dict from get_session_db().
+        ipa_mode: "auto" (only when English headwords detected), "all"
+            (force IPA on all content columns), "en" (English column only),
+            "de" (German/definition columns only), or "none" (skip entirely).
+        syllable_mode: "auto" (only when original has pipe dividers),
+            "all" (force syllabification on all words), "en" (English only),
+            "de" (German only), or "none" (skip).
+
+    Returns:
+        StructuredGrid result dict.
+
+    Raises:
+        ValueError: If session data is incomplete.
+    """
+    t0 = time.time()
+
+    # ── Phase 1: Input Validation & Word Filtering ──────────────────
+
+    # 1. Validate and load word results
+    word_result = session.get("word_result")
+    if not word_result or not word_result.get("cells"):
+        raise ValueError("No word results found. Run paddle-kombi or rapid-kombi first.")
+
+    img_w = word_result.get("image_width", 0)
+    img_h = word_result.get("image_height", 0)
+    if not img_w or not img_h:
+        raise ValueError("Missing image dimensions in word_result")
+
+    # 2. Flatten all word boxes from cells
+    all_words = _flatten_word_boxes(word_result["cells"])
+    if not all_words:
+        raise ValueError("No word boxes found in cells")
+
+    # 2a-pre. Apply min_conf filter if specified
+    if min_conf and min_conf > 0:
+        before = len(all_words)
+        all_words = [w for w in all_words if w.get('conf', 100) >= min_conf]
+        removed = before - len(all_words)
+        if removed:
+            logger.info("build-grid session %s: min_conf=%d removed %d/%d words",
+                        session_id, min_conf, removed, before)
+
+    logger.info("build-grid session %s: %d words from %d cells (enhance=%s, max_cols=%s, min_conf=%s)",
+                session_id, len(all_words), len(word_result["cells"]),
+                enhance, max_columns, min_conf)
+
+    # 2b. Filter decorative margin columns (alphabet graphics)
+    margin_strip_info = _filter_decorative_margin(all_words, img_w, logger, session_id)
+    margin_strip_detected = margin_strip_info.get("found", False)
+
+    # Read document_category from session
+    document_category = session.get("document_category")
+
+    # 2c. Filter footer rows (page numbers at the very bottom)
+    page_number_info = _filter_footer_words(all_words, img_h, logger, session_id)
+
+    # 2c2. Filter OCR junk from header illustrations
+    _filter_header_junk(all_words, img_h, logger, session_id)
+
+    # 2d. Filter words inside user-defined exclude regions
+    structure_result = session.get("structure_result")
+    exclude_rects = []
+    if structure_result:
+        for er in structure_result.get("exclude_regions", []):
+            exclude_rects.append({
+                "x": er["x"], "y": er["y"],
+                "w": er["w"], "h": er["h"],
+            })
+    if exclude_rects:
+        before = len(all_words)
+        filtered = []
+        for w in all_words:
+            w_cx = w["left"] + w.get("width", 0) / 2
+            w_cy = w["top"] + w.get("height", 0) / 2
+            inside = any(
+                er["x"] <= w_cx <= er["x"] + er["w"]
+                and er["y"] <= w_cy <= er["y"] + er["h"]
+                for er in exclude_rects
+            )
+            if not inside:
+                filtered.append(w)
+        removed = before - len(filtered)
+        if removed:
+            all_words = filtered
+            logger.info(
+                "build-grid session %s: removed %d words inside %d user exclude region(s)",
+                session_id, removed, len(exclude_rects),
+            )
+
+    # 2e. Hard-filter words inside graphic/image regions from structure step
+    graphic_rects: List[Dict[str, int]] = []
+    if structure_result:
+        for g in structure_result.get("graphics", []):
+            graphic_rects.append({
+                "x": g["x"], "y": g["y"],
+                "w": g["w"], "h": g["h"],
+            })
+    if graphic_rects:
+        before = len(all_words)
+        all_words = [
+            w for w in all_words
+            if not any(
+                gr["x"] <= w["left"] + w.get("width", 0) / 2 <= gr["x"] + gr["w"]
+                and gr["y"] <= w["top"] + w.get("height", 0) / 2 <= gr["y"] + gr["h"]
+                for gr in graphic_rects
+            )
+        ]
+        removed = before - len(all_words)
+        if removed:
+            logger.info(
+                "build-grid session %s: hard-removed %d words inside %d structure graphic region(s)",
+                session_id, removed, len(graphic_rects),
+            )
+
+    content_x, content_y, content_w, content_h = _get_content_bounds(all_words)
+
+    # ── Phase 2: Image Processing & Zone Detection ──────────────────
+
+    zone_result = await _build_zones(
+        session_id, session, all_words, graphic_rects,
+        content_x, content_y, content_w, content_h,
+        img_w, img_h,
+    )
+    zones_data = zone_result["zones_data"]
+    boxes_detected = zone_result["boxes_detected"]
+    recovered_count = zone_result["recovered_count"]
+    border_prefiltered = zone_result["border_prefiltered"]
+    img_bgr = zone_result["img_bgr"]
+
+    # ── Phase 3: Junk Removal & Cell Cleanup ────────────────────────
+
+    border_prefiltered = _cleanup_zones(zones_data, border_prefiltered, session_id)
+
+    # ── Phase 4+5a: Color, Headings, IPA, Page Refs ─────────────────
+
+    text_result = _process_text(
+        zones_data, img_bgr, img_w, img_h, ipa_mode, page_number_info,
+    )
+
+    # ── Phase 5b+6: Finalize & Result Assembly ──────────────────────
+
+    duration = time.time() - t0
+
+    result = _finalize_grid(
+        zones_data=zones_data,
+        all_words=all_words,
+        img_bgr=img_bgr,
+        img_w=img_w,
+        img_h=img_h,
+        session_id=session_id,
+        max_columns=max_columns,
+        ipa_mode=ipa_mode,
+        syllable_mode=syllable_mode,
+        en_col_type=text_result["en_col_type"],
+        ipa_target_cols=text_result["ipa_target_cols"],
+        all_content_cols=text_result["all_content_cols"],
+        skip_ipa=text_result["skip_ipa"],
+        document_category=document_category,
+        margin_strip_detected=margin_strip_detected,
+        page_number_info=text_result["page_number_info"],
+        boxes_detected=boxes_detected,
+        recovered_count=recovered_count,
+        duration=duration,
+    )
+
+    return result