[split-required] Split remaining Python monoliths (Phase 1 continued)

klausur-service (7 monoliths): - grid_editor_helpers.py (1,737 → 5 files: columns, filters, headers, zones) - cv_cell_grid.py (1,675 → 7 files: build, legacy, streaming, merge, vocab) - worksheet_editor_api.py (1,305 → 4 files: models, AI, reconstruct, routes) - legal_corpus_ingestion.py (1,280 → 3 files: registry, chunking, ingestion) - cv_review.py (1,248 → 4 files: pipeline, spell, LLM, barrel) - cv_preprocessing.py (1,166 → 3 files: deskew, dewarp, barrel) - rbac.py, admin_api.py, routes/eh.py remain (next batch) backend-lehrer (1 monolith): - classroom_engine/repository.py (1,705 → 7 files by domain) All re-export barrels preserve backward compatibility. Zero import errors verified. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 22:47:59 +02:00
parent 0b37c5e692
commit b2a0126f14
34 changed files with 9264 additions and 9164 deletions
--- a/klausur-service/backend/grid_editor_zones.py
+++ b/klausur-service/backend/grid_editor_zones.py
@@ -0,0 +1,389 @@
+"""
+Grid Editor — vertical divider detection, zone splitting/merging, zone grid building.
+
+Split from grid_editor_helpers.py for maintainability.
+All functions are pure computation — no HTTP, DB, or session side effects.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+import re
+from typing import Any, Dict, List, Optional
+
+from cv_vocab_types import PageZone
+from cv_words_first import _cluster_rows, _build_cells
+
+from grid_editor_columns import (
+    _cluster_columns_by_alignment,
+    _merge_inline_marker_columns,
+    _split_cross_column_words,
+)
+from grid_editor_headers import (
+    _detect_header_rows,
+    _detect_colspan_cells,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Vertical divider detection and zone splitting
+# ---------------------------------------------------------------------------
+
+_PIPE_RE_VSPLIT = re.compile(r"^\|+$")
+
+
+def _detect_vertical_dividers(
+    words: List[Dict],
+    zone_x: int,
+    zone_w: int,
+    zone_y: int,
+    zone_h: int,
+) -> List[float]:
+    """Detect vertical divider lines from pipe word_boxes at consistent x.
+
+    Returns list of divider x-positions (empty if no dividers found).
+    """
+    if not words or zone_w <= 0 or zone_h <= 0:
+        return []
+
+    # Collect pipe word_boxes
+    pipes = [
+        w for w in words
+        if _PIPE_RE_VSPLIT.match((w.get("text") or "").strip())
+    ]
+    if len(pipes) < 5:
+        return []
+
+    # Cluster pipe x-centers by proximity
+    tolerance = max(15, int(zone_w * 0.02))
+    pipe_xs = sorted(w["left"] + w["width"] / 2 for w in pipes)
+
+    clusters: List[List[float]] = [[pipe_xs[0]]]
+    for x in pipe_xs[1:]:
+        if x - clusters[-1][-1] <= tolerance:
+            clusters[-1].append(x)
+        else:
+            clusters.append([x])
+
+    dividers: List[float] = []
+    for cluster in clusters:
+        if len(cluster) < 5:
+            continue
+        mean_x = sum(cluster) / len(cluster)
+        # Must be between 15% and 85% of zone width
+        rel_pos = (mean_x - zone_x) / zone_w
+        if rel_pos < 0.15 or rel_pos > 0.85:
+            continue
+        # Check vertical coverage: pipes must span >= 50% of zone height
+        cluster_pipes = [
+            w for w in pipes
+            if abs(w["left"] + w["width"] / 2 - mean_x) <= tolerance
+        ]
+        ys = [w["top"] for w in cluster_pipes] + [w["top"] + w["height"] for w in cluster_pipes]
+        y_span = max(ys) - min(ys) if ys else 0
+        if y_span < zone_h * 0.5:
+            continue
+        dividers.append(mean_x)
+
+    return sorted(dividers)
+
+
+def _split_zone_at_vertical_dividers(
+    zone: "PageZone",
+    divider_xs: List[float],
+    vsplit_group_id: int,
+) -> List["PageZone"]:
+    """Split a PageZone at vertical divider positions into sub-zones."""
+    boundaries = [zone.x] + divider_xs + [zone.x + zone.width]
+    hints = []
+    for i in range(len(boundaries) - 1):
+        if i == 0:
+            hints.append("left_of_vsplit")
+        elif i == len(boundaries) - 2:
+            hints.append("right_of_vsplit")
+        else:
+            hints.append("middle_of_vsplit")
+
+    sub_zones = []
+    for i in range(len(boundaries) - 1):
+        x_start = int(boundaries[i])
+        x_end = int(boundaries[i + 1])
+        sub = PageZone(
+            index=0,  # re-indexed later
+            zone_type=zone.zone_type,
+            y=zone.y,
+            height=zone.height,
+            x=x_start,
+            width=x_end - x_start,
+            box=zone.box,
+            image_overlays=zone.image_overlays,
+            layout_hint=hints[i],
+            vsplit_group=vsplit_group_id,
+        )
+        sub_zones.append(sub)
+
+    return sub_zones
+
+
+def _merge_content_zones_across_boxes(
+    zones: List,
+    content_x: int,
+    content_w: int,
+) -> List:
+    """Merge content zones separated by box zones into single zones.
+
+    Box zones become image_overlays on the merged content zone.
+    Pattern: [content, box*, content] -> [merged_content with overlay]
+    Box zones NOT between two content zones stay as standalone zones.
+    """
+    if len(zones) < 3:
+        return zones
+
+    # Group consecutive runs of [content, box+, content]
+    result: List = []
+    i = 0
+    while i < len(zones):
+        z = zones[i]
+        if z.zone_type != "content":
+            result.append(z)
+            i += 1
+            continue
+
+        # Start of a potential merge group: content zone
+        group_contents = [z]
+        group_boxes = []
+        j = i + 1
+        # Absorb [box, content] pairs -- only absorb a box if it's
+        # confirmed to be followed by another content zone.
+        while j < len(zones):
+            if (zones[j].zone_type == "box"
+                    and j + 1 < len(zones)
+                    and zones[j + 1].zone_type == "content"):
+                group_boxes.append(zones[j])
+                group_contents.append(zones[j + 1])
+                j += 2
+            else:
+                break
+
+        if len(group_contents) >= 2 and group_boxes:
+            # Merge: create one large content zone spanning all
+            y_min = min(c.y for c in group_contents)
+            y_max = max(c.y + c.height for c in group_contents)
+            overlays = []
+            for bz in group_boxes:
+                overlay = {
+                    "y": bz.y,
+                    "height": bz.height,
+                    "x": bz.x,
+                    "width": bz.width,
+                }
+                if bz.box:
+                    overlay["box"] = {
+                        "x": bz.box.x,
+                        "y": bz.box.y,
+                        "width": bz.box.width,
+                        "height": bz.box.height,
+                        "confidence": bz.box.confidence,
+                        "border_thickness": bz.box.border_thickness,
+                    }
+                overlays.append(overlay)
+
+            merged = PageZone(
+                index=0,  # re-indexed below
+                zone_type="content",
+                y=y_min,
+                height=y_max - y_min,
+                x=content_x,
+                width=content_w,
+                image_overlays=overlays,
+            )
+            result.append(merged)
+            i = j
+        else:
+            # No merge possible -- emit just the content zone
+            result.append(z)
+            i += 1
+
+    # Re-index zones
+    for idx, z in enumerate(result):
+        z.index = idx
+
+    logger.info(
+        "zone-merge: %d zones -> %d zones after merging across boxes",
+        len(zones), len(result),
+    )
+    return result
+
+
+def _build_zone_grid(
+    zone_words: List[Dict],
+    zone_x: int,
+    zone_y: int,
+    zone_w: int,
+    zone_h: int,
+    zone_index: int,
+    img_w: int,
+    img_h: int,
+    global_columns: Optional[List[Dict]] = None,
+    skip_first_row_header: bool = False,
+) -> Dict[str, Any]:
+    """Build columns, rows, cells for a single zone from its words.
+
+    Args:
+        global_columns: If provided, use these pre-computed column boundaries
+            instead of detecting columns per zone.  Used for content zones so
+            that all content zones (above/between/below boxes) share the same
+            column structure.  Box zones always detect columns independently.
+    """
+    if not zone_words:
+        return {
+            "columns": [],
+            "rows": [],
+            "cells": [],
+            "header_rows": [],
+        }
+
+    # Cluster rows first (needed for column alignment analysis)
+    rows = _cluster_rows(zone_words)
+
+    # Diagnostic logging for small/medium zones (box zones typically have 40-60 words)
+    if len(zone_words) <= 60:
+        import statistics as _st
+        _heights = [w['height'] for w in zone_words if w.get('height', 0) > 0]
+        _med_h = _st.median(_heights) if _heights else 20
+        _y_tol = max(_med_h * 0.5, 5)
+        logger.info(
+            "zone %d row-clustering: %d words, median_h=%.0f, y_tol=%.1f -> %d rows",
+            zone_index, len(zone_words), _med_h, _y_tol, len(rows),
+        )
+        for w in sorted(zone_words, key=lambda ww: (ww['top'], ww['left'])):
+            logger.info(
+                "  zone %d word: y=%d x=%d h=%d w=%d '%s'",
+                zone_index, w['top'], w['left'], w['height'], w['width'],
+                w.get('text', '')[:40],
+            )
+        for r in rows:
+            logger.info(
+                "  zone %d row %d: y_min=%d y_max=%d y_center=%.0f",
+                zone_index, r['index'], r['y_min'], r['y_max'], r['y_center'],
+            )
+
+    # Use global columns if provided, otherwise detect per zone
+    columns = global_columns if global_columns else _cluster_columns_by_alignment(zone_words, zone_w, rows)
+
+    # Merge inline marker columns (bullets, numbering) into adjacent text
+    if not global_columns:
+        columns = _merge_inline_marker_columns(columns, zone_words)
+
+    if not columns or not rows:
+        return {
+            "columns": [],
+            "rows": [],
+            "cells": [],
+            "header_rows": [],
+        }
+
+    # Split word boxes that straddle column boundaries (e.g. "sichzie"
+    # spanning Col 1 + Col 2).  Must happen after column detection and
+    # before cell assignment.
+    # Keep original words for colspan detection (split destroys span info).
+    original_zone_words = zone_words
+    if len(columns) >= 2:
+        zone_words = _split_cross_column_words(zone_words, columns)
+
+    # Build cells
+    cells = _build_cells(zone_words, columns, rows, img_w, img_h)
+
+    # --- Detect colspan (merged cells spanning multiple columns) ---
+    # Uses the ORIGINAL (pre-split) words to detect word-blocks that span
+    # multiple columns.  _split_cross_column_words would have destroyed
+    # this information by cutting words at column boundaries.
+    if len(columns) >= 2:
+        cells = _detect_colspan_cells(original_zone_words, columns, rows, cells, img_w, img_h)
+
+    # Prefix cell IDs with zone index
+    for cell in cells:
+        cell["cell_id"] = f"Z{zone_index}_{cell['cell_id']}"
+        cell["zone_index"] = zone_index
+
+    # Detect header rows (pass columns for spanning header detection)
+    header_rows = _detect_header_rows(rows, zone_words, zone_y, columns,
+                                      skip_first_row_header=skip_first_row_header)
+
+    # Merge cells in spanning header rows into a single col-0 cell
+    if header_rows and len(columns) >= 2:
+        for hri in header_rows:
+            header_cells = [c for c in cells if c["row_index"] == hri]
+            if len(header_cells) <= 1:
+                continue
+            # Collect all word_boxes and text from all columns
+            all_wb = []
+            all_text_parts = []
+            for hc in sorted(header_cells, key=lambda c: c["col_index"]):
+                all_wb.extend(hc.get("word_boxes", []))
+                if hc.get("text", "").strip():
+                    all_text_parts.append(hc["text"].strip())
+            # Remove all header cells, replace with one spanning cell
+            cells = [c for c in cells if c["row_index"] != hri]
+            if all_wb:
+                x_min = min(wb["left"] for wb in all_wb)
+                y_min = min(wb["top"] for wb in all_wb)
+                x_max = max(wb["left"] + wb["width"] for wb in all_wb)
+                y_max = max(wb["top"] + wb["height"] for wb in all_wb)
+                cells.append({
+                    "cell_id": f"R{hri:02d}_C0",
+                    "row_index": hri,
+                    "col_index": 0,
+                    "col_type": "spanning_header",
+                    "text": " ".join(all_text_parts),
+                    "confidence": 0.0,
+                    "bbox_px": {"x": x_min, "y": y_min,
+                                "w": x_max - x_min, "h": y_max - y_min},
+                    "bbox_pct": {
+                        "x": round(x_min / img_w * 100, 2) if img_w else 0,
+                        "y": round(y_min / img_h * 100, 2) if img_h else 0,
+                        "w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
+                        "h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
+                    },
+                    "word_boxes": all_wb,
+                    "ocr_engine": "words_first",
+                    "is_bold": True,
+                })
+
+    # Convert columns to output format with percentages
+    out_columns = []
+    for col in columns:
+        x_min = col["x_min"]
+        x_max = col["x_max"]
+        out_columns.append({
+            "index": col["index"],
+            "label": col["type"],
+            "x_min_px": round(x_min),
+            "x_max_px": round(x_max),
+            "x_min_pct": round(x_min / img_w * 100, 2) if img_w else 0,
+            "x_max_pct": round(x_max / img_w * 100, 2) if img_w else 0,
+            "bold": False,
+        })
+
+    # Convert rows to output format with percentages
+    out_rows = []
+    for row in rows:
+        out_rows.append({
+            "index": row["index"],
+            "y_min_px": round(row["y_min"]),
+            "y_max_px": round(row["y_max"]),
+            "y_min_pct": round(row["y_min"] / img_h * 100, 2) if img_h else 0,
+            "y_max_pct": round(row["y_max"] / img_h * 100, 2) if img_h else 0,
+            "is_header": row["index"] in header_rows,
+        })
+
+    return {
+        "columns": out_columns,
+        "rows": out_rows,
+        "cells": cells,
+        "header_rows": header_rows,
+        "_raw_columns": columns,  # internal: for propagation to other zones
+    }