Fix: Remove broken getKlausurApiUrl and clean up empty lines

sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 16:02:04 +02:00
parent b07f802c24
commit 9ba420fa91
150 changed files with 30231 additions and 32053 deletions
--- a/klausur-service/backend/grid_build_zones.py
+++ b/klausur-service/backend/grid_build_zones.py
@@ -0,0 +1,462 @@
+"""
+Grid Build Zones — Phase 2: Image processing, graphic detection, box/zone
+detection and zone-aware grid building.
+
+Extracted from grid_build_core.py for maintainability.
+"""
+
+import logging
+from typing import Any, Dict, List, Optional
+
+import cv2
+import numpy as np
+
+from cv_box_detect import detect_boxes, split_page_into_zones
+from cv_graphic_detect import detect_graphic_elements
+from cv_color_detect import recover_colored_text
+from cv_vocab_types import PageZone
+from ocr_pipeline_session_store import get_session_image
+
+from grid_editor_helpers import (
+    _filter_border_strip_words,
+    _filter_border_ghosts,
+    _words_in_zone,
+    _PIPE_RE_VSPLIT,
+    _detect_vertical_dividers,
+    _split_zone_at_vertical_dividers,
+    _merge_content_zones_across_boxes,
+    _build_zone_grid,
+)
+
+logger = logging.getLogger(__name__)
+
+
+async def _build_zones(
+    session_id: str,
+    session: dict,
+    all_words: List[Dict[str, Any]],
+    graphic_rects: List[Dict[str, int]],
+    content_x: int,
+    content_y: int,
+    content_w: int,
+    content_h: int,
+    img_w: int,
+    img_h: int,
+) -> Dict[str, Any]:
+    """Load image, detect graphics/boxes, build zone-aware grids.
+
+    Returns a dict with keys:
+        zones_data, boxes_detected, recovered_count, border_prefiltered,
+        img_bgr, all_words (modified in-place but returned for clarity).
+    """
+    zones_data: List[Dict[str, Any]] = []
+    boxes_detected = 0
+    recovered_count = 0
+    border_prefiltered = False
+    img_bgr = None
+
+    # 3. Load image for box detection
+    img_png = await get_session_image(session_id, "cropped")
+    if not img_png:
+        img_png = await get_session_image(session_id, "dewarped")
+    if not img_png:
+        img_png = await get_session_image(session_id, "original")
+
+    if img_png:
+        # Decode image for color detection + box detection
+        arr = np.frombuffer(img_png, dtype=np.uint8)
+        img_bgr = cv2.imdecode(arr, cv2.IMREAD_COLOR)
+
+        if img_bgr is not None:
+            # --- 3a. Detect graphic/image regions via CV and hard-filter ---
+            sig_words = [w for w in all_words if len((w.get("text") or "").strip()) >= 3]
+            fresh_graphics = detect_graphic_elements(img_bgr, sig_words)
+            if fresh_graphics:
+                fresh_rects = [
+                    {"x": g.x, "y": g.y, "w": g.width, "h": g.height}
+                    for g in fresh_graphics
+                ]
+                graphic_rects.extend(fresh_rects)
+                logger.info(
+                    "build-grid session %s: detected %d graphic region(s) via CV",
+                    session_id, len(fresh_graphics),
+                )
+                # Hard-filter words inside newly detected graphic regions
+                before = len(all_words)
+                all_words[:] = [
+                    w for w in all_words
+                    if not any(
+                        gr["x"] <= w["left"] + w.get("width", 0) / 2 <= gr["x"] + gr["w"]
+                        and gr["y"] <= w["top"] + w.get("height", 0) / 2 <= gr["y"] + gr["h"]
+                        for gr in fresh_rects
+                    )
+                ]
+                removed = before - len(all_words)
+                if removed:
+                    logger.info(
+                        "build-grid session %s: hard-removed %d words inside %d fresh graphic region(s)",
+                        session_id, removed, len(fresh_rects),
+                    )
+
+            # --- Recover colored text that OCR missed (before grid building) ---
+            recovered = recover_colored_text(img_bgr, all_words)
+            if recovered and graphic_rects:
+                # Filter recovered chars inside graphic regions
+                recovered = [
+                    r for r in recovered
+                    if not any(
+                        gr["x"] <= r["left"] + r.get("width", 0) / 2 <= gr["x"] + gr["w"]
+                        and gr["y"] <= r["top"] + r.get("height", 0) / 2 <= gr["y"] + gr["h"]
+                        for gr in graphic_rects
+                    )
+                ]
+            if recovered:
+                recovered_count = len(recovered)
+                all_words.extend(recovered)
+                logger.info(
+                    "build-grid session %s: +%d recovered colored words",
+                    session_id, recovered_count,
+                )
+
+            # Detect bordered boxes
+            boxes = detect_boxes(
+                img_bgr,
+                content_x=content_x,
+                content_w=content_w,
+                content_y=content_y,
+                content_h=content_h,
+            )
+            boxes_detected = len(boxes)
+
+            if boxes:
+                # Filter border ghost words before grid building
+                all_words_new, ghost_count = _filter_border_ghosts(all_words, boxes)
+                if ghost_count:
+                    all_words[:] = all_words_new
+                    logger.info(
+                        "build-grid session %s: removed %d border ghost words",
+                        session_id, ghost_count,
+                    )
+
+                # Split page into zones
+                page_zones = split_page_into_zones(
+                    content_x, content_y, content_w, content_h, boxes
+                )
+
+                # Merge content zones separated by box zones
+                page_zones = _merge_content_zones_across_boxes(
+                    page_zones, content_x, content_w
+                )
+
+                # 3b. Detect vertical dividers and split content zones
+                page_zones, border_prefiltered_vd = _detect_and_split_vertical_dividers(
+                    page_zones, all_words
+                )
+
+                # --- First pass: build grids per zone independently ---
+                zone_grids = _build_grids_per_zone(
+                    page_zones, all_words, img_w, img_h
+                )
+                border_prefiltered = border_prefiltered or any(
+                    zg.get("_border_prefiltered") for zg in zone_grids
+                )
+
+                # --- Second pass: merge column boundaries from all content zones ---
+                _merge_content_zone_columns(
+                    zone_grids, all_words, content_w, img_w, img_h, session_id
+                )
+
+                # --- Build zones_data from zone_grids ---
+                for zg in zone_grids:
+                    pz = zg["pz"]
+                    grid = zg["grid"]
+                    grid.pop("_raw_columns", None)
+
+                    zone_entry: Dict[str, Any] = {
+                        "zone_index": pz.index,
+                        "zone_type": pz.zone_type,
+                        "bbox_px": {
+                            "x": pz.x, "y": pz.y,
+                            "w": pz.width, "h": pz.height,
+                        },
+                        "bbox_pct": {
+                            "x": round(pz.x / img_w * 100, 2) if img_w else 0,
+                            "y": round(pz.y / img_h * 100, 2) if img_h else 0,
+                            "w": round(pz.width / img_w * 100, 2) if img_w else 0,
+                            "h": round(pz.height / img_h * 100, 2) if img_h else 0,
+                        },
+                        "border": None,
+                        "word_count": len(zg["words"]),
+                        **grid,
+                    }
+
+                    if pz.box:
+                        zone_entry["border"] = {
+                            "thickness": pz.box.border_thickness,
+                            "confidence": pz.box.confidence,
+                        }
+
+                    if pz.image_overlays:
+                        zone_entry["image_overlays"] = pz.image_overlays
+
+                    if pz.layout_hint:
+                        zone_entry["layout_hint"] = pz.layout_hint
+                    if pz.vsplit_group is not None:
+                        zone_entry["vsplit_group"] = pz.vsplit_group
+
+                    zones_data.append(zone_entry)
+
+    # 4. Fallback: no boxes detected -> single zone with all words
+    if not zones_data:
+        before = len(all_words)
+        filtered_words = [
+            w for w in all_words
+            if not (w.get("recovered") and len(w.get("text", "").strip()) <= 2)
+        ]
+        removed = before - len(filtered_words)
+        if removed:
+            logger.info(
+                "build-grid session %s: filtered %d recovered artifacts (fallback zone)",
+                session_id, removed,
+            )
+        filtered_words, bs_removed = _filter_border_strip_words(filtered_words)
+        if bs_removed:
+            border_prefiltered = True
+            logger.info(
+                "build-grid session %s: pre-filtered %d border-strip words",
+                session_id, bs_removed,
+            )
+        grid = _build_zone_grid(
+            filtered_words, content_x, content_y, content_w, content_h,
+            0, img_w, img_h,
+        )
+        grid.pop("_raw_columns", None)
+        zones_data.append({
+            "zone_index": 0,
+            "zone_type": "content",
+            "bbox_px": {
+                "x": content_x, "y": content_y,
+                "w": content_w, "h": content_h,
+            },
+            "bbox_pct": {
+                "x": round(content_x / img_w * 100, 2) if img_w else 0,
+                "y": round(content_y / img_h * 100, 2) if img_h else 0,
+                "w": round(content_w / img_w * 100, 2) if img_w else 0,
+                "h": round(content_h / img_h * 100, 2) if img_h else 0,
+            },
+            "border": None,
+            "word_count": len(all_words),
+            **grid,
+        })
+
+    return {
+        "zones_data": zones_data,
+        "boxes_detected": boxes_detected,
+        "recovered_count": recovered_count,
+        "border_prefiltered": border_prefiltered,
+        "img_bgr": img_bgr,
+    }
+
+
+def _detect_and_split_vertical_dividers(
+    page_zones: List[PageZone],
+    all_words: List[Dict[str, Any]],
+) -> tuple:
+    """Detect vertical dividers and split content zones.
+
+    Returns (expanded_zones, border_prefiltered_from_vsplit).
+    """
+    vsplit_group_counter = 0
+    expanded_zones: List = []
+    for pz in page_zones:
+        if pz.zone_type != "content":
+            expanded_zones.append(pz)
+            continue
+        zone_words = _words_in_zone(
+            all_words, pz.y, pz.height, pz.x, pz.width
+        )
+        divider_xs = _detect_vertical_dividers(
+            zone_words, pz.x, pz.width, pz.y, pz.height
+        )
+        if divider_xs:
+            sub_zones = _split_zone_at_vertical_dividers(
+                pz, divider_xs, vsplit_group_counter
+            )
+            expanded_zones.extend(sub_zones)
+            vsplit_group_counter += 1
+            # Remove pipe words so they don't appear in sub-zones
+            pipe_ids = set(
+                id(w) for w in zone_words
+                if _PIPE_RE_VSPLIT.match((w.get("text") or "").strip())
+            )
+            all_words[:] = [w for w in all_words if id(w) not in pipe_ids]
+            logger.info(
+                "build-grid: vertical split zone %d at x=%s -> %d sub-zones",
+                pz.index, [int(x) for x in divider_xs], len(sub_zones),
+            )
+        else:
+            expanded_zones.append(pz)
+    # Re-index zones
+    for i, pz in enumerate(expanded_zones):
+        pz.index = i
+    return expanded_zones, False
+
+
+def _build_grids_per_zone(
+    page_zones: List[PageZone],
+    all_words: List[Dict[str, Any]],
+    img_w: int,
+    img_h: int,
+) -> List[Dict[str, Any]]:
+    """Build grids for each zone independently (first pass)."""
+    zone_grids: List[Dict] = []
+
+    for pz in page_zones:
+        zone_words = _words_in_zone(
+            all_words, pz.y, pz.height, pz.x, pz.width
+        )
+        if pz.zone_type == "content":
+            logger.info(
+                "build-grid zone %d (%s): bounds x=%d..%d y=%d..%d -> %d/%d words",
+                pz.index, pz.zone_type,
+                pz.x, pz.x + pz.width, pz.y, pz.y + pz.height,
+                len(zone_words), len(all_words),
+            )
+        # Filter recovered single-char artifacts in ALL zones
+        before = len(zone_words)
+        zone_words = [
+            w for w in zone_words
+            if not (
+                w.get("recovered")
+                and len(w.get("text", "").strip()) <= 2
+            )
+        ]
+        removed = before - len(zone_words)
+        if removed:
+            logger.info(
+                "build-grid: filtered %d recovered artifacts from %s zone %d",
+                removed, pz.zone_type, pz.index,
+            )
+        # Filter words inside image overlay regions (merged box zones)
+        if pz.image_overlays:
+            before_ov = len(zone_words)
+            zone_words = [
+                w for w in zone_words
+                if not any(
+                    ov["y"] <= w["top"] + w["height"] / 2 <= ov["y"] + ov["height"]
+                    and ov["x"] <= w["left"] + w["width"] / 2 <= ov["x"] + ov["width"]
+                    for ov in pz.image_overlays
+                )
+            ]
+            ov_removed = before_ov - len(zone_words)
+            if ov_removed:
+                logger.info(
+                    "build-grid: filtered %d words inside image overlays from zone %d",
+                    ov_removed, pz.index,
+                )
+        zone_words, bs_removed = _filter_border_strip_words(zone_words)
+        bp = False
+        if bs_removed:
+            bp = True
+            logger.info(
+                "build-grid: pre-filtered %d border-strip words from zone %d",
+                bs_removed, pz.index,
+            )
+        grid = _build_zone_grid(
+            zone_words, pz.x, pz.y, pz.width, pz.height,
+            pz.index, img_w, img_h,
+            skip_first_row_header=bool(pz.image_overlays),
+        )
+        zone_grids.append({
+            "pz": pz, "words": zone_words, "grid": grid,
+            "_border_prefiltered": bp,
+        })
+
+    return zone_grids
+
+
+def _merge_content_zone_columns(
+    zone_grids: List[Dict[str, Any]],
+    all_words: List[Dict[str, Any]],
+    content_w: int,
+    img_w: int,
+    img_h: int,
+    session_id: str,
+) -> None:
+    """Second pass: merge column boundaries from all content zones.
+
+    Modifies zone_grids in place.
+    """
+    content_zones = [
+        zg for zg in zone_grids
+        if zg["pz"].zone_type == "content"
+        and zg["pz"].vsplit_group is None
+    ]
+    if len(content_zones) <= 1:
+        return
+
+    # Collect column split points (x_min of non-first columns)
+    all_split_xs: List[float] = []
+    for zg in content_zones:
+        raw_cols = zg["grid"].get("_raw_columns", [])
+        for col in raw_cols[1:]:
+            all_split_xs.append(col["x_min"])
+
+    if not all_split_xs:
+        return
+
+    all_split_xs.sort()
+    merge_distance = max(25, int(content_w * 0.03))
+    merged_xs = [all_split_xs[0]]
+    for x in all_split_xs[1:]:
+        if x - merged_xs[-1] < merge_distance:
+            merged_xs[-1] = (merged_xs[-1] + x) / 2
+        else:
+            merged_xs.append(x)
+
+    total_cols = len(merged_xs) + 1
+    max_zone_cols = max(
+        len(zg["grid"].get("_raw_columns", []))
+        for zg in content_zones
+    )
+
+    if total_cols < max_zone_cols:
+        return
+
+    cx_min = min(w["left"] for w in all_words)
+    cx_max = max(w["left"] + w["width"] for w in all_words)
+    merged_columns: List[Dict[str, Any]] = []
+    prev_x = cx_min
+    for i, sx in enumerate(merged_xs):
+        merged_columns.append({
+            "index": i,
+            "type": f"column_{i + 1}",
+            "x_min": prev_x,
+            "x_max": sx,
+        })
+        prev_x = sx
+    merged_columns.append({
+        "index": len(merged_xs),
+        "type": f"column_{len(merged_xs) + 1}",
+        "x_min": prev_x,
+        "x_max": cx_max,
+    })
+
+    # Re-build ALL content zones with merged columns
+    for zg in zone_grids:
+        pz = zg["pz"]
+        if pz.zone_type == "content":
+            grid = _build_zone_grid(
+                zg["words"], pz.x, pz.y,
+                pz.width, pz.height,
+                pz.index, img_w, img_h,
+                global_columns=merged_columns,
+                skip_first_row_header=bool(pz.image_overlays),
+            )
+            zg["grid"] = grid
+    logger.info(
+        "build-grid session %s: union of %d content "
+        "zones -> %d merged columns (max single zone: %d)",
+        session_id, len(content_zones),
+        total_cols, max_zone_cols,
+    )