Vertical zone split: detect divider lines and create independent sub-zones

Pages with two side-by-side vocabulary columns separated by a vertical black line are now split into independent sub-zones before row/column detection. Each sub-zone gets its own rows, preventing misalignment from different heading rhythms. - _detect_vertical_dividers(): finds pipe word_boxes at consistent x positions spanning >50% of zone height - _split_zone_at_vertical_dividers(): creates left/right PageZone objects with layout_hint and vsplit_group metadata - Column union skips vsplit zones (independent column sets) - Frontend renders vsplit zones side by side via flex layout - PageZone gets layout_hint + vsplit_group fields Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 16:38:12 +01:00
parent e4fa634a63
commit 45b83560fd
4 changed files with 215 additions and 19 deletions
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -449,6 +449,108 @@ def _words_in_zone(
    return result


+# ---------------------------------------------------------------------------
+# Vertical divider detection and zone splitting
+# ---------------------------------------------------------------------------
+
+_PIPE_RE_VSPLIT = re.compile(r"^\|+$")
+
+
+def _detect_vertical_dividers(
+    words: List[Dict],
+    zone_x: int,
+    zone_w: int,
+    zone_y: int,
+    zone_h: int,
+) -> List[float]:
+    """Detect vertical divider lines from pipe word_boxes at consistent x.
+
+    Returns list of divider x-positions (empty if no dividers found).
+    """
+    if not words or zone_w <= 0 or zone_h <= 0:
+        return []
+
+    # Collect pipe word_boxes
+    pipes = [
+        w for w in words
+        if _PIPE_RE_VSPLIT.match((w.get("text") or "").strip())
+    ]
+    if len(pipes) < 5:
+        return []
+
+    # Cluster pipe x-centers by proximity
+    tolerance = max(15, int(zone_w * 0.02))
+    pipe_xs = sorted(w["left"] + w["width"] / 2 for w in pipes)
+
+    clusters: List[List[float]] = [[pipe_xs[0]]]
+    for x in pipe_xs[1:]:
+        if x - clusters[-1][-1] <= tolerance:
+            clusters[-1].append(x)
+        else:
+            clusters.append([x])
+
+    dividers: List[float] = []
+    for cluster in clusters:
+        if len(cluster) < 5:
+            continue
+        mean_x = sum(cluster) / len(cluster)
+        # Must be between 15% and 85% of zone width
+        rel_pos = (mean_x - zone_x) / zone_w
+        if rel_pos < 0.15 or rel_pos > 0.85:
+            continue
+        # Check vertical coverage: pipes must span >= 50% of zone height
+        cluster_pipes = [
+            w for w in pipes
+            if abs(w["left"] + w["width"] / 2 - mean_x) <= tolerance
+        ]
+        ys = [w["top"] for w in cluster_pipes] + [w["top"] + w["height"] for w in cluster_pipes]
+        y_span = max(ys) - min(ys) if ys else 0
+        if y_span < zone_h * 0.5:
+            continue
+        dividers.append(mean_x)
+
+    return sorted(dividers)
+
+
+def _split_zone_at_vertical_dividers(
+    zone: "PageZone",
+    divider_xs: List[float],
+    vsplit_group_id: int,
+) -> List["PageZone"]:
+    """Split a PageZone at vertical divider positions into sub-zones."""
+    from cv_vocab_types import PageZone
+
+    boundaries = [zone.x] + divider_xs + [zone.x + zone.width]
+    hints = []
+    for i in range(len(boundaries) - 1):
+        if i == 0:
+            hints.append("left_of_vsplit")
+        elif i == len(boundaries) - 2:
+            hints.append("right_of_vsplit")
+        else:
+            hints.append("middle_of_vsplit")
+
+    sub_zones = []
+    for i in range(len(boundaries) - 1):
+        x_start = int(boundaries[i])
+        x_end = int(boundaries[i + 1])
+        sub = PageZone(
+            index=0,  # re-indexed later
+            zone_type=zone.zone_type,
+            y=zone.y,
+            height=zone.height,
+            x=x_start,
+            width=x_end - x_start,
+            box=zone.box,
+            image_overlays=zone.image_overlays,
+            layout_hint=hints[i],
+            vsplit_group=vsplit_group_id,
+        )
+        sub_zones.append(sub)
+
+    return sub_zones
+
+
 def _merge_content_zones_across_boxes(
    zones: List,
    content_x: int,
@@ -1404,11 +1506,49 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                    page_zones, content_x, content_w
                )

+                # 3b. Detect vertical dividers and split content zones
+                vsplit_group_counter = 0
+                expanded_zones: List = []
+                for pz in page_zones:
+                    if pz.zone_type != "content":
+                        expanded_zones.append(pz)
+                        continue
+                    zone_words = _words_in_zone(
+                        all_words, pz.y, pz.height, pz.x, pz.width
+                    )
+                    divider_xs = _detect_vertical_dividers(
+                        zone_words, pz.x, pz.width, pz.y, pz.height
+                    )
+                    if divider_xs:
+                        sub_zones = _split_zone_at_vertical_dividers(
+                            pz, divider_xs, vsplit_group_counter
+                        )
+                        expanded_zones.extend(sub_zones)
+                        vsplit_group_counter += 1
+                        # Remove pipe words so they don't appear in sub-zones
+                        pipe_ids = set(
+                            id(w) for w in zone_words
+                            if _PIPE_RE_VSPLIT.match((w.get("text") or "").strip())
+                        )
+                        all_words[:] = [w for w in all_words if id(w) not in pipe_ids]
+                        logger.info(
+                            "build-grid: vertical split zone %d at x=%s → %d sub-zones",
+                            pz.index, [int(x) for x in divider_xs], len(sub_zones),
+                        )
+                    else:
+                        expanded_zones.append(pz)
+                # Re-index zones
+                for i, pz in enumerate(expanded_zones):
+                    pz.index = i
+                page_zones = expanded_zones
+
                # --- Union columns from all content zones ---
                # Each content zone detects columns independently.  Narrow
                # columns (page refs, markers) may appear in only one zone.
                # Merge column split-points from ALL content zones so every
                # zone shares the full column set.
+                # NOTE: Zones from a vertical split are independent and must
+                # NOT share columns with each other.

                # First pass: build grids per zone independently
                zone_grids: List[Dict] = []
@@ -1459,8 +1599,11 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                    zone_grids.append({"pz": pz, "words": zone_words, "grid": grid})

                # Second pass: merge column boundaries from all content zones
+                # Exclude zones from vertical splits — they have independent columns.
                content_zones = [
-                    zg for zg in zone_grids if zg["pz"].zone_type == "content"
+                    zg for zg in zone_grids
+                    if zg["pz"].zone_type == "content"
+                    and zg["pz"].vsplit_group is None
                ]
                if len(content_zones) > 1:
                    # Collect column split points (x_min of non-first columns)
@@ -1564,6 +1707,11 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                    if pz.image_overlays:
                        zone_entry["image_overlays"] = pz.image_overlays

+                    if pz.layout_hint:
+                        zone_entry["layout_hint"] = pz.layout_hint
+                    if pz.vsplit_group is not None:
+                        zone_entry["vsplit_group"] = pz.vsplit_group
+
                    zones_data.append(zone_entry)

    # 4. Fallback: no boxes detected → single zone with all words
@@ -1719,8 +1867,11 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
    # OCR reads physical vertical divider lines as "|" or "||" characters.
    # These sit at consistent x positions near column boundaries and pollute
    # cell text.  Remove them from word_boxes and rebuild cell text.
+    # NOTE: Zones from a vertical split already had pipes removed in step 3b.
    _PIPE_RE = re.compile(r"^\|+$")
    for z in zones_data:
+        if z.get("vsplit_group") is not None:
+            continue  # pipes already removed before split
        removed_pipes = 0
        for cell in z.get("cells", []):
            wbs = cell.get("word_boxes") or []