""" Grid Editor — vertical divider detection, zone splitting/merging, zone grid building. Split from grid_editor_helpers.py for maintainability. All functions are pure computation — no HTTP, DB, or session side effects. Lizenz: Apache 2.0 (kommerziell nutzbar) DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import logging import re from typing import Any, Dict, List, Optional from cv_vocab_types import PageZone from cv_words_first import _cluster_rows, _build_cells from grid_editor_columns import ( _cluster_columns_by_alignment, _merge_inline_marker_columns, _split_cross_column_words, ) from grid_editor_headers import ( _detect_header_rows, _detect_colspan_cells, ) logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Vertical divider detection and zone splitting # --------------------------------------------------------------------------- _PIPE_RE_VSPLIT = re.compile(r"^\|+$") def _detect_vertical_dividers( words: List[Dict], zone_x: int, zone_w: int, zone_y: int, zone_h: int, ) -> List[float]: """Detect vertical divider lines from pipe word_boxes at consistent x. Returns list of divider x-positions (empty if no dividers found). """ if not words or zone_w <= 0 or zone_h <= 0: return [] # Collect pipe word_boxes pipes = [ w for w in words if _PIPE_RE_VSPLIT.match((w.get("text") or "").strip()) ] if len(pipes) < 5: return [] # Cluster pipe x-centers by proximity tolerance = max(15, int(zone_w * 0.02)) pipe_xs = sorted(w["left"] + w["width"] / 2 for w in pipes) clusters: List[List[float]] = [[pipe_xs[0]]] for x in pipe_xs[1:]: if x - clusters[-1][-1] <= tolerance: clusters[-1].append(x) else: clusters.append([x]) dividers: List[float] = [] for cluster in clusters: if len(cluster) < 5: continue mean_x = sum(cluster) / len(cluster) # Must be between 15% and 85% of zone width rel_pos = (mean_x - zone_x) / zone_w if rel_pos < 0.15 or rel_pos > 0.85: continue # Check vertical coverage: pipes must span >= 50% of zone height cluster_pipes = [ w for w in pipes if abs(w["left"] + w["width"] / 2 - mean_x) <= tolerance ] ys = [w["top"] for w in cluster_pipes] + [w["top"] + w["height"] for w in cluster_pipes] y_span = max(ys) - min(ys) if ys else 0 if y_span < zone_h * 0.5: continue dividers.append(mean_x) return sorted(dividers) def _split_zone_at_vertical_dividers( zone: "PageZone", divider_xs: List[float], vsplit_group_id: int, ) -> List["PageZone"]: """Split a PageZone at vertical divider positions into sub-zones.""" boundaries = [zone.x] + divider_xs + [zone.x + zone.width] hints = [] for i in range(len(boundaries) - 1): if i == 0: hints.append("left_of_vsplit") elif i == len(boundaries) - 2: hints.append("right_of_vsplit") else: hints.append("middle_of_vsplit") sub_zones = [] for i in range(len(boundaries) - 1): x_start = int(boundaries[i]) x_end = int(boundaries[i + 1]) sub = PageZone( index=0, # re-indexed later zone_type=zone.zone_type, y=zone.y, height=zone.height, x=x_start, width=x_end - x_start, box=zone.box, image_overlays=zone.image_overlays, layout_hint=hints[i], vsplit_group=vsplit_group_id, ) sub_zones.append(sub) return sub_zones def _merge_content_zones_across_boxes( zones: List, content_x: int, content_w: int, ) -> List: """Merge content zones separated by box zones into single zones. Box zones become image_overlays on the merged content zone. Pattern: [content, box*, content] -> [merged_content with overlay] Box zones NOT between two content zones stay as standalone zones. """ if len(zones) < 3: return zones # Group consecutive runs of [content, box+, content] result: List = [] i = 0 while i < len(zones): z = zones[i] if z.zone_type != "content": result.append(z) i += 1 continue # Start of a potential merge group: content zone group_contents = [z] group_boxes = [] j = i + 1 # Absorb [box, content] pairs -- only absorb a box if it's # confirmed to be followed by another content zone. while j < len(zones): if (zones[j].zone_type == "box" and j + 1 < len(zones) and zones[j + 1].zone_type == "content"): group_boxes.append(zones[j]) group_contents.append(zones[j + 1]) j += 2 else: break if len(group_contents) >= 2 and group_boxes: # Merge: create one large content zone spanning all y_min = min(c.y for c in group_contents) y_max = max(c.y + c.height for c in group_contents) overlays = [] for bz in group_boxes: overlay = { "y": bz.y, "height": bz.height, "x": bz.x, "width": bz.width, } if bz.box: overlay["box"] = { "x": bz.box.x, "y": bz.box.y, "width": bz.box.width, "height": bz.box.height, "confidence": bz.box.confidence, "border_thickness": bz.box.border_thickness, } overlays.append(overlay) merged = PageZone( index=0, # re-indexed below zone_type="content", y=y_min, height=y_max - y_min, x=content_x, width=content_w, image_overlays=overlays, ) result.append(merged) i = j else: # No merge possible -- emit just the content zone result.append(z) i += 1 # Re-index zones for idx, z in enumerate(result): z.index = idx logger.info( "zone-merge: %d zones -> %d zones after merging across boxes", len(zones), len(result), ) return result def _build_zone_grid( zone_words: List[Dict], zone_x: int, zone_y: int, zone_w: int, zone_h: int, zone_index: int, img_w: int, img_h: int, global_columns: Optional[List[Dict]] = None, skip_first_row_header: bool = False, ) -> Dict[str, Any]: """Build columns, rows, cells for a single zone from its words. Args: global_columns: If provided, use these pre-computed column boundaries instead of detecting columns per zone. Used for content zones so that all content zones (above/between/below boxes) share the same column structure. Box zones always detect columns independently. """ if not zone_words: return { "columns": [], "rows": [], "cells": [], "header_rows": [], } # Cluster rows first (needed for column alignment analysis) rows = _cluster_rows(zone_words) # Diagnostic logging for small/medium zones (box zones typically have 40-60 words) if len(zone_words) <= 60: import statistics as _st _heights = [w['height'] for w in zone_words if w.get('height', 0) > 0] _med_h = _st.median(_heights) if _heights else 20 _y_tol = max(_med_h * 0.5, 5) logger.info( "zone %d row-clustering: %d words, median_h=%.0f, y_tol=%.1f -> %d rows", zone_index, len(zone_words), _med_h, _y_tol, len(rows), ) for w in sorted(zone_words, key=lambda ww: (ww['top'], ww['left'])): logger.info( " zone %d word: y=%d x=%d h=%d w=%d '%s'", zone_index, w['top'], w['left'], w['height'], w['width'], w.get('text', '')[:40], ) for r in rows: logger.info( " zone %d row %d: y_min=%d y_max=%d y_center=%.0f", zone_index, r['index'], r['y_min'], r['y_max'], r['y_center'], ) # Use global columns if provided, otherwise detect per zone columns = global_columns if global_columns else _cluster_columns_by_alignment(zone_words, zone_w, rows) # Merge inline marker columns (bullets, numbering) into adjacent text if not global_columns: columns = _merge_inline_marker_columns(columns, zone_words) if not columns or not rows: return { "columns": [], "rows": [], "cells": [], "header_rows": [], } # Split word boxes that straddle column boundaries (e.g. "sichzie" # spanning Col 1 + Col 2). Must happen after column detection and # before cell assignment. # Keep original words for colspan detection (split destroys span info). original_zone_words = zone_words if len(columns) >= 2: zone_words = _split_cross_column_words(zone_words, columns) # Build cells cells = _build_cells(zone_words, columns, rows, img_w, img_h) # --- Detect colspan (merged cells spanning multiple columns) --- # Uses the ORIGINAL (pre-split) words to detect word-blocks that span # multiple columns. _split_cross_column_words would have destroyed # this information by cutting words at column boundaries. if len(columns) >= 2: cells = _detect_colspan_cells(original_zone_words, columns, rows, cells, img_w, img_h) # Prefix cell IDs with zone index for cell in cells: cell["cell_id"] = f"Z{zone_index}_{cell['cell_id']}" cell["zone_index"] = zone_index # Detect header rows (pass columns for spanning header detection) header_rows = _detect_header_rows(rows, zone_words, zone_y, columns, skip_first_row_header=skip_first_row_header) # Merge cells in spanning header rows into a single col-0 cell if header_rows and len(columns) >= 2: for hri in header_rows: header_cells = [c for c in cells if c["row_index"] == hri] if len(header_cells) <= 1: continue # Collect all word_boxes and text from all columns all_wb = [] all_text_parts = [] for hc in sorted(header_cells, key=lambda c: c["col_index"]): all_wb.extend(hc.get("word_boxes", [])) if hc.get("text", "").strip(): all_text_parts.append(hc["text"].strip()) # Remove all header cells, replace with one spanning cell cells = [c for c in cells if c["row_index"] != hri] if all_wb: x_min = min(wb["left"] for wb in all_wb) y_min = min(wb["top"] for wb in all_wb) x_max = max(wb["left"] + wb["width"] for wb in all_wb) y_max = max(wb["top"] + wb["height"] for wb in all_wb) cells.append({ "cell_id": f"R{hri:02d}_C0", "row_index": hri, "col_index": 0, "col_type": "spanning_header", "text": " ".join(all_text_parts), "confidence": 0.0, "bbox_px": {"x": x_min, "y": y_min, "w": x_max - x_min, "h": y_max - y_min}, "bbox_pct": { "x": round(x_min / img_w * 100, 2) if img_w else 0, "y": round(y_min / img_h * 100, 2) if img_h else 0, "w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0, "h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0, }, "word_boxes": all_wb, "ocr_engine": "words_first", "is_bold": True, }) # Convert columns to output format with percentages out_columns = [] for col in columns: x_min = col["x_min"] x_max = col["x_max"] out_columns.append({ "index": col["index"], "label": col["type"], "x_min_px": round(x_min), "x_max_px": round(x_max), "x_min_pct": round(x_min / img_w * 100, 2) if img_w else 0, "x_max_pct": round(x_max / img_w * 100, 2) if img_w else 0, "bold": False, }) # Convert rows to output format with percentages out_rows = [] for row in rows: out_rows.append({ "index": row["index"], "y_min_px": round(row["y_min"]), "y_max_px": round(row["y_max"]), "y_min_pct": round(row["y_min"] / img_h * 100, 2) if img_h else 0, "y_max_pct": round(row["y_max"] / img_h * 100, 2) if img_h else 0, "is_header": row["index"] in header_rows, }) return { "columns": out_columns, "rows": out_rows, "cells": cells, "header_rows": header_rows, "_raw_columns": columns, # internal: for propagation to other zones }