Fix: Sidebar scrollable + add Eltern-Portal nav link

overflow-hidden → overflow-y-auto so all nav items are reachable. Added /parent (Eltern-Portal) link with people icon. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 20:49:44 +02:00
parent d87645ffce
commit 45287b3541
48 changed files with 6 additions and 1 deletions
--- a/klausur-service/backend/ocr/detect/box_layout.py
+++ b/klausur-service/backend/ocr/detect/box_layout.py
@@ -0,0 +1,339 @@
+"""
+Box layout classifier — detects internal layout type of embedded boxes.
+
+Classifies each box as: flowing | columnar | bullet_list | header_only
+and provides layout-appropriate grid building.
+
+Used by the Box-Grid-Review step to rebuild box zones with correct structure.
+"""
+
+import logging
+import re
+import statistics
+from typing import Any, Dict, List, Optional, Tuple
+
+logger = logging.getLogger(__name__)
+
+# Bullet / list-item patterns at the start of a line
+_BULLET_RE = re.compile(
+    r'^[\-\u2022\u2013\u2014\u25CF\u25CB\u25AA\u25A0•·]\s'  # dash, bullet chars
+    r'|^\d{1,2}[.)]\s'     # numbered: "1) " or "1. "
+    r'|^[a-z][.)]\s'       # lettered: "a) " or "a. "
+)
+
+
+def classify_box_layout(
+    words: List[Dict],
+    box_w: int,
+    box_h: int,
+) -> str:
+    """Classify the internal layout of a detected box.
+
+    Args:
+        words: OCR word dicts within the box (with top, left, width, height, text)
+        box_w: Box width in pixels
+        box_h: Box height in pixels
+
+    Returns:
+        'header_only' | 'bullet_list' | 'columnar' | 'flowing'
+    """
+    if not words:
+        return "header_only"
+
+    # Group words into lines by y-proximity
+    lines = _group_into_lines(words)
+
+    # Header only: very few words or single line
+    total_words = sum(len(line) for line in lines)
+    if total_words <= 5 or len(lines) <= 1:
+        return "header_only"
+
+    # Bullet list: check if majority of lines start with bullet patterns
+    bullet_count = 0
+    for line in lines:
+        first_text = line[0].get("text", "") if line else ""
+        if _BULLET_RE.match(first_text):
+            bullet_count += 1
+        # Also check if first word IS a bullet char
+        elif first_text.strip() in ("-", "–", "—", "•", "·", "▪", "▸"):
+            bullet_count += 1
+    if bullet_count >= len(lines) * 0.4 and bullet_count >= 2:
+        return "bullet_list"
+
+    # Columnar: check for multiple distinct x-clusters
+    if len(lines) >= 3 and _has_column_structure(words, box_w):
+        return "columnar"
+
+    # Default: flowing text
+    return "flowing"
+
+
+def _group_into_lines(words: List[Dict]) -> List[List[Dict]]:
+    """Group words into lines by y-proximity."""
+    if not words:
+        return []
+
+    sorted_words = sorted(words, key=lambda w: (w["top"], w["left"]))
+    heights = [w["height"] for w in sorted_words if w.get("height", 0) > 0]
+    median_h = statistics.median(heights) if heights else 20
+    y_tolerance = max(median_h * 0.5, 5)
+
+    lines: List[List[Dict]] = []
+    current_line: List[Dict] = [sorted_words[0]]
+    current_y = sorted_words[0]["top"]
+
+    for w in sorted_words[1:]:
+        if abs(w["top"] - current_y) <= y_tolerance:
+            current_line.append(w)
+        else:
+            lines.append(sorted(current_line, key=lambda ww: ww["left"]))
+            current_line = [w]
+            current_y = w["top"]
+
+    if current_line:
+        lines.append(sorted(current_line, key=lambda ww: ww["left"]))
+
+    return lines
+
+
+def _has_column_structure(words: List[Dict], box_w: int) -> bool:
+    """Check if words have multiple distinct left-edge clusters (columns)."""
+    if box_w <= 0:
+        return False
+
+    lines = _group_into_lines(words)
+    if len(lines) < 3:
+        return False
+
+    # Collect left-edges of non-first words in each line
+    # (first word of each line often aligns regardless of columns)
+    left_edges = []
+    for line in lines:
+        for w in line[1:]:  # skip first word
+            left_edges.append(w["left"])
+
+    if len(left_edges) < 4:
+        return False
+
+    # Check if left edges cluster into 2+ distinct groups
+    left_edges.sort()
+    gaps = [left_edges[i + 1] - left_edges[i] for i in range(len(left_edges) - 1)]
+    if not gaps:
+        return False
+
+    median_gap = statistics.median(gaps)
+    # A column gap is typically > 15% of box width
+    column_gap_threshold = box_w * 0.15
+    large_gaps = [g for g in gaps if g > column_gap_threshold]
+
+    return len(large_gaps) >= 1
+
+
+def build_box_zone_grid(
+    zone_words: List[Dict],
+    box_x: int,
+    box_y: int,
+    box_w: int,
+    box_h: int,
+    zone_index: int,
+    img_w: int,
+    img_h: int,
+    layout_type: Optional[str] = None,
+) -> Dict[str, Any]:
+    """Build a grid for a box zone with layout-aware processing.
+
+    If layout_type is None, auto-detects it.
+    For 'flowing' and 'bullet_list', forces single-column layout.
+    For 'columnar', uses the standard multi-column detection.
+    For 'header_only', creates a single cell.
+
+    Returns the same format as _build_zone_grid (columns, rows, cells, header_rows).
+    """
+    from grid_editor_helpers import _build_zone_grid, _cluster_rows
+
+    if not zone_words:
+        return {
+            "columns": [],
+            "rows": [],
+            "cells": [],
+            "header_rows": [],
+            "box_layout_type": layout_type or "header_only",
+            "box_grid_reviewed": False,
+        }
+
+    # Auto-detect layout if not specified
+    if not layout_type:
+        layout_type = classify_box_layout(zone_words, box_w, box_h)
+
+    logger.info(
+        "Box zone %d: layout_type=%s, %d words, %dx%d",
+        zone_index, layout_type, len(zone_words), box_w, box_h,
+    )
+
+    if layout_type == "header_only":
+        # Single cell with all text concatenated
+        all_text = " ".join(
+            w.get("text", "") for w in sorted(zone_words, key=lambda ww: (ww["top"], ww["left"]))
+        ).strip()
+        return {
+            "columns": [{"col_index": 0, "index": 0, "label": "column_text", "col_type": "column_1",
+                         "x_min_px": box_x, "x_max_px": box_x + box_w,
+                         "x_min_pct": round(box_x / img_w * 100, 2) if img_w else 0,
+                         "x_max_pct": round((box_x + box_w) / img_w * 100, 2) if img_w else 0,
+                         "bold": False}],
+            "rows": [{"index": 0, "row_index": 0,
+                       "y_min": box_y, "y_max": box_y + box_h, "y_center": box_y + box_h / 2,
+                       "y_min_px": box_y, "y_max_px": box_y + box_h,
+                       "y_min_pct": round(box_y / img_h * 100, 2) if img_h else 0,
+                       "y_max_pct": round((box_y + box_h) / img_h * 100, 2) if img_h else 0,
+                       "is_header": True}],
+            "cells": [{
+                "cell_id": f"Z{zone_index}_R0C0",
+                "row_index": 0,
+                "col_index": 0,
+                "col_type": "column_1",
+                "text": all_text,
+                "word_boxes": zone_words,
+            }],
+            "header_rows": [0],
+            "box_layout_type": layout_type,
+            "box_grid_reviewed": False,
+        }
+
+    if layout_type in ("flowing", "bullet_list"):
+        # Force single column — each line becomes one row with one cell.
+        # Detect bullet structure from indentation and merge continuation
+        # lines into the bullet they belong to.
+        lines = _group_into_lines(zone_words)
+        column = {
+            "col_index": 0, "index": 0, "label": "column_text", "col_type": "column_1",
+            "x_min_px": box_x, "x_max_px": box_x + box_w,
+            "x_min_pct": round(box_x / img_w * 100, 2) if img_w else 0,
+            "x_max_pct": round((box_x + box_w) / img_w * 100, 2) if img_w else 0,
+            "bold": False,
+        }
+
+        # --- Detect indentation levels ---
+        line_indents = []
+        for line_words in lines:
+            if not line_words:
+                line_indents.append(0)
+                continue
+            min_left = min(w["left"] for w in line_words)
+            line_indents.append(min_left - box_x)
+
+        # Find the minimum indent (= bullet/main level)
+        valid_indents = [ind for ind in line_indents if ind >= 0]
+        min_indent = min(valid_indents) if valid_indents else 0
+
+        # Indentation threshold: lines indented > 15px more than minimum
+        # are continuation lines belonging to the previous bullet
+        INDENT_THRESHOLD = 15
+
+        # --- Group lines into logical items (bullet + continuations) ---
+        # Each item is a list of line indices
+        items: List[List[int]] = []
+        for li, indent in enumerate(line_indents):
+            is_continuation = (indent > min_indent + INDENT_THRESHOLD) and len(items) > 0
+            if is_continuation:
+                items[-1].append(li)
+            else:
+                items.append([li])
+
+        logger.info(
+            "Box zone %d flowing: %d lines → %d items (indents=%s, min=%d, threshold=%d)",
+            zone_index, len(lines), len(items),
+            [int(i) for i in line_indents], int(min_indent), INDENT_THRESHOLD,
+        )
+
+        # --- Build rows and cells from grouped items ---
+        rows = []
+        cells = []
+        header_rows = []
+
+        for row_idx, item_line_indices in enumerate(items):
+            # Collect all words from all lines in this item
+            item_words = []
+            item_texts = []
+            for li in item_line_indices:
+                if li < len(lines):
+                    item_words.extend(lines[li])
+                    line_text = " ".join(w.get("text", "") for w in lines[li]).strip()
+                    if line_text:
+                        item_texts.append(line_text)
+
+            if not item_words:
+                continue
+
+            y_min = min(w["top"] for w in item_words)
+            y_max = max(w["top"] + w["height"] for w in item_words)
+            y_center = (y_min + y_max) / 2
+
+            row = {
+                "index": row_idx,
+                "row_index": row_idx,
+                "y_min": y_min,
+                "y_max": y_max,
+                "y_center": y_center,
+                "y_min_px": y_min,
+                "y_max_px": y_max,
+                "y_min_pct": round(y_min / img_h * 100, 2) if img_h else 0,
+                "y_max_pct": round(y_max / img_h * 100, 2) if img_h else 0,
+                "is_header": False,
+            }
+            rows.append(row)
+
+            # Join multi-line text with newline for display
+            merged_text = "\n".join(item_texts)
+
+            # Add bullet marker if this is a bullet item without one
+            first_text = item_texts[0] if item_texts else ""
+            is_bullet = len(item_line_indices) > 1 or _BULLET_RE.match(first_text)
+            if is_bullet and not _BULLET_RE.match(first_text) and row_idx > 0:
+                # Continuation item without bullet — add one
+                merged_text = "• " + merged_text
+
+            cell = {
+                "cell_id": f"Z{zone_index}_R{row_idx}C0",
+                "row_index": row_idx,
+                "col_index": 0,
+                "col_type": "column_1",
+                "text": merged_text,
+                "word_boxes": item_words,
+            }
+            cells.append(cell)
+
+        # Detect header: first item if it has no continuation lines and is short
+        if len(items) >= 2:
+            first_item_texts = []
+            for li in items[0]:
+                if li < len(lines):
+                    first_item_texts.append(" ".join(w.get("text", "") for w in lines[li]).strip())
+            first_text = " ".join(first_item_texts)
+            if (len(first_text) < 40
+                    or first_text.isupper()
+                    or first_text.rstrip().endswith(':')):
+                header_rows = [0]
+
+        return {
+            "columns": [column],
+            "rows": rows,
+            "cells": cells,
+            "header_rows": header_rows,
+            "box_layout_type": layout_type,
+            "box_grid_reviewed": False,
+        }
+
+    # Columnar: use standard grid builder with independent column detection
+    result = _build_zone_grid(
+        zone_words, box_x, box_y, box_w, box_h,
+        zone_index, img_w, img_h,
+        global_columns=None,  # detect columns independently
+    )
+
+    # Colspan detection is now handled generically by _detect_colspan_cells
+    # in grid_editor_helpers.py (called inside _build_zone_grid).
+
+    result["box_layout_type"] = layout_type
+    result["box_grid_reviewed"] = False
+    return result