+ {/* Header */}
+
+
+
+ 📦 Box-Review ({boxZones.length} {boxZones.length === 1 ? 'Box' : 'Boxen'})
+
+
+ Eingebettete Boxen prüfen und korrigieren. Layout-Typ kann pro Box angepasst werden.
+
+
+
+ {dirty && (
+
+ )}
+
+
+
+
+
+ {/* Errors */}
+ {(error || buildError) && (
+
+ {error || buildError}
+
+ )}
+
+ {building && (
+
+
+
Box-Grids werden aufgebaut...
+
+ )}
+
+ {/* Box zones */}
+ {boxZones.map((zone) => (
+
+ {/* Box header */}
+
+
+
📦
+
+
+ Box {zone.zone_index + 1}
+
+
+ {zone.bbox_px.w}×{zone.bbox_px.h}px
+ {zone.cells?.length ? ` • ${zone.cells.length} Zellen` : ''}
+
+
+
+
+
+
+
+
+
+ {/* Box content — image + grid side by side */}
+
+ {/* Box image crop */}
+ {sessionId && (
+
+
+

{
+ // Fallback: hide image if endpoint doesn't exist
+ (e.target as HTMLImageElement).style.display = 'none'
+ }}
+ />
+
+
+ )}
+
+ {/* Box grid table */}
+
+ {zone.cells && zone.cells.length > 0 ? (
+
+ ) : (
+
+
Keine Zellen erkannt.
+
+
+ )}
+
+
+
+ ))}
+
+ )
+}
diff --git a/klausur-service/backend/cv_box_layout.py b/klausur-service/backend/cv_box_layout.py
new file mode 100644
index 0000000..e51d5da
--- /dev/null
+++ b/klausur-service/backend/cv_box_layout.py
@@ -0,0 +1,256 @@
+"""
+Box layout classifier — detects internal layout type of embedded boxes.
+
+Classifies each box as: flowing | columnar | bullet_list | header_only
+and provides layout-appropriate grid building.
+
+Used by the Box-Grid-Review step to rebuild box zones with correct structure.
+"""
+
+import logging
+import re
+import statistics
+from typing import Any, Dict, List, Optional, Tuple
+
+logger = logging.getLogger(__name__)
+
+# Bullet / list-item patterns at the start of a line
+_BULLET_RE = re.compile(
+ r'^[\-\u2022\u2013\u2014\u25CF\u25CB\u25AA\u25A0•·]\s' # dash, bullet chars
+ r'|^\d{1,2}[.)]\s' # numbered: "1) " or "1. "
+ r'|^[a-z][.)]\s' # lettered: "a) " or "a. "
+)
+
+
+def classify_box_layout(
+ words: List[Dict],
+ box_w: int,
+ box_h: int,
+) -> str:
+ """Classify the internal layout of a detected box.
+
+ Args:
+ words: OCR word dicts within the box (with top, left, width, height, text)
+ box_w: Box width in pixels
+ box_h: Box height in pixels
+
+ Returns:
+ 'header_only' | 'bullet_list' | 'columnar' | 'flowing'
+ """
+ if not words:
+ return "header_only"
+
+ # Group words into lines by y-proximity
+ lines = _group_into_lines(words)
+
+ # Header only: very few words or single line
+ total_words = sum(len(line) for line in lines)
+ if total_words <= 5 or len(lines) <= 1:
+ return "header_only"
+
+ # Bullet list: check if majority of lines start with bullet patterns
+ bullet_count = 0
+ for line in lines:
+ first_text = line[0].get("text", "") if line else ""
+ if _BULLET_RE.match(first_text):
+ bullet_count += 1
+ # Also check if first word IS a bullet char
+ elif first_text.strip() in ("-", "–", "—", "•", "·", "▪", "▸"):
+ bullet_count += 1
+ if bullet_count >= len(lines) * 0.4 and bullet_count >= 2:
+ return "bullet_list"
+
+ # Columnar: check for multiple distinct x-clusters
+ if len(lines) >= 3 and _has_column_structure(words, box_w):
+ return "columnar"
+
+ # Default: flowing text
+ return "flowing"
+
+
+def _group_into_lines(words: List[Dict]) -> List[List[Dict]]:
+ """Group words into lines by y-proximity."""
+ if not words:
+ return []
+
+ sorted_words = sorted(words, key=lambda w: (w["top"], w["left"]))
+ heights = [w["height"] for w in sorted_words if w.get("height", 0) > 0]
+ median_h = statistics.median(heights) if heights else 20
+ y_tolerance = max(median_h * 0.5, 5)
+
+ lines: List[List[Dict]] = []
+ current_line: List[Dict] = [sorted_words[0]]
+ current_y = sorted_words[0]["top"]
+
+ for w in sorted_words[1:]:
+ if abs(w["top"] - current_y) <= y_tolerance:
+ current_line.append(w)
+ else:
+ lines.append(sorted(current_line, key=lambda ww: ww["left"]))
+ current_line = [w]
+ current_y = w["top"]
+
+ if current_line:
+ lines.append(sorted(current_line, key=lambda ww: ww["left"]))
+
+ return lines
+
+
+def _has_column_structure(words: List[Dict], box_w: int) -> bool:
+ """Check if words have multiple distinct left-edge clusters (columns)."""
+ if box_w <= 0:
+ return False
+
+ lines = _group_into_lines(words)
+ if len(lines) < 3:
+ return False
+
+ # Collect left-edges of non-first words in each line
+ # (first word of each line often aligns regardless of columns)
+ left_edges = []
+ for line in lines:
+ for w in line[1:]: # skip first word
+ left_edges.append(w["left"])
+
+ if len(left_edges) < 4:
+ return False
+
+ # Check if left edges cluster into 2+ distinct groups
+ left_edges.sort()
+ gaps = [left_edges[i + 1] - left_edges[i] for i in range(len(left_edges) - 1)]
+ if not gaps:
+ return False
+
+ median_gap = statistics.median(gaps)
+ # A column gap is typically > 15% of box width
+ column_gap_threshold = box_w * 0.15
+ large_gaps = [g for g in gaps if g > column_gap_threshold]
+
+ return len(large_gaps) >= 1
+
+
+def build_box_zone_grid(
+ zone_words: List[Dict],
+ box_x: int,
+ box_y: int,
+ box_w: int,
+ box_h: int,
+ zone_index: int,
+ img_w: int,
+ img_h: int,
+ layout_type: Optional[str] = None,
+) -> Dict[str, Any]:
+ """Build a grid for a box zone with layout-aware processing.
+
+ If layout_type is None, auto-detects it.
+ For 'flowing' and 'bullet_list', forces single-column layout.
+ For 'columnar', uses the standard multi-column detection.
+ For 'header_only', creates a single cell.
+
+ Returns the same format as _build_zone_grid (columns, rows, cells, header_rows).
+ """
+ from grid_editor_helpers import _build_zone_grid, _cluster_rows
+
+ if not zone_words:
+ return {
+ "columns": [],
+ "rows": [],
+ "cells": [],
+ "header_rows": [],
+ "box_layout_type": layout_type or "header_only",
+ "box_grid_reviewed": False,
+ }
+
+ # Auto-detect layout if not specified
+ if not layout_type:
+ layout_type = classify_box_layout(zone_words, box_w, box_h)
+
+ logger.info(
+ "Box zone %d: layout_type=%s, %d words, %dx%d",
+ zone_index, layout_type, len(zone_words), box_w, box_h,
+ )
+
+ if layout_type == "header_only":
+ # Single cell with all text concatenated
+ all_text = " ".join(
+ w.get("text", "") for w in sorted(zone_words, key=lambda ww: (ww["top"], ww["left"]))
+ ).strip()
+ return {
+ "columns": [{"col_index": 0, "index": 0, "label": "column_text", "col_type": "column_1"}],
+ "rows": [{"index": 0, "row_index": 0, "y_min": box_y, "y_max": box_y + box_h, "y_center": box_y + box_h / 2}],
+ "cells": [{
+ "cell_id": f"Z{zone_index}_R0C0",
+ "row_index": 0,
+ "col_index": 0,
+ "col_type": "column_1",
+ "text": all_text,
+ "word_boxes": zone_words,
+ }],
+ "header_rows": [0],
+ "box_layout_type": layout_type,
+ "box_grid_reviewed": False,
+ }
+
+ if layout_type in ("flowing", "bullet_list"):
+ # Force single column — each line becomes one row with one cell
+ lines = _group_into_lines(zone_words)
+ column = {"col_index": 0, "index": 0, "label": "column_text", "col_type": "column_1"}
+ rows = []
+ cells = []
+
+ for row_idx, line_words in enumerate(lines):
+ if not line_words:
+ continue
+ y_min = min(w["top"] for w in line_words)
+ y_max = max(w["top"] + w["height"] for w in line_words)
+ y_center = (y_min + y_max) / 2
+
+ row = {
+ "index": row_idx,
+ "row_index": row_idx,
+ "y_min": y_min,
+ "y_max": y_max,
+ "y_center": y_center,
+ }
+ rows.append(row)
+
+ line_text = " ".join(w.get("text", "") for w in line_words).strip()
+ cell = {
+ "cell_id": f"Z{zone_index}_R{row_idx}C0",
+ "row_index": row_idx,
+ "col_index": 0,
+ "col_type": "column_1",
+ "text": line_text,
+ "word_boxes": line_words,
+ }
+ cells.append(cell)
+
+ # Detect header: first row if it's notably different (bold, larger, or short)
+ header_rows = []
+ if len(lines) >= 2:
+ first_line = lines[0]
+ first_text = " ".join(w.get("text", "") for w in first_line).strip()
+ # Header heuristic: short text, or all-caps, or ends with ':'
+ if (len(first_text) < 40
+ or first_text.isupper()
+ or first_text.rstrip().endswith(':')):
+ header_rows = [0]
+
+ return {
+ "columns": [column],
+ "rows": rows,
+ "cells": cells,
+ "header_rows": header_rows,
+ "box_layout_type": layout_type,
+ "box_grid_reviewed": False,
+ }
+
+ # Columnar: use standard grid builder with independent column detection
+ result = _build_zone_grid(
+ zone_words, box_x, box_y, box_w, box_h,
+ zone_index, img_w, img_h,
+ global_columns=None, # detect columns independently
+ )
+ result["box_layout_type"] = layout_type
+ result["box_grid_reviewed"] = False
+ return result
diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py
index b914990..1b5e1d3 100644
--- a/klausur-service/backend/grid_editor_api.py
+++ b/klausur-service/backend/grid_editor_api.py
@@ -2181,3 +2181,117 @@ async def gutter_repair_apply(session_id: str, request: Request):
)
return result
+
+
+# ---------------------------------------------------------------------------
+# Box-Grid-Review endpoints
+# ---------------------------------------------------------------------------
+
+@router.post("/sessions/{session_id}/build-box-grids")
+async def build_box_grids(session_id: str, request: Request):
+ """Rebuild grid structure for all box zones with layout-aware detection.
+
+ For each zone with zone_type='box':
+ 1. Auto-detect layout type (flowing / columnar / bullet_list / header_only)
+ 2. Build grid with layout-appropriate parameters
+ 3. Apply SmartSpellChecker corrections
+ 4. Store results back in grid_editor_result.zones[]
+
+ Optional body: { "overrides": { "2": "bullet_list" } }
+ Maps zone_index → forced layout_type.
+ """
+ session = await get_session_db(session_id)
+ if not session:
+ raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
+
+ grid_data = session.get("grid_editor_result")
+ if not grid_data:
+ raise HTTPException(status_code=400, detail="No grid data. Run build-grid first.")
+
+ word_result = session.get("word_result") or {}
+ all_words = word_result.get("cells") or word_result.get("words") or []
+
+ body = {}
+ try:
+ body = await request.json()
+ except Exception:
+ pass
+ layout_overrides = body.get("overrides", {})
+
+ from cv_box_layout import classify_box_layout, build_box_zone_grid, _group_into_lines
+ from grid_editor_helpers import _words_in_zone
+
+ img_w = grid_data.get("image_width", 0)
+ img_h = grid_data.get("image_height", 0)
+
+ zones = grid_data.get("zones", [])
+ box_count = 0
+ spell_fixes = 0
+
+ for z in zones:
+ if z.get("zone_type") != "box":
+ continue
+
+ bbox = z.get("bbox_px", {})
+ bx, by = bbox.get("x", 0), bbox.get("y", 0)
+ bw, bh = bbox.get("w", 0), bbox.get("h", 0)
+
+ if bw <= 0 or bh <= 0:
+ continue
+
+ zone_idx = z.get("zone_index", 0)
+
+ # Filter words inside this box
+ zone_words = _words_in_zone(all_words, by, bh, bx, bw)
+ if not zone_words:
+ logger.info("Box zone %d: no words found in bbox", zone_idx)
+ continue
+
+ # Get layout override or auto-detect
+ forced_layout = layout_overrides.get(str(zone_idx))
+
+ # Build box grid
+ box_grid = build_box_zone_grid(
+ zone_words, bx, by, bw, bh,
+ zone_idx, img_w, img_h,
+ layout_type=forced_layout,
+ )
+
+ # Apply SmartSpellChecker to all box cells
+ try:
+ from smart_spell import SmartSpellChecker
+ ssc = SmartSpellChecker()
+ for cell in box_grid.get("cells", []):
+ text = cell.get("text", "")
+ if not text:
+ continue
+ result = ssc.correct_text(text, lang="auto")
+ if result.changed:
+ cell["text"] = result.corrected
+ spell_fixes += 1
+ except ImportError:
+ pass
+
+ # Update zone data with new grid
+ z["columns"] = box_grid["columns"]
+ z["rows"] = box_grid["rows"]
+ z["cells"] = box_grid["cells"]
+ z["header_rows"] = box_grid.get("header_rows", [])
+ z["box_layout_type"] = box_grid.get("box_layout_type", "flowing")
+ z["box_grid_reviewed"] = False
+ box_count += 1
+
+ # Save updated grid back
+ await update_session_db(session_id, grid_editor_result=grid_data)
+
+ logger.info(
+ "build-box-grids session %s: %d box zones rebuilt, %d spell fixes",
+ session_id, box_count, spell_fixes,
+ )
+
+ return {
+ "session_id": session_id,
+ "box_zones_rebuilt": box_count,
+ "spell_fixes": spell_fixes,
+ "zones": zones,
+ }