Files
breakpilot-lehrer/klausur-service/backend/cv_box_layout.py
Benjamin Admin 5da9a550bf
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 44s
CI / test-go-edu-search (push) Successful in 43s
CI / test-python-klausur (push) Failing after 2m52s
CI / test-python-agent-core (push) Successful in 36s
CI / test-nodejs-website (push) Successful in 37s
Add Box-Grid-Review step (Step 11) to OCR pipeline
New pipeline step between Gutter Repair and Ground Truth that processes
embedded boxes (grammar tips, exercises) independently from the main grid.

Backend:
- cv_box_layout.py: classify_box_layout() detects flowing/columnar/
  bullet_list/header_only layout types per box
- build_box_zone_grid(): layout-aware grid building (single-column for
  flowing text, independent columns for tabular content)
- POST /sessions/{id}/build-box-grids endpoint with SmartSpellChecker
- Layout type overridable per box via request body

Frontend:
- StepBoxGridReview.tsx: shows each box with cropped image + editable
  GridTable. Layout type dropdown per box. Auto-builds on first load.
- Auto-skip when no boxes detected on page
- Pipeline steps updated: 13 steps (0-12), Ground Truth moved to 12

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-12 17:26:06 +02:00

257 lines
8.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Box layout classifier — detects internal layout type of embedded boxes.
Classifies each box as: flowing | columnar | bullet_list | header_only
and provides layout-appropriate grid building.
Used by the Box-Grid-Review step to rebuild box zones with correct structure.
"""
import logging
import re
import statistics
from typing import Any, Dict, List, Optional, Tuple
logger = logging.getLogger(__name__)
# Bullet / list-item patterns at the start of a line
_BULLET_RE = re.compile(
r'^[\-\u2022\u2013\u2014\u25CF\u25CB\u25AA\u25A0•·]\s' # dash, bullet chars
r'|^\d{1,2}[.)]\s' # numbered: "1) " or "1. "
r'|^[a-z][.)]\s' # lettered: "a) " or "a. "
)
def classify_box_layout(
words: List[Dict],
box_w: int,
box_h: int,
) -> str:
"""Classify the internal layout of a detected box.
Args:
words: OCR word dicts within the box (with top, left, width, height, text)
box_w: Box width in pixels
box_h: Box height in pixels
Returns:
'header_only' | 'bullet_list' | 'columnar' | 'flowing'
"""
if not words:
return "header_only"
# Group words into lines by y-proximity
lines = _group_into_lines(words)
# Header only: very few words or single line
total_words = sum(len(line) for line in lines)
if total_words <= 5 or len(lines) <= 1:
return "header_only"
# Bullet list: check if majority of lines start with bullet patterns
bullet_count = 0
for line in lines:
first_text = line[0].get("text", "") if line else ""
if _BULLET_RE.match(first_text):
bullet_count += 1
# Also check if first word IS a bullet char
elif first_text.strip() in ("-", "", "", "", "·", "", ""):
bullet_count += 1
if bullet_count >= len(lines) * 0.4 and bullet_count >= 2:
return "bullet_list"
# Columnar: check for multiple distinct x-clusters
if len(lines) >= 3 and _has_column_structure(words, box_w):
return "columnar"
# Default: flowing text
return "flowing"
def _group_into_lines(words: List[Dict]) -> List[List[Dict]]:
"""Group words into lines by y-proximity."""
if not words:
return []
sorted_words = sorted(words, key=lambda w: (w["top"], w["left"]))
heights = [w["height"] for w in sorted_words if w.get("height", 0) > 0]
median_h = statistics.median(heights) if heights else 20
y_tolerance = max(median_h * 0.5, 5)
lines: List[List[Dict]] = []
current_line: List[Dict] = [sorted_words[0]]
current_y = sorted_words[0]["top"]
for w in sorted_words[1:]:
if abs(w["top"] - current_y) <= y_tolerance:
current_line.append(w)
else:
lines.append(sorted(current_line, key=lambda ww: ww["left"]))
current_line = [w]
current_y = w["top"]
if current_line:
lines.append(sorted(current_line, key=lambda ww: ww["left"]))
return lines
def _has_column_structure(words: List[Dict], box_w: int) -> bool:
"""Check if words have multiple distinct left-edge clusters (columns)."""
if box_w <= 0:
return False
lines = _group_into_lines(words)
if len(lines) < 3:
return False
# Collect left-edges of non-first words in each line
# (first word of each line often aligns regardless of columns)
left_edges = []
for line in lines:
for w in line[1:]: # skip first word
left_edges.append(w["left"])
if len(left_edges) < 4:
return False
# Check if left edges cluster into 2+ distinct groups
left_edges.sort()
gaps = [left_edges[i + 1] - left_edges[i] for i in range(len(left_edges) - 1)]
if not gaps:
return False
median_gap = statistics.median(gaps)
# A column gap is typically > 15% of box width
column_gap_threshold = box_w * 0.15
large_gaps = [g for g in gaps if g > column_gap_threshold]
return len(large_gaps) >= 1
def build_box_zone_grid(
zone_words: List[Dict],
box_x: int,
box_y: int,
box_w: int,
box_h: int,
zone_index: int,
img_w: int,
img_h: int,
layout_type: Optional[str] = None,
) -> Dict[str, Any]:
"""Build a grid for a box zone with layout-aware processing.
If layout_type is None, auto-detects it.
For 'flowing' and 'bullet_list', forces single-column layout.
For 'columnar', uses the standard multi-column detection.
For 'header_only', creates a single cell.
Returns the same format as _build_zone_grid (columns, rows, cells, header_rows).
"""
from grid_editor_helpers import _build_zone_grid, _cluster_rows
if not zone_words:
return {
"columns": [],
"rows": [],
"cells": [],
"header_rows": [],
"box_layout_type": layout_type or "header_only",
"box_grid_reviewed": False,
}
# Auto-detect layout if not specified
if not layout_type:
layout_type = classify_box_layout(zone_words, box_w, box_h)
logger.info(
"Box zone %d: layout_type=%s, %d words, %dx%d",
zone_index, layout_type, len(zone_words), box_w, box_h,
)
if layout_type == "header_only":
# Single cell with all text concatenated
all_text = " ".join(
w.get("text", "") for w in sorted(zone_words, key=lambda ww: (ww["top"], ww["left"]))
).strip()
return {
"columns": [{"col_index": 0, "index": 0, "label": "column_text", "col_type": "column_1"}],
"rows": [{"index": 0, "row_index": 0, "y_min": box_y, "y_max": box_y + box_h, "y_center": box_y + box_h / 2}],
"cells": [{
"cell_id": f"Z{zone_index}_R0C0",
"row_index": 0,
"col_index": 0,
"col_type": "column_1",
"text": all_text,
"word_boxes": zone_words,
}],
"header_rows": [0],
"box_layout_type": layout_type,
"box_grid_reviewed": False,
}
if layout_type in ("flowing", "bullet_list"):
# Force single column — each line becomes one row with one cell
lines = _group_into_lines(zone_words)
column = {"col_index": 0, "index": 0, "label": "column_text", "col_type": "column_1"}
rows = []
cells = []
for row_idx, line_words in enumerate(lines):
if not line_words:
continue
y_min = min(w["top"] for w in line_words)
y_max = max(w["top"] + w["height"] for w in line_words)
y_center = (y_min + y_max) / 2
row = {
"index": row_idx,
"row_index": row_idx,
"y_min": y_min,
"y_max": y_max,
"y_center": y_center,
}
rows.append(row)
line_text = " ".join(w.get("text", "") for w in line_words).strip()
cell = {
"cell_id": f"Z{zone_index}_R{row_idx}C0",
"row_index": row_idx,
"col_index": 0,
"col_type": "column_1",
"text": line_text,
"word_boxes": line_words,
}
cells.append(cell)
# Detect header: first row if it's notably different (bold, larger, or short)
header_rows = []
if len(lines) >= 2:
first_line = lines[0]
first_text = " ".join(w.get("text", "") for w in first_line).strip()
# Header heuristic: short text, or all-caps, or ends with ':'
if (len(first_text) < 40
or first_text.isupper()
or first_text.rstrip().endswith(':')):
header_rows = [0]
return {
"columns": [column],
"rows": rows,
"cells": cells,
"header_rows": header_rows,
"box_layout_type": layout_type,
"box_grid_reviewed": False,
}
# Columnar: use standard grid builder with independent column detection
result = _build_zone_grid(
zone_words, box_x, box_y, box_w, box_h,
zone_index, img_w, img_h,
global_columns=None, # detect columns independently
)
result["box_layout_type"] = layout_type
result["box_grid_reviewed"] = False
return result