Files
breakpilot-lehrer/klausur-service/backend/cv_box_layout.py
Benjamin Admin c62ff7cd31
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 33s
CI / test-go-edu-search (push) Successful in 38s
CI / test-python-klausur (push) Failing after 2m45s
CI / test-python-agent-core (push) Successful in 38s
CI / test-nodejs-website (push) Successful in 34s
Generic colspan detection for merged cells in grids and boxes
New _detect_colspan_cells() in grid_editor_helpers.py:
- Runs after _build_cells() for every zone (content + box)
- Detects word-blocks that extend across column boundaries
- Merges affected cells into spanning_header with colspan=N
- Uses column midpoints to determine which columns are covered
- Works for full-page scans and box zones equally

Also fixes box flowing/bullet_list row height fields (y_min_px/y_max_px).

Removed duplicate spanning logic from cv_box_layout.py — now uses
the generic _detect_colspan_cells from grid_editor_helpers.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-13 11:38:03 +02:00

266 lines
8.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Box layout classifier — detects internal layout type of embedded boxes.
Classifies each box as: flowing | columnar | bullet_list | header_only
and provides layout-appropriate grid building.
Used by the Box-Grid-Review step to rebuild box zones with correct structure.
"""
import logging
import re
import statistics
from typing import Any, Dict, List, Optional, Tuple
logger = logging.getLogger(__name__)
# Bullet / list-item patterns at the start of a line
_BULLET_RE = re.compile(
r'^[\-\u2022\u2013\u2014\u25CF\u25CB\u25AA\u25A0•·]\s' # dash, bullet chars
r'|^\d{1,2}[.)]\s' # numbered: "1) " or "1. "
r'|^[a-z][.)]\s' # lettered: "a) " or "a. "
)
def classify_box_layout(
words: List[Dict],
box_w: int,
box_h: int,
) -> str:
"""Classify the internal layout of a detected box.
Args:
words: OCR word dicts within the box (with top, left, width, height, text)
box_w: Box width in pixels
box_h: Box height in pixels
Returns:
'header_only' | 'bullet_list' | 'columnar' | 'flowing'
"""
if not words:
return "header_only"
# Group words into lines by y-proximity
lines = _group_into_lines(words)
# Header only: very few words or single line
total_words = sum(len(line) for line in lines)
if total_words <= 5 or len(lines) <= 1:
return "header_only"
# Bullet list: check if majority of lines start with bullet patterns
bullet_count = 0
for line in lines:
first_text = line[0].get("text", "") if line else ""
if _BULLET_RE.match(first_text):
bullet_count += 1
# Also check if first word IS a bullet char
elif first_text.strip() in ("-", "", "", "", "·", "", ""):
bullet_count += 1
if bullet_count >= len(lines) * 0.4 and bullet_count >= 2:
return "bullet_list"
# Columnar: check for multiple distinct x-clusters
if len(lines) >= 3 and _has_column_structure(words, box_w):
return "columnar"
# Default: flowing text
return "flowing"
def _group_into_lines(words: List[Dict]) -> List[List[Dict]]:
"""Group words into lines by y-proximity."""
if not words:
return []
sorted_words = sorted(words, key=lambda w: (w["top"], w["left"]))
heights = [w["height"] for w in sorted_words if w.get("height", 0) > 0]
median_h = statistics.median(heights) if heights else 20
y_tolerance = max(median_h * 0.5, 5)
lines: List[List[Dict]] = []
current_line: List[Dict] = [sorted_words[0]]
current_y = sorted_words[0]["top"]
for w in sorted_words[1:]:
if abs(w["top"] - current_y) <= y_tolerance:
current_line.append(w)
else:
lines.append(sorted(current_line, key=lambda ww: ww["left"]))
current_line = [w]
current_y = w["top"]
if current_line:
lines.append(sorted(current_line, key=lambda ww: ww["left"]))
return lines
def _has_column_structure(words: List[Dict], box_w: int) -> bool:
"""Check if words have multiple distinct left-edge clusters (columns)."""
if box_w <= 0:
return False
lines = _group_into_lines(words)
if len(lines) < 3:
return False
# Collect left-edges of non-first words in each line
# (first word of each line often aligns regardless of columns)
left_edges = []
for line in lines:
for w in line[1:]: # skip first word
left_edges.append(w["left"])
if len(left_edges) < 4:
return False
# Check if left edges cluster into 2+ distinct groups
left_edges.sort()
gaps = [left_edges[i + 1] - left_edges[i] for i in range(len(left_edges) - 1)]
if not gaps:
return False
median_gap = statistics.median(gaps)
# A column gap is typically > 15% of box width
column_gap_threshold = box_w * 0.15
large_gaps = [g for g in gaps if g > column_gap_threshold]
return len(large_gaps) >= 1
def build_box_zone_grid(
zone_words: List[Dict],
box_x: int,
box_y: int,
box_w: int,
box_h: int,
zone_index: int,
img_w: int,
img_h: int,
layout_type: Optional[str] = None,
) -> Dict[str, Any]:
"""Build a grid for a box zone with layout-aware processing.
If layout_type is None, auto-detects it.
For 'flowing' and 'bullet_list', forces single-column layout.
For 'columnar', uses the standard multi-column detection.
For 'header_only', creates a single cell.
Returns the same format as _build_zone_grid (columns, rows, cells, header_rows).
"""
from grid_editor_helpers import _build_zone_grid, _cluster_rows
if not zone_words:
return {
"columns": [],
"rows": [],
"cells": [],
"header_rows": [],
"box_layout_type": layout_type or "header_only",
"box_grid_reviewed": False,
}
# Auto-detect layout if not specified
if not layout_type:
layout_type = classify_box_layout(zone_words, box_w, box_h)
logger.info(
"Box zone %d: layout_type=%s, %d words, %dx%d",
zone_index, layout_type, len(zone_words), box_w, box_h,
)
if layout_type == "header_only":
# Single cell with all text concatenated
all_text = " ".join(
w.get("text", "") for w in sorted(zone_words, key=lambda ww: (ww["top"], ww["left"]))
).strip()
return {
"columns": [{"col_index": 0, "index": 0, "label": "column_text", "col_type": "column_1"}],
"rows": [{"index": 0, "row_index": 0, "y_min": box_y, "y_max": box_y + box_h, "y_center": box_y + box_h / 2}],
"cells": [{
"cell_id": f"Z{zone_index}_R0C0",
"row_index": 0,
"col_index": 0,
"col_type": "column_1",
"text": all_text,
"word_boxes": zone_words,
}],
"header_rows": [0],
"box_layout_type": layout_type,
"box_grid_reviewed": False,
}
if layout_type in ("flowing", "bullet_list"):
# Force single column — each line becomes one row with one cell
lines = _group_into_lines(zone_words)
column = {"col_index": 0, "index": 0, "label": "column_text", "col_type": "column_1"}
rows = []
cells = []
for row_idx, line_words in enumerate(lines):
if not line_words:
continue
y_min = min(w["top"] for w in line_words)
y_max = max(w["top"] + w["height"] for w in line_words)
y_center = (y_min + y_max) / 2
row = {
"index": row_idx,
"row_index": row_idx,
"y_min": y_min,
"y_max": y_max,
"y_center": y_center,
# GridTable expects _px and _pct variants
"y_min_px": y_min,
"y_max_px": y_max,
"y_min_pct": round(y_min / img_h * 100, 2) if img_h else 0,
"y_max_pct": round(y_max / img_h * 100, 2) if img_h else 0,
}
rows.append(row)
line_text = " ".join(w.get("text", "") for w in line_words).strip()
cell = {
"cell_id": f"Z{zone_index}_R{row_idx}C0",
"row_index": row_idx,
"col_index": 0,
"col_type": "column_1",
"text": line_text,
"word_boxes": line_words,
}
cells.append(cell)
# Detect header: first row if it's notably different (bold, larger, or short)
header_rows = []
if len(lines) >= 2:
first_line = lines[0]
first_text = " ".join(w.get("text", "") for w in first_line).strip()
# Header heuristic: short text, or all-caps, or ends with ':'
if (len(first_text) < 40
or first_text.isupper()
or first_text.rstrip().endswith(':')):
header_rows = [0]
return {
"columns": [column],
"rows": rows,
"cells": cells,
"header_rows": header_rows,
"box_layout_type": layout_type,
"box_grid_reviewed": False,
}
# Columnar: use standard grid builder with independent column detection
result = _build_zone_grid(
zone_words, box_x, box_y, box_w, box_h,
zone_index, img_w, img_h,
global_columns=None, # detect columns independently
)
# Colspan detection is now handled generically by _detect_colspan_cells
# in grid_editor_helpers.py (called inside _build_zone_grid).
result["box_layout_type"] = layout_type
result["box_grid_reviewed"] = False
return result