Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 44s
CI / test-go-edu-search (push) Successful in 43s
CI / test-python-klausur (push) Failing after 2m52s
CI / test-python-agent-core (push) Successful in 36s
CI / test-nodejs-website (push) Successful in 37s
New pipeline step between Gutter Repair and Ground Truth that processes
embedded boxes (grammar tips, exercises) independently from the main grid.
Backend:
- cv_box_layout.py: classify_box_layout() detects flowing/columnar/
bullet_list/header_only layout types per box
- build_box_zone_grid(): layout-aware grid building (single-column for
flowing text, independent columns for tabular content)
- POST /sessions/{id}/build-box-grids endpoint with SmartSpellChecker
- Layout type overridable per box via request body
Frontend:
- StepBoxGridReview.tsx: shows each box with cropped image + editable
GridTable. Layout type dropdown per box. Auto-builds on first load.
- Auto-skip when no boxes detected on page
- Pipeline steps updated: 13 steps (0-12), Ground Truth moved to 12
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
257 lines
8.2 KiB
Python
257 lines
8.2 KiB
Python
"""
|
||
Box layout classifier — detects internal layout type of embedded boxes.
|
||
|
||
Classifies each box as: flowing | columnar | bullet_list | header_only
|
||
and provides layout-appropriate grid building.
|
||
|
||
Used by the Box-Grid-Review step to rebuild box zones with correct structure.
|
||
"""
|
||
|
||
import logging
|
||
import re
|
||
import statistics
|
||
from typing import Any, Dict, List, Optional, Tuple
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# Bullet / list-item patterns at the start of a line
|
||
_BULLET_RE = re.compile(
|
||
r'^[\-\u2022\u2013\u2014\u25CF\u25CB\u25AA\u25A0•·]\s' # dash, bullet chars
|
||
r'|^\d{1,2}[.)]\s' # numbered: "1) " or "1. "
|
||
r'|^[a-z][.)]\s' # lettered: "a) " or "a. "
|
||
)
|
||
|
||
|
||
def classify_box_layout(
|
||
words: List[Dict],
|
||
box_w: int,
|
||
box_h: int,
|
||
) -> str:
|
||
"""Classify the internal layout of a detected box.
|
||
|
||
Args:
|
||
words: OCR word dicts within the box (with top, left, width, height, text)
|
||
box_w: Box width in pixels
|
||
box_h: Box height in pixels
|
||
|
||
Returns:
|
||
'header_only' | 'bullet_list' | 'columnar' | 'flowing'
|
||
"""
|
||
if not words:
|
||
return "header_only"
|
||
|
||
# Group words into lines by y-proximity
|
||
lines = _group_into_lines(words)
|
||
|
||
# Header only: very few words or single line
|
||
total_words = sum(len(line) for line in lines)
|
||
if total_words <= 5 or len(lines) <= 1:
|
||
return "header_only"
|
||
|
||
# Bullet list: check if majority of lines start with bullet patterns
|
||
bullet_count = 0
|
||
for line in lines:
|
||
first_text = line[0].get("text", "") if line else ""
|
||
if _BULLET_RE.match(first_text):
|
||
bullet_count += 1
|
||
# Also check if first word IS a bullet char
|
||
elif first_text.strip() in ("-", "–", "—", "•", "·", "▪", "▸"):
|
||
bullet_count += 1
|
||
if bullet_count >= len(lines) * 0.4 and bullet_count >= 2:
|
||
return "bullet_list"
|
||
|
||
# Columnar: check for multiple distinct x-clusters
|
||
if len(lines) >= 3 and _has_column_structure(words, box_w):
|
||
return "columnar"
|
||
|
||
# Default: flowing text
|
||
return "flowing"
|
||
|
||
|
||
def _group_into_lines(words: List[Dict]) -> List[List[Dict]]:
|
||
"""Group words into lines by y-proximity."""
|
||
if not words:
|
||
return []
|
||
|
||
sorted_words = sorted(words, key=lambda w: (w["top"], w["left"]))
|
||
heights = [w["height"] for w in sorted_words if w.get("height", 0) > 0]
|
||
median_h = statistics.median(heights) if heights else 20
|
||
y_tolerance = max(median_h * 0.5, 5)
|
||
|
||
lines: List[List[Dict]] = []
|
||
current_line: List[Dict] = [sorted_words[0]]
|
||
current_y = sorted_words[0]["top"]
|
||
|
||
for w in sorted_words[1:]:
|
||
if abs(w["top"] - current_y) <= y_tolerance:
|
||
current_line.append(w)
|
||
else:
|
||
lines.append(sorted(current_line, key=lambda ww: ww["left"]))
|
||
current_line = [w]
|
||
current_y = w["top"]
|
||
|
||
if current_line:
|
||
lines.append(sorted(current_line, key=lambda ww: ww["left"]))
|
||
|
||
return lines
|
||
|
||
|
||
def _has_column_structure(words: List[Dict], box_w: int) -> bool:
|
||
"""Check if words have multiple distinct left-edge clusters (columns)."""
|
||
if box_w <= 0:
|
||
return False
|
||
|
||
lines = _group_into_lines(words)
|
||
if len(lines) < 3:
|
||
return False
|
||
|
||
# Collect left-edges of non-first words in each line
|
||
# (first word of each line often aligns regardless of columns)
|
||
left_edges = []
|
||
for line in lines:
|
||
for w in line[1:]: # skip first word
|
||
left_edges.append(w["left"])
|
||
|
||
if len(left_edges) < 4:
|
||
return False
|
||
|
||
# Check if left edges cluster into 2+ distinct groups
|
||
left_edges.sort()
|
||
gaps = [left_edges[i + 1] - left_edges[i] for i in range(len(left_edges) - 1)]
|
||
if not gaps:
|
||
return False
|
||
|
||
median_gap = statistics.median(gaps)
|
||
# A column gap is typically > 15% of box width
|
||
column_gap_threshold = box_w * 0.15
|
||
large_gaps = [g for g in gaps if g > column_gap_threshold]
|
||
|
||
return len(large_gaps) >= 1
|
||
|
||
|
||
def build_box_zone_grid(
|
||
zone_words: List[Dict],
|
||
box_x: int,
|
||
box_y: int,
|
||
box_w: int,
|
||
box_h: int,
|
||
zone_index: int,
|
||
img_w: int,
|
||
img_h: int,
|
||
layout_type: Optional[str] = None,
|
||
) -> Dict[str, Any]:
|
||
"""Build a grid for a box zone with layout-aware processing.
|
||
|
||
If layout_type is None, auto-detects it.
|
||
For 'flowing' and 'bullet_list', forces single-column layout.
|
||
For 'columnar', uses the standard multi-column detection.
|
||
For 'header_only', creates a single cell.
|
||
|
||
Returns the same format as _build_zone_grid (columns, rows, cells, header_rows).
|
||
"""
|
||
from grid_editor_helpers import _build_zone_grid, _cluster_rows
|
||
|
||
if not zone_words:
|
||
return {
|
||
"columns": [],
|
||
"rows": [],
|
||
"cells": [],
|
||
"header_rows": [],
|
||
"box_layout_type": layout_type or "header_only",
|
||
"box_grid_reviewed": False,
|
||
}
|
||
|
||
# Auto-detect layout if not specified
|
||
if not layout_type:
|
||
layout_type = classify_box_layout(zone_words, box_w, box_h)
|
||
|
||
logger.info(
|
||
"Box zone %d: layout_type=%s, %d words, %dx%d",
|
||
zone_index, layout_type, len(zone_words), box_w, box_h,
|
||
)
|
||
|
||
if layout_type == "header_only":
|
||
# Single cell with all text concatenated
|
||
all_text = " ".join(
|
||
w.get("text", "") for w in sorted(zone_words, key=lambda ww: (ww["top"], ww["left"]))
|
||
).strip()
|
||
return {
|
||
"columns": [{"col_index": 0, "index": 0, "label": "column_text", "col_type": "column_1"}],
|
||
"rows": [{"index": 0, "row_index": 0, "y_min": box_y, "y_max": box_y + box_h, "y_center": box_y + box_h / 2}],
|
||
"cells": [{
|
||
"cell_id": f"Z{zone_index}_R0C0",
|
||
"row_index": 0,
|
||
"col_index": 0,
|
||
"col_type": "column_1",
|
||
"text": all_text,
|
||
"word_boxes": zone_words,
|
||
}],
|
||
"header_rows": [0],
|
||
"box_layout_type": layout_type,
|
||
"box_grid_reviewed": False,
|
||
}
|
||
|
||
if layout_type in ("flowing", "bullet_list"):
|
||
# Force single column — each line becomes one row with one cell
|
||
lines = _group_into_lines(zone_words)
|
||
column = {"col_index": 0, "index": 0, "label": "column_text", "col_type": "column_1"}
|
||
rows = []
|
||
cells = []
|
||
|
||
for row_idx, line_words in enumerate(lines):
|
||
if not line_words:
|
||
continue
|
||
y_min = min(w["top"] for w in line_words)
|
||
y_max = max(w["top"] + w["height"] for w in line_words)
|
||
y_center = (y_min + y_max) / 2
|
||
|
||
row = {
|
||
"index": row_idx,
|
||
"row_index": row_idx,
|
||
"y_min": y_min,
|
||
"y_max": y_max,
|
||
"y_center": y_center,
|
||
}
|
||
rows.append(row)
|
||
|
||
line_text = " ".join(w.get("text", "") for w in line_words).strip()
|
||
cell = {
|
||
"cell_id": f"Z{zone_index}_R{row_idx}C0",
|
||
"row_index": row_idx,
|
||
"col_index": 0,
|
||
"col_type": "column_1",
|
||
"text": line_text,
|
||
"word_boxes": line_words,
|
||
}
|
||
cells.append(cell)
|
||
|
||
# Detect header: first row if it's notably different (bold, larger, or short)
|
||
header_rows = []
|
||
if len(lines) >= 2:
|
||
first_line = lines[0]
|
||
first_text = " ".join(w.get("text", "") for w in first_line).strip()
|
||
# Header heuristic: short text, or all-caps, or ends with ':'
|
||
if (len(first_text) < 40
|
||
or first_text.isupper()
|
||
or first_text.rstrip().endswith(':')):
|
||
header_rows = [0]
|
||
|
||
return {
|
||
"columns": [column],
|
||
"rows": rows,
|
||
"cells": cells,
|
||
"header_rows": header_rows,
|
||
"box_layout_type": layout_type,
|
||
"box_grid_reviewed": False,
|
||
}
|
||
|
||
# Columnar: use standard grid builder with independent column detection
|
||
result = _build_zone_grid(
|
||
zone_words, box_x, box_y, box_w, box_h,
|
||
zone_index, img_w, img_h,
|
||
global_columns=None, # detect columns independently
|
||
)
|
||
result["box_layout_type"] = layout_type
|
||
result["box_grid_reviewed"] = False
|
||
return result
|