klausur-service (7 monoliths): - grid_editor_helpers.py (1,737 → 5 files: columns, filters, headers, zones) - cv_cell_grid.py (1,675 → 7 files: build, legacy, streaming, merge, vocab) - worksheet_editor_api.py (1,305 → 4 files: models, AI, reconstruct, routes) - legal_corpus_ingestion.py (1,280 → 3 files: registry, chunking, ingestion) - cv_review.py (1,248 → 4 files: pipeline, spell, LLM, barrel) - cv_preprocessing.py (1,166 → 3 files: deskew, dewarp, barrel) - rbac.py, admin_api.py, routes/eh.py remain (next batch) backend-lehrer (1 monolith): - classroom_engine/repository.py (1,705 → 7 files by domain) All re-export barrels preserve backward compatibility. Zero import errors verified. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
390 lines
13 KiB
Python
390 lines
13 KiB
Python
"""
|
|
Grid Editor — vertical divider detection, zone splitting/merging, zone grid building.
|
|
|
|
Split from grid_editor_helpers.py for maintainability.
|
|
All functions are pure computation — no HTTP, DB, or session side effects.
|
|
|
|
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
|
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
|
"""
|
|
|
|
import logging
|
|
import re
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
from cv_vocab_types import PageZone
|
|
from cv_words_first import _cluster_rows, _build_cells
|
|
|
|
from grid_editor_columns import (
|
|
_cluster_columns_by_alignment,
|
|
_merge_inline_marker_columns,
|
|
_split_cross_column_words,
|
|
)
|
|
from grid_editor_headers import (
|
|
_detect_header_rows,
|
|
_detect_colspan_cells,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Vertical divider detection and zone splitting
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_PIPE_RE_VSPLIT = re.compile(r"^\|+$")
|
|
|
|
|
|
def _detect_vertical_dividers(
|
|
words: List[Dict],
|
|
zone_x: int,
|
|
zone_w: int,
|
|
zone_y: int,
|
|
zone_h: int,
|
|
) -> List[float]:
|
|
"""Detect vertical divider lines from pipe word_boxes at consistent x.
|
|
|
|
Returns list of divider x-positions (empty if no dividers found).
|
|
"""
|
|
if not words or zone_w <= 0 or zone_h <= 0:
|
|
return []
|
|
|
|
# Collect pipe word_boxes
|
|
pipes = [
|
|
w for w in words
|
|
if _PIPE_RE_VSPLIT.match((w.get("text") or "").strip())
|
|
]
|
|
if len(pipes) < 5:
|
|
return []
|
|
|
|
# Cluster pipe x-centers by proximity
|
|
tolerance = max(15, int(zone_w * 0.02))
|
|
pipe_xs = sorted(w["left"] + w["width"] / 2 for w in pipes)
|
|
|
|
clusters: List[List[float]] = [[pipe_xs[0]]]
|
|
for x in pipe_xs[1:]:
|
|
if x - clusters[-1][-1] <= tolerance:
|
|
clusters[-1].append(x)
|
|
else:
|
|
clusters.append([x])
|
|
|
|
dividers: List[float] = []
|
|
for cluster in clusters:
|
|
if len(cluster) < 5:
|
|
continue
|
|
mean_x = sum(cluster) / len(cluster)
|
|
# Must be between 15% and 85% of zone width
|
|
rel_pos = (mean_x - zone_x) / zone_w
|
|
if rel_pos < 0.15 or rel_pos > 0.85:
|
|
continue
|
|
# Check vertical coverage: pipes must span >= 50% of zone height
|
|
cluster_pipes = [
|
|
w for w in pipes
|
|
if abs(w["left"] + w["width"] / 2 - mean_x) <= tolerance
|
|
]
|
|
ys = [w["top"] for w in cluster_pipes] + [w["top"] + w["height"] for w in cluster_pipes]
|
|
y_span = max(ys) - min(ys) if ys else 0
|
|
if y_span < zone_h * 0.5:
|
|
continue
|
|
dividers.append(mean_x)
|
|
|
|
return sorted(dividers)
|
|
|
|
|
|
def _split_zone_at_vertical_dividers(
|
|
zone: "PageZone",
|
|
divider_xs: List[float],
|
|
vsplit_group_id: int,
|
|
) -> List["PageZone"]:
|
|
"""Split a PageZone at vertical divider positions into sub-zones."""
|
|
boundaries = [zone.x] + divider_xs + [zone.x + zone.width]
|
|
hints = []
|
|
for i in range(len(boundaries) - 1):
|
|
if i == 0:
|
|
hints.append("left_of_vsplit")
|
|
elif i == len(boundaries) - 2:
|
|
hints.append("right_of_vsplit")
|
|
else:
|
|
hints.append("middle_of_vsplit")
|
|
|
|
sub_zones = []
|
|
for i in range(len(boundaries) - 1):
|
|
x_start = int(boundaries[i])
|
|
x_end = int(boundaries[i + 1])
|
|
sub = PageZone(
|
|
index=0, # re-indexed later
|
|
zone_type=zone.zone_type,
|
|
y=zone.y,
|
|
height=zone.height,
|
|
x=x_start,
|
|
width=x_end - x_start,
|
|
box=zone.box,
|
|
image_overlays=zone.image_overlays,
|
|
layout_hint=hints[i],
|
|
vsplit_group=vsplit_group_id,
|
|
)
|
|
sub_zones.append(sub)
|
|
|
|
return sub_zones
|
|
|
|
|
|
def _merge_content_zones_across_boxes(
|
|
zones: List,
|
|
content_x: int,
|
|
content_w: int,
|
|
) -> List:
|
|
"""Merge content zones separated by box zones into single zones.
|
|
|
|
Box zones become image_overlays on the merged content zone.
|
|
Pattern: [content, box*, content] -> [merged_content with overlay]
|
|
Box zones NOT between two content zones stay as standalone zones.
|
|
"""
|
|
if len(zones) < 3:
|
|
return zones
|
|
|
|
# Group consecutive runs of [content, box+, content]
|
|
result: List = []
|
|
i = 0
|
|
while i < len(zones):
|
|
z = zones[i]
|
|
if z.zone_type != "content":
|
|
result.append(z)
|
|
i += 1
|
|
continue
|
|
|
|
# Start of a potential merge group: content zone
|
|
group_contents = [z]
|
|
group_boxes = []
|
|
j = i + 1
|
|
# Absorb [box, content] pairs -- only absorb a box if it's
|
|
# confirmed to be followed by another content zone.
|
|
while j < len(zones):
|
|
if (zones[j].zone_type == "box"
|
|
and j + 1 < len(zones)
|
|
and zones[j + 1].zone_type == "content"):
|
|
group_boxes.append(zones[j])
|
|
group_contents.append(zones[j + 1])
|
|
j += 2
|
|
else:
|
|
break
|
|
|
|
if len(group_contents) >= 2 and group_boxes:
|
|
# Merge: create one large content zone spanning all
|
|
y_min = min(c.y for c in group_contents)
|
|
y_max = max(c.y + c.height for c in group_contents)
|
|
overlays = []
|
|
for bz in group_boxes:
|
|
overlay = {
|
|
"y": bz.y,
|
|
"height": bz.height,
|
|
"x": bz.x,
|
|
"width": bz.width,
|
|
}
|
|
if bz.box:
|
|
overlay["box"] = {
|
|
"x": bz.box.x,
|
|
"y": bz.box.y,
|
|
"width": bz.box.width,
|
|
"height": bz.box.height,
|
|
"confidence": bz.box.confidence,
|
|
"border_thickness": bz.box.border_thickness,
|
|
}
|
|
overlays.append(overlay)
|
|
|
|
merged = PageZone(
|
|
index=0, # re-indexed below
|
|
zone_type="content",
|
|
y=y_min,
|
|
height=y_max - y_min,
|
|
x=content_x,
|
|
width=content_w,
|
|
image_overlays=overlays,
|
|
)
|
|
result.append(merged)
|
|
i = j
|
|
else:
|
|
# No merge possible -- emit just the content zone
|
|
result.append(z)
|
|
i += 1
|
|
|
|
# Re-index zones
|
|
for idx, z in enumerate(result):
|
|
z.index = idx
|
|
|
|
logger.info(
|
|
"zone-merge: %d zones -> %d zones after merging across boxes",
|
|
len(zones), len(result),
|
|
)
|
|
return result
|
|
|
|
|
|
def _build_zone_grid(
|
|
zone_words: List[Dict],
|
|
zone_x: int,
|
|
zone_y: int,
|
|
zone_w: int,
|
|
zone_h: int,
|
|
zone_index: int,
|
|
img_w: int,
|
|
img_h: int,
|
|
global_columns: Optional[List[Dict]] = None,
|
|
skip_first_row_header: bool = False,
|
|
) -> Dict[str, Any]:
|
|
"""Build columns, rows, cells for a single zone from its words.
|
|
|
|
Args:
|
|
global_columns: If provided, use these pre-computed column boundaries
|
|
instead of detecting columns per zone. Used for content zones so
|
|
that all content zones (above/between/below boxes) share the same
|
|
column structure. Box zones always detect columns independently.
|
|
"""
|
|
if not zone_words:
|
|
return {
|
|
"columns": [],
|
|
"rows": [],
|
|
"cells": [],
|
|
"header_rows": [],
|
|
}
|
|
|
|
# Cluster rows first (needed for column alignment analysis)
|
|
rows = _cluster_rows(zone_words)
|
|
|
|
# Diagnostic logging for small/medium zones (box zones typically have 40-60 words)
|
|
if len(zone_words) <= 60:
|
|
import statistics as _st
|
|
_heights = [w['height'] for w in zone_words if w.get('height', 0) > 0]
|
|
_med_h = _st.median(_heights) if _heights else 20
|
|
_y_tol = max(_med_h * 0.5, 5)
|
|
logger.info(
|
|
"zone %d row-clustering: %d words, median_h=%.0f, y_tol=%.1f -> %d rows",
|
|
zone_index, len(zone_words), _med_h, _y_tol, len(rows),
|
|
)
|
|
for w in sorted(zone_words, key=lambda ww: (ww['top'], ww['left'])):
|
|
logger.info(
|
|
" zone %d word: y=%d x=%d h=%d w=%d '%s'",
|
|
zone_index, w['top'], w['left'], w['height'], w['width'],
|
|
w.get('text', '')[:40],
|
|
)
|
|
for r in rows:
|
|
logger.info(
|
|
" zone %d row %d: y_min=%d y_max=%d y_center=%.0f",
|
|
zone_index, r['index'], r['y_min'], r['y_max'], r['y_center'],
|
|
)
|
|
|
|
# Use global columns if provided, otherwise detect per zone
|
|
columns = global_columns if global_columns else _cluster_columns_by_alignment(zone_words, zone_w, rows)
|
|
|
|
# Merge inline marker columns (bullets, numbering) into adjacent text
|
|
if not global_columns:
|
|
columns = _merge_inline_marker_columns(columns, zone_words)
|
|
|
|
if not columns or not rows:
|
|
return {
|
|
"columns": [],
|
|
"rows": [],
|
|
"cells": [],
|
|
"header_rows": [],
|
|
}
|
|
|
|
# Split word boxes that straddle column boundaries (e.g. "sichzie"
|
|
# spanning Col 1 + Col 2). Must happen after column detection and
|
|
# before cell assignment.
|
|
# Keep original words for colspan detection (split destroys span info).
|
|
original_zone_words = zone_words
|
|
if len(columns) >= 2:
|
|
zone_words = _split_cross_column_words(zone_words, columns)
|
|
|
|
# Build cells
|
|
cells = _build_cells(zone_words, columns, rows, img_w, img_h)
|
|
|
|
# --- Detect colspan (merged cells spanning multiple columns) ---
|
|
# Uses the ORIGINAL (pre-split) words to detect word-blocks that span
|
|
# multiple columns. _split_cross_column_words would have destroyed
|
|
# this information by cutting words at column boundaries.
|
|
if len(columns) >= 2:
|
|
cells = _detect_colspan_cells(original_zone_words, columns, rows, cells, img_w, img_h)
|
|
|
|
# Prefix cell IDs with zone index
|
|
for cell in cells:
|
|
cell["cell_id"] = f"Z{zone_index}_{cell['cell_id']}"
|
|
cell["zone_index"] = zone_index
|
|
|
|
# Detect header rows (pass columns for spanning header detection)
|
|
header_rows = _detect_header_rows(rows, zone_words, zone_y, columns,
|
|
skip_first_row_header=skip_first_row_header)
|
|
|
|
# Merge cells in spanning header rows into a single col-0 cell
|
|
if header_rows and len(columns) >= 2:
|
|
for hri in header_rows:
|
|
header_cells = [c for c in cells if c["row_index"] == hri]
|
|
if len(header_cells) <= 1:
|
|
continue
|
|
# Collect all word_boxes and text from all columns
|
|
all_wb = []
|
|
all_text_parts = []
|
|
for hc in sorted(header_cells, key=lambda c: c["col_index"]):
|
|
all_wb.extend(hc.get("word_boxes", []))
|
|
if hc.get("text", "").strip():
|
|
all_text_parts.append(hc["text"].strip())
|
|
# Remove all header cells, replace with one spanning cell
|
|
cells = [c for c in cells if c["row_index"] != hri]
|
|
if all_wb:
|
|
x_min = min(wb["left"] for wb in all_wb)
|
|
y_min = min(wb["top"] for wb in all_wb)
|
|
x_max = max(wb["left"] + wb["width"] for wb in all_wb)
|
|
y_max = max(wb["top"] + wb["height"] for wb in all_wb)
|
|
cells.append({
|
|
"cell_id": f"R{hri:02d}_C0",
|
|
"row_index": hri,
|
|
"col_index": 0,
|
|
"col_type": "spanning_header",
|
|
"text": " ".join(all_text_parts),
|
|
"confidence": 0.0,
|
|
"bbox_px": {"x": x_min, "y": y_min,
|
|
"w": x_max - x_min, "h": y_max - y_min},
|
|
"bbox_pct": {
|
|
"x": round(x_min / img_w * 100, 2) if img_w else 0,
|
|
"y": round(y_min / img_h * 100, 2) if img_h else 0,
|
|
"w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
|
|
"h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
|
|
},
|
|
"word_boxes": all_wb,
|
|
"ocr_engine": "words_first",
|
|
"is_bold": True,
|
|
})
|
|
|
|
# Convert columns to output format with percentages
|
|
out_columns = []
|
|
for col in columns:
|
|
x_min = col["x_min"]
|
|
x_max = col["x_max"]
|
|
out_columns.append({
|
|
"index": col["index"],
|
|
"label": col["type"],
|
|
"x_min_px": round(x_min),
|
|
"x_max_px": round(x_max),
|
|
"x_min_pct": round(x_min / img_w * 100, 2) if img_w else 0,
|
|
"x_max_pct": round(x_max / img_w * 100, 2) if img_w else 0,
|
|
"bold": False,
|
|
})
|
|
|
|
# Convert rows to output format with percentages
|
|
out_rows = []
|
|
for row in rows:
|
|
out_rows.append({
|
|
"index": row["index"],
|
|
"y_min_px": round(row["y_min"]),
|
|
"y_max_px": round(row["y_max"]),
|
|
"y_min_pct": round(row["y_min"] / img_h * 100, 2) if img_h else 0,
|
|
"y_max_pct": round(row["y_max"] / img_h * 100, 2) if img_h else 0,
|
|
"is_header": row["index"] in header_rows,
|
|
})
|
|
|
|
return {
|
|
"columns": out_columns,
|
|
"rows": out_rows,
|
|
"cells": cells,
|
|
"header_rows": header_rows,
|
|
"_raw_columns": columns, # internal: for propagation to other zones
|
|
}
|