[split-required] Split remaining Python monoliths (Phase 1 continued)
klausur-service (7 monoliths): - grid_editor_helpers.py (1,737 → 5 files: columns, filters, headers, zones) - cv_cell_grid.py (1,675 → 7 files: build, legacy, streaming, merge, vocab) - worksheet_editor_api.py (1,305 → 4 files: models, AI, reconstruct, routes) - legal_corpus_ingestion.py (1,280 → 3 files: registry, chunking, ingestion) - cv_review.py (1,248 → 4 files: pipeline, spell, LLM, barrel) - cv_preprocessing.py (1,166 → 3 files: deskew, dewarp, barrel) - rbac.py, admin_api.py, routes/eh.py remain (next batch) backend-lehrer (1 monolith): - classroom_engine/repository.py (1,705 → 7 files by domain) All re-export barrels preserve backward compatibility. Zero import errors verified. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
389
klausur-service/backend/grid_editor_zones.py
Normal file
389
klausur-service/backend/grid_editor_zones.py
Normal file
@@ -0,0 +1,389 @@
|
||||
"""
|
||||
Grid Editor — vertical divider detection, zone splitting/merging, zone grid building.
|
||||
|
||||
Split from grid_editor_helpers.py for maintainability.
|
||||
All functions are pure computation — no HTTP, DB, or session side effects.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from cv_vocab_types import PageZone
|
||||
from cv_words_first import _cluster_rows, _build_cells
|
||||
|
||||
from grid_editor_columns import (
|
||||
_cluster_columns_by_alignment,
|
||||
_merge_inline_marker_columns,
|
||||
_split_cross_column_words,
|
||||
)
|
||||
from grid_editor_headers import (
|
||||
_detect_header_rows,
|
||||
_detect_colspan_cells,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Vertical divider detection and zone splitting
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_PIPE_RE_VSPLIT = re.compile(r"^\|+$")
|
||||
|
||||
|
||||
def _detect_vertical_dividers(
|
||||
words: List[Dict],
|
||||
zone_x: int,
|
||||
zone_w: int,
|
||||
zone_y: int,
|
||||
zone_h: int,
|
||||
) -> List[float]:
|
||||
"""Detect vertical divider lines from pipe word_boxes at consistent x.
|
||||
|
||||
Returns list of divider x-positions (empty if no dividers found).
|
||||
"""
|
||||
if not words or zone_w <= 0 or zone_h <= 0:
|
||||
return []
|
||||
|
||||
# Collect pipe word_boxes
|
||||
pipes = [
|
||||
w for w in words
|
||||
if _PIPE_RE_VSPLIT.match((w.get("text") or "").strip())
|
||||
]
|
||||
if len(pipes) < 5:
|
||||
return []
|
||||
|
||||
# Cluster pipe x-centers by proximity
|
||||
tolerance = max(15, int(zone_w * 0.02))
|
||||
pipe_xs = sorted(w["left"] + w["width"] / 2 for w in pipes)
|
||||
|
||||
clusters: List[List[float]] = [[pipe_xs[0]]]
|
||||
for x in pipe_xs[1:]:
|
||||
if x - clusters[-1][-1] <= tolerance:
|
||||
clusters[-1].append(x)
|
||||
else:
|
||||
clusters.append([x])
|
||||
|
||||
dividers: List[float] = []
|
||||
for cluster in clusters:
|
||||
if len(cluster) < 5:
|
||||
continue
|
||||
mean_x = sum(cluster) / len(cluster)
|
||||
# Must be between 15% and 85% of zone width
|
||||
rel_pos = (mean_x - zone_x) / zone_w
|
||||
if rel_pos < 0.15 or rel_pos > 0.85:
|
||||
continue
|
||||
# Check vertical coverage: pipes must span >= 50% of zone height
|
||||
cluster_pipes = [
|
||||
w for w in pipes
|
||||
if abs(w["left"] + w["width"] / 2 - mean_x) <= tolerance
|
||||
]
|
||||
ys = [w["top"] for w in cluster_pipes] + [w["top"] + w["height"] for w in cluster_pipes]
|
||||
y_span = max(ys) - min(ys) if ys else 0
|
||||
if y_span < zone_h * 0.5:
|
||||
continue
|
||||
dividers.append(mean_x)
|
||||
|
||||
return sorted(dividers)
|
||||
|
||||
|
||||
def _split_zone_at_vertical_dividers(
|
||||
zone: "PageZone",
|
||||
divider_xs: List[float],
|
||||
vsplit_group_id: int,
|
||||
) -> List["PageZone"]:
|
||||
"""Split a PageZone at vertical divider positions into sub-zones."""
|
||||
boundaries = [zone.x] + divider_xs + [zone.x + zone.width]
|
||||
hints = []
|
||||
for i in range(len(boundaries) - 1):
|
||||
if i == 0:
|
||||
hints.append("left_of_vsplit")
|
||||
elif i == len(boundaries) - 2:
|
||||
hints.append("right_of_vsplit")
|
||||
else:
|
||||
hints.append("middle_of_vsplit")
|
||||
|
||||
sub_zones = []
|
||||
for i in range(len(boundaries) - 1):
|
||||
x_start = int(boundaries[i])
|
||||
x_end = int(boundaries[i + 1])
|
||||
sub = PageZone(
|
||||
index=0, # re-indexed later
|
||||
zone_type=zone.zone_type,
|
||||
y=zone.y,
|
||||
height=zone.height,
|
||||
x=x_start,
|
||||
width=x_end - x_start,
|
||||
box=zone.box,
|
||||
image_overlays=zone.image_overlays,
|
||||
layout_hint=hints[i],
|
||||
vsplit_group=vsplit_group_id,
|
||||
)
|
||||
sub_zones.append(sub)
|
||||
|
||||
return sub_zones
|
||||
|
||||
|
||||
def _merge_content_zones_across_boxes(
|
||||
zones: List,
|
||||
content_x: int,
|
||||
content_w: int,
|
||||
) -> List:
|
||||
"""Merge content zones separated by box zones into single zones.
|
||||
|
||||
Box zones become image_overlays on the merged content zone.
|
||||
Pattern: [content, box*, content] -> [merged_content with overlay]
|
||||
Box zones NOT between two content zones stay as standalone zones.
|
||||
"""
|
||||
if len(zones) < 3:
|
||||
return zones
|
||||
|
||||
# Group consecutive runs of [content, box+, content]
|
||||
result: List = []
|
||||
i = 0
|
||||
while i < len(zones):
|
||||
z = zones[i]
|
||||
if z.zone_type != "content":
|
||||
result.append(z)
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Start of a potential merge group: content zone
|
||||
group_contents = [z]
|
||||
group_boxes = []
|
||||
j = i + 1
|
||||
# Absorb [box, content] pairs -- only absorb a box if it's
|
||||
# confirmed to be followed by another content zone.
|
||||
while j < len(zones):
|
||||
if (zones[j].zone_type == "box"
|
||||
and j + 1 < len(zones)
|
||||
and zones[j + 1].zone_type == "content"):
|
||||
group_boxes.append(zones[j])
|
||||
group_contents.append(zones[j + 1])
|
||||
j += 2
|
||||
else:
|
||||
break
|
||||
|
||||
if len(group_contents) >= 2 and group_boxes:
|
||||
# Merge: create one large content zone spanning all
|
||||
y_min = min(c.y for c in group_contents)
|
||||
y_max = max(c.y + c.height for c in group_contents)
|
||||
overlays = []
|
||||
for bz in group_boxes:
|
||||
overlay = {
|
||||
"y": bz.y,
|
||||
"height": bz.height,
|
||||
"x": bz.x,
|
||||
"width": bz.width,
|
||||
}
|
||||
if bz.box:
|
||||
overlay["box"] = {
|
||||
"x": bz.box.x,
|
||||
"y": bz.box.y,
|
||||
"width": bz.box.width,
|
||||
"height": bz.box.height,
|
||||
"confidence": bz.box.confidence,
|
||||
"border_thickness": bz.box.border_thickness,
|
||||
}
|
||||
overlays.append(overlay)
|
||||
|
||||
merged = PageZone(
|
||||
index=0, # re-indexed below
|
||||
zone_type="content",
|
||||
y=y_min,
|
||||
height=y_max - y_min,
|
||||
x=content_x,
|
||||
width=content_w,
|
||||
image_overlays=overlays,
|
||||
)
|
||||
result.append(merged)
|
||||
i = j
|
||||
else:
|
||||
# No merge possible -- emit just the content zone
|
||||
result.append(z)
|
||||
i += 1
|
||||
|
||||
# Re-index zones
|
||||
for idx, z in enumerate(result):
|
||||
z.index = idx
|
||||
|
||||
logger.info(
|
||||
"zone-merge: %d zones -> %d zones after merging across boxes",
|
||||
len(zones), len(result),
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
def _build_zone_grid(
|
||||
zone_words: List[Dict],
|
||||
zone_x: int,
|
||||
zone_y: int,
|
||||
zone_w: int,
|
||||
zone_h: int,
|
||||
zone_index: int,
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
global_columns: Optional[List[Dict]] = None,
|
||||
skip_first_row_header: bool = False,
|
||||
) -> Dict[str, Any]:
|
||||
"""Build columns, rows, cells for a single zone from its words.
|
||||
|
||||
Args:
|
||||
global_columns: If provided, use these pre-computed column boundaries
|
||||
instead of detecting columns per zone. Used for content zones so
|
||||
that all content zones (above/between/below boxes) share the same
|
||||
column structure. Box zones always detect columns independently.
|
||||
"""
|
||||
if not zone_words:
|
||||
return {
|
||||
"columns": [],
|
||||
"rows": [],
|
||||
"cells": [],
|
||||
"header_rows": [],
|
||||
}
|
||||
|
||||
# Cluster rows first (needed for column alignment analysis)
|
||||
rows = _cluster_rows(zone_words)
|
||||
|
||||
# Diagnostic logging for small/medium zones (box zones typically have 40-60 words)
|
||||
if len(zone_words) <= 60:
|
||||
import statistics as _st
|
||||
_heights = [w['height'] for w in zone_words if w.get('height', 0) > 0]
|
||||
_med_h = _st.median(_heights) if _heights else 20
|
||||
_y_tol = max(_med_h * 0.5, 5)
|
||||
logger.info(
|
||||
"zone %d row-clustering: %d words, median_h=%.0f, y_tol=%.1f -> %d rows",
|
||||
zone_index, len(zone_words), _med_h, _y_tol, len(rows),
|
||||
)
|
||||
for w in sorted(zone_words, key=lambda ww: (ww['top'], ww['left'])):
|
||||
logger.info(
|
||||
" zone %d word: y=%d x=%d h=%d w=%d '%s'",
|
||||
zone_index, w['top'], w['left'], w['height'], w['width'],
|
||||
w.get('text', '')[:40],
|
||||
)
|
||||
for r in rows:
|
||||
logger.info(
|
||||
" zone %d row %d: y_min=%d y_max=%d y_center=%.0f",
|
||||
zone_index, r['index'], r['y_min'], r['y_max'], r['y_center'],
|
||||
)
|
||||
|
||||
# Use global columns if provided, otherwise detect per zone
|
||||
columns = global_columns if global_columns else _cluster_columns_by_alignment(zone_words, zone_w, rows)
|
||||
|
||||
# Merge inline marker columns (bullets, numbering) into adjacent text
|
||||
if not global_columns:
|
||||
columns = _merge_inline_marker_columns(columns, zone_words)
|
||||
|
||||
if not columns or not rows:
|
||||
return {
|
||||
"columns": [],
|
||||
"rows": [],
|
||||
"cells": [],
|
||||
"header_rows": [],
|
||||
}
|
||||
|
||||
# Split word boxes that straddle column boundaries (e.g. "sichzie"
|
||||
# spanning Col 1 + Col 2). Must happen after column detection and
|
||||
# before cell assignment.
|
||||
# Keep original words for colspan detection (split destroys span info).
|
||||
original_zone_words = zone_words
|
||||
if len(columns) >= 2:
|
||||
zone_words = _split_cross_column_words(zone_words, columns)
|
||||
|
||||
# Build cells
|
||||
cells = _build_cells(zone_words, columns, rows, img_w, img_h)
|
||||
|
||||
# --- Detect colspan (merged cells spanning multiple columns) ---
|
||||
# Uses the ORIGINAL (pre-split) words to detect word-blocks that span
|
||||
# multiple columns. _split_cross_column_words would have destroyed
|
||||
# this information by cutting words at column boundaries.
|
||||
if len(columns) >= 2:
|
||||
cells = _detect_colspan_cells(original_zone_words, columns, rows, cells, img_w, img_h)
|
||||
|
||||
# Prefix cell IDs with zone index
|
||||
for cell in cells:
|
||||
cell["cell_id"] = f"Z{zone_index}_{cell['cell_id']}"
|
||||
cell["zone_index"] = zone_index
|
||||
|
||||
# Detect header rows (pass columns for spanning header detection)
|
||||
header_rows = _detect_header_rows(rows, zone_words, zone_y, columns,
|
||||
skip_first_row_header=skip_first_row_header)
|
||||
|
||||
# Merge cells in spanning header rows into a single col-0 cell
|
||||
if header_rows and len(columns) >= 2:
|
||||
for hri in header_rows:
|
||||
header_cells = [c for c in cells if c["row_index"] == hri]
|
||||
if len(header_cells) <= 1:
|
||||
continue
|
||||
# Collect all word_boxes and text from all columns
|
||||
all_wb = []
|
||||
all_text_parts = []
|
||||
for hc in sorted(header_cells, key=lambda c: c["col_index"]):
|
||||
all_wb.extend(hc.get("word_boxes", []))
|
||||
if hc.get("text", "").strip():
|
||||
all_text_parts.append(hc["text"].strip())
|
||||
# Remove all header cells, replace with one spanning cell
|
||||
cells = [c for c in cells if c["row_index"] != hri]
|
||||
if all_wb:
|
||||
x_min = min(wb["left"] for wb in all_wb)
|
||||
y_min = min(wb["top"] for wb in all_wb)
|
||||
x_max = max(wb["left"] + wb["width"] for wb in all_wb)
|
||||
y_max = max(wb["top"] + wb["height"] for wb in all_wb)
|
||||
cells.append({
|
||||
"cell_id": f"R{hri:02d}_C0",
|
||||
"row_index": hri,
|
||||
"col_index": 0,
|
||||
"col_type": "spanning_header",
|
||||
"text": " ".join(all_text_parts),
|
||||
"confidence": 0.0,
|
||||
"bbox_px": {"x": x_min, "y": y_min,
|
||||
"w": x_max - x_min, "h": y_max - y_min},
|
||||
"bbox_pct": {
|
||||
"x": round(x_min / img_w * 100, 2) if img_w else 0,
|
||||
"y": round(y_min / img_h * 100, 2) if img_h else 0,
|
||||
"w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
|
||||
"h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
|
||||
},
|
||||
"word_boxes": all_wb,
|
||||
"ocr_engine": "words_first",
|
||||
"is_bold": True,
|
||||
})
|
||||
|
||||
# Convert columns to output format with percentages
|
||||
out_columns = []
|
||||
for col in columns:
|
||||
x_min = col["x_min"]
|
||||
x_max = col["x_max"]
|
||||
out_columns.append({
|
||||
"index": col["index"],
|
||||
"label": col["type"],
|
||||
"x_min_px": round(x_min),
|
||||
"x_max_px": round(x_max),
|
||||
"x_min_pct": round(x_min / img_w * 100, 2) if img_w else 0,
|
||||
"x_max_pct": round(x_max / img_w * 100, 2) if img_w else 0,
|
||||
"bold": False,
|
||||
})
|
||||
|
||||
# Convert rows to output format with percentages
|
||||
out_rows = []
|
||||
for row in rows:
|
||||
out_rows.append({
|
||||
"index": row["index"],
|
||||
"y_min_px": round(row["y_min"]),
|
||||
"y_max_px": round(row["y_max"]),
|
||||
"y_min_pct": round(row["y_min"] / img_h * 100, 2) if img_h else 0,
|
||||
"y_max_pct": round(row["y_max"] / img_h * 100, 2) if img_h else 0,
|
||||
"is_header": row["index"] in header_rows,
|
||||
})
|
||||
|
||||
return {
|
||||
"columns": out_columns,
|
||||
"rows": out_rows,
|
||||
"cells": cells,
|
||||
"header_rows": header_rows,
|
||||
"_raw_columns": columns, # internal: for propagation to other zones
|
||||
}
|
||||
Reference in New Issue
Block a user