klausur-service (7 monoliths): - grid_editor_helpers.py (1,737 → 5 files: columns, filters, headers, zones) - cv_cell_grid.py (1,675 → 7 files: build, legacy, streaming, merge, vocab) - worksheet_editor_api.py (1,305 → 4 files: models, AI, reconstruct, routes) - legal_corpus_ingestion.py (1,280 → 3 files: registry, chunking, ingestion) - cv_review.py (1,248 → 4 files: pipeline, spell, LLM, barrel) - cv_preprocessing.py (1,166 → 3 files: deskew, dewarp, barrel) - rbac.py, admin_api.py, routes/eh.py remain (next batch) backend-lehrer (1 monolith): - classroom_engine/repository.py (1,705 → 7 files by domain) All re-export barrels preserve backward compatibility. Zero import errors verified. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
500 lines
20 KiB
Python
500 lines
20 KiB
Python
"""
|
|
Grid Editor — header/heading detection and colspan (merged cell) detection.
|
|
Split from grid_editor_helpers.py. Pure computation, no HTTP/DB side effects.
|
|
Lizenz: Apache 2.0 | DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
|
"""
|
|
|
|
import logging
|
|
import re
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
from cv_ocr_engines import _text_has_garbled_ipa
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def _detect_heading_rows_by_color(zones_data: List[Dict], img_w: int, img_h: int) -> int:
|
|
"""Detect heading rows by color + height after color annotation.
|
|
|
|
A row is a heading if:
|
|
1. ALL word_boxes have color_name != 'black' (typically 'blue')
|
|
2. Mean word height > 1.2x median height of all words in the zone
|
|
|
|
Detected heading rows are merged into a single spanning cell.
|
|
Returns count of headings detected.
|
|
"""
|
|
heading_count = 0
|
|
|
|
for z in zones_data:
|
|
cells = z.get("cells", [])
|
|
rows = z.get("rows", [])
|
|
columns = z.get("columns", [])
|
|
if not cells or not rows or len(columns) < 2:
|
|
continue
|
|
|
|
# Compute median word height across the zone
|
|
all_heights = []
|
|
for cell in cells:
|
|
for wb in cell.get("word_boxes") or []:
|
|
h = wb.get("height", 0)
|
|
if h > 0:
|
|
all_heights.append(h)
|
|
if not all_heights:
|
|
continue
|
|
all_heights_sorted = sorted(all_heights)
|
|
median_h = all_heights_sorted[len(all_heights_sorted) // 2]
|
|
|
|
heading_row_indices = []
|
|
for row in rows:
|
|
if row.get("is_header"):
|
|
continue # already detected as header
|
|
ri = row["index"]
|
|
row_cells = [c for c in cells if c.get("row_index") == ri]
|
|
row_wbs = [
|
|
wb for cell in row_cells
|
|
for wb in cell.get("word_boxes") or []
|
|
]
|
|
if not row_wbs:
|
|
continue
|
|
|
|
# Condition 1: ALL words are non-black
|
|
all_colored = all(
|
|
wb.get("color_name", "black") != "black"
|
|
for wb in row_wbs
|
|
)
|
|
if not all_colored:
|
|
continue
|
|
|
|
# Condition 2: mean height > 1.2x median
|
|
mean_h = sum(wb.get("height", 0) for wb in row_wbs) / len(row_wbs)
|
|
if mean_h <= median_h * 1.2:
|
|
continue
|
|
|
|
heading_row_indices.append(ri)
|
|
|
|
# Merge heading cells into spanning cells
|
|
for hri in heading_row_indices:
|
|
header_cells = [c for c in cells if c.get("row_index") == hri]
|
|
if len(header_cells) <= 1:
|
|
# Single cell -- just mark it as heading
|
|
if header_cells:
|
|
header_cells[0]["col_type"] = "heading"
|
|
heading_count += 1
|
|
# Mark row as header
|
|
for row in rows:
|
|
if row["index"] == hri:
|
|
row["is_header"] = True
|
|
continue
|
|
|
|
# Collect all word_boxes and text from all columns
|
|
all_wb = []
|
|
all_text_parts = []
|
|
for hc in sorted(header_cells, key=lambda c: c["col_index"]):
|
|
all_wb.extend(hc.get("word_boxes", []))
|
|
if hc.get("text", "").strip():
|
|
all_text_parts.append(hc["text"].strip())
|
|
|
|
# Remove all cells for this row, replace with one spanning cell
|
|
z["cells"] = [c for c in z["cells"] if c.get("row_index") != hri]
|
|
|
|
if all_wb:
|
|
x_min = min(wb["left"] for wb in all_wb)
|
|
y_min = min(wb["top"] for wb in all_wb)
|
|
x_max = max(wb["left"] + wb["width"] for wb in all_wb)
|
|
y_max = max(wb["top"] + wb["height"] for wb in all_wb)
|
|
|
|
# Use the actual starting col_index from the first cell
|
|
first_col = min(hc["col_index"] for hc in header_cells)
|
|
zone_idx = z.get("zone_index", 0)
|
|
z["cells"].append({
|
|
"cell_id": f"Z{zone_idx}_R{hri:02d}_C{first_col}",
|
|
"zone_index": zone_idx,
|
|
"row_index": hri,
|
|
"col_index": first_col,
|
|
"col_type": "heading",
|
|
"text": " ".join(all_text_parts),
|
|
"confidence": 0.0,
|
|
"bbox_px": {"x": x_min, "y": y_min,
|
|
"w": x_max - x_min, "h": y_max - y_min},
|
|
"bbox_pct": {
|
|
"x": round(x_min / img_w * 100, 2) if img_w else 0,
|
|
"y": round(y_min / img_h * 100, 2) if img_h else 0,
|
|
"w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
|
|
"h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
|
|
},
|
|
"word_boxes": all_wb,
|
|
"ocr_engine": "words_first",
|
|
"is_bold": True,
|
|
})
|
|
|
|
# Mark row as header
|
|
for row in rows:
|
|
if row["index"] == hri:
|
|
row["is_header"] = True
|
|
heading_count += 1
|
|
|
|
return heading_count
|
|
|
|
|
|
def _detect_heading_rows_by_single_cell(
|
|
zones_data: List[Dict], img_w: int, img_h: int,
|
|
) -> int:
|
|
"""Detect heading rows that have only a single content cell.
|
|
|
|
Black headings like "Theme" have normal color and height, so they are
|
|
missed by ``_detect_heading_rows_by_color``. The distinguishing signal
|
|
is that they occupy only one column while normal vocabulary rows fill
|
|
at least 2-3 columns.
|
|
|
|
A row qualifies as a heading if:
|
|
1. It is not already marked as a header/heading.
|
|
2. It has exactly ONE cell whose col_type starts with ``column_``
|
|
(excluding column_1 / page_ref which only carries page numbers).
|
|
3. That single cell is NOT in the last column (continuation/example
|
|
lines like "2. Ver\u00e4nderung, Wechsel" often sit alone in column_4).
|
|
4. The text does not start with ``[`` (IPA continuation).
|
|
5. The zone has >=3 columns and >=5 rows (avoids false positives in
|
|
tiny zones).
|
|
6. The majority of rows in the zone have >=2 content cells (ensures
|
|
we are in a multi-column vocab layout).
|
|
"""
|
|
heading_count = 0
|
|
|
|
for z in zones_data:
|
|
cells = z.get("cells", [])
|
|
rows = z.get("rows", [])
|
|
columns = z.get("columns", [])
|
|
if len(columns) < 3 or len(rows) < 5:
|
|
continue
|
|
|
|
# Determine the last col_index (example/sentence column)
|
|
col_indices = sorted(set(c.get("col_index", 0) for c in cells))
|
|
if not col_indices:
|
|
continue
|
|
last_col = col_indices[-1]
|
|
|
|
# Count content cells per row (column_* but not column_1/page_ref).
|
|
# Exception: column_1 cells that contain a dictionary article word
|
|
# (die/der/das etc.) ARE content -- they appear in dictionary layouts
|
|
# where the leftmost column holds grammatical articles.
|
|
_ARTICLE_WORDS = {
|
|
"die", "der", "das", "dem", "den", "des", "ein", "eine",
|
|
"the", "a", "an",
|
|
}
|
|
row_content_counts: Dict[int, int] = {}
|
|
for cell in cells:
|
|
ct = cell.get("col_type", "")
|
|
if not ct.startswith("column_"):
|
|
continue
|
|
if ct == "column_1":
|
|
ctext = (cell.get("text") or "").strip().lower()
|
|
if ctext not in _ARTICLE_WORDS:
|
|
continue
|
|
ri = cell.get("row_index", -1)
|
|
row_content_counts[ri] = row_content_counts.get(ri, 0) + 1
|
|
|
|
# Majority of rows must have >=2 content cells
|
|
multi_col_rows = sum(1 for cnt in row_content_counts.values() if cnt >= 2)
|
|
if multi_col_rows < len(rows) * 0.4:
|
|
continue
|
|
|
|
# Exclude first and last non-header rows -- these are typically
|
|
# page numbers or footer text, not headings.
|
|
non_header_rows = [r for r in rows if not r.get("is_header")]
|
|
if len(non_header_rows) < 3:
|
|
continue
|
|
first_ri = non_header_rows[0]["index"]
|
|
last_ri = non_header_rows[-1]["index"]
|
|
|
|
heading_row_indices = []
|
|
for row in rows:
|
|
if row.get("is_header"):
|
|
continue
|
|
ri = row["index"]
|
|
if ri == first_ri or ri == last_ri:
|
|
continue
|
|
row_cells = [c for c in cells if c.get("row_index") == ri]
|
|
content_cells = [
|
|
c for c in row_cells
|
|
if c.get("col_type", "").startswith("column_")
|
|
and (c.get("col_type") != "column_1"
|
|
or (c.get("text") or "").strip().lower() in _ARTICLE_WORDS)
|
|
]
|
|
if len(content_cells) != 1:
|
|
continue
|
|
cell = content_cells[0]
|
|
# Not in the last column (continuation/example lines)
|
|
if cell.get("col_index") == last_col:
|
|
continue
|
|
text = (cell.get("text") or "").strip()
|
|
if not text or text.startswith("["):
|
|
continue
|
|
# Continuation lines start with "(" -- e.g. "(usw.)", "(TV-Serie)"
|
|
if text.startswith("("):
|
|
continue
|
|
# Single cell NOT in the first content column is likely a
|
|
# continuation/overflow line, not a heading. Real headings
|
|
# ("Theme 1", "Unit 3: ...") appear in the first or second
|
|
# content column.
|
|
first_content_col = col_indices[0] if col_indices else 0
|
|
if cell.get("col_index", 0) > first_content_col + 1:
|
|
continue
|
|
# Skip garbled IPA without brackets (e.g. "ska:f -- ska:vz")
|
|
# but NOT text with real IPA symbols (e.g. "Theme [\u03b8\u02c8i\u02d0m]")
|
|
_REAL_IPA_CHARS = set("\u02c8\u02cc\u0259\u026a\u025b\u0252\u028a\u028c\u00e6\u0251\u0254\u0283\u0292\u03b8\u00f0\u014b")
|
|
if _text_has_garbled_ipa(text) and not any(c in _REAL_IPA_CHARS for c in text):
|
|
continue
|
|
# Guard: dictionary section headings are short (1-4 alpha chars
|
|
# like "A", "Ab", "Zi", "Sch"). Longer text that starts
|
|
# lowercase is a regular vocabulary word (e.g. "zentral") that
|
|
# happens to appear alone in its row.
|
|
alpha_only = re.sub(r'[^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]', '', text)
|
|
if len(alpha_only) > 4 and text[0].islower():
|
|
continue
|
|
heading_row_indices.append(ri)
|
|
|
|
# Guard: if >25% of eligible rows would become headings, the
|
|
# heuristic is misfiring (e.g. sparse single-column layout where
|
|
# most rows naturally have only 1 content cell).
|
|
eligible_rows = len(non_header_rows) - 2 # minus first/last excluded
|
|
if eligible_rows > 0 and len(heading_row_indices) > eligible_rows * 0.25:
|
|
logger.debug(
|
|
"Skipping single-cell heading detection for zone %s: "
|
|
"%d/%d rows would be headings (>25%%)",
|
|
z.get("zone_index"), len(heading_row_indices), eligible_rows,
|
|
)
|
|
continue
|
|
|
|
for hri in heading_row_indices:
|
|
header_cells = [c for c in cells if c.get("row_index") == hri]
|
|
if not header_cells:
|
|
continue
|
|
|
|
# Collect all word_boxes and text
|
|
all_wb = []
|
|
all_text_parts = []
|
|
for hc in sorted(header_cells, key=lambda c: c["col_index"]):
|
|
all_wb.extend(hc.get("word_boxes", []))
|
|
if hc.get("text", "").strip():
|
|
all_text_parts.append(hc["text"].strip())
|
|
|
|
first_col_idx = min(hc["col_index"] for hc in header_cells)
|
|
|
|
# Remove old cells for this row, add spanning heading cell
|
|
z["cells"] = [c for c in z["cells"] if c.get("row_index") != hri]
|
|
|
|
if all_wb:
|
|
x_min = min(wb["left"] for wb in all_wb)
|
|
y_min = min(wb["top"] for wb in all_wb)
|
|
x_max = max(wb["left"] + wb["width"] for wb in all_wb)
|
|
y_max = max(wb["top"] + wb["height"] for wb in all_wb)
|
|
else:
|
|
# Fallback to first cell bbox
|
|
bp = header_cells[0].get("bbox_px", {})
|
|
x_min = bp.get("x", 0)
|
|
y_min = bp.get("y", 0)
|
|
x_max = x_min + bp.get("w", 0)
|
|
y_max = y_min + bp.get("h", 0)
|
|
|
|
zone_idx = z.get("zone_index", 0)
|
|
z["cells"].append({
|
|
"cell_id": f"Z{zone_idx}_R{hri:02d}_C{first_col_idx}",
|
|
"zone_index": zone_idx,
|
|
"row_index": hri,
|
|
"col_index": first_col_idx,
|
|
"col_type": "heading",
|
|
"text": " ".join(all_text_parts),
|
|
"confidence": 0.0,
|
|
"bbox_px": {"x": x_min, "y": y_min,
|
|
"w": x_max - x_min, "h": y_max - y_min},
|
|
"bbox_pct": {
|
|
"x": round(x_min / img_w * 100, 2) if img_w else 0,
|
|
"y": round(y_min / img_h * 100, 2) if img_h else 0,
|
|
"w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
|
|
"h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
|
|
},
|
|
"word_boxes": all_wb,
|
|
"ocr_engine": "words_first",
|
|
"is_bold": False,
|
|
})
|
|
|
|
for row in rows:
|
|
if row["index"] == hri:
|
|
row["is_header"] = True
|
|
heading_count += 1
|
|
|
|
return heading_count
|
|
|
|
|
|
def _detect_header_rows(
|
|
rows: List[Dict],
|
|
zone_words: List[Dict],
|
|
zone_y: int,
|
|
columns: Optional[List[Dict]] = None,
|
|
skip_first_row_header: bool = False,
|
|
) -> List[int]:
|
|
"""Detect header rows: first-row heuristic + spanning header detection.
|
|
|
|
A "spanning header" is a row whose words stretch across multiple column
|
|
boundaries (e.g. "Unit4: Bonnie Scotland" centred across 4 columns).
|
|
"""
|
|
if len(rows) < 2:
|
|
return []
|
|
|
|
headers = []
|
|
|
|
if not skip_first_row_header:
|
|
first_row = rows[0]
|
|
second_row = rows[1]
|
|
|
|
# Gap between first and second row > 0.5x average row height
|
|
avg_h = sum(r["y_max"] - r["y_min"] for r in rows) / len(rows)
|
|
gap = second_row["y_min"] - first_row["y_max"]
|
|
if gap > avg_h * 0.5:
|
|
headers.append(0)
|
|
|
|
# Also check if first row words are taller than average (bold/header text)
|
|
all_heights = [w["height"] for w in zone_words]
|
|
median_h = sorted(all_heights)[len(all_heights) // 2] if all_heights else 20
|
|
first_row_words = [
|
|
w for w in zone_words
|
|
if first_row["y_min"] <= w["top"] + w["height"] / 2 <= first_row["y_max"]
|
|
]
|
|
if first_row_words:
|
|
first_h = max(w["height"] for w in first_row_words)
|
|
if first_h > median_h * 1.3:
|
|
if 0 not in headers:
|
|
headers.append(0)
|
|
|
|
# Note: Spanning-header detection (rows spanning all columns) has been
|
|
# disabled because it produces too many false positives on vocabulary
|
|
# worksheets where IPA transcriptions or short entries naturally span
|
|
# multiple columns with few words. The first-row heuristic above is
|
|
# sufficient for detecting real headers.
|
|
|
|
return headers
|
|
|
|
|
|
def _detect_colspan_cells(
|
|
zone_words: List[Dict],
|
|
columns: List[Dict],
|
|
rows: List[Dict],
|
|
cells: List[Dict],
|
|
img_w: int,
|
|
img_h: int,
|
|
) -> List[Dict]:
|
|
"""Detect and merge cells that span multiple columns (colspan).
|
|
|
|
A word-block (PaddleOCR phrase) that extends significantly past a column
|
|
boundary into the next column indicates a merged cell. This replaces
|
|
the incorrectly split cells with a single cell spanning multiple columns.
|
|
|
|
Works for both full-page scans and box zones.
|
|
"""
|
|
if len(columns) < 2 or not zone_words or not rows:
|
|
return cells
|
|
|
|
from cv_words_first import _assign_word_to_row
|
|
|
|
# Column boundaries (midpoints between adjacent columns)
|
|
col_boundaries = []
|
|
for ci in range(len(columns) - 1):
|
|
col_boundaries.append((columns[ci]["x_max"] + columns[ci + 1]["x_min"]) / 2)
|
|
|
|
def _cols_covered(w_left: float, w_right: float) -> List[int]:
|
|
"""Return list of column indices that a word-block covers."""
|
|
covered = []
|
|
for col in columns:
|
|
col_mid = (col["x_min"] + col["x_max"]) / 2
|
|
# Word covers a column if it extends past the column's midpoint
|
|
if w_left < col_mid < w_right:
|
|
covered.append(col["index"])
|
|
# Also include column if word starts within it
|
|
elif col["x_min"] <= w_left < col["x_max"]:
|
|
covered.append(col["index"])
|
|
return sorted(set(covered))
|
|
|
|
# Group original word-blocks by row
|
|
row_word_blocks: Dict[int, List[Dict]] = {}
|
|
for w in zone_words:
|
|
ri = _assign_word_to_row(w, rows)
|
|
row_word_blocks.setdefault(ri, []).append(w)
|
|
|
|
# For each row, check if any word-block spans multiple columns
|
|
rows_to_merge: Dict[int, List[Dict]] = {} # row_index -> list of spanning word-blocks
|
|
|
|
for ri, wblocks in row_word_blocks.items():
|
|
spanning = []
|
|
for w in wblocks:
|
|
w_left = w["left"]
|
|
w_right = w_left + w["width"]
|
|
covered = _cols_covered(w_left, w_right)
|
|
if len(covered) >= 2:
|
|
spanning.append({"word": w, "cols": covered})
|
|
if spanning:
|
|
rows_to_merge[ri] = spanning
|
|
|
|
if not rows_to_merge:
|
|
return cells
|
|
|
|
# Merge cells for spanning rows
|
|
new_cells = []
|
|
for cell in cells:
|
|
ri = cell.get("row_index", -1)
|
|
if ri not in rows_to_merge:
|
|
new_cells.append(cell)
|
|
continue
|
|
|
|
# Check if this cell's column is part of a spanning block
|
|
ci = cell.get("col_index", -1)
|
|
is_part_of_span = False
|
|
for span in rows_to_merge[ri]:
|
|
if ci in span["cols"]:
|
|
is_part_of_span = True
|
|
# Only emit the merged cell for the FIRST column in the span
|
|
if ci == span["cols"][0]:
|
|
# Use the ORIGINAL word-block text (not the split cell texts
|
|
# which may have broken words like "euros a" + "nd cents")
|
|
orig_word = span["word"]
|
|
merged_text = orig_word.get("text", "").strip()
|
|
all_wb = [orig_word]
|
|
|
|
# Compute merged bbox
|
|
if all_wb:
|
|
x_min = min(wb["left"] for wb in all_wb)
|
|
y_min = min(wb["top"] for wb in all_wb)
|
|
x_max = max(wb["left"] + wb["width"] for wb in all_wb)
|
|
y_max = max(wb["top"] + wb["height"] for wb in all_wb)
|
|
else:
|
|
x_min = y_min = x_max = y_max = 0
|
|
|
|
new_cells.append({
|
|
"cell_id": cell["cell_id"],
|
|
"row_index": ri,
|
|
"col_index": span["cols"][0],
|
|
"col_type": "spanning_header",
|
|
"colspan": len(span["cols"]),
|
|
"text": merged_text,
|
|
"confidence": cell.get("confidence", 0),
|
|
"bbox_px": {"x": x_min, "y": y_min,
|
|
"w": x_max - x_min, "h": y_max - y_min},
|
|
"bbox_pct": {
|
|
"x": round(x_min / img_w * 100, 2) if img_w else 0,
|
|
"y": round(y_min / img_h * 100, 2) if img_h else 0,
|
|
"w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
|
|
"h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
|
|
},
|
|
"word_boxes": all_wb,
|
|
"ocr_engine": cell.get("ocr_engine", ""),
|
|
"is_bold": cell.get("is_bold", False),
|
|
})
|
|
logger.info(
|
|
"colspan detected: row %d, cols %s -> merged %d cells (%r)",
|
|
ri, span["cols"], len(span["cols"]), merged_text[:50],
|
|
)
|
|
break
|
|
if not is_part_of_span:
|
|
new_cells.append(cell)
|
|
|
|
return new_cells
|