Files
breakpilot-lehrer/klausur-service/backend/grid_build_cell_ops.py
Benjamin Admin 9ba420fa91
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
Fix: Remove broken getKlausurApiUrl and clean up empty lines
sed replacement left orphaned hostname references in story page
and empty lines in getApiBase functions.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 16:02:04 +02:00

306 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Grid Build Cell Ops — Cell-level operations: bullet/artifact removal,
garbled cell cleanup, word-box reordering, and max_columns enforcement.
Extracted from grid_build_core.py for maintainability.
"""
import logging
import re
from typing import Any, Dict, List, Tuple
from cv_ocr_engines import (
_words_to_reading_order_text, _group_words_into_lines, _lookup_ipa,
)
logger = logging.getLogger(__name__)
def _remove_bullets_and_artifacts(zones_data: List[Dict[str, Any]]) -> None:
"""Remove blue bullet/artifact word_boxes (Step 5i).
Handles tiny coloured symbols, overlapping word_boxes, duplicate text,
and syllable-split word merging.
"""
_ALPHA_WORD_RE = re.compile(r'^[A-Za-z\u00c0-\u024f\-]+[.,;:!?]*$')
_REMOVE_SYMBOLS = {'>', '<', '~', '\\', '^', '`', '#', '|', '¬', '¦'}
bullet_removed = 0
for z in zones_data:
for cell in z.get("cells", []):
wbs = cell.get("word_boxes") or []
if len(wbs) < 2:
continue
to_remove: set = set()
# Rule (a): tiny coloured symbols
for i, wb in enumerate(wbs):
cn = wb.get("color_name", "black")
if (cn != "black"
and wb.get("width", 0) * wb.get("height", 0) < 200
and wb.get("conf", 100) < 85):
to_remove.add(i)
# Rule (a2): isolated non-alphanumeric symbols
for i, wb in enumerate(wbs):
t = (wb.get("text") or "").strip()
if t and not re.search(r'[a-zA-Z0-9äöüÄÖÜß]', t) and len(t) <= 2:
if t in _REMOVE_SYMBOLS:
to_remove.add(i)
# Rule (b) + (c): overlap and duplicate detection
to_merge: List[Tuple[int, int]] = []
indexed = sorted(enumerate(wbs), key=lambda iw: iw[1].get("left", 0))
for p in range(len(indexed) - 1):
i1, w1 = indexed[p]
i2, w2 = indexed[p + 1]
x1s, x1e = w1.get("left", 0), w1.get("left", 0) + w1.get("width", 0)
x2s, x2e = w2.get("left", 0), w2.get("left", 0) + w2.get("width", 0)
overlap = max(0, min(x1e, x2e) - max(x1s, x2s))
min_w = min(w1.get("width", 1), w2.get("width", 1))
gap = x2s - x1e
overlap_pct = overlap / min_w if min_w > 0 else 0
if overlap_pct > 0.20:
t1 = (w1.get("text") or "").strip()
t2 = (w2.get("text") or "").strip()
# Syllable-split words
if (overlap_pct <= 0.75
and _ALPHA_WORD_RE.match(t1)
and _ALPHA_WORD_RE.match(t2)):
to_merge.append((i1, i2))
continue
# High overlap with short prefix
if (overlap_pct > 0.75
and _ALPHA_WORD_RE.match(t1)
and _ALPHA_WORD_RE.match(t2)
and t1.rstrip(".,;:!?").lower() != t2.rstrip(".,;:!?").lower()
and min(len(t1.rstrip(".,;:!?")), len(t2.rstrip(".,;:!?"))) <= 4):
to_merge.append((i1, i2))
continue
if overlap_pct <= 0.40:
continue
c1 = w1.get("conf", 50)
c2 = w2.get("conf", 50)
# Very high overlap: prefer IPA-dictionary word
if overlap_pct > 0.90 and t1.lower() != t2.lower():
in_dict_1 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t1.lower()), "british")) if t1.isalpha() else False
in_dict_2 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t2.lower()), "british")) if t2.isalpha() else False
if in_dict_1 and not in_dict_2:
to_remove.add(i2)
continue
elif in_dict_2 and not in_dict_1:
to_remove.add(i1)
continue
if c1 < c2:
to_remove.add(i1)
elif c2 < c1:
to_remove.add(i2)
else:
if w1.get("height", 0) > w2.get("height", 0):
to_remove.add(i1)
else:
to_remove.add(i2)
elif (gap < 6
and w1.get("color_name") == "blue"
and w2.get("color_name") == "blue"
and (w1.get("text") or "").strip() == (w2.get("text") or "").strip()):
c1 = w1.get("conf", 50)
c2 = w2.get("conf", 50)
to_remove.add(i1 if c1 <= c2 else i2)
# Execute merges first (syllable-split words)
if to_merge:
merge_parent: Dict[int, int] = {}
for mi1, mi2 in to_merge:
actual_mi1 = mi1
while actual_mi1 in merge_parent:
actual_mi1 = merge_parent[actual_mi1]
if actual_mi1 in to_remove or mi2 in to_remove:
continue
if mi2 in merge_parent:
continue
mw1, mw2 = wbs[actual_mi1], wbs[mi2]
mt1 = (mw1.get("text") or "").rstrip(".,;:!?")
mt2 = (mw2.get("text") or "").strip()
merged_text = mt1 + mt2
mx = min(mw1["left"], mw2["left"])
my = min(mw1["top"], mw2["top"])
mr = max(mw1["left"] + mw1["width"],
mw2["left"] + mw2["width"])
mb = max(mw1["top"] + mw1["height"],
mw2["top"] + mw2["height"])
mw1["text"] = merged_text
mw1["left"] = mx
mw1["top"] = my
mw1["width"] = mr - mx
mw1["height"] = mb - my
mw1["conf"] = (mw1.get("conf", 50) + mw2.get("conf", 50)) / 2
to_remove.add(mi2)
merge_parent[mi2] = actual_mi1
bullet_removed -= 1
if to_remove:
bullet_removed += len(to_remove)
filtered = [wb for i, wb in enumerate(wbs) if i not in to_remove]
cell["word_boxes"] = filtered
if not cell.get("_ipa_corrected"):
cell["text"] = _words_to_reading_order_text(filtered)
if bullet_removed:
for z in zones_data:
z["cells"] = [c for c in z.get("cells", [])
if (c.get("word_boxes") or c.get("text", "").strip())]
logger.info("Step 5i: removed %d bullet/artifact word_boxes", bullet_removed)
def _remove_garbled_cells(zones_data: List[Dict[str, Any]]) -> None:
"""Remove cells whose text is entirely garbled / artifact noise (Step 5j-pre)."""
_COMMON_SHORT_WORDS = {
"ab", "am", "an", "da", "du", "er", "es", "im", "in", "ja",
"ob", "so", "um", "zu", "wo", "je", "oh", "or",
"die", "der", "das", "dem", "den", "des", "ein", "und",
"auf", "aus", "bei", "bis", "für", "mit", "nur", "von",
"a", "i", "an", "as", "at", "be", "by", "do", "go", "he",
"if", "in", "is", "it", "me", "my", "no", "of", "oh", "ok",
"on", "or", "so", "to", "up", "us", "we",
"the", "and", "but", "for", "not",
}
_PURE_JUNK_RE = re.compile(r'^[\W\d\s]+$')
artifact_cells_removed = 0
for z in zones_data:
before = len(z.get("cells", []))
kept = []
for cell in z.get("cells", []):
text = (cell.get("text") or "").strip()
core = text.rstrip(".,;:!?'\"")
is_artifact = False
if not core:
is_artifact = True
elif _PURE_JUNK_RE.match(core):
if core.strip() not in ('=', '(=', '=)', ';', ':', '-', '', ''):
is_artifact = True
elif len(core) <= 2 and core.lower() not in _COMMON_SHORT_WORDS and not core.isalpha():
is_artifact = True
elif len(core) <= 3 and core.isupper() and core.lower() not in _COMMON_SHORT_WORDS:
is_artifact = True
elif (len(core) <= 5 and re.search(r'\d', core) and re.search(r'[A-Za-z]', core)
and not re.match(r'^[pPsS]\.?\d+$', core)):
is_artifact = True
if is_artifact:
kept.append(None)
else:
kept.append(cell)
z["cells"] = [c for c in kept if c is not None]
artifact_cells_removed += before - len(z["cells"])
if artifact_cells_removed:
for z in zones_data:
cell_ris = {c.get("row_index") for c in z.get("cells", [])}
z["rows"] = [r for r in z.get("rows", []) if r["index"] in cell_ris]
logger.info("Step 5j-pre: removed %d artifact cells", artifact_cells_removed)
def _normalize_word_order(zones_data: List[Dict[str, Any]]) -> None:
"""Normalise word_box order to reading order (Step 5j)."""
wb_reordered = 0
for z in zones_data:
for cell in z.get("cells", []):
wbs = cell.get("word_boxes") or []
if len(wbs) < 2:
continue
lines = _group_words_into_lines(wbs, y_tolerance_px=15)
sorted_wbs = [w for line in lines for w in line]
if [id(w) for w in sorted_wbs] != [id(w) for w in wbs]:
cell["word_boxes"] = sorted_wbs
wb_reordered += 1
if wb_reordered:
logger.info("Step 5j: re-ordered word_boxes in %d cells to reading order", wb_reordered)
def _enforce_max_columns(
zones_data: List[Dict[str, Any]],
max_columns: int,
) -> None:
"""Enforce max_columns by merging narrowest columns (Step 5k)."""
for z in zones_data:
if z.get("zone_type") != "content":
continue
cols = z.get("columns", [])
cells = z.get("cells", [])
if len(cols) <= max_columns:
continue
logger.info(
"max_columns=%d: zone %s has %d columns -> merging",
max_columns, z.get("zone_index"), len(cols),
)
cols_by_width = sorted(cols, key=lambda c: (c.get("x_max_px", 0) - c.get("x_min_px", 0)))
while len(cols) > max_columns:
narrowest = cols_by_width.pop(0)
ni = narrowest["index"]
sorted_by_x = sorted(cols, key=lambda c: c.get("x_min_px", c.get("x_min_pct", 0)))
pos = next(i for i, c in enumerate(sorted_by_x) if c["index"] == ni)
if pos + 1 < len(sorted_by_x):
merge_target = sorted_by_x[pos + 1]
elif pos > 0:
merge_target = sorted_by_x[pos - 1]
else:
break
ti = merge_target["index"]
merge_target["x_min_px"] = min(
merge_target.get("x_min_px", merge_target.get("x_min_pct", 0)),
narrowest.get("x_min_px", narrowest.get("x_min_pct", 0)),
)
merge_target["x_max_px"] = max(
merge_target.get("x_max_px", merge_target.get("x_max_pct", 100)),
narrowest.get("x_max_px", narrowest.get("x_max_pct", 100)),
)
if "x_min_pct" in merge_target and "x_min_pct" in narrowest:
merge_target["x_min_pct"] = min(merge_target["x_min_pct"], narrowest["x_min_pct"])
merge_target["x_max_pct"] = max(merge_target["x_max_pct"], narrowest["x_max_pct"])
for cell in cells:
if cell.get("col_index") == ni:
cell["col_index"] = ti
existing = next(
(c for c in cells if c["col_index"] == ti
and c["row_index"] == cell["row_index"]
and c is not cell),
None,
)
if existing:
existing["text"] = (
(existing.get("text", "") + " " + cell.get("text", "")).strip()
)
existing["word_boxes"] = existing.get("word_boxes", []) + cell.get("word_boxes", [])
cell["_merged"] = True
z["cells"] = [c for c in cells if not c.get("_merged")]
cells = z["cells"]
cols.remove(narrowest)
cols_by_width = [c for c in cols_by_width if c["index"] != ni]
# Re-index columns 0..N-1
for new_idx, col in enumerate(sorted(cols, key=lambda c: c.get("x_min_px", c.get("x_min_pct", 0)))):
old_idx = col["index"]
col["index"] = new_idx
for cell in cells:
if cell.get("col_index") == old_idx:
cell["col_index"] = new_idx
logger.info("max_columns: zone %s now has %d columns", z.get("zone_index"), len(cols))