Fix: Remove broken getKlausurApiUrl and clean up empty lines
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
305
klausur-service/backend/grid_build_cell_ops.py
Normal file
305
klausur-service/backend/grid_build_cell_ops.py
Normal file
@@ -0,0 +1,305 @@
|
||||
"""
|
||||
Grid Build Cell Ops — Cell-level operations: bullet/artifact removal,
|
||||
garbled cell cleanup, word-box reordering, and max_columns enforcement.
|
||||
|
||||
Extracted from grid_build_core.py for maintainability.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Dict, List, Tuple
|
||||
|
||||
from cv_ocr_engines import (
|
||||
_words_to_reading_order_text, _group_words_into_lines, _lookup_ipa,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _remove_bullets_and_artifacts(zones_data: List[Dict[str, Any]]) -> None:
|
||||
"""Remove blue bullet/artifact word_boxes (Step 5i).
|
||||
|
||||
Handles tiny coloured symbols, overlapping word_boxes, duplicate text,
|
||||
and syllable-split word merging.
|
||||
"""
|
||||
_ALPHA_WORD_RE = re.compile(r'^[A-Za-z\u00c0-\u024f\-]+[.,;:!?]*$')
|
||||
_REMOVE_SYMBOLS = {'>', '<', '~', '\\', '^', '`', '#', '|', '¬', '¦'}
|
||||
|
||||
bullet_removed = 0
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
wbs = cell.get("word_boxes") or []
|
||||
if len(wbs) < 2:
|
||||
continue
|
||||
to_remove: set = set()
|
||||
|
||||
# Rule (a): tiny coloured symbols
|
||||
for i, wb in enumerate(wbs):
|
||||
cn = wb.get("color_name", "black")
|
||||
if (cn != "black"
|
||||
and wb.get("width", 0) * wb.get("height", 0) < 200
|
||||
and wb.get("conf", 100) < 85):
|
||||
to_remove.add(i)
|
||||
|
||||
# Rule (a2): isolated non-alphanumeric symbols
|
||||
for i, wb in enumerate(wbs):
|
||||
t = (wb.get("text") or "").strip()
|
||||
if t and not re.search(r'[a-zA-Z0-9äöüÄÖÜß]', t) and len(t) <= 2:
|
||||
if t in _REMOVE_SYMBOLS:
|
||||
to_remove.add(i)
|
||||
|
||||
# Rule (b) + (c): overlap and duplicate detection
|
||||
to_merge: List[Tuple[int, int]] = []
|
||||
indexed = sorted(enumerate(wbs), key=lambda iw: iw[1].get("left", 0))
|
||||
for p in range(len(indexed) - 1):
|
||||
i1, w1 = indexed[p]
|
||||
i2, w2 = indexed[p + 1]
|
||||
x1s, x1e = w1.get("left", 0), w1.get("left", 0) + w1.get("width", 0)
|
||||
x2s, x2e = w2.get("left", 0), w2.get("left", 0) + w2.get("width", 0)
|
||||
overlap = max(0, min(x1e, x2e) - max(x1s, x2s))
|
||||
min_w = min(w1.get("width", 1), w2.get("width", 1))
|
||||
gap = x2s - x1e
|
||||
overlap_pct = overlap / min_w if min_w > 0 else 0
|
||||
|
||||
if overlap_pct > 0.20:
|
||||
t1 = (w1.get("text") or "").strip()
|
||||
t2 = (w2.get("text") or "").strip()
|
||||
|
||||
# Syllable-split words
|
||||
if (overlap_pct <= 0.75
|
||||
and _ALPHA_WORD_RE.match(t1)
|
||||
and _ALPHA_WORD_RE.match(t2)):
|
||||
to_merge.append((i1, i2))
|
||||
continue
|
||||
|
||||
# High overlap with short prefix
|
||||
if (overlap_pct > 0.75
|
||||
and _ALPHA_WORD_RE.match(t1)
|
||||
and _ALPHA_WORD_RE.match(t2)
|
||||
and t1.rstrip(".,;:!?").lower() != t2.rstrip(".,;:!?").lower()
|
||||
and min(len(t1.rstrip(".,;:!?")), len(t2.rstrip(".,;:!?"))) <= 4):
|
||||
to_merge.append((i1, i2))
|
||||
continue
|
||||
|
||||
if overlap_pct <= 0.40:
|
||||
continue
|
||||
|
||||
c1 = w1.get("conf", 50)
|
||||
c2 = w2.get("conf", 50)
|
||||
|
||||
# Very high overlap: prefer IPA-dictionary word
|
||||
if overlap_pct > 0.90 and t1.lower() != t2.lower():
|
||||
in_dict_1 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t1.lower()), "british")) if t1.isalpha() else False
|
||||
in_dict_2 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t2.lower()), "british")) if t2.isalpha() else False
|
||||
if in_dict_1 and not in_dict_2:
|
||||
to_remove.add(i2)
|
||||
continue
|
||||
elif in_dict_2 and not in_dict_1:
|
||||
to_remove.add(i1)
|
||||
continue
|
||||
|
||||
if c1 < c2:
|
||||
to_remove.add(i1)
|
||||
elif c2 < c1:
|
||||
to_remove.add(i2)
|
||||
else:
|
||||
if w1.get("height", 0) > w2.get("height", 0):
|
||||
to_remove.add(i1)
|
||||
else:
|
||||
to_remove.add(i2)
|
||||
|
||||
elif (gap < 6
|
||||
and w1.get("color_name") == "blue"
|
||||
and w2.get("color_name") == "blue"
|
||||
and (w1.get("text") or "").strip() == (w2.get("text") or "").strip()):
|
||||
c1 = w1.get("conf", 50)
|
||||
c2 = w2.get("conf", 50)
|
||||
to_remove.add(i1 if c1 <= c2 else i2)
|
||||
|
||||
# Execute merges first (syllable-split words)
|
||||
if to_merge:
|
||||
merge_parent: Dict[int, int] = {}
|
||||
for mi1, mi2 in to_merge:
|
||||
actual_mi1 = mi1
|
||||
while actual_mi1 in merge_parent:
|
||||
actual_mi1 = merge_parent[actual_mi1]
|
||||
if actual_mi1 in to_remove or mi2 in to_remove:
|
||||
continue
|
||||
if mi2 in merge_parent:
|
||||
continue
|
||||
mw1, mw2 = wbs[actual_mi1], wbs[mi2]
|
||||
mt1 = (mw1.get("text") or "").rstrip(".,;:!?")
|
||||
mt2 = (mw2.get("text") or "").strip()
|
||||
merged_text = mt1 + mt2
|
||||
mx = min(mw1["left"], mw2["left"])
|
||||
my = min(mw1["top"], mw2["top"])
|
||||
mr = max(mw1["left"] + mw1["width"],
|
||||
mw2["left"] + mw2["width"])
|
||||
mb = max(mw1["top"] + mw1["height"],
|
||||
mw2["top"] + mw2["height"])
|
||||
mw1["text"] = merged_text
|
||||
mw1["left"] = mx
|
||||
mw1["top"] = my
|
||||
mw1["width"] = mr - mx
|
||||
mw1["height"] = mb - my
|
||||
mw1["conf"] = (mw1.get("conf", 50) + mw2.get("conf", 50)) / 2
|
||||
to_remove.add(mi2)
|
||||
merge_parent[mi2] = actual_mi1
|
||||
bullet_removed -= 1
|
||||
|
||||
if to_remove:
|
||||
bullet_removed += len(to_remove)
|
||||
filtered = [wb for i, wb in enumerate(wbs) if i not in to_remove]
|
||||
cell["word_boxes"] = filtered
|
||||
if not cell.get("_ipa_corrected"):
|
||||
cell["text"] = _words_to_reading_order_text(filtered)
|
||||
|
||||
if bullet_removed:
|
||||
for z in zones_data:
|
||||
z["cells"] = [c for c in z.get("cells", [])
|
||||
if (c.get("word_boxes") or c.get("text", "").strip())]
|
||||
logger.info("Step 5i: removed %d bullet/artifact word_boxes", bullet_removed)
|
||||
|
||||
|
||||
def _remove_garbled_cells(zones_data: List[Dict[str, Any]]) -> None:
|
||||
"""Remove cells whose text is entirely garbled / artifact noise (Step 5j-pre)."""
|
||||
_COMMON_SHORT_WORDS = {
|
||||
"ab", "am", "an", "da", "du", "er", "es", "im", "in", "ja",
|
||||
"ob", "so", "um", "zu", "wo", "je", "oh", "or",
|
||||
"die", "der", "das", "dem", "den", "des", "ein", "und",
|
||||
"auf", "aus", "bei", "bis", "für", "mit", "nur", "von",
|
||||
"a", "i", "an", "as", "at", "be", "by", "do", "go", "he",
|
||||
"if", "in", "is", "it", "me", "my", "no", "of", "oh", "ok",
|
||||
"on", "or", "so", "to", "up", "us", "we",
|
||||
"the", "and", "but", "for", "not",
|
||||
}
|
||||
_PURE_JUNK_RE = re.compile(r'^[\W\d\s]+$')
|
||||
artifact_cells_removed = 0
|
||||
|
||||
for z in zones_data:
|
||||
before = len(z.get("cells", []))
|
||||
kept = []
|
||||
for cell in z.get("cells", []):
|
||||
text = (cell.get("text") or "").strip()
|
||||
core = text.rstrip(".,;:!?'\"")
|
||||
is_artifact = False
|
||||
if not core:
|
||||
is_artifact = True
|
||||
elif _PURE_JUNK_RE.match(core):
|
||||
if core.strip() not in ('=', '(=', '=)', ';', ':', '-', '–', '—'):
|
||||
is_artifact = True
|
||||
elif len(core) <= 2 and core.lower() not in _COMMON_SHORT_WORDS and not core.isalpha():
|
||||
is_artifact = True
|
||||
elif len(core) <= 3 and core.isupper() and core.lower() not in _COMMON_SHORT_WORDS:
|
||||
is_artifact = True
|
||||
elif (len(core) <= 5 and re.search(r'\d', core) and re.search(r'[A-Za-z]', core)
|
||||
and not re.match(r'^[pPsS]\.?\d+$', core)):
|
||||
is_artifact = True
|
||||
if is_artifact:
|
||||
kept.append(None)
|
||||
else:
|
||||
kept.append(cell)
|
||||
z["cells"] = [c for c in kept if c is not None]
|
||||
artifact_cells_removed += before - len(z["cells"])
|
||||
|
||||
if artifact_cells_removed:
|
||||
for z in zones_data:
|
||||
cell_ris = {c.get("row_index") for c in z.get("cells", [])}
|
||||
z["rows"] = [r for r in z.get("rows", []) if r["index"] in cell_ris]
|
||||
logger.info("Step 5j-pre: removed %d artifact cells", artifact_cells_removed)
|
||||
|
||||
|
||||
def _normalize_word_order(zones_data: List[Dict[str, Any]]) -> None:
|
||||
"""Normalise word_box order to reading order (Step 5j)."""
|
||||
wb_reordered = 0
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
wbs = cell.get("word_boxes") or []
|
||||
if len(wbs) < 2:
|
||||
continue
|
||||
lines = _group_words_into_lines(wbs, y_tolerance_px=15)
|
||||
sorted_wbs = [w for line in lines for w in line]
|
||||
if [id(w) for w in sorted_wbs] != [id(w) for w in wbs]:
|
||||
cell["word_boxes"] = sorted_wbs
|
||||
wb_reordered += 1
|
||||
if wb_reordered:
|
||||
logger.info("Step 5j: re-ordered word_boxes in %d cells to reading order", wb_reordered)
|
||||
|
||||
|
||||
def _enforce_max_columns(
|
||||
zones_data: List[Dict[str, Any]],
|
||||
max_columns: int,
|
||||
) -> None:
|
||||
"""Enforce max_columns by merging narrowest columns (Step 5k)."""
|
||||
for z in zones_data:
|
||||
if z.get("zone_type") != "content":
|
||||
continue
|
||||
cols = z.get("columns", [])
|
||||
cells = z.get("cells", [])
|
||||
if len(cols) <= max_columns:
|
||||
continue
|
||||
|
||||
logger.info(
|
||||
"max_columns=%d: zone %s has %d columns -> merging",
|
||||
max_columns, z.get("zone_index"), len(cols),
|
||||
)
|
||||
|
||||
cols_by_width = sorted(cols, key=lambda c: (c.get("x_max_px", 0) - c.get("x_min_px", 0)))
|
||||
|
||||
while len(cols) > max_columns:
|
||||
narrowest = cols_by_width.pop(0)
|
||||
ni = narrowest["index"]
|
||||
|
||||
sorted_by_x = sorted(cols, key=lambda c: c.get("x_min_px", c.get("x_min_pct", 0)))
|
||||
pos = next(i for i, c in enumerate(sorted_by_x) if c["index"] == ni)
|
||||
if pos + 1 < len(sorted_by_x):
|
||||
merge_target = sorted_by_x[pos + 1]
|
||||
elif pos > 0:
|
||||
merge_target = sorted_by_x[pos - 1]
|
||||
else:
|
||||
break
|
||||
|
||||
ti = merge_target["index"]
|
||||
|
||||
merge_target["x_min_px"] = min(
|
||||
merge_target.get("x_min_px", merge_target.get("x_min_pct", 0)),
|
||||
narrowest.get("x_min_px", narrowest.get("x_min_pct", 0)),
|
||||
)
|
||||
merge_target["x_max_px"] = max(
|
||||
merge_target.get("x_max_px", merge_target.get("x_max_pct", 100)),
|
||||
narrowest.get("x_max_px", narrowest.get("x_max_pct", 100)),
|
||||
)
|
||||
if "x_min_pct" in merge_target and "x_min_pct" in narrowest:
|
||||
merge_target["x_min_pct"] = min(merge_target["x_min_pct"], narrowest["x_min_pct"])
|
||||
merge_target["x_max_pct"] = max(merge_target["x_max_pct"], narrowest["x_max_pct"])
|
||||
|
||||
for cell in cells:
|
||||
if cell.get("col_index") == ni:
|
||||
cell["col_index"] = ti
|
||||
existing = next(
|
||||
(c for c in cells if c["col_index"] == ti
|
||||
and c["row_index"] == cell["row_index"]
|
||||
and c is not cell),
|
||||
None,
|
||||
)
|
||||
if existing:
|
||||
existing["text"] = (
|
||||
(existing.get("text", "") + " " + cell.get("text", "")).strip()
|
||||
)
|
||||
existing["word_boxes"] = existing.get("word_boxes", []) + cell.get("word_boxes", [])
|
||||
cell["_merged"] = True
|
||||
|
||||
z["cells"] = [c for c in cells if not c.get("_merged")]
|
||||
cells = z["cells"]
|
||||
cols.remove(narrowest)
|
||||
cols_by_width = [c for c in cols_by_width if c["index"] != ni]
|
||||
|
||||
# Re-index columns 0..N-1
|
||||
for new_idx, col in enumerate(sorted(cols, key=lambda c: c.get("x_min_px", c.get("x_min_pct", 0)))):
|
||||
old_idx = col["index"]
|
||||
col["index"] = new_idx
|
||||
for cell in cells:
|
||||
if cell.get("col_index") == old_idx:
|
||||
cell["col_index"] = new_idx
|
||||
|
||||
logger.info("max_columns: zone %s now has %d columns", z.get("zone_index"), len(cols))
|
||||
Reference in New Issue
Block a user