Fix: Remove broken getKlausurApiUrl and clean up empty lines
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s

sed replacement left orphaned hostname references in story page
and empty lines in getApiBase functions.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-24 16:02:04 +02:00
parent b07f802c24
commit 9ba420fa91
150 changed files with 30231 additions and 32053 deletions

View File

@@ -0,0 +1,305 @@
"""
Grid Build Cell Ops — Cell-level operations: bullet/artifact removal,
garbled cell cleanup, word-box reordering, and max_columns enforcement.
Extracted from grid_build_core.py for maintainability.
"""
import logging
import re
from typing import Any, Dict, List, Tuple
from cv_ocr_engines import (
_words_to_reading_order_text, _group_words_into_lines, _lookup_ipa,
)
logger = logging.getLogger(__name__)
def _remove_bullets_and_artifacts(zones_data: List[Dict[str, Any]]) -> None:
"""Remove blue bullet/artifact word_boxes (Step 5i).
Handles tiny coloured symbols, overlapping word_boxes, duplicate text,
and syllable-split word merging.
"""
_ALPHA_WORD_RE = re.compile(r'^[A-Za-z\u00c0-\u024f\-]+[.,;:!?]*$')
_REMOVE_SYMBOLS = {'>', '<', '~', '\\', '^', '`', '#', '|', '¬', '¦'}
bullet_removed = 0
for z in zones_data:
for cell in z.get("cells", []):
wbs = cell.get("word_boxes") or []
if len(wbs) < 2:
continue
to_remove: set = set()
# Rule (a): tiny coloured symbols
for i, wb in enumerate(wbs):
cn = wb.get("color_name", "black")
if (cn != "black"
and wb.get("width", 0) * wb.get("height", 0) < 200
and wb.get("conf", 100) < 85):
to_remove.add(i)
# Rule (a2): isolated non-alphanumeric symbols
for i, wb in enumerate(wbs):
t = (wb.get("text") or "").strip()
if t and not re.search(r'[a-zA-Z0-9äöüÄÖÜß]', t) and len(t) <= 2:
if t in _REMOVE_SYMBOLS:
to_remove.add(i)
# Rule (b) + (c): overlap and duplicate detection
to_merge: List[Tuple[int, int]] = []
indexed = sorted(enumerate(wbs), key=lambda iw: iw[1].get("left", 0))
for p in range(len(indexed) - 1):
i1, w1 = indexed[p]
i2, w2 = indexed[p + 1]
x1s, x1e = w1.get("left", 0), w1.get("left", 0) + w1.get("width", 0)
x2s, x2e = w2.get("left", 0), w2.get("left", 0) + w2.get("width", 0)
overlap = max(0, min(x1e, x2e) - max(x1s, x2s))
min_w = min(w1.get("width", 1), w2.get("width", 1))
gap = x2s - x1e
overlap_pct = overlap / min_w if min_w > 0 else 0
if overlap_pct > 0.20:
t1 = (w1.get("text") or "").strip()
t2 = (w2.get("text") or "").strip()
# Syllable-split words
if (overlap_pct <= 0.75
and _ALPHA_WORD_RE.match(t1)
and _ALPHA_WORD_RE.match(t2)):
to_merge.append((i1, i2))
continue
# High overlap with short prefix
if (overlap_pct > 0.75
and _ALPHA_WORD_RE.match(t1)
and _ALPHA_WORD_RE.match(t2)
and t1.rstrip(".,;:!?").lower() != t2.rstrip(".,;:!?").lower()
and min(len(t1.rstrip(".,;:!?")), len(t2.rstrip(".,;:!?"))) <= 4):
to_merge.append((i1, i2))
continue
if overlap_pct <= 0.40:
continue
c1 = w1.get("conf", 50)
c2 = w2.get("conf", 50)
# Very high overlap: prefer IPA-dictionary word
if overlap_pct > 0.90 and t1.lower() != t2.lower():
in_dict_1 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t1.lower()), "british")) if t1.isalpha() else False
in_dict_2 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t2.lower()), "british")) if t2.isalpha() else False
if in_dict_1 and not in_dict_2:
to_remove.add(i2)
continue
elif in_dict_2 and not in_dict_1:
to_remove.add(i1)
continue
if c1 < c2:
to_remove.add(i1)
elif c2 < c1:
to_remove.add(i2)
else:
if w1.get("height", 0) > w2.get("height", 0):
to_remove.add(i1)
else:
to_remove.add(i2)
elif (gap < 6
and w1.get("color_name") == "blue"
and w2.get("color_name") == "blue"
and (w1.get("text") or "").strip() == (w2.get("text") or "").strip()):
c1 = w1.get("conf", 50)
c2 = w2.get("conf", 50)
to_remove.add(i1 if c1 <= c2 else i2)
# Execute merges first (syllable-split words)
if to_merge:
merge_parent: Dict[int, int] = {}
for mi1, mi2 in to_merge:
actual_mi1 = mi1
while actual_mi1 in merge_parent:
actual_mi1 = merge_parent[actual_mi1]
if actual_mi1 in to_remove or mi2 in to_remove:
continue
if mi2 in merge_parent:
continue
mw1, mw2 = wbs[actual_mi1], wbs[mi2]
mt1 = (mw1.get("text") or "").rstrip(".,;:!?")
mt2 = (mw2.get("text") or "").strip()
merged_text = mt1 + mt2
mx = min(mw1["left"], mw2["left"])
my = min(mw1["top"], mw2["top"])
mr = max(mw1["left"] + mw1["width"],
mw2["left"] + mw2["width"])
mb = max(mw1["top"] + mw1["height"],
mw2["top"] + mw2["height"])
mw1["text"] = merged_text
mw1["left"] = mx
mw1["top"] = my
mw1["width"] = mr - mx
mw1["height"] = mb - my
mw1["conf"] = (mw1.get("conf", 50) + mw2.get("conf", 50)) / 2
to_remove.add(mi2)
merge_parent[mi2] = actual_mi1
bullet_removed -= 1
if to_remove:
bullet_removed += len(to_remove)
filtered = [wb for i, wb in enumerate(wbs) if i not in to_remove]
cell["word_boxes"] = filtered
if not cell.get("_ipa_corrected"):
cell["text"] = _words_to_reading_order_text(filtered)
if bullet_removed:
for z in zones_data:
z["cells"] = [c for c in z.get("cells", [])
if (c.get("word_boxes") or c.get("text", "").strip())]
logger.info("Step 5i: removed %d bullet/artifact word_boxes", bullet_removed)
def _remove_garbled_cells(zones_data: List[Dict[str, Any]]) -> None:
"""Remove cells whose text is entirely garbled / artifact noise (Step 5j-pre)."""
_COMMON_SHORT_WORDS = {
"ab", "am", "an", "da", "du", "er", "es", "im", "in", "ja",
"ob", "so", "um", "zu", "wo", "je", "oh", "or",
"die", "der", "das", "dem", "den", "des", "ein", "und",
"auf", "aus", "bei", "bis", "für", "mit", "nur", "von",
"a", "i", "an", "as", "at", "be", "by", "do", "go", "he",
"if", "in", "is", "it", "me", "my", "no", "of", "oh", "ok",
"on", "or", "so", "to", "up", "us", "we",
"the", "and", "but", "for", "not",
}
_PURE_JUNK_RE = re.compile(r'^[\W\d\s]+$')
artifact_cells_removed = 0
for z in zones_data:
before = len(z.get("cells", []))
kept = []
for cell in z.get("cells", []):
text = (cell.get("text") or "").strip()
core = text.rstrip(".,;:!?'\"")
is_artifact = False
if not core:
is_artifact = True
elif _PURE_JUNK_RE.match(core):
if core.strip() not in ('=', '(=', '=)', ';', ':', '-', '', ''):
is_artifact = True
elif len(core) <= 2 and core.lower() not in _COMMON_SHORT_WORDS and not core.isalpha():
is_artifact = True
elif len(core) <= 3 and core.isupper() and core.lower() not in _COMMON_SHORT_WORDS:
is_artifact = True
elif (len(core) <= 5 and re.search(r'\d', core) and re.search(r'[A-Za-z]', core)
and not re.match(r'^[pPsS]\.?\d+$', core)):
is_artifact = True
if is_artifact:
kept.append(None)
else:
kept.append(cell)
z["cells"] = [c for c in kept if c is not None]
artifact_cells_removed += before - len(z["cells"])
if artifact_cells_removed:
for z in zones_data:
cell_ris = {c.get("row_index") for c in z.get("cells", [])}
z["rows"] = [r for r in z.get("rows", []) if r["index"] in cell_ris]
logger.info("Step 5j-pre: removed %d artifact cells", artifact_cells_removed)
def _normalize_word_order(zones_data: List[Dict[str, Any]]) -> None:
"""Normalise word_box order to reading order (Step 5j)."""
wb_reordered = 0
for z in zones_data:
for cell in z.get("cells", []):
wbs = cell.get("word_boxes") or []
if len(wbs) < 2:
continue
lines = _group_words_into_lines(wbs, y_tolerance_px=15)
sorted_wbs = [w for line in lines for w in line]
if [id(w) for w in sorted_wbs] != [id(w) for w in wbs]:
cell["word_boxes"] = sorted_wbs
wb_reordered += 1
if wb_reordered:
logger.info("Step 5j: re-ordered word_boxes in %d cells to reading order", wb_reordered)
def _enforce_max_columns(
zones_data: List[Dict[str, Any]],
max_columns: int,
) -> None:
"""Enforce max_columns by merging narrowest columns (Step 5k)."""
for z in zones_data:
if z.get("zone_type") != "content":
continue
cols = z.get("columns", [])
cells = z.get("cells", [])
if len(cols) <= max_columns:
continue
logger.info(
"max_columns=%d: zone %s has %d columns -> merging",
max_columns, z.get("zone_index"), len(cols),
)
cols_by_width = sorted(cols, key=lambda c: (c.get("x_max_px", 0) - c.get("x_min_px", 0)))
while len(cols) > max_columns:
narrowest = cols_by_width.pop(0)
ni = narrowest["index"]
sorted_by_x = sorted(cols, key=lambda c: c.get("x_min_px", c.get("x_min_pct", 0)))
pos = next(i for i, c in enumerate(sorted_by_x) if c["index"] == ni)
if pos + 1 < len(sorted_by_x):
merge_target = sorted_by_x[pos + 1]
elif pos > 0:
merge_target = sorted_by_x[pos - 1]
else:
break
ti = merge_target["index"]
merge_target["x_min_px"] = min(
merge_target.get("x_min_px", merge_target.get("x_min_pct", 0)),
narrowest.get("x_min_px", narrowest.get("x_min_pct", 0)),
)
merge_target["x_max_px"] = max(
merge_target.get("x_max_px", merge_target.get("x_max_pct", 100)),
narrowest.get("x_max_px", narrowest.get("x_max_pct", 100)),
)
if "x_min_pct" in merge_target and "x_min_pct" in narrowest:
merge_target["x_min_pct"] = min(merge_target["x_min_pct"], narrowest["x_min_pct"])
merge_target["x_max_pct"] = max(merge_target["x_max_pct"], narrowest["x_max_pct"])
for cell in cells:
if cell.get("col_index") == ni:
cell["col_index"] = ti
existing = next(
(c for c in cells if c["col_index"] == ti
and c["row_index"] == cell["row_index"]
and c is not cell),
None,
)
if existing:
existing["text"] = (
(existing.get("text", "") + " " + cell.get("text", "")).strip()
)
existing["word_boxes"] = existing.get("word_boxes", []) + cell.get("word_boxes", [])
cell["_merged"] = True
z["cells"] = [c for c in cells if not c.get("_merged")]
cells = z["cells"]
cols.remove(narrowest)
cols_by_width = [c for c in cols_by_width if c["index"] != ni]
# Re-index columns 0..N-1
for new_idx, col in enumerate(sorted(cols, key=lambda c: c.get("x_min_px", c.get("x_min_pct", 0)))):
old_idx = col["index"]
col["index"] = new_idx
for cell in cells:
if cell.get("col_index") == old_idx:
cell["col_index"] = new_idx
logger.info("max_columns: zone %s now has %d columns", z.get("zone_index"), len(cols))