Fix: Remove broken getKlausurApiUrl and clean up empty lines

sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 16:02:04 +02:00
parent b07f802c24
commit 9ba420fa91
150 changed files with 30231 additions and 32053 deletions
--- a/klausur-service/backend/grid_build_cell_ops.py
+++ b/klausur-service/backend/grid_build_cell_ops.py
@@ -0,0 +1,305 @@
+"""
+Grid Build Cell Ops — Cell-level operations: bullet/artifact removal,
+garbled cell cleanup, word-box reordering, and max_columns enforcement.
+
+Extracted from grid_build_core.py for maintainability.
+"""
+
+import logging
+import re
+from typing import Any, Dict, List, Tuple
+
+from cv_ocr_engines import (
+    _words_to_reading_order_text, _group_words_into_lines, _lookup_ipa,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def _remove_bullets_and_artifacts(zones_data: List[Dict[str, Any]]) -> None:
+    """Remove blue bullet/artifact word_boxes (Step 5i).
+
+    Handles tiny coloured symbols, overlapping word_boxes, duplicate text,
+    and syllable-split word merging.
+    """
+    _ALPHA_WORD_RE = re.compile(r'^[A-Za-z\u00c0-\u024f\-]+[.,;:!?]*$')
+    _REMOVE_SYMBOLS = {'>', '<', '~', '\\', '^', '`', '#', '|', '¬', '¦'}
+
+    bullet_removed = 0
+    for z in zones_data:
+        for cell in z.get("cells", []):
+            wbs = cell.get("word_boxes") or []
+            if len(wbs) < 2:
+                continue
+            to_remove: set = set()
+
+            # Rule (a): tiny coloured symbols
+            for i, wb in enumerate(wbs):
+                cn = wb.get("color_name", "black")
+                if (cn != "black"
+                        and wb.get("width", 0) * wb.get("height", 0) < 200
+                        and wb.get("conf", 100) < 85):
+                    to_remove.add(i)
+
+            # Rule (a2): isolated non-alphanumeric symbols
+            for i, wb in enumerate(wbs):
+                t = (wb.get("text") or "").strip()
+                if t and not re.search(r'[a-zA-Z0-9äöüÄÖÜß]', t) and len(t) <= 2:
+                    if t in _REMOVE_SYMBOLS:
+                        to_remove.add(i)
+
+            # Rule (b) + (c): overlap and duplicate detection
+            to_merge: List[Tuple[int, int]] = []
+            indexed = sorted(enumerate(wbs), key=lambda iw: iw[1].get("left", 0))
+            for p in range(len(indexed) - 1):
+                i1, w1 = indexed[p]
+                i2, w2 = indexed[p + 1]
+                x1s, x1e = w1.get("left", 0), w1.get("left", 0) + w1.get("width", 0)
+                x2s, x2e = w2.get("left", 0), w2.get("left", 0) + w2.get("width", 0)
+                overlap = max(0, min(x1e, x2e) - max(x1s, x2s))
+                min_w = min(w1.get("width", 1), w2.get("width", 1))
+                gap = x2s - x1e
+                overlap_pct = overlap / min_w if min_w > 0 else 0
+
+                if overlap_pct > 0.20:
+                    t1 = (w1.get("text") or "").strip()
+                    t2 = (w2.get("text") or "").strip()
+
+                    # Syllable-split words
+                    if (overlap_pct <= 0.75
+                            and _ALPHA_WORD_RE.match(t1)
+                            and _ALPHA_WORD_RE.match(t2)):
+                        to_merge.append((i1, i2))
+                        continue
+
+                    # High overlap with short prefix
+                    if (overlap_pct > 0.75
+                            and _ALPHA_WORD_RE.match(t1)
+                            and _ALPHA_WORD_RE.match(t2)
+                            and t1.rstrip(".,;:!?").lower() != t2.rstrip(".,;:!?").lower()
+                            and min(len(t1.rstrip(".,;:!?")), len(t2.rstrip(".,;:!?"))) <= 4):
+                        to_merge.append((i1, i2))
+                        continue
+
+                    if overlap_pct <= 0.40:
+                        continue
+
+                    c1 = w1.get("conf", 50)
+                    c2 = w2.get("conf", 50)
+
+                    # Very high overlap: prefer IPA-dictionary word
+                    if overlap_pct > 0.90 and t1.lower() != t2.lower():
+                        in_dict_1 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t1.lower()), "british")) if t1.isalpha() else False
+                        in_dict_2 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t2.lower()), "british")) if t2.isalpha() else False
+                        if in_dict_1 and not in_dict_2:
+                            to_remove.add(i2)
+                            continue
+                        elif in_dict_2 and not in_dict_1:
+                            to_remove.add(i1)
+                            continue
+
+                    if c1 < c2:
+                        to_remove.add(i1)
+                    elif c2 < c1:
+                        to_remove.add(i2)
+                    else:
+                        if w1.get("height", 0) > w2.get("height", 0):
+                            to_remove.add(i1)
+                        else:
+                            to_remove.add(i2)
+
+                elif (gap < 6
+                      and w1.get("color_name") == "blue"
+                      and w2.get("color_name") == "blue"
+                      and (w1.get("text") or "").strip() == (w2.get("text") or "").strip()):
+                    c1 = w1.get("conf", 50)
+                    c2 = w2.get("conf", 50)
+                    to_remove.add(i1 if c1 <= c2 else i2)
+
+            # Execute merges first (syllable-split words)
+            if to_merge:
+                merge_parent: Dict[int, int] = {}
+                for mi1, mi2 in to_merge:
+                    actual_mi1 = mi1
+                    while actual_mi1 in merge_parent:
+                        actual_mi1 = merge_parent[actual_mi1]
+                    if actual_mi1 in to_remove or mi2 in to_remove:
+                        continue
+                    if mi2 in merge_parent:
+                        continue
+                    mw1, mw2 = wbs[actual_mi1], wbs[mi2]
+                    mt1 = (mw1.get("text") or "").rstrip(".,;:!?")
+                    mt2 = (mw2.get("text") or "").strip()
+                    merged_text = mt1 + mt2
+                    mx = min(mw1["left"], mw2["left"])
+                    my = min(mw1["top"], mw2["top"])
+                    mr = max(mw1["left"] + mw1["width"],
+                             mw2["left"] + mw2["width"])
+                    mb = max(mw1["top"] + mw1["height"],
+                             mw2["top"] + mw2["height"])
+                    mw1["text"] = merged_text
+                    mw1["left"] = mx
+                    mw1["top"] = my
+                    mw1["width"] = mr - mx
+                    mw1["height"] = mb - my
+                    mw1["conf"] = (mw1.get("conf", 50) + mw2.get("conf", 50)) / 2
+                    to_remove.add(mi2)
+                    merge_parent[mi2] = actual_mi1
+                    bullet_removed -= 1
+
+            if to_remove:
+                bullet_removed += len(to_remove)
+                filtered = [wb for i, wb in enumerate(wbs) if i not in to_remove]
+                cell["word_boxes"] = filtered
+                if not cell.get("_ipa_corrected"):
+                    cell["text"] = _words_to_reading_order_text(filtered)
+
+    if bullet_removed:
+        for z in zones_data:
+            z["cells"] = [c for c in z.get("cells", [])
+                          if (c.get("word_boxes") or c.get("text", "").strip())]
+        logger.info("Step 5i: removed %d bullet/artifact word_boxes", bullet_removed)
+
+
+def _remove_garbled_cells(zones_data: List[Dict[str, Any]]) -> None:
+    """Remove cells whose text is entirely garbled / artifact noise (Step 5j-pre)."""
+    _COMMON_SHORT_WORDS = {
+        "ab", "am", "an", "da", "du", "er", "es", "im", "in", "ja",
+        "ob", "so", "um", "zu", "wo", "je", "oh", "or",
+        "die", "der", "das", "dem", "den", "des", "ein", "und",
+        "auf", "aus", "bei", "bis", "für", "mit", "nur", "von",
+        "a", "i", "an", "as", "at", "be", "by", "do", "go", "he",
+        "if", "in", "is", "it", "me", "my", "no", "of", "oh", "ok",
+        "on", "or", "so", "to", "up", "us", "we",
+        "the", "and", "but", "for", "not",
+    }
+    _PURE_JUNK_RE = re.compile(r'^[\W\d\s]+$')
+    artifact_cells_removed = 0
+
+    for z in zones_data:
+        before = len(z.get("cells", []))
+        kept = []
+        for cell in z.get("cells", []):
+            text = (cell.get("text") or "").strip()
+            core = text.rstrip(".,;:!?'\"")
+            is_artifact = False
+            if not core:
+                is_artifact = True
+            elif _PURE_JUNK_RE.match(core):
+                if core.strip() not in ('=', '(=', '=)', ';', ':', '-', '–', '—'):
+                    is_artifact = True
+            elif len(core) <= 2 and core.lower() not in _COMMON_SHORT_WORDS and not core.isalpha():
+                is_artifact = True
+            elif len(core) <= 3 and core.isupper() and core.lower() not in _COMMON_SHORT_WORDS:
+                is_artifact = True
+            elif (len(core) <= 5 and re.search(r'\d', core) and re.search(r'[A-Za-z]', core)
+                  and not re.match(r'^[pPsS]\.?\d+$', core)):
+                is_artifact = True
+            if is_artifact:
+                kept.append(None)
+            else:
+                kept.append(cell)
+        z["cells"] = [c for c in kept if c is not None]
+        artifact_cells_removed += before - len(z["cells"])
+
+    if artifact_cells_removed:
+        for z in zones_data:
+            cell_ris = {c.get("row_index") for c in z.get("cells", [])}
+            z["rows"] = [r for r in z.get("rows", []) if r["index"] in cell_ris]
+        logger.info("Step 5j-pre: removed %d artifact cells", artifact_cells_removed)
+
+
+def _normalize_word_order(zones_data: List[Dict[str, Any]]) -> None:
+    """Normalise word_box order to reading order (Step 5j)."""
+    wb_reordered = 0
+    for z in zones_data:
+        for cell in z.get("cells", []):
+            wbs = cell.get("word_boxes") or []
+            if len(wbs) < 2:
+                continue
+            lines = _group_words_into_lines(wbs, y_tolerance_px=15)
+            sorted_wbs = [w for line in lines for w in line]
+            if [id(w) for w in sorted_wbs] != [id(w) for w in wbs]:
+                cell["word_boxes"] = sorted_wbs
+                wb_reordered += 1
+    if wb_reordered:
+        logger.info("Step 5j: re-ordered word_boxes in %d cells to reading order", wb_reordered)
+
+
+def _enforce_max_columns(
+    zones_data: List[Dict[str, Any]],
+    max_columns: int,
+) -> None:
+    """Enforce max_columns by merging narrowest columns (Step 5k)."""
+    for z in zones_data:
+        if z.get("zone_type") != "content":
+            continue
+        cols = z.get("columns", [])
+        cells = z.get("cells", [])
+        if len(cols) <= max_columns:
+            continue
+
+        logger.info(
+            "max_columns=%d: zone %s has %d columns -> merging",
+            max_columns, z.get("zone_index"), len(cols),
+        )
+
+        cols_by_width = sorted(cols, key=lambda c: (c.get("x_max_px", 0) - c.get("x_min_px", 0)))
+
+        while len(cols) > max_columns:
+            narrowest = cols_by_width.pop(0)
+            ni = narrowest["index"]
+
+            sorted_by_x = sorted(cols, key=lambda c: c.get("x_min_px", c.get("x_min_pct", 0)))
+            pos = next(i for i, c in enumerate(sorted_by_x) if c["index"] == ni)
+            if pos + 1 < len(sorted_by_x):
+                merge_target = sorted_by_x[pos + 1]
+            elif pos > 0:
+                merge_target = sorted_by_x[pos - 1]
+            else:
+                break
+
+            ti = merge_target["index"]
+
+            merge_target["x_min_px"] = min(
+                merge_target.get("x_min_px", merge_target.get("x_min_pct", 0)),
+                narrowest.get("x_min_px", narrowest.get("x_min_pct", 0)),
+            )
+            merge_target["x_max_px"] = max(
+                merge_target.get("x_max_px", merge_target.get("x_max_pct", 100)),
+                narrowest.get("x_max_px", narrowest.get("x_max_pct", 100)),
+            )
+            if "x_min_pct" in merge_target and "x_min_pct" in narrowest:
+                merge_target["x_min_pct"] = min(merge_target["x_min_pct"], narrowest["x_min_pct"])
+                merge_target["x_max_pct"] = max(merge_target["x_max_pct"], narrowest["x_max_pct"])
+
+            for cell in cells:
+                if cell.get("col_index") == ni:
+                    cell["col_index"] = ti
+                    existing = next(
+                        (c for c in cells if c["col_index"] == ti
+                         and c["row_index"] == cell["row_index"]
+                         and c is not cell),
+                        None,
+                    )
+                    if existing:
+                        existing["text"] = (
+                            (existing.get("text", "") + " " + cell.get("text", "")).strip()
+                        )
+                        existing["word_boxes"] = existing.get("word_boxes", []) + cell.get("word_boxes", [])
+                        cell["_merged"] = True
+
+            z["cells"] = [c for c in cells if not c.get("_merged")]
+            cells = z["cells"]
+            cols.remove(narrowest)
+            cols_by_width = [c for c in cols_by_width if c["index"] != ni]
+
+        # Re-index columns 0..N-1
+        for new_idx, col in enumerate(sorted(cols, key=lambda c: c.get("x_min_px", c.get("x_min_pct", 0)))):
+            old_idx = col["index"]
+            col["index"] = new_idx
+            for cell in cells:
+                if cell.get("col_index") == old_idx:
+                    cell["col_index"] = new_idx
+
+        logger.info("max_columns: zone %s now has %d columns", z.get("zone_index"), len(cols))