Fix: Remove broken getKlausurApiUrl and clean up empty lines

sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 16:02:04 +02:00
parent b07f802c24
commit 9ba420fa91
150 changed files with 30231 additions and 32053 deletions
--- a/klausur-service/backend/grid_build_cleanup.py
+++ b/klausur-service/backend/grid_build_cleanup.py
@@ -0,0 +1,390 @@
+"""
+Grid Build Cleanup — Phase 3: Junk row removal, artifact cleanup, pipe
+divider removal, connector normalization, border strip detection, and
+alphabet sidebar removal.
+
+Extracted from grid_build_core.py for maintainability.
+"""
+
+import logging
+import re
+from typing import Any, Dict, List
+
+from cv_ocr_engines import _words_to_reading_order_text
+
+logger = logging.getLogger(__name__)
+
+_PIPE_RE = re.compile(r"^\|+$")
+
+
+def _cleanup_zones(
+    zones_data: List[Dict[str, Any]],
+    border_prefiltered: bool,
+    session_id: str,
+) -> bool:
+    """Clean up zone data: remove junk rows, artifacts, pipes, border strips.
+
+    Args:
+        zones_data: List of zone dicts (modified in place).
+        border_prefiltered: Whether border words were already pre-filtered.
+        session_id: For logging.
+
+    Returns:
+        Updated border_prefiltered flag.
+    """
+    _remove_junk_rows(zones_data)
+    _remove_artifact_cells(zones_data)
+    _remove_oversized_word_boxes(zones_data)
+    _remove_pipe_dividers(zones_data)
+    _normalize_connector_columns(zones_data)
+    border_prefiltered = _remove_border_strips(zones_data, border_prefiltered)
+    _remove_alphabet_sidebars(zones_data)
+    return border_prefiltered
+
+
+def _remove_junk_rows(zones_data: List[Dict[str, Any]]) -> None:
+    """Remove rows where ALL cells contain only short, low-confidence text.
+
+    Also removes 'oversized stub' rows and 'scattered debris' rows.
+    """
+    _JUNK_CONF_THRESHOLD = 50
+    _JUNK_MAX_TEXT_LEN = 3
+
+    for z in zones_data:
+        cells = z.get("cells", [])
+        rows = z.get("rows", [])
+        if not cells or not rows:
+            continue
+
+        # Compute median word height across the zone for oversized detection
+        all_wb_heights = [
+            wb["height"]
+            for cell in cells
+            for wb in cell.get("word_boxes") or []
+            if wb.get("height", 0) > 0
+        ]
+        median_wb_h = sorted(all_wb_heights)[len(all_wb_heights) // 2] if all_wb_heights else 28
+
+        junk_row_indices = set()
+        for row in rows:
+            ri = row["index"]
+            row_cells = [c for c in cells if c.get("row_index") == ri]
+            if not row_cells:
+                continue
+
+            row_wbs = [
+                wb for cell in row_cells
+                for wb in cell.get("word_boxes") or []
+            ]
+
+            # Rule 1: ALL word_boxes are low-conf AND short text
+            all_junk = True
+            for wb in row_wbs:
+                text = (wb.get("text") or "").strip()
+                conf = wb.get("conf", 0)
+                if conf >= _JUNK_CONF_THRESHOLD or len(text) > _JUNK_MAX_TEXT_LEN:
+                    all_junk = False
+                    break
+            if all_junk and row_wbs:
+                junk_row_indices.add(ri)
+                continue
+
+            # Rule 2: oversized stub -- <=3 words, short total text,
+            # and word height > 1.8x median
+            if len(row_wbs) <= 3:
+                total_text = "".join((wb.get("text") or "").strip() for wb in row_wbs)
+                max_h = max((wb.get("height", 0) for wb in row_wbs), default=0)
+                has_page_ref = any(
+                    re.match(r'^[pPsS]\.?\s*\d+$', (wb.get("text") or "").strip())
+                    for wb in row_wbs
+                )
+                if len(total_text) <= 5 and max_h > median_wb_h * 1.8 and not has_page_ref:
+                    junk_row_indices.add(ri)
+                    continue
+
+            # Rule 3: scattered debris -- rows with only tiny fragments
+            longest = max(len((wb.get("text") or "").strip()) for wb in row_wbs)
+            if longest <= 2:
+                junk_row_indices.add(ri)
+                continue
+
+        if junk_row_indices:
+            z["cells"] = [c for c in cells if c.get("row_index") not in junk_row_indices]
+            z["rows"] = [r for r in rows if r["index"] not in junk_row_indices]
+            logger.info(
+                "build-grid: removed %d junk rows from zone %d: %s",
+                len(junk_row_indices), z["zone_index"],
+                sorted(junk_row_indices),
+            )
+
+
+def _remove_artifact_cells(zones_data: List[Dict[str, Any]]) -> None:
+    """Remove individual cells with a single very-short, low-conf word."""
+    _ARTIFACT_MAX_LEN = 2
+    _ARTIFACT_CONF_THRESHOLD = 65
+
+    for z in zones_data:
+        cells = z.get("cells", [])
+        if not cells:
+            continue
+        artifact_ids = set()
+        for cell in cells:
+            wbs = cell.get("word_boxes") or []
+            if len(wbs) != 1:
+                continue
+            wb = wbs[0]
+            text = (wb.get("text") or "").strip()
+            conf = wb.get("conf", 100)
+            if len(text) <= _ARTIFACT_MAX_LEN and conf < _ARTIFACT_CONF_THRESHOLD:
+                artifact_ids.add(cell.get("cell_id"))
+        if artifact_ids:
+            z["cells"] = [c for c in cells if c.get("cell_id") not in artifact_ids]
+            logger.info(
+                "build-grid: removed %d artifact cells from zone %d: %s",
+                len(artifact_ids), z.get("zone_index", 0),
+                [c.get("text") for c in cells if c.get("cell_id") in artifact_ids],
+            )
+
+
+def _remove_oversized_word_boxes(zones_data: List[Dict[str, Any]]) -> None:
+    """Remove word_boxes whose height is 3x+ the median (graphic artifacts)."""
+    for z in zones_data:
+        cells = z.get("cells", [])
+        if not cells:
+            continue
+        all_wh = [
+            wb["height"]
+            for cell in cells
+            for wb in cell.get("word_boxes") or []
+            if wb.get("height", 0) > 0
+        ]
+        if not all_wh:
+            continue
+        med_h = sorted(all_wh)[len(all_wh) // 2]
+        oversized_threshold = med_h * 3
+        removed_oversized = 0
+        for cell in cells:
+            wbs = cell.get("word_boxes") or []
+            filtered = [wb for wb in wbs if wb.get("height", 0) <= oversized_threshold]
+            if len(filtered) < len(wbs):
+                removed_oversized += len(wbs) - len(filtered)
+                cell["word_boxes"] = filtered
+                cell["text"] = _words_to_reading_order_text(filtered)
+        if removed_oversized:
+            z["cells"] = [c for c in cells if c.get("word_boxes")]
+            logger.info(
+                "build-grid: removed %d oversized word_boxes (>%dpx) from zone %d",
+                removed_oversized, oversized_threshold, z.get("zone_index", 0),
+            )
+
+
+def _remove_pipe_dividers(zones_data: List[Dict[str, Any]]) -> None:
+    """Remove pipe-character word_boxes (column divider artifacts)."""
+    for z in zones_data:
+        if z.get("vsplit_group") is not None:
+            continue  # pipes already removed before split
+        removed_pipes = 0
+        for cell in z.get("cells", []):
+            wbs = cell.get("word_boxes") or []
+            filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())]
+            if len(filtered) < len(wbs):
+                removed_pipes += len(wbs) - len(filtered)
+                cell["word_boxes"] = filtered
+                cell["text"] = _words_to_reading_order_text(filtered)
+        if removed_pipes:
+            z["cells"] = [c for c in z.get("cells", []) if (c.get("word_boxes") or c.get("text", "").strip())]
+            logger.info(
+                "build-grid: removed %d pipe-divider word_boxes from zone %d",
+                removed_pipes, z.get("zone_index", 0),
+            )
+
+    # Strip pipe chars ONLY from cell edges (OCR artifacts).
+    # Preserve pipes embedded in words as syllable separators.
+    for z in zones_data:
+        for cell in z.get("cells", []):
+            text = cell.get("text", "")
+            if "|" in text:
+                cleaned = text.strip("|").strip()
+                if cleaned != text.strip():
+                    cell["text"] = cleaned
+
+
+def _normalize_connector_columns(zones_data: List[Dict[str, Any]]) -> None:
+    """Normalize narrow connector columns where OCR appends noise chars.
+
+    In synonym dictionaries a narrow column repeats the same word
+    (e.g. "oder") in every row. OCR sometimes appends noise chars.
+    """
+    for z in zones_data:
+        cols = z.get("columns", [])
+        cells = z.get("cells", [])
+        if not cols or not cells:
+            continue
+        for col in cols:
+            ci = col.get("index")
+            col_cells = [c for c in cells if c.get("col_index") == ci]
+            if len(col_cells) < 3:
+                continue
+            text_counts: Dict[str, int] = {}
+            for c in col_cells:
+                t = (c.get("text") or "").strip()
+                if t:
+                    text_counts[t] = text_counts.get(t, 0) + 1
+            if not text_counts:
+                continue
+            dominant_text = max(text_counts, key=text_counts.get)  # type: ignore[arg-type]
+            dominant_count = text_counts[dominant_text]
+            if len(dominant_text) > 10 or dominant_count < len(col_cells) * 0.6:
+                continue
+            fixed = 0
+            for c in col_cells:
+                t = (c.get("text") or "").strip()
+                if t != dominant_text and t.startswith(dominant_text) and len(t) <= len(dominant_text) + 2:
+                    c["text"] = dominant_text
+                    wbs = c.get("word_boxes") or []
+                    if len(wbs) == 1:
+                        wbs[0]["text"] = dominant_text
+                    fixed += 1
+            if fixed:
+                logger.info(
+                    "build-grid: normalized %d outlier cells in connector column %d "
+                    "(dominant='%s') zone %d",
+                    fixed, ci, dominant_text, z.get("zone_index", 0),
+                )
+
+
+def _remove_border_strips(
+    zones_data: List[Dict[str, Any]],
+    border_prefiltered: bool,
+) -> bool:
+    """Detect and remove page-border decoration strips.
+
+    Returns updated border_prefiltered flag.
+    """
+    border_strip_removed = 0
+    if border_prefiltered:
+        logger.info("Step 4e: skipped (border pre-filter already applied)")
+        return border_prefiltered
+
+    for z in zones_data:
+        cells = z.get("cells", [])
+        if not cells:
+            continue
+        all_wbs_with_cell: list = []
+        for cell in cells:
+            for wb in cell.get("word_boxes") or []:
+                all_wbs_with_cell.append((wb.get("left", 0), wb, cell))
+        if len(all_wbs_with_cell) < 10:
+            continue
+        all_wbs_with_cell.sort(key=lambda t: t[0])
+        total = len(all_wbs_with_cell)
+
+        # -- Left-edge scan --
+        left_strip_count = 0
+        left_gap = 0
+        running_right = 0
+        for gi in range(total - 1):
+            running_right = max(
+                running_right,
+                all_wbs_with_cell[gi][0] + all_wbs_with_cell[gi][1].get("width", 0),
+            )
+            gap = all_wbs_with_cell[gi + 1][0] - running_right
+            if gap > 30:
+                left_strip_count = gi + 1
+                left_gap = gap
+                break
+
+        # -- Right-edge scan --
+        right_strip_count = 0
+        right_gap = 0
+        running_left = all_wbs_with_cell[-1][0]
+        for gi in range(total - 1, 0, -1):
+            running_left = min(running_left, all_wbs_with_cell[gi][0])
+            prev_right = (
+                all_wbs_with_cell[gi - 1][0]
+                + all_wbs_with_cell[gi - 1][1].get("width", 0)
+            )
+            gap = running_left - prev_right
+            if gap > 30:
+                right_strip_count = total - gi
+                right_gap = gap
+                break
+
+        strip_wbs: set = set()
+        strip_side = ""
+        strip_gap = 0
+        strip_count = 0
+        if left_strip_count > 0 and left_strip_count / total < 0.20:
+            strip_side = "left"
+            strip_count = left_strip_count
+            strip_gap = left_gap
+            strip_wbs = {id(t[1]) for t in all_wbs_with_cell[:left_strip_count]}
+        elif right_strip_count > 0 and right_strip_count / total < 0.20:
+            strip_side = "right"
+            strip_count = right_strip_count
+            strip_gap = right_gap
+            strip_wbs = {id(t[1]) for t in all_wbs_with_cell[total - right_strip_count:]}
+
+        if not strip_wbs:
+            continue
+        for cell in cells:
+            wbs = cell.get("word_boxes") or []
+            filtered = [wb for wb in wbs if id(wb) not in strip_wbs]
+            if len(filtered) < len(wbs):
+                border_strip_removed += len(wbs) - len(filtered)
+                cell["word_boxes"] = filtered
+                cell["text"] = _words_to_reading_order_text(filtered)
+        z["cells"] = [c for c in cells
+                      if (c.get("word_boxes") or c.get("text", "").strip())]
+        logger.info(
+            "Step 4e: removed %d border-strip word_boxes (%s) from zone %d "
+            "(gap=%dpx, strip=%d/%d wbs)",
+            border_strip_removed, strip_side, z.get("zone_index", 0),
+            strip_gap, strip_count, total,
+        )
+
+    return border_prefiltered
+
+
+def _remove_alphabet_sidebars(zones_data: List[Dict[str, Any]]) -> None:
+    """Remove decorative edge columns (alphabet sidebar safety net).
+
+    Dictionary pages have A-Z letter sidebars that OCR reads as single-
+    character word_boxes.
+    """
+    for z in zones_data:
+        columns = z.get("columns", [])
+        cells = z.get("cells", [])
+        if len(columns) < 3 or not cells:
+            continue
+        col_cells: Dict[str, List[Dict]] = {}
+        for cell in cells:
+            ct = cell.get("col_type", "")
+            if ct.startswith("column_"):
+                col_cells.setdefault(ct, []).append(cell)
+        col_types_ordered = sorted(col_cells.keys())
+        if len(col_types_ordered) < 3:
+            continue
+        for edge_ct in [col_types_ordered[0], col_types_ordered[-1]]:
+            edge_cells_list = col_cells.get(edge_ct, [])
+            if len(edge_cells_list) < 3:
+                continue
+            texts = [(c.get("text") or "").strip() for c in edge_cells_list]
+            avg_len = sum(len(t) for t in texts) / len(texts)
+            single_char = sum(1 for t in texts if len(t) <= 1)
+            single_ratio = single_char / len(texts)
+            if avg_len > 1.5:
+                continue
+            if single_ratio < 0.7:
+                continue
+            removed_count = len(edge_cells_list)
+            edge_ids = {id(c) for c in edge_cells_list}
+            z["cells"] = [c for c in cells if id(c) not in edge_ids]
+            z["columns"] = [col for col in columns if col.get("col_type") != edge_ct]
+            logger.info(
+                "Step 4f: removed decorative edge column '%s' from zone %d "
+                "(%d cells, avg_len=%.1f, single_char=%.0f%%)",
+                edge_ct, z.get("zone_index", 0), removed_count,
+                avg_len, single_ratio * 100,
+            )
+            break  # only remove one edge per zone