Fix: Remove broken getKlausurApiUrl and clean up empty lines

sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 16:02:04 +02:00
parent b07f802c24
commit 9ba420fa91
150 changed files with 30231 additions and 32053 deletions
--- a/klausur-service/backend/grid_build_text_ops.py
+++ b/klausur-service/backend/grid_build_text_ops.py
@@ -0,0 +1,489 @@
+"""
+Grid Build Text Ops — Phase 4+5a: Color annotation, heading detection,
+parenthesis fix, IPA phonetic correction, page ref extraction, and
+slash-IPA conversion.
+
+Extracted from grid_build_core.py for maintainability.
+"""
+
+import logging
+import re
+from typing import Any, Dict, List, Optional, Set, Tuple
+
+from cv_color_detect import detect_word_colors
+from cv_ocr_engines import (
+    fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa,
+    _lookup_ipa,
+)
+from grid_editor_helpers import (
+    _detect_heading_rows_by_color,
+    _detect_heading_rows_by_single_cell,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def _process_text(
+    zones_data: List[Dict[str, Any]],
+    img_bgr: Any,
+    img_w: int,
+    img_h: int,
+    ipa_mode: str,
+    page_number_info: Optional[Dict],
+) -> Dict[str, Any]:
+    """Run color annotation, heading detection, IPA correction, and page refs.
+
+    Args:
+        zones_data: List of zone dicts (modified in place).
+        img_bgr: BGR image array (or None).
+        img_w: Image width.
+        img_h: Image height.
+        ipa_mode: IPA processing mode.
+        page_number_info: Existing page number metadata (may be None).
+
+    Returns:
+        Dict with keys: en_col_type, ipa_target_cols, all_content_cols,
+        skip_ipa, page_number_info.
+    """
+    # 5. Color annotation on final word_boxes in cells
+    if img_bgr is not None:
+        all_wb: List[Dict] = []
+        for z in zones_data:
+            for cell in z.get("cells", []):
+                all_wb.extend(cell.get("word_boxes", []))
+        detect_word_colors(img_bgr, all_wb)
+
+    # 5a. Heading detection by color + height
+    heading_count = _detect_heading_rows_by_color(zones_data, img_w, img_h)
+    if heading_count:
+        logger.info("Detected %d heading rows by color+height", heading_count)
+
+    # 5b. Fix unmatched parentheses in cell text
+    for z in zones_data:
+        for cell in z.get("cells", []):
+            text = cell.get("text", "")
+            if ")" in text and "(" not in text:
+                cell["text"] = "(" + text
+
+    # 5c. IPA phonetic correction
+    all_cells = [cell for z in zones_data for cell in z.get("cells", [])]
+    total_cols = sum(len(z.get("columns", [])) for z in zones_data)
+    en_col_type = None
+    ipa_target_cols: set = set()
+    all_content_cols: set = set()
+    skip_ipa = (ipa_mode == "none")
+
+    # When ipa_mode=none, strip ALL square brackets from ALL content columns
+    if skip_ipa:
+        _SQUARE_BRACKET_RE_NONE = re.compile(r'\s*\[[^\]]+\]')
+        for cell in all_cells:
+            ct = cell.get("col_type", "")
+            if not ct.startswith("column_"):
+                continue
+            text = cell.get("text", "")
+            if "[" in text:
+                stripped = _SQUARE_BRACKET_RE_NONE.sub("", text)
+                if stripped != text:
+                    cell["text"] = stripped.strip()
+                    cell["_ipa_corrected"] = True
+
+    if not skip_ipa and total_cols >= 3:
+        en_col_type, ipa_target_cols, all_content_cols = _run_ipa_correction(
+            all_cells, total_cols, ipa_mode, zones_data
+        )
+    elif not skip_ipa:
+        # Collect all_content_cols even when <3 cols (needed by finalize)
+        for cell in all_cells:
+            ct = cell.get("col_type", "")
+            if ct.startswith("column_") and (cell.get("text") or "").strip():
+                all_content_cols.add(ct)
+
+    # 5e. Heading detection by single-cell rows
+    single_heading_count = _detect_heading_rows_by_single_cell(zones_data, img_w, img_h)
+    if single_heading_count:
+        logger.info("Detected %d heading rows by single-cell heuristic", single_heading_count)
+
+    # 5f. Strip IPA from headings
+    for z in zones_data:
+        for cell in z.get("cells", []):
+            if cell.get("col_type") != "heading":
+                continue
+            text = cell.get("text", "")
+            stripped = re.sub(r'\s*\[[^\]]*\]\s*$', '', text).strip()
+            if stripped and stripped != text:
+                cell["text"] = stripped
+
+    # 5g. Extract page_ref cells and footer rows
+    _extract_page_refs_and_footers(zones_data, page_number_info)
+
+    # 5h. Convert slash-delimited IPA to bracket notation
+    _convert_slash_ipa(zones_data, skip_ipa, en_col_type)
+
+    return {
+        "en_col_type": en_col_type,
+        "ipa_target_cols": ipa_target_cols,
+        "all_content_cols": all_content_cols,
+        "skip_ipa": skip_ipa,
+        "page_number_info": page_number_info,
+    }
+
+
+def _run_ipa_correction(
+    all_cells: List[Dict],
+    total_cols: int,
+    ipa_mode: str,
+    zones_data: List[Dict[str, Any]],
+) -> Tuple[Optional[str], set, set]:
+    """Run IPA correction on cells. Returns (en_col_type, ipa_target_cols, all_content_cols)."""
+    en_col_type = None
+    all_content_cols: set = set()
+
+    # Detect English headword column via IPA signals
+    col_ipa_count: Dict[str, int] = {}
+    for cell in all_cells:
+        ct = cell.get("col_type", "")
+        if not ct.startswith("column_"):
+            continue
+        txt = cell.get("text", "") or ""
+        if txt.strip():
+            all_content_cols.add(ct)
+        if '[' in txt or _text_has_garbled_ipa(txt):
+            col_ipa_count[ct] = col_ipa_count.get(ct, 0) + 1
+    if col_ipa_count:
+        en_col_type = max(col_ipa_count, key=col_ipa_count.get)
+    elif ipa_mode == "all":
+        col_cell_count: Dict[str, int] = {}
+        for cell in all_cells:
+            ct = cell.get("col_type", "")
+            if ct.startswith("column_") and (cell.get("text") or "").strip():
+                col_cell_count[ct] = col_cell_count.get(ct, 0) + 1
+        if col_cell_count:
+            en_col_type = max(col_cell_count, key=col_cell_count.get)
+
+    # Decide which columns to process based on ipa_mode
+    en_ipa_target_cols: set = set()
+    de_ipa_target_cols: set = set()
+    if ipa_mode in ("auto", "en"):
+        if en_col_type:
+            en_ipa_target_cols.add(en_col_type)
+    elif ipa_mode == "de":
+        de_ipa_target_cols = all_content_cols - {en_col_type} if en_col_type else all_content_cols
+    elif ipa_mode == "all":
+        if en_col_type:
+            en_ipa_target_cols.add(en_col_type)
+        de_ipa_target_cols = all_content_cols - en_ipa_target_cols
+
+    # --- Strip IPA from columns NOT in the target set ---
+    _SQUARE_BRACKET_RE = re.compile(r'\s*\[[^\]]+\]')
+    strip_en_ipa = en_col_type and en_col_type not in en_ipa_target_cols
+    if strip_en_ipa or ipa_mode == "none":
+        strip_cols = {en_col_type} if strip_en_ipa and ipa_mode != "none" else all_content_cols
+        for cell in all_cells:
+            ct = cell.get("col_type", "")
+            if ct not in strip_cols:
+                continue
+            text = cell.get("text", "")
+            if "[" in text:
+                stripped = _SQUARE_BRACKET_RE.sub("", text)
+                if stripped != text:
+                    cell["text"] = stripped.strip()
+                    cell["_ipa_corrected"] = True
+
+    # --- English IPA (Britfone + eng_to_ipa) ---
+    if en_ipa_target_cols:
+        for cell in all_cells:
+            ct = cell.get("col_type")
+            if ct in en_ipa_target_cols:
+                cell["_orig_col_type"] = ct
+                cell["col_type"] = "column_en"
+    _pre_ipa = {id(c): c.get("text", "") for c in all_cells}
+    fix_cell_phonetics(all_cells, pronunciation="british")
+    for cell in all_cells:
+        orig = cell.pop("_orig_col_type", None)
+        if orig:
+            cell["col_type"] = orig
+        if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
+            cell["_ipa_corrected"] = True
+
+    # --- German IPA (wiki-pronunciation-dict + epitran) ---
+    if de_ipa_target_cols:
+        from cv_ipa_german import insert_german_ipa
+        insert_german_ipa(all_cells, de_ipa_target_cols)
+
+    ipa_target_cols = en_ipa_target_cols | de_ipa_target_cols
+
+    # Mark cells whose text was changed by IPA correction
+    for cell in all_cells:
+        if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
+            cell["_ipa_corrected"] = True
+
+    # 5d. Fix IPA continuation cells
+    skip_ipa = (ipa_mode == "none")
+    _REAL_IPA_CHARS = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
+    ipa_cont_fixed = 0
+    for z in ([] if skip_ipa else zones_data):
+        rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"])
+        z_cells = z.get("cells", [])
+        for idx, row in enumerate(rows_sorted):
+            if idx == 0:
+                continue
+            ri = row["index"]
+            row_cells = [c for c in z_cells if c.get("row_index") == ri]
+            for cell in row_cells:
+                ct = cell.get("col_type", "")
+                if not ct.startswith("column_"):
+                    continue
+                cell_text = (cell.get("text") or "").strip()
+                if not cell_text:
+                    wb_texts = [w.get("text", "")
+                                for w in cell.get("word_boxes", [])]
+                    cell_text = " ".join(wb_texts).strip()
+                    if not cell_text:
+                        continue
+
+                is_bracketed = (
+                    cell_text.startswith('[') and cell_text.endswith(']')
+                )
+
+                if is_bracketed:
+                    if not _text_has_garbled_ipa(cell_text):
+                        continue
+                    if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', cell_text):
+                        continue
+                else:
+                    content_cells_in_row = [
+                        c for c in row_cells
+                        if c.get("col_type", "").startswith("column_")
+                        and c.get("col_type") != "column_1"
+                    ]
+                    if len(content_cells_in_row) != 1:
+                        continue
+                    if not _text_has_garbled_ipa(cell_text):
+                        continue
+                    if any(c in _REAL_IPA_CHARS for c in cell_text):
+                        continue
+                    _words_in_text = re.findall(r'[A-Za-zÄÖÜäöüß]{3,}', cell_text)
+                    if len(_words_in_text) >= 3:
+                        continue
+
+                # Find headword in previous row, same column
+                prev_ri = rows_sorted[idx - 1]["index"]
+                prev_same_col = [
+                    c for c in z_cells
+                    if c.get("row_index") == prev_ri
+                    and c.get("col_type") == ct
+                ]
+                if not prev_same_col:
+                    continue
+                prev_text = prev_same_col[0].get("text", "")
+                fixed = fix_ipa_continuation_cell(
+                    cell_text, prev_text, pronunciation="british",
+                )
+                if fixed != cell_text:
+                    cell["text"] = fixed
+                    ipa_cont_fixed += 1
+                    logger.info(
+                        "IPA continuation R%d %s: '%s' -> '%s'",
+                        ri, ct, cell_text, fixed,
+                    )
+    if ipa_cont_fixed:
+        logger.info("Fixed %d IPA continuation cells", ipa_cont_fixed)
+
+    return en_col_type, ipa_target_cols, all_content_cols
+
+
+def _extract_page_refs_and_footers(
+    zones_data: List[Dict[str, Any]],
+    page_number_info: Optional[Dict],
+) -> None:
+    """Extract page_ref cells and footer rows from content zones.
+
+    Modifies zones_data in place. Updates page_number_info if a page number
+    footer is found.
+    """
+    _REAL_IPA_CHARS_SET = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
+    _PAGE_REF_RE = re.compile(r'^[pP,]?\s*\.?\s*\d+$')
+    _NUMBER_WORDS = {
+        "one", "two", "three", "four", "five", "six", "seven",
+        "eight", "nine", "ten", "eleven", "twelve", "thirteen",
+        "fourteen", "fifteen", "sixteen", "seventeen", "eighteen",
+        "nineteen", "twenty", "thirty", "forty", "fifty", "sixty",
+        "seventy", "eighty", "ninety", "hundred", "thousand", "and",
+        "einhundert", "zweihundert", "dreihundert", "vierhundert",
+        "und", "zehn", "zwanzig", "dreißig", "vierzig", "fünfzig",
+    }
+
+    for z in zones_data:
+        if z.get("zone_type") != "content":
+            continue
+        cells = z.get("cells", [])
+        rows = z.get("rows", [])
+        if not rows:
+            continue
+
+        # Extract column_1 cells that look like page references
+        page_refs = []
+        page_ref_cell_ids = set()
+        for cell in cells:
+            if cell.get("col_type") != "column_1":
+                continue
+            text = (cell.get("text") or "").strip()
+            if not text:
+                continue
+            if not _PAGE_REF_RE.match(text):
+                continue
+            page_refs.append({
+                "row_index": cell.get("row_index"),
+                "text": text,
+                "bbox_pct": cell.get("bbox_pct", {}),
+            })
+            page_ref_cell_ids.add(cell.get("cell_id"))
+
+        # Detect footer: last non-header row if it has only 1 cell
+        footer_rows = []
+        non_header_rows = [r for r in rows if not r.get("is_header")]
+        if non_header_rows:
+            last_row = non_header_rows[-1]
+            last_ri = last_row["index"]
+            last_cells = [c for c in z["cells"]
+                          if c.get("row_index") == last_ri]
+            if len(last_cells) == 1:
+                text = (last_cells[0].get("text") or "").strip()
+                has_real_ipa = any(c in _REAL_IPA_CHARS_SET for c in text)
+                has_commas = ',' in text
+                text_words = set(text.lower().split())
+                is_written_number = len(text_words) >= 2 and text_words.issubset(_NUMBER_WORDS)
+                is_page_number = len(text) <= 20 or is_written_number
+                if (text and not has_real_ipa and not has_commas
+                        and is_page_number
+                        and last_cells[0].get("col_type") != "heading"):
+                    footer_rows.append({
+                        "row_index": last_ri,
+                        "text": text,
+                        "bbox_pct": last_cells[0].get("bbox_pct", {}),
+                    })
+
+        # Classify footer rows
+        page_number_footers = []
+        other_footers = []
+        for fr in footer_rows:
+            ft = fr["text"].strip()
+            digits = "".join(c for c in ft if c.isdigit())
+            if digits and re.match(r'^[\d\s.]+$', ft):
+                page_number_footers.append(fr)
+            elif ft.lower().split() and set(ft.lower().split()).issubset(_NUMBER_WORDS):
+                page_number_footers.append(fr)
+            else:
+                other_footers.append(fr)
+
+        # Remove page-number footer rows from grid entirely
+        if page_number_footers:
+            pn_ris = {fr["row_index"] for fr in page_number_footers}
+            z["cells"] = [c for c in z["cells"] if c.get("row_index") not in pn_ris]
+            z["rows"] = [r for r in z["rows"] if r["index"] not in pn_ris]
+            pn_text = page_number_footers[0]["text"].strip()
+            pn_digits = "".join(c for c in pn_text if c.isdigit())
+            if not page_number_info:
+                page_number_info = {
+                    "text": pn_text,
+                    "y_pct": page_number_footers[0].get("bbox_pct", {}).get("y", 95),
+                }
+                if pn_digits:
+                    page_number_info["number"] = int(pn_digits)
+
+        # Mark remaining footer rows
+        if other_footers:
+            footer_ris = {fr["row_index"] for fr in other_footers}
+            for r in z["rows"]:
+                if r["index"] in footer_ris:
+                    r["is_footer"] = True
+            for c in z["cells"]:
+                if c.get("row_index") in footer_ris:
+                    c["col_type"] = "footer"
+
+        if page_refs or footer_rows:
+            logger.info(
+                "Extracted %d page_refs + %d footer rows (%d page numbers removed) from zone %d",
+                len(page_refs), len(footer_rows), len(page_number_footers),
+                z.get("zone_index", 0),
+            )
+
+        if page_refs:
+            z["page_refs"] = page_refs
+        if other_footers:
+            z["footer"] = other_footers
+
+
+def _convert_slash_ipa(
+    zones_data: List[Dict[str, Any]],
+    skip_ipa: bool,
+    en_col_type: Optional[str],
+) -> None:
+    """Convert slash-delimited IPA to bracket notation.
+
+    Dictionary-style pages print IPA between slashes: "tiger /'taiga/"
+    """
+    _SLASH_IPA_RE = re.compile(
+        r'(\b[a-zA-Z]+[²³¹]?)\s*'   # headword (capture group 1)
+        r"(/[^/]{2,}/)"              # /ipa/ (capture group 2), min 2 chars
+    )
+    _STANDALONE_SLASH_IPA_RE = re.compile(r'^/([^/]{2,})/')
+    _SLASH_IPA_REJECT_RE = re.compile(r'[\s(),]')
+    slash_ipa_fixed = 0
+
+    for z in ([] if skip_ipa else zones_data):
+        for cell in z.get("cells", []):
+            if en_col_type and cell.get("col_type") != en_col_type:
+                continue
+            text = cell.get("text", "")
+            if "/" not in text:
+                continue
+
+            def _replace_slash_ipa(m: re.Match) -> str:
+                nonlocal slash_ipa_fixed
+                headword = m.group(1)
+                ocr_ipa = m.group(2)
+                inner_raw = ocr_ipa.strip("/").strip()
+                if _SLASH_IPA_REJECT_RE.search(inner_raw):
+                    return m.group(0)
+                clean_hw = re.sub(r'[²³¹\d]', '', headword).strip()
+                ipa = _lookup_ipa(clean_hw, "british") if clean_hw else None
+                if ipa:
+                    slash_ipa_fixed += 1
+                    return f"{headword} [{ipa}]"
+                inner = inner_raw.lstrip("'").strip()
+                if inner:
+                    slash_ipa_fixed += 1
+                    return f"{headword} [{inner}]"
+                return m.group(0)
+
+            new_text = _SLASH_IPA_RE.sub(_replace_slash_ipa, text)
+
+            _AFTER_BRACKET_SLASH = re.compile(r'(?<=\])\s*(/[^/]{2,}/)')
+
+            def _replace_trailing_slash(m: re.Match) -> str:
+                nonlocal slash_ipa_fixed
+                inner = m.group(1).strip("/").strip().lstrip("'").strip()
+                if _SLASH_IPA_REJECT_RE.search(inner):
+                    return m.group(0)
+                if inner:
+                    slash_ipa_fixed += 1
+                    return f" [{inner}]"
+                return m.group(0)
+            new_text = _AFTER_BRACKET_SLASH.sub(_replace_trailing_slash, new_text)
+
+            if new_text == text:
+                m = _STANDALONE_SLASH_IPA_RE.match(text)
+                if m:
+                    inner = m.group(1).strip()
+                    if not _SLASH_IPA_REJECT_RE.search(inner):
+                        inner = inner.lstrip("'").strip()
+                        if inner:
+                            new_text = "[" + inner + "]" + text[m.end():]
+                            slash_ipa_fixed += 1
+
+            if new_text != text:
+                cell["text"] = new_text
+
+    if slash_ipa_fixed:
+        logger.info("Step 5h: converted %d slash-IPA to bracket notation", slash_ipa_fixed)