Fix: Remove broken getKlausurApiUrl and clean up empty lines
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s

sed replacement left orphaned hostname references in story page
and empty lines in getApiBase functions.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-24 16:02:04 +02:00
parent b07f802c24
commit 9ba420fa91
150 changed files with 30231 additions and 32053 deletions

View File

@@ -0,0 +1,489 @@
"""
Grid Build Text Ops — Phase 4+5a: Color annotation, heading detection,
parenthesis fix, IPA phonetic correction, page ref extraction, and
slash-IPA conversion.
Extracted from grid_build_core.py for maintainability.
"""
import logging
import re
from typing import Any, Dict, List, Optional, Set, Tuple
from cv_color_detect import detect_word_colors
from cv_ocr_engines import (
fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa,
_lookup_ipa,
)
from grid_editor_helpers import (
_detect_heading_rows_by_color,
_detect_heading_rows_by_single_cell,
)
logger = logging.getLogger(__name__)
def _process_text(
zones_data: List[Dict[str, Any]],
img_bgr: Any,
img_w: int,
img_h: int,
ipa_mode: str,
page_number_info: Optional[Dict],
) -> Dict[str, Any]:
"""Run color annotation, heading detection, IPA correction, and page refs.
Args:
zones_data: List of zone dicts (modified in place).
img_bgr: BGR image array (or None).
img_w: Image width.
img_h: Image height.
ipa_mode: IPA processing mode.
page_number_info: Existing page number metadata (may be None).
Returns:
Dict with keys: en_col_type, ipa_target_cols, all_content_cols,
skip_ipa, page_number_info.
"""
# 5. Color annotation on final word_boxes in cells
if img_bgr is not None:
all_wb: List[Dict] = []
for z in zones_data:
for cell in z.get("cells", []):
all_wb.extend(cell.get("word_boxes", []))
detect_word_colors(img_bgr, all_wb)
# 5a. Heading detection by color + height
heading_count = _detect_heading_rows_by_color(zones_data, img_w, img_h)
if heading_count:
logger.info("Detected %d heading rows by color+height", heading_count)
# 5b. Fix unmatched parentheses in cell text
for z in zones_data:
for cell in z.get("cells", []):
text = cell.get("text", "")
if ")" in text and "(" not in text:
cell["text"] = "(" + text
# 5c. IPA phonetic correction
all_cells = [cell for z in zones_data for cell in z.get("cells", [])]
total_cols = sum(len(z.get("columns", [])) for z in zones_data)
en_col_type = None
ipa_target_cols: set = set()
all_content_cols: set = set()
skip_ipa = (ipa_mode == "none")
# When ipa_mode=none, strip ALL square brackets from ALL content columns
if skip_ipa:
_SQUARE_BRACKET_RE_NONE = re.compile(r'\s*\[[^\]]+\]')
for cell in all_cells:
ct = cell.get("col_type", "")
if not ct.startswith("column_"):
continue
text = cell.get("text", "")
if "[" in text:
stripped = _SQUARE_BRACKET_RE_NONE.sub("", text)
if stripped != text:
cell["text"] = stripped.strip()
cell["_ipa_corrected"] = True
if not skip_ipa and total_cols >= 3:
en_col_type, ipa_target_cols, all_content_cols = _run_ipa_correction(
all_cells, total_cols, ipa_mode, zones_data
)
elif not skip_ipa:
# Collect all_content_cols even when <3 cols (needed by finalize)
for cell in all_cells:
ct = cell.get("col_type", "")
if ct.startswith("column_") and (cell.get("text") or "").strip():
all_content_cols.add(ct)
# 5e. Heading detection by single-cell rows
single_heading_count = _detect_heading_rows_by_single_cell(zones_data, img_w, img_h)
if single_heading_count:
logger.info("Detected %d heading rows by single-cell heuristic", single_heading_count)
# 5f. Strip IPA from headings
for z in zones_data:
for cell in z.get("cells", []):
if cell.get("col_type") != "heading":
continue
text = cell.get("text", "")
stripped = re.sub(r'\s*\[[^\]]*\]\s*$', '', text).strip()
if stripped and stripped != text:
cell["text"] = stripped
# 5g. Extract page_ref cells and footer rows
_extract_page_refs_and_footers(zones_data, page_number_info)
# 5h. Convert slash-delimited IPA to bracket notation
_convert_slash_ipa(zones_data, skip_ipa, en_col_type)
return {
"en_col_type": en_col_type,
"ipa_target_cols": ipa_target_cols,
"all_content_cols": all_content_cols,
"skip_ipa": skip_ipa,
"page_number_info": page_number_info,
}
def _run_ipa_correction(
all_cells: List[Dict],
total_cols: int,
ipa_mode: str,
zones_data: List[Dict[str, Any]],
) -> Tuple[Optional[str], set, set]:
"""Run IPA correction on cells. Returns (en_col_type, ipa_target_cols, all_content_cols)."""
en_col_type = None
all_content_cols: set = set()
# Detect English headword column via IPA signals
col_ipa_count: Dict[str, int] = {}
for cell in all_cells:
ct = cell.get("col_type", "")
if not ct.startswith("column_"):
continue
txt = cell.get("text", "") or ""
if txt.strip():
all_content_cols.add(ct)
if '[' in txt or _text_has_garbled_ipa(txt):
col_ipa_count[ct] = col_ipa_count.get(ct, 0) + 1
if col_ipa_count:
en_col_type = max(col_ipa_count, key=col_ipa_count.get)
elif ipa_mode == "all":
col_cell_count: Dict[str, int] = {}
for cell in all_cells:
ct = cell.get("col_type", "")
if ct.startswith("column_") and (cell.get("text") or "").strip():
col_cell_count[ct] = col_cell_count.get(ct, 0) + 1
if col_cell_count:
en_col_type = max(col_cell_count, key=col_cell_count.get)
# Decide which columns to process based on ipa_mode
en_ipa_target_cols: set = set()
de_ipa_target_cols: set = set()
if ipa_mode in ("auto", "en"):
if en_col_type:
en_ipa_target_cols.add(en_col_type)
elif ipa_mode == "de":
de_ipa_target_cols = all_content_cols - {en_col_type} if en_col_type else all_content_cols
elif ipa_mode == "all":
if en_col_type:
en_ipa_target_cols.add(en_col_type)
de_ipa_target_cols = all_content_cols - en_ipa_target_cols
# --- Strip IPA from columns NOT in the target set ---
_SQUARE_BRACKET_RE = re.compile(r'\s*\[[^\]]+\]')
strip_en_ipa = en_col_type and en_col_type not in en_ipa_target_cols
if strip_en_ipa or ipa_mode == "none":
strip_cols = {en_col_type} if strip_en_ipa and ipa_mode != "none" else all_content_cols
for cell in all_cells:
ct = cell.get("col_type", "")
if ct not in strip_cols:
continue
text = cell.get("text", "")
if "[" in text:
stripped = _SQUARE_BRACKET_RE.sub("", text)
if stripped != text:
cell["text"] = stripped.strip()
cell["_ipa_corrected"] = True
# --- English IPA (Britfone + eng_to_ipa) ---
if en_ipa_target_cols:
for cell in all_cells:
ct = cell.get("col_type")
if ct in en_ipa_target_cols:
cell["_orig_col_type"] = ct
cell["col_type"] = "column_en"
_pre_ipa = {id(c): c.get("text", "") for c in all_cells}
fix_cell_phonetics(all_cells, pronunciation="british")
for cell in all_cells:
orig = cell.pop("_orig_col_type", None)
if orig:
cell["col_type"] = orig
if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
cell["_ipa_corrected"] = True
# --- German IPA (wiki-pronunciation-dict + epitran) ---
if de_ipa_target_cols:
from cv_ipa_german import insert_german_ipa
insert_german_ipa(all_cells, de_ipa_target_cols)
ipa_target_cols = en_ipa_target_cols | de_ipa_target_cols
# Mark cells whose text was changed by IPA correction
for cell in all_cells:
if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
cell["_ipa_corrected"] = True
# 5d. Fix IPA continuation cells
skip_ipa = (ipa_mode == "none")
_REAL_IPA_CHARS = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
ipa_cont_fixed = 0
for z in ([] if skip_ipa else zones_data):
rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"])
z_cells = z.get("cells", [])
for idx, row in enumerate(rows_sorted):
if idx == 0:
continue
ri = row["index"]
row_cells = [c for c in z_cells if c.get("row_index") == ri]
for cell in row_cells:
ct = cell.get("col_type", "")
if not ct.startswith("column_"):
continue
cell_text = (cell.get("text") or "").strip()
if not cell_text:
wb_texts = [w.get("text", "")
for w in cell.get("word_boxes", [])]
cell_text = " ".join(wb_texts).strip()
if not cell_text:
continue
is_bracketed = (
cell_text.startswith('[') and cell_text.endswith(']')
)
if is_bracketed:
if not _text_has_garbled_ipa(cell_text):
continue
if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', cell_text):
continue
else:
content_cells_in_row = [
c for c in row_cells
if c.get("col_type", "").startswith("column_")
and c.get("col_type") != "column_1"
]
if len(content_cells_in_row) != 1:
continue
if not _text_has_garbled_ipa(cell_text):
continue
if any(c in _REAL_IPA_CHARS for c in cell_text):
continue
_words_in_text = re.findall(r'[A-Za-zÄÖÜäöüß]{3,}', cell_text)
if len(_words_in_text) >= 3:
continue
# Find headword in previous row, same column
prev_ri = rows_sorted[idx - 1]["index"]
prev_same_col = [
c for c in z_cells
if c.get("row_index") == prev_ri
and c.get("col_type") == ct
]
if not prev_same_col:
continue
prev_text = prev_same_col[0].get("text", "")
fixed = fix_ipa_continuation_cell(
cell_text, prev_text, pronunciation="british",
)
if fixed != cell_text:
cell["text"] = fixed
ipa_cont_fixed += 1
logger.info(
"IPA continuation R%d %s: '%s' -> '%s'",
ri, ct, cell_text, fixed,
)
if ipa_cont_fixed:
logger.info("Fixed %d IPA continuation cells", ipa_cont_fixed)
return en_col_type, ipa_target_cols, all_content_cols
def _extract_page_refs_and_footers(
zones_data: List[Dict[str, Any]],
page_number_info: Optional[Dict],
) -> None:
"""Extract page_ref cells and footer rows from content zones.
Modifies zones_data in place. Updates page_number_info if a page number
footer is found.
"""
_REAL_IPA_CHARS_SET = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
_PAGE_REF_RE = re.compile(r'^[pP,]?\s*\.?\s*\d+$')
_NUMBER_WORDS = {
"one", "two", "three", "four", "five", "six", "seven",
"eight", "nine", "ten", "eleven", "twelve", "thirteen",
"fourteen", "fifteen", "sixteen", "seventeen", "eighteen",
"nineteen", "twenty", "thirty", "forty", "fifty", "sixty",
"seventy", "eighty", "ninety", "hundred", "thousand", "and",
"einhundert", "zweihundert", "dreihundert", "vierhundert",
"und", "zehn", "zwanzig", "dreißig", "vierzig", "fünfzig",
}
for z in zones_data:
if z.get("zone_type") != "content":
continue
cells = z.get("cells", [])
rows = z.get("rows", [])
if not rows:
continue
# Extract column_1 cells that look like page references
page_refs = []
page_ref_cell_ids = set()
for cell in cells:
if cell.get("col_type") != "column_1":
continue
text = (cell.get("text") or "").strip()
if not text:
continue
if not _PAGE_REF_RE.match(text):
continue
page_refs.append({
"row_index": cell.get("row_index"),
"text": text,
"bbox_pct": cell.get("bbox_pct", {}),
})
page_ref_cell_ids.add(cell.get("cell_id"))
# Detect footer: last non-header row if it has only 1 cell
footer_rows = []
non_header_rows = [r for r in rows if not r.get("is_header")]
if non_header_rows:
last_row = non_header_rows[-1]
last_ri = last_row["index"]
last_cells = [c for c in z["cells"]
if c.get("row_index") == last_ri]
if len(last_cells) == 1:
text = (last_cells[0].get("text") or "").strip()
has_real_ipa = any(c in _REAL_IPA_CHARS_SET for c in text)
has_commas = ',' in text
text_words = set(text.lower().split())
is_written_number = len(text_words) >= 2 and text_words.issubset(_NUMBER_WORDS)
is_page_number = len(text) <= 20 or is_written_number
if (text and not has_real_ipa and not has_commas
and is_page_number
and last_cells[0].get("col_type") != "heading"):
footer_rows.append({
"row_index": last_ri,
"text": text,
"bbox_pct": last_cells[0].get("bbox_pct", {}),
})
# Classify footer rows
page_number_footers = []
other_footers = []
for fr in footer_rows:
ft = fr["text"].strip()
digits = "".join(c for c in ft if c.isdigit())
if digits and re.match(r'^[\d\s.]+$', ft):
page_number_footers.append(fr)
elif ft.lower().split() and set(ft.lower().split()).issubset(_NUMBER_WORDS):
page_number_footers.append(fr)
else:
other_footers.append(fr)
# Remove page-number footer rows from grid entirely
if page_number_footers:
pn_ris = {fr["row_index"] for fr in page_number_footers}
z["cells"] = [c for c in z["cells"] if c.get("row_index") not in pn_ris]
z["rows"] = [r for r in z["rows"] if r["index"] not in pn_ris]
pn_text = page_number_footers[0]["text"].strip()
pn_digits = "".join(c for c in pn_text if c.isdigit())
if not page_number_info:
page_number_info = {
"text": pn_text,
"y_pct": page_number_footers[0].get("bbox_pct", {}).get("y", 95),
}
if pn_digits:
page_number_info["number"] = int(pn_digits)
# Mark remaining footer rows
if other_footers:
footer_ris = {fr["row_index"] for fr in other_footers}
for r in z["rows"]:
if r["index"] in footer_ris:
r["is_footer"] = True
for c in z["cells"]:
if c.get("row_index") in footer_ris:
c["col_type"] = "footer"
if page_refs or footer_rows:
logger.info(
"Extracted %d page_refs + %d footer rows (%d page numbers removed) from zone %d",
len(page_refs), len(footer_rows), len(page_number_footers),
z.get("zone_index", 0),
)
if page_refs:
z["page_refs"] = page_refs
if other_footers:
z["footer"] = other_footers
def _convert_slash_ipa(
zones_data: List[Dict[str, Any]],
skip_ipa: bool,
en_col_type: Optional[str],
) -> None:
"""Convert slash-delimited IPA to bracket notation.
Dictionary-style pages print IPA between slashes: "tiger /'taiga/"
"""
_SLASH_IPA_RE = re.compile(
r'(\b[a-zA-Z]+[²³¹]?)\s*' # headword (capture group 1)
r"(/[^/]{2,}/)" # /ipa/ (capture group 2), min 2 chars
)
_STANDALONE_SLASH_IPA_RE = re.compile(r'^/([^/]{2,})/')
_SLASH_IPA_REJECT_RE = re.compile(r'[\s(),]')
slash_ipa_fixed = 0
for z in ([] if skip_ipa else zones_data):
for cell in z.get("cells", []):
if en_col_type and cell.get("col_type") != en_col_type:
continue
text = cell.get("text", "")
if "/" not in text:
continue
def _replace_slash_ipa(m: re.Match) -> str:
nonlocal slash_ipa_fixed
headword = m.group(1)
ocr_ipa = m.group(2)
inner_raw = ocr_ipa.strip("/").strip()
if _SLASH_IPA_REJECT_RE.search(inner_raw):
return m.group(0)
clean_hw = re.sub(r'[²³¹\d]', '', headword).strip()
ipa = _lookup_ipa(clean_hw, "british") if clean_hw else None
if ipa:
slash_ipa_fixed += 1
return f"{headword} [{ipa}]"
inner = inner_raw.lstrip("'").strip()
if inner:
slash_ipa_fixed += 1
return f"{headword} [{inner}]"
return m.group(0)
new_text = _SLASH_IPA_RE.sub(_replace_slash_ipa, text)
_AFTER_BRACKET_SLASH = re.compile(r'(?<=\])\s*(/[^/]{2,}/)')
def _replace_trailing_slash(m: re.Match) -> str:
nonlocal slash_ipa_fixed
inner = m.group(1).strip("/").strip().lstrip("'").strip()
if _SLASH_IPA_REJECT_RE.search(inner):
return m.group(0)
if inner:
slash_ipa_fixed += 1
return f" [{inner}]"
return m.group(0)
new_text = _AFTER_BRACKET_SLASH.sub(_replace_trailing_slash, new_text)
if new_text == text:
m = _STANDALONE_SLASH_IPA_RE.match(text)
if m:
inner = m.group(1).strip()
if not _SLASH_IPA_REJECT_RE.search(inner):
inner = inner.lstrip("'").strip()
if inner:
new_text = "[" + inner + "]" + text[m.end():]
slash_ipa_fixed += 1
if new_text != text:
cell["text"] = new_text
if slash_ipa_fixed:
logger.info("Step 5h: converted %d slash-IPA to bracket notation", slash_ipa_fixed)