Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
490 lines
19 KiB
Python
490 lines
19 KiB
Python
"""
|
|
Grid Build Text Ops — Phase 4+5a: Color annotation, heading detection,
|
|
parenthesis fix, IPA phonetic correction, page ref extraction, and
|
|
slash-IPA conversion.
|
|
|
|
Extracted from grid_build_core.py for maintainability.
|
|
"""
|
|
|
|
import logging
|
|
import re
|
|
from typing import Any, Dict, List, Optional, Set, Tuple
|
|
|
|
from cv_color_detect import detect_word_colors
|
|
from cv_ocr_engines import (
|
|
fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa,
|
|
_lookup_ipa,
|
|
)
|
|
from grid_editor_helpers import (
|
|
_detect_heading_rows_by_color,
|
|
_detect_heading_rows_by_single_cell,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def _process_text(
|
|
zones_data: List[Dict[str, Any]],
|
|
img_bgr: Any,
|
|
img_w: int,
|
|
img_h: int,
|
|
ipa_mode: str,
|
|
page_number_info: Optional[Dict],
|
|
) -> Dict[str, Any]:
|
|
"""Run color annotation, heading detection, IPA correction, and page refs.
|
|
|
|
Args:
|
|
zones_data: List of zone dicts (modified in place).
|
|
img_bgr: BGR image array (or None).
|
|
img_w: Image width.
|
|
img_h: Image height.
|
|
ipa_mode: IPA processing mode.
|
|
page_number_info: Existing page number metadata (may be None).
|
|
|
|
Returns:
|
|
Dict with keys: en_col_type, ipa_target_cols, all_content_cols,
|
|
skip_ipa, page_number_info.
|
|
"""
|
|
# 5. Color annotation on final word_boxes in cells
|
|
if img_bgr is not None:
|
|
all_wb: List[Dict] = []
|
|
for z in zones_data:
|
|
for cell in z.get("cells", []):
|
|
all_wb.extend(cell.get("word_boxes", []))
|
|
detect_word_colors(img_bgr, all_wb)
|
|
|
|
# 5a. Heading detection by color + height
|
|
heading_count = _detect_heading_rows_by_color(zones_data, img_w, img_h)
|
|
if heading_count:
|
|
logger.info("Detected %d heading rows by color+height", heading_count)
|
|
|
|
# 5b. Fix unmatched parentheses in cell text
|
|
for z in zones_data:
|
|
for cell in z.get("cells", []):
|
|
text = cell.get("text", "")
|
|
if ")" in text and "(" not in text:
|
|
cell["text"] = "(" + text
|
|
|
|
# 5c. IPA phonetic correction
|
|
all_cells = [cell for z in zones_data for cell in z.get("cells", [])]
|
|
total_cols = sum(len(z.get("columns", [])) for z in zones_data)
|
|
en_col_type = None
|
|
ipa_target_cols: set = set()
|
|
all_content_cols: set = set()
|
|
skip_ipa = (ipa_mode == "none")
|
|
|
|
# When ipa_mode=none, strip ALL square brackets from ALL content columns
|
|
if skip_ipa:
|
|
_SQUARE_BRACKET_RE_NONE = re.compile(r'\s*\[[^\]]+\]')
|
|
for cell in all_cells:
|
|
ct = cell.get("col_type", "")
|
|
if not ct.startswith("column_"):
|
|
continue
|
|
text = cell.get("text", "")
|
|
if "[" in text:
|
|
stripped = _SQUARE_BRACKET_RE_NONE.sub("", text)
|
|
if stripped != text:
|
|
cell["text"] = stripped.strip()
|
|
cell["_ipa_corrected"] = True
|
|
|
|
if not skip_ipa and total_cols >= 3:
|
|
en_col_type, ipa_target_cols, all_content_cols = _run_ipa_correction(
|
|
all_cells, total_cols, ipa_mode, zones_data
|
|
)
|
|
elif not skip_ipa:
|
|
# Collect all_content_cols even when <3 cols (needed by finalize)
|
|
for cell in all_cells:
|
|
ct = cell.get("col_type", "")
|
|
if ct.startswith("column_") and (cell.get("text") or "").strip():
|
|
all_content_cols.add(ct)
|
|
|
|
# 5e. Heading detection by single-cell rows
|
|
single_heading_count = _detect_heading_rows_by_single_cell(zones_data, img_w, img_h)
|
|
if single_heading_count:
|
|
logger.info("Detected %d heading rows by single-cell heuristic", single_heading_count)
|
|
|
|
# 5f. Strip IPA from headings
|
|
for z in zones_data:
|
|
for cell in z.get("cells", []):
|
|
if cell.get("col_type") != "heading":
|
|
continue
|
|
text = cell.get("text", "")
|
|
stripped = re.sub(r'\s*\[[^\]]*\]\s*$', '', text).strip()
|
|
if stripped and stripped != text:
|
|
cell["text"] = stripped
|
|
|
|
# 5g. Extract page_ref cells and footer rows
|
|
_extract_page_refs_and_footers(zones_data, page_number_info)
|
|
|
|
# 5h. Convert slash-delimited IPA to bracket notation
|
|
_convert_slash_ipa(zones_data, skip_ipa, en_col_type)
|
|
|
|
return {
|
|
"en_col_type": en_col_type,
|
|
"ipa_target_cols": ipa_target_cols,
|
|
"all_content_cols": all_content_cols,
|
|
"skip_ipa": skip_ipa,
|
|
"page_number_info": page_number_info,
|
|
}
|
|
|
|
|
|
def _run_ipa_correction(
|
|
all_cells: List[Dict],
|
|
total_cols: int,
|
|
ipa_mode: str,
|
|
zones_data: List[Dict[str, Any]],
|
|
) -> Tuple[Optional[str], set, set]:
|
|
"""Run IPA correction on cells. Returns (en_col_type, ipa_target_cols, all_content_cols)."""
|
|
en_col_type = None
|
|
all_content_cols: set = set()
|
|
|
|
# Detect English headword column via IPA signals
|
|
col_ipa_count: Dict[str, int] = {}
|
|
for cell in all_cells:
|
|
ct = cell.get("col_type", "")
|
|
if not ct.startswith("column_"):
|
|
continue
|
|
txt = cell.get("text", "") or ""
|
|
if txt.strip():
|
|
all_content_cols.add(ct)
|
|
if '[' in txt or _text_has_garbled_ipa(txt):
|
|
col_ipa_count[ct] = col_ipa_count.get(ct, 0) + 1
|
|
if col_ipa_count:
|
|
en_col_type = max(col_ipa_count, key=col_ipa_count.get)
|
|
elif ipa_mode == "all":
|
|
col_cell_count: Dict[str, int] = {}
|
|
for cell in all_cells:
|
|
ct = cell.get("col_type", "")
|
|
if ct.startswith("column_") and (cell.get("text") or "").strip():
|
|
col_cell_count[ct] = col_cell_count.get(ct, 0) + 1
|
|
if col_cell_count:
|
|
en_col_type = max(col_cell_count, key=col_cell_count.get)
|
|
|
|
# Decide which columns to process based on ipa_mode
|
|
en_ipa_target_cols: set = set()
|
|
de_ipa_target_cols: set = set()
|
|
if ipa_mode in ("auto", "en"):
|
|
if en_col_type:
|
|
en_ipa_target_cols.add(en_col_type)
|
|
elif ipa_mode == "de":
|
|
de_ipa_target_cols = all_content_cols - {en_col_type} if en_col_type else all_content_cols
|
|
elif ipa_mode == "all":
|
|
if en_col_type:
|
|
en_ipa_target_cols.add(en_col_type)
|
|
de_ipa_target_cols = all_content_cols - en_ipa_target_cols
|
|
|
|
# --- Strip IPA from columns NOT in the target set ---
|
|
_SQUARE_BRACKET_RE = re.compile(r'\s*\[[^\]]+\]')
|
|
strip_en_ipa = en_col_type and en_col_type not in en_ipa_target_cols
|
|
if strip_en_ipa or ipa_mode == "none":
|
|
strip_cols = {en_col_type} if strip_en_ipa and ipa_mode != "none" else all_content_cols
|
|
for cell in all_cells:
|
|
ct = cell.get("col_type", "")
|
|
if ct not in strip_cols:
|
|
continue
|
|
text = cell.get("text", "")
|
|
if "[" in text:
|
|
stripped = _SQUARE_BRACKET_RE.sub("", text)
|
|
if stripped != text:
|
|
cell["text"] = stripped.strip()
|
|
cell["_ipa_corrected"] = True
|
|
|
|
# --- English IPA (Britfone + eng_to_ipa) ---
|
|
if en_ipa_target_cols:
|
|
for cell in all_cells:
|
|
ct = cell.get("col_type")
|
|
if ct in en_ipa_target_cols:
|
|
cell["_orig_col_type"] = ct
|
|
cell["col_type"] = "column_en"
|
|
_pre_ipa = {id(c): c.get("text", "") for c in all_cells}
|
|
fix_cell_phonetics(all_cells, pronunciation="british")
|
|
for cell in all_cells:
|
|
orig = cell.pop("_orig_col_type", None)
|
|
if orig:
|
|
cell["col_type"] = orig
|
|
if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
|
|
cell["_ipa_corrected"] = True
|
|
|
|
# --- German IPA (wiki-pronunciation-dict + epitran) ---
|
|
if de_ipa_target_cols:
|
|
from cv_ipa_german import insert_german_ipa
|
|
insert_german_ipa(all_cells, de_ipa_target_cols)
|
|
|
|
ipa_target_cols = en_ipa_target_cols | de_ipa_target_cols
|
|
|
|
# Mark cells whose text was changed by IPA correction
|
|
for cell in all_cells:
|
|
if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
|
|
cell["_ipa_corrected"] = True
|
|
|
|
# 5d. Fix IPA continuation cells
|
|
skip_ipa = (ipa_mode == "none")
|
|
_REAL_IPA_CHARS = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
|
|
ipa_cont_fixed = 0
|
|
for z in ([] if skip_ipa else zones_data):
|
|
rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"])
|
|
z_cells = z.get("cells", [])
|
|
for idx, row in enumerate(rows_sorted):
|
|
if idx == 0:
|
|
continue
|
|
ri = row["index"]
|
|
row_cells = [c for c in z_cells if c.get("row_index") == ri]
|
|
for cell in row_cells:
|
|
ct = cell.get("col_type", "")
|
|
if not ct.startswith("column_"):
|
|
continue
|
|
cell_text = (cell.get("text") or "").strip()
|
|
if not cell_text:
|
|
wb_texts = [w.get("text", "")
|
|
for w in cell.get("word_boxes", [])]
|
|
cell_text = " ".join(wb_texts).strip()
|
|
if not cell_text:
|
|
continue
|
|
|
|
is_bracketed = (
|
|
cell_text.startswith('[') and cell_text.endswith(']')
|
|
)
|
|
|
|
if is_bracketed:
|
|
if not _text_has_garbled_ipa(cell_text):
|
|
continue
|
|
if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', cell_text):
|
|
continue
|
|
else:
|
|
content_cells_in_row = [
|
|
c for c in row_cells
|
|
if c.get("col_type", "").startswith("column_")
|
|
and c.get("col_type") != "column_1"
|
|
]
|
|
if len(content_cells_in_row) != 1:
|
|
continue
|
|
if not _text_has_garbled_ipa(cell_text):
|
|
continue
|
|
if any(c in _REAL_IPA_CHARS for c in cell_text):
|
|
continue
|
|
_words_in_text = re.findall(r'[A-Za-zÄÖÜäöüß]{3,}', cell_text)
|
|
if len(_words_in_text) >= 3:
|
|
continue
|
|
|
|
# Find headword in previous row, same column
|
|
prev_ri = rows_sorted[idx - 1]["index"]
|
|
prev_same_col = [
|
|
c for c in z_cells
|
|
if c.get("row_index") == prev_ri
|
|
and c.get("col_type") == ct
|
|
]
|
|
if not prev_same_col:
|
|
continue
|
|
prev_text = prev_same_col[0].get("text", "")
|
|
fixed = fix_ipa_continuation_cell(
|
|
cell_text, prev_text, pronunciation="british",
|
|
)
|
|
if fixed != cell_text:
|
|
cell["text"] = fixed
|
|
ipa_cont_fixed += 1
|
|
logger.info(
|
|
"IPA continuation R%d %s: '%s' -> '%s'",
|
|
ri, ct, cell_text, fixed,
|
|
)
|
|
if ipa_cont_fixed:
|
|
logger.info("Fixed %d IPA continuation cells", ipa_cont_fixed)
|
|
|
|
return en_col_type, ipa_target_cols, all_content_cols
|
|
|
|
|
|
def _extract_page_refs_and_footers(
|
|
zones_data: List[Dict[str, Any]],
|
|
page_number_info: Optional[Dict],
|
|
) -> None:
|
|
"""Extract page_ref cells and footer rows from content zones.
|
|
|
|
Modifies zones_data in place. Updates page_number_info if a page number
|
|
footer is found.
|
|
"""
|
|
_REAL_IPA_CHARS_SET = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
|
|
_PAGE_REF_RE = re.compile(r'^[pP,]?\s*\.?\s*\d+$')
|
|
_NUMBER_WORDS = {
|
|
"one", "two", "three", "four", "five", "six", "seven",
|
|
"eight", "nine", "ten", "eleven", "twelve", "thirteen",
|
|
"fourteen", "fifteen", "sixteen", "seventeen", "eighteen",
|
|
"nineteen", "twenty", "thirty", "forty", "fifty", "sixty",
|
|
"seventy", "eighty", "ninety", "hundred", "thousand", "and",
|
|
"einhundert", "zweihundert", "dreihundert", "vierhundert",
|
|
"und", "zehn", "zwanzig", "dreißig", "vierzig", "fünfzig",
|
|
}
|
|
|
|
for z in zones_data:
|
|
if z.get("zone_type") != "content":
|
|
continue
|
|
cells = z.get("cells", [])
|
|
rows = z.get("rows", [])
|
|
if not rows:
|
|
continue
|
|
|
|
# Extract column_1 cells that look like page references
|
|
page_refs = []
|
|
page_ref_cell_ids = set()
|
|
for cell in cells:
|
|
if cell.get("col_type") != "column_1":
|
|
continue
|
|
text = (cell.get("text") or "").strip()
|
|
if not text:
|
|
continue
|
|
if not _PAGE_REF_RE.match(text):
|
|
continue
|
|
page_refs.append({
|
|
"row_index": cell.get("row_index"),
|
|
"text": text,
|
|
"bbox_pct": cell.get("bbox_pct", {}),
|
|
})
|
|
page_ref_cell_ids.add(cell.get("cell_id"))
|
|
|
|
# Detect footer: last non-header row if it has only 1 cell
|
|
footer_rows = []
|
|
non_header_rows = [r for r in rows if not r.get("is_header")]
|
|
if non_header_rows:
|
|
last_row = non_header_rows[-1]
|
|
last_ri = last_row["index"]
|
|
last_cells = [c for c in z["cells"]
|
|
if c.get("row_index") == last_ri]
|
|
if len(last_cells) == 1:
|
|
text = (last_cells[0].get("text") or "").strip()
|
|
has_real_ipa = any(c in _REAL_IPA_CHARS_SET for c in text)
|
|
has_commas = ',' in text
|
|
text_words = set(text.lower().split())
|
|
is_written_number = len(text_words) >= 2 and text_words.issubset(_NUMBER_WORDS)
|
|
is_page_number = len(text) <= 20 or is_written_number
|
|
if (text and not has_real_ipa and not has_commas
|
|
and is_page_number
|
|
and last_cells[0].get("col_type") != "heading"):
|
|
footer_rows.append({
|
|
"row_index": last_ri,
|
|
"text": text,
|
|
"bbox_pct": last_cells[0].get("bbox_pct", {}),
|
|
})
|
|
|
|
# Classify footer rows
|
|
page_number_footers = []
|
|
other_footers = []
|
|
for fr in footer_rows:
|
|
ft = fr["text"].strip()
|
|
digits = "".join(c for c in ft if c.isdigit())
|
|
if digits and re.match(r'^[\d\s.]+$', ft):
|
|
page_number_footers.append(fr)
|
|
elif ft.lower().split() and set(ft.lower().split()).issubset(_NUMBER_WORDS):
|
|
page_number_footers.append(fr)
|
|
else:
|
|
other_footers.append(fr)
|
|
|
|
# Remove page-number footer rows from grid entirely
|
|
if page_number_footers:
|
|
pn_ris = {fr["row_index"] for fr in page_number_footers}
|
|
z["cells"] = [c for c in z["cells"] if c.get("row_index") not in pn_ris]
|
|
z["rows"] = [r for r in z["rows"] if r["index"] not in pn_ris]
|
|
pn_text = page_number_footers[0]["text"].strip()
|
|
pn_digits = "".join(c for c in pn_text if c.isdigit())
|
|
if not page_number_info:
|
|
page_number_info = {
|
|
"text": pn_text,
|
|
"y_pct": page_number_footers[0].get("bbox_pct", {}).get("y", 95),
|
|
}
|
|
if pn_digits:
|
|
page_number_info["number"] = int(pn_digits)
|
|
|
|
# Mark remaining footer rows
|
|
if other_footers:
|
|
footer_ris = {fr["row_index"] for fr in other_footers}
|
|
for r in z["rows"]:
|
|
if r["index"] in footer_ris:
|
|
r["is_footer"] = True
|
|
for c in z["cells"]:
|
|
if c.get("row_index") in footer_ris:
|
|
c["col_type"] = "footer"
|
|
|
|
if page_refs or footer_rows:
|
|
logger.info(
|
|
"Extracted %d page_refs + %d footer rows (%d page numbers removed) from zone %d",
|
|
len(page_refs), len(footer_rows), len(page_number_footers),
|
|
z.get("zone_index", 0),
|
|
)
|
|
|
|
if page_refs:
|
|
z["page_refs"] = page_refs
|
|
if other_footers:
|
|
z["footer"] = other_footers
|
|
|
|
|
|
def _convert_slash_ipa(
|
|
zones_data: List[Dict[str, Any]],
|
|
skip_ipa: bool,
|
|
en_col_type: Optional[str],
|
|
) -> None:
|
|
"""Convert slash-delimited IPA to bracket notation.
|
|
|
|
Dictionary-style pages print IPA between slashes: "tiger /'taiga/"
|
|
"""
|
|
_SLASH_IPA_RE = re.compile(
|
|
r'(\b[a-zA-Z]+[²³¹]?)\s*' # headword (capture group 1)
|
|
r"(/[^/]{2,}/)" # /ipa/ (capture group 2), min 2 chars
|
|
)
|
|
_STANDALONE_SLASH_IPA_RE = re.compile(r'^/([^/]{2,})/')
|
|
_SLASH_IPA_REJECT_RE = re.compile(r'[\s(),]')
|
|
slash_ipa_fixed = 0
|
|
|
|
for z in ([] if skip_ipa else zones_data):
|
|
for cell in z.get("cells", []):
|
|
if en_col_type and cell.get("col_type") != en_col_type:
|
|
continue
|
|
text = cell.get("text", "")
|
|
if "/" not in text:
|
|
continue
|
|
|
|
def _replace_slash_ipa(m: re.Match) -> str:
|
|
nonlocal slash_ipa_fixed
|
|
headword = m.group(1)
|
|
ocr_ipa = m.group(2)
|
|
inner_raw = ocr_ipa.strip("/").strip()
|
|
if _SLASH_IPA_REJECT_RE.search(inner_raw):
|
|
return m.group(0)
|
|
clean_hw = re.sub(r'[²³¹\d]', '', headword).strip()
|
|
ipa = _lookup_ipa(clean_hw, "british") if clean_hw else None
|
|
if ipa:
|
|
slash_ipa_fixed += 1
|
|
return f"{headword} [{ipa}]"
|
|
inner = inner_raw.lstrip("'").strip()
|
|
if inner:
|
|
slash_ipa_fixed += 1
|
|
return f"{headword} [{inner}]"
|
|
return m.group(0)
|
|
|
|
new_text = _SLASH_IPA_RE.sub(_replace_slash_ipa, text)
|
|
|
|
_AFTER_BRACKET_SLASH = re.compile(r'(?<=\])\s*(/[^/]{2,}/)')
|
|
|
|
def _replace_trailing_slash(m: re.Match) -> str:
|
|
nonlocal slash_ipa_fixed
|
|
inner = m.group(1).strip("/").strip().lstrip("'").strip()
|
|
if _SLASH_IPA_REJECT_RE.search(inner):
|
|
return m.group(0)
|
|
if inner:
|
|
slash_ipa_fixed += 1
|
|
return f" [{inner}]"
|
|
return m.group(0)
|
|
new_text = _AFTER_BRACKET_SLASH.sub(_replace_trailing_slash, new_text)
|
|
|
|
if new_text == text:
|
|
m = _STANDALONE_SLASH_IPA_RE.match(text)
|
|
if m:
|
|
inner = m.group(1).strip()
|
|
if not _SLASH_IPA_REJECT_RE.search(inner):
|
|
inner = inner.lstrip("'").strip()
|
|
if inner:
|
|
new_text = "[" + inner + "]" + text[m.end():]
|
|
slash_ipa_fixed += 1
|
|
|
|
if new_text != text:
|
|
cell["text"] = new_text
|
|
|
|
if slash_ipa_fixed:
|
|
logger.info("Step 5h: converted %d slash-IPA to bracket notation", slash_ipa_fixed)
|