Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
453 lines
16 KiB
Python
453 lines
16 KiB
Python
"""
|
|
Grid Build Finalize — Phase 5b+6: Orchestrates cell-level operations,
|
|
dictionary detection, syllable dividers, spell checking, empty column
|
|
removal, and result assembly.
|
|
|
|
Extracted from grid_build_core.py for maintainability.
|
|
"""
|
|
|
|
import logging
|
|
import re
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
from grid_build_cell_ops import (
|
|
_remove_bullets_and_artifacts,
|
|
_remove_garbled_cells,
|
|
_normalize_word_order,
|
|
_enforce_max_columns,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def _finalize_grid(
|
|
zones_data: List[Dict[str, Any]],
|
|
all_words: List[Dict[str, Any]],
|
|
img_bgr: Any,
|
|
img_w: int,
|
|
img_h: int,
|
|
session_id: str,
|
|
max_columns: Optional[int],
|
|
ipa_mode: str,
|
|
syllable_mode: str,
|
|
en_col_type: Optional[str],
|
|
ipa_target_cols: set,
|
|
all_content_cols: set,
|
|
skip_ipa: bool,
|
|
document_category: Optional[str],
|
|
margin_strip_detected: bool,
|
|
page_number_info: Optional[Dict],
|
|
boxes_detected: int,
|
|
recovered_count: int,
|
|
duration: float,
|
|
) -> dict:
|
|
"""Run final processing steps and assemble result dict.
|
|
|
|
Handles: bullet removal, artifact cells, word ordering, max_columns,
|
|
dictionary detection, syllable dividers, spell check, empty columns,
|
|
internal flag cleanup, and result assembly.
|
|
"""
|
|
total_cols = sum(len(z.get("columns", [])) for z in zones_data)
|
|
|
|
# 5i. Remove blue bullet/artifact word_boxes
|
|
_remove_bullets_and_artifacts(zones_data)
|
|
|
|
# 5j-pre. Remove cells whose text is entirely garbled / artifact noise
|
|
_remove_garbled_cells(zones_data)
|
|
|
|
# 5j. Normalise word_box order to reading order
|
|
_normalize_word_order(zones_data)
|
|
|
|
# 5k. Enforce max_columns by merging narrowest columns
|
|
if max_columns and max_columns > 0:
|
|
_enforce_max_columns(zones_data, max_columns)
|
|
|
|
# --- Dictionary detection on assembled grid ---
|
|
dict_detection = _detect_dictionary(
|
|
zones_data, img_w, img_h, document_category, margin_strip_detected
|
|
)
|
|
|
|
# --- Word-gap merge ---
|
|
try:
|
|
from cv_syllable_detect import merge_word_gaps_in_zones
|
|
merge_word_gaps_in_zones(zones_data, session_id)
|
|
except Exception as e:
|
|
logger.warning("Word-gap merge failed: %s", e)
|
|
|
|
# --- Pipe auto-correction ---
|
|
try:
|
|
from cv_syllable_detect import autocorrect_pipe_artifacts
|
|
autocorrect_pipe_artifacts(zones_data, session_id)
|
|
except Exception as e:
|
|
logger.warning("Pipe autocorrect failed: %s", e)
|
|
|
|
# --- Syllable divider insertion ---
|
|
syllable_insertions = _insert_syllable_dividers(
|
|
zones_data, img_bgr, session_id, syllable_mode, dict_detection,
|
|
en_col_type, all_content_cols, total_cols,
|
|
)
|
|
|
|
# --- Split merged words ---
|
|
_split_merged_words(zones_data, session_id)
|
|
|
|
# --- Ensure space before IPA/phonetic brackets ---
|
|
_fix_ipa_spacing(zones_data)
|
|
|
|
# --- SmartSpellChecker ---
|
|
_run_spell_checker(zones_data, session_id, en_col_type, total_cols)
|
|
|
|
# --- Debug log cell counts per column ---
|
|
for z in zones_data:
|
|
if z.get("zone_type") == "content":
|
|
from collections import Counter as _Counter
|
|
_cc = _Counter(c.get("col_index") for c in z.get("cells", []))
|
|
_cols = z.get("columns", [])
|
|
logger.info(
|
|
"pre-empty-col-removal zone %d: %d cols, cells_per_col=%s",
|
|
z.get("zone_index", 0), len(_cols), dict(sorted(_cc.items())),
|
|
)
|
|
|
|
# --- Remove empty columns ---
|
|
_remove_empty_columns(zones_data)
|
|
|
|
# Clean up internal flags before returning
|
|
for z in zones_data:
|
|
for cell in z.get("cells", []):
|
|
cell.pop("_ipa_corrected", None)
|
|
|
|
# 6. Build result
|
|
return _assemble_result(
|
|
zones_data, all_words, img_w, img_h, session_id,
|
|
ipa_mode, syllable_mode, ipa_target_cols, skip_ipa,
|
|
dict_detection, page_number_info, boxes_detected,
|
|
recovered_count, duration, syllable_insertions,
|
|
)
|
|
|
|
|
|
def _detect_dictionary(
|
|
zones_data: List[Dict[str, Any]],
|
|
img_w: int,
|
|
img_h: int,
|
|
document_category: Optional[str],
|
|
margin_strip_detected: bool,
|
|
) -> Dict[str, Any]:
|
|
"""Run dictionary detection on the assembled grid."""
|
|
from cv_layout import _score_dictionary_signals
|
|
dict_detection: Dict[str, Any] = {"is_dictionary": False, "confidence": 0.0}
|
|
try:
|
|
from cv_vocab_types import ColumnGeometry
|
|
for z in zones_data:
|
|
zone_cells = z.get("cells", [])
|
|
zone_cols = z.get("columns", [])
|
|
if len(zone_cols) < 2 or len(zone_cells) < 10:
|
|
continue
|
|
pseudo_geoms = []
|
|
for col in zone_cols:
|
|
ci = col["index"]
|
|
col_cells = [c for c in zone_cells if c.get("col_index") == ci]
|
|
col_words = []
|
|
for cell in col_cells:
|
|
for wb in cell.get("word_boxes") or []:
|
|
col_words.append({
|
|
"text": wb.get("text", ""),
|
|
"conf": wb.get("conf", 0),
|
|
"top": wb.get("top", 0),
|
|
"left": wb.get("left", 0),
|
|
"height": wb.get("height", 0),
|
|
"width": wb.get("width", 0),
|
|
})
|
|
if not cell.get("word_boxes") and cell.get("text"):
|
|
col_words.append({
|
|
"text": cell["text"],
|
|
"conf": cell.get("confidence", 50),
|
|
"top": cell.get("bbox_px", {}).get("y", 0),
|
|
"left": cell.get("bbox_px", {}).get("x", 0),
|
|
"height": cell.get("bbox_px", {}).get("h", 20),
|
|
"width": cell.get("bbox_px", {}).get("w", 50),
|
|
})
|
|
col_w = col.get("x_max_px", 0) - col.get("x_min_px", 0)
|
|
pseudo_geoms.append(ColumnGeometry(
|
|
index=ci, x=col.get("x_min_px", 0), y=0,
|
|
width=max(col_w, 1), height=img_h,
|
|
word_count=len(col_words), words=col_words,
|
|
width_ratio=col_w / max(img_w, 1),
|
|
))
|
|
if len(pseudo_geoms) >= 2:
|
|
dd = _score_dictionary_signals(
|
|
pseudo_geoms,
|
|
document_category=document_category,
|
|
margin_strip_detected=margin_strip_detected,
|
|
)
|
|
if dd["confidence"] > dict_detection["confidence"]:
|
|
dict_detection = dd
|
|
except Exception as e:
|
|
logger.warning("Dictionary detection failed: %s", e)
|
|
return dict_detection
|
|
|
|
|
|
def _insert_syllable_dividers(
|
|
zones_data: List[Dict[str, Any]],
|
|
img_bgr: Any,
|
|
session_id: str,
|
|
syllable_mode: str,
|
|
dict_detection: Dict[str, Any],
|
|
en_col_type: Optional[str],
|
|
all_content_cols: set,
|
|
total_cols: int,
|
|
) -> int:
|
|
"""Insert syllable dividers for dictionary pages. Returns insertion count."""
|
|
syllable_insertions = 0
|
|
if syllable_mode == "none" or img_bgr is None:
|
|
if syllable_mode == "none":
|
|
for z in zones_data:
|
|
for cell in z.get("cells", []):
|
|
t = cell.get("text", "")
|
|
if "|" in t:
|
|
cell["text"] = t.replace("|", "")
|
|
return syllable_insertions
|
|
|
|
_syllable_eligible = False
|
|
if syllable_mode in ("all", "de", "en"):
|
|
_syllable_eligible = True
|
|
elif (dict_detection.get("is_dictionary")
|
|
and dict_detection.get("article_col_index") is not None):
|
|
_syllable_eligible = True
|
|
|
|
_syllable_col_filter: Optional[set] = None
|
|
if syllable_mode == "en":
|
|
_syllable_col_filter = {en_col_type} if en_col_type else set()
|
|
elif syllable_mode == "de":
|
|
if en_col_type and total_cols >= 3:
|
|
_syllable_col_filter = all_content_cols - {en_col_type}
|
|
|
|
if _syllable_eligible:
|
|
try:
|
|
from cv_syllable_detect import insert_syllable_dividers
|
|
force_syllables = (syllable_mode in ("all", "de", "en"))
|
|
syllable_insertions = insert_syllable_dividers(
|
|
zones_data, img_bgr, session_id,
|
|
force=force_syllables,
|
|
col_filter=_syllable_col_filter,
|
|
)
|
|
except Exception as e:
|
|
logger.warning("Syllable insertion failed: %s", e)
|
|
|
|
return syllable_insertions
|
|
|
|
|
|
def _split_merged_words(
|
|
zones_data: List[Dict[str, Any]],
|
|
session_id: str,
|
|
) -> None:
|
|
"""Split merged words using dictionary lookup."""
|
|
try:
|
|
from cv_review import _try_split_merged_word, _SPELL_AVAILABLE
|
|
if not _SPELL_AVAILABLE:
|
|
return
|
|
split_count = 0
|
|
for z in zones_data:
|
|
for cell in z.get("cells", []):
|
|
text = cell.get("text", "")
|
|
if not text:
|
|
continue
|
|
parts = []
|
|
changed = False
|
|
for token in text.split():
|
|
clean = token
|
|
bracket_pos = clean.find('[')
|
|
suffix_ipa = ""
|
|
if bracket_pos > 0:
|
|
suffix_ipa = clean[bracket_pos:]
|
|
clean = clean[:bracket_pos]
|
|
suffix_punct = ""
|
|
stripped = clean.rstrip(".,!?;:'\")")
|
|
if stripped != clean:
|
|
suffix_punct = clean[len(stripped):]
|
|
clean = stripped
|
|
suffix = suffix_punct + suffix_ipa
|
|
contraction = ""
|
|
if "'" in clean and clean.index("'") >= 2:
|
|
apos_pos = clean.index("'")
|
|
contraction = clean[apos_pos:]
|
|
clean = clean[:apos_pos]
|
|
suffix = contraction + suffix
|
|
if len(clean) >= 4 and clean.isalpha():
|
|
split = _try_split_merged_word(clean)
|
|
if split:
|
|
parts.append(split + suffix)
|
|
changed = True
|
|
continue
|
|
parts.append(token)
|
|
if changed:
|
|
cell["text"] = " ".join(parts)
|
|
split_count += 1
|
|
if split_count:
|
|
logger.info("build-grid session %s: split %d merged words", session_id, split_count)
|
|
except ImportError:
|
|
pass
|
|
|
|
|
|
def _fix_ipa_spacing(zones_data: List[Dict[str, Any]]) -> None:
|
|
"""Ensure space before IPA/phonetic brackets: 'word[ipa]' -> 'word [ipa]'."""
|
|
_IPA_NOSPACE_RE = re.compile(r'([a-zA-ZäöüÄÖÜß])(\[[^\]]{2,}\])')
|
|
for z in zones_data:
|
|
for cell in z.get("cells", []):
|
|
text = cell.get("text", "")
|
|
if text and "[" in text:
|
|
fixed = _IPA_NOSPACE_RE.sub(r'\1 \2', text)
|
|
if fixed != text:
|
|
cell["text"] = fixed
|
|
|
|
|
|
def _run_spell_checker(
|
|
zones_data: List[Dict[str, Any]],
|
|
session_id: str,
|
|
en_col_type: Optional[str],
|
|
total_cols: int,
|
|
) -> None:
|
|
"""Run SmartSpellChecker on all cells."""
|
|
try:
|
|
from smart_spell import SmartSpellChecker
|
|
_ssc = SmartSpellChecker()
|
|
spell_fix_count = 0
|
|
|
|
for z in zones_data:
|
|
for cell in z.get("cells", []):
|
|
text = cell.get("text", "")
|
|
if not text or not text.strip():
|
|
continue
|
|
ct = cell.get("col_type", "")
|
|
if not ct.startswith("column_"):
|
|
continue
|
|
|
|
if total_cols >= 3 and en_col_type:
|
|
lang = "en" if ct == en_col_type else "de"
|
|
elif total_cols <= 2:
|
|
lang = "auto"
|
|
else:
|
|
lang = "auto"
|
|
|
|
result = _ssc.correct_text(text, lang=lang)
|
|
if result.changed:
|
|
cell["text"] = result.corrected
|
|
spell_fix_count += 1
|
|
|
|
if spell_fix_count:
|
|
logger.info(
|
|
"build-grid session %s: SmartSpellChecker fixed %d cells",
|
|
session_id, spell_fix_count,
|
|
)
|
|
except ImportError:
|
|
logger.debug("SmartSpellChecker not available in build-grid")
|
|
except Exception as e:
|
|
logger.warning("SmartSpellChecker error in build-grid: %s", e)
|
|
|
|
|
|
def _remove_empty_columns(zones_data: List[Dict[str, Any]]) -> None:
|
|
"""Remove columns that have no cells assigned."""
|
|
for z in zones_data:
|
|
cells = z.get("cells", [])
|
|
used_col_indices = {c.get("col_index") for c in cells}
|
|
old_cols = z.get("columns", [])
|
|
new_cols = [c for c in old_cols if c.get("col_index", c.get("index", -1)) in used_col_indices]
|
|
if len(new_cols) < len(old_cols):
|
|
old_to_new = {}
|
|
for new_i, col in enumerate(new_cols):
|
|
old_i = col.get("col_index", col.get("index", new_i))
|
|
old_to_new[old_i] = new_i
|
|
col["col_index"] = new_i
|
|
col["index"] = new_i
|
|
col["label"] = f"column_{new_i + 1}" if len(new_cols) > 1 else "column_text"
|
|
for cell in cells:
|
|
old_ci = cell.get("col_index", 0)
|
|
cell["col_index"] = old_to_new.get(old_ci, old_ci)
|
|
cell["col_type"] = f"column_{cell['col_index'] + 1}" if len(new_cols) > 1 else "column_text"
|
|
z["columns"] = new_cols
|
|
|
|
|
|
def _assemble_result(
|
|
zones_data: List[Dict[str, Any]],
|
|
all_words: List[Dict[str, Any]],
|
|
img_w: int,
|
|
img_h: int,
|
|
session_id: str,
|
|
ipa_mode: str,
|
|
syllable_mode: str,
|
|
ipa_target_cols: set,
|
|
skip_ipa: bool,
|
|
dict_detection: Dict[str, Any],
|
|
page_number_info: Optional[Dict],
|
|
boxes_detected: int,
|
|
recovered_count: int,
|
|
duration: float,
|
|
syllable_insertions: int,
|
|
) -> dict:
|
|
"""Build the final result dict (Phase 6)."""
|
|
total_cells = sum(len(z.get("cells", [])) for z in zones_data)
|
|
total_columns = sum(len(z.get("columns", [])) for z in zones_data)
|
|
total_rows = sum(len(z.get("rows", [])) for z in zones_data)
|
|
|
|
# Collect color statistics
|
|
color_stats: Dict[str, int] = {}
|
|
for z in zones_data:
|
|
for cell in z.get("cells", []):
|
|
for wb in cell.get("word_boxes", []):
|
|
cn = wb.get("color_name", "black")
|
|
color_stats[cn] = color_stats.get(cn, 0) + 1
|
|
|
|
# Compute layout metrics
|
|
all_content_row_heights: List[float] = []
|
|
for z in zones_data:
|
|
for row in z.get("rows", []):
|
|
if not row.get("is_header", False):
|
|
h = row.get("y_max_px", 0) - row.get("y_min_px", 0)
|
|
if h > 0:
|
|
all_content_row_heights.append(h)
|
|
avg_row_height = (
|
|
sum(all_content_row_heights) / len(all_content_row_heights)
|
|
if all_content_row_heights else 30.0
|
|
)
|
|
font_size_suggestion = max(10, int(avg_row_height * 0.6))
|
|
|
|
return {
|
|
"session_id": session_id,
|
|
"image_width": img_w,
|
|
"image_height": img_h,
|
|
"zones": zones_data,
|
|
"boxes_detected": boxes_detected,
|
|
"summary": {
|
|
"total_zones": len(zones_data),
|
|
"total_columns": total_columns,
|
|
"total_rows": total_rows,
|
|
"total_cells": total_cells,
|
|
"total_words": len(all_words),
|
|
"recovered_colored": recovered_count,
|
|
"color_stats": color_stats,
|
|
},
|
|
"formatting": {
|
|
"bold_columns": [],
|
|
"header_rows": [],
|
|
},
|
|
"layout_metrics": {
|
|
"page_width_px": img_w,
|
|
"page_height_px": img_h,
|
|
"avg_row_height_px": round(avg_row_height, 1),
|
|
"font_size_suggestion_px": font_size_suggestion,
|
|
},
|
|
"dictionary_detection": {
|
|
"is_dictionary": dict_detection.get("is_dictionary", False),
|
|
"confidence": dict_detection.get("confidence", 0.0),
|
|
"signals": dict_detection.get("signals", {}),
|
|
"article_col_index": dict_detection.get("article_col_index"),
|
|
"headword_col_index": dict_detection.get("headword_col_index"),
|
|
},
|
|
"processing_modes": {
|
|
"ipa_mode": ipa_mode,
|
|
"syllable_mode": syllable_mode,
|
|
"ipa_applied": bool(ipa_target_cols) if not skip_ipa else False,
|
|
"syllables_applied": syllable_insertions > 0,
|
|
},
|
|
"page_number": page_number_info,
|
|
"duration_seconds": round(duration, 2),
|
|
}
|