Files
breakpilot-lehrer/klausur-service/backend/grid_build_finalize.py
Benjamin Admin 9ba420fa91
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
Fix: Remove broken getKlausurApiUrl and clean up empty lines
sed replacement left orphaned hostname references in story page
and empty lines in getApiBase functions.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 16:02:04 +02:00

453 lines
16 KiB
Python

"""
Grid Build Finalize — Phase 5b+6: Orchestrates cell-level operations,
dictionary detection, syllable dividers, spell checking, empty column
removal, and result assembly.
Extracted from grid_build_core.py for maintainability.
"""
import logging
import re
from typing import Any, Dict, List, Optional
from grid_build_cell_ops import (
_remove_bullets_and_artifacts,
_remove_garbled_cells,
_normalize_word_order,
_enforce_max_columns,
)
logger = logging.getLogger(__name__)
def _finalize_grid(
zones_data: List[Dict[str, Any]],
all_words: List[Dict[str, Any]],
img_bgr: Any,
img_w: int,
img_h: int,
session_id: str,
max_columns: Optional[int],
ipa_mode: str,
syllable_mode: str,
en_col_type: Optional[str],
ipa_target_cols: set,
all_content_cols: set,
skip_ipa: bool,
document_category: Optional[str],
margin_strip_detected: bool,
page_number_info: Optional[Dict],
boxes_detected: int,
recovered_count: int,
duration: float,
) -> dict:
"""Run final processing steps and assemble result dict.
Handles: bullet removal, artifact cells, word ordering, max_columns,
dictionary detection, syllable dividers, spell check, empty columns,
internal flag cleanup, and result assembly.
"""
total_cols = sum(len(z.get("columns", [])) for z in zones_data)
# 5i. Remove blue bullet/artifact word_boxes
_remove_bullets_and_artifacts(zones_data)
# 5j-pre. Remove cells whose text is entirely garbled / artifact noise
_remove_garbled_cells(zones_data)
# 5j. Normalise word_box order to reading order
_normalize_word_order(zones_data)
# 5k. Enforce max_columns by merging narrowest columns
if max_columns and max_columns > 0:
_enforce_max_columns(zones_data, max_columns)
# --- Dictionary detection on assembled grid ---
dict_detection = _detect_dictionary(
zones_data, img_w, img_h, document_category, margin_strip_detected
)
# --- Word-gap merge ---
try:
from cv_syllable_detect import merge_word_gaps_in_zones
merge_word_gaps_in_zones(zones_data, session_id)
except Exception as e:
logger.warning("Word-gap merge failed: %s", e)
# --- Pipe auto-correction ---
try:
from cv_syllable_detect import autocorrect_pipe_artifacts
autocorrect_pipe_artifacts(zones_data, session_id)
except Exception as e:
logger.warning("Pipe autocorrect failed: %s", e)
# --- Syllable divider insertion ---
syllable_insertions = _insert_syllable_dividers(
zones_data, img_bgr, session_id, syllable_mode, dict_detection,
en_col_type, all_content_cols, total_cols,
)
# --- Split merged words ---
_split_merged_words(zones_data, session_id)
# --- Ensure space before IPA/phonetic brackets ---
_fix_ipa_spacing(zones_data)
# --- SmartSpellChecker ---
_run_spell_checker(zones_data, session_id, en_col_type, total_cols)
# --- Debug log cell counts per column ---
for z in zones_data:
if z.get("zone_type") == "content":
from collections import Counter as _Counter
_cc = _Counter(c.get("col_index") for c in z.get("cells", []))
_cols = z.get("columns", [])
logger.info(
"pre-empty-col-removal zone %d: %d cols, cells_per_col=%s",
z.get("zone_index", 0), len(_cols), dict(sorted(_cc.items())),
)
# --- Remove empty columns ---
_remove_empty_columns(zones_data)
# Clean up internal flags before returning
for z in zones_data:
for cell in z.get("cells", []):
cell.pop("_ipa_corrected", None)
# 6. Build result
return _assemble_result(
zones_data, all_words, img_w, img_h, session_id,
ipa_mode, syllable_mode, ipa_target_cols, skip_ipa,
dict_detection, page_number_info, boxes_detected,
recovered_count, duration, syllable_insertions,
)
def _detect_dictionary(
zones_data: List[Dict[str, Any]],
img_w: int,
img_h: int,
document_category: Optional[str],
margin_strip_detected: bool,
) -> Dict[str, Any]:
"""Run dictionary detection on the assembled grid."""
from cv_layout import _score_dictionary_signals
dict_detection: Dict[str, Any] = {"is_dictionary": False, "confidence": 0.0}
try:
from cv_vocab_types import ColumnGeometry
for z in zones_data:
zone_cells = z.get("cells", [])
zone_cols = z.get("columns", [])
if len(zone_cols) < 2 or len(zone_cells) < 10:
continue
pseudo_geoms = []
for col in zone_cols:
ci = col["index"]
col_cells = [c for c in zone_cells if c.get("col_index") == ci]
col_words = []
for cell in col_cells:
for wb in cell.get("word_boxes") or []:
col_words.append({
"text": wb.get("text", ""),
"conf": wb.get("conf", 0),
"top": wb.get("top", 0),
"left": wb.get("left", 0),
"height": wb.get("height", 0),
"width": wb.get("width", 0),
})
if not cell.get("word_boxes") and cell.get("text"):
col_words.append({
"text": cell["text"],
"conf": cell.get("confidence", 50),
"top": cell.get("bbox_px", {}).get("y", 0),
"left": cell.get("bbox_px", {}).get("x", 0),
"height": cell.get("bbox_px", {}).get("h", 20),
"width": cell.get("bbox_px", {}).get("w", 50),
})
col_w = col.get("x_max_px", 0) - col.get("x_min_px", 0)
pseudo_geoms.append(ColumnGeometry(
index=ci, x=col.get("x_min_px", 0), y=0,
width=max(col_w, 1), height=img_h,
word_count=len(col_words), words=col_words,
width_ratio=col_w / max(img_w, 1),
))
if len(pseudo_geoms) >= 2:
dd = _score_dictionary_signals(
pseudo_geoms,
document_category=document_category,
margin_strip_detected=margin_strip_detected,
)
if dd["confidence"] > dict_detection["confidence"]:
dict_detection = dd
except Exception as e:
logger.warning("Dictionary detection failed: %s", e)
return dict_detection
def _insert_syllable_dividers(
zones_data: List[Dict[str, Any]],
img_bgr: Any,
session_id: str,
syllable_mode: str,
dict_detection: Dict[str, Any],
en_col_type: Optional[str],
all_content_cols: set,
total_cols: int,
) -> int:
"""Insert syllable dividers for dictionary pages. Returns insertion count."""
syllable_insertions = 0
if syllable_mode == "none" or img_bgr is None:
if syllable_mode == "none":
for z in zones_data:
for cell in z.get("cells", []):
t = cell.get("text", "")
if "|" in t:
cell["text"] = t.replace("|", "")
return syllable_insertions
_syllable_eligible = False
if syllable_mode in ("all", "de", "en"):
_syllable_eligible = True
elif (dict_detection.get("is_dictionary")
and dict_detection.get("article_col_index") is not None):
_syllable_eligible = True
_syllable_col_filter: Optional[set] = None
if syllable_mode == "en":
_syllable_col_filter = {en_col_type} if en_col_type else set()
elif syllable_mode == "de":
if en_col_type and total_cols >= 3:
_syllable_col_filter = all_content_cols - {en_col_type}
if _syllable_eligible:
try:
from cv_syllable_detect import insert_syllable_dividers
force_syllables = (syllable_mode in ("all", "de", "en"))
syllable_insertions = insert_syllable_dividers(
zones_data, img_bgr, session_id,
force=force_syllables,
col_filter=_syllable_col_filter,
)
except Exception as e:
logger.warning("Syllable insertion failed: %s", e)
return syllable_insertions
def _split_merged_words(
zones_data: List[Dict[str, Any]],
session_id: str,
) -> None:
"""Split merged words using dictionary lookup."""
try:
from cv_review import _try_split_merged_word, _SPELL_AVAILABLE
if not _SPELL_AVAILABLE:
return
split_count = 0
for z in zones_data:
for cell in z.get("cells", []):
text = cell.get("text", "")
if not text:
continue
parts = []
changed = False
for token in text.split():
clean = token
bracket_pos = clean.find('[')
suffix_ipa = ""
if bracket_pos > 0:
suffix_ipa = clean[bracket_pos:]
clean = clean[:bracket_pos]
suffix_punct = ""
stripped = clean.rstrip(".,!?;:'\")")
if stripped != clean:
suffix_punct = clean[len(stripped):]
clean = stripped
suffix = suffix_punct + suffix_ipa
contraction = ""
if "'" in clean and clean.index("'") >= 2:
apos_pos = clean.index("'")
contraction = clean[apos_pos:]
clean = clean[:apos_pos]
suffix = contraction + suffix
if len(clean) >= 4 and clean.isalpha():
split = _try_split_merged_word(clean)
if split:
parts.append(split + suffix)
changed = True
continue
parts.append(token)
if changed:
cell["text"] = " ".join(parts)
split_count += 1
if split_count:
logger.info("build-grid session %s: split %d merged words", session_id, split_count)
except ImportError:
pass
def _fix_ipa_spacing(zones_data: List[Dict[str, Any]]) -> None:
"""Ensure space before IPA/phonetic brackets: 'word[ipa]' -> 'word [ipa]'."""
_IPA_NOSPACE_RE = re.compile(r'([a-zA-ZäöüÄÖÜß])(\[[^\]]{2,}\])')
for z in zones_data:
for cell in z.get("cells", []):
text = cell.get("text", "")
if text and "[" in text:
fixed = _IPA_NOSPACE_RE.sub(r'\1 \2', text)
if fixed != text:
cell["text"] = fixed
def _run_spell_checker(
zones_data: List[Dict[str, Any]],
session_id: str,
en_col_type: Optional[str],
total_cols: int,
) -> None:
"""Run SmartSpellChecker on all cells."""
try:
from smart_spell import SmartSpellChecker
_ssc = SmartSpellChecker()
spell_fix_count = 0
for z in zones_data:
for cell in z.get("cells", []):
text = cell.get("text", "")
if not text or not text.strip():
continue
ct = cell.get("col_type", "")
if not ct.startswith("column_"):
continue
if total_cols >= 3 and en_col_type:
lang = "en" if ct == en_col_type else "de"
elif total_cols <= 2:
lang = "auto"
else:
lang = "auto"
result = _ssc.correct_text(text, lang=lang)
if result.changed:
cell["text"] = result.corrected
spell_fix_count += 1
if spell_fix_count:
logger.info(
"build-grid session %s: SmartSpellChecker fixed %d cells",
session_id, spell_fix_count,
)
except ImportError:
logger.debug("SmartSpellChecker not available in build-grid")
except Exception as e:
logger.warning("SmartSpellChecker error in build-grid: %s", e)
def _remove_empty_columns(zones_data: List[Dict[str, Any]]) -> None:
"""Remove columns that have no cells assigned."""
for z in zones_data:
cells = z.get("cells", [])
used_col_indices = {c.get("col_index") for c in cells}
old_cols = z.get("columns", [])
new_cols = [c for c in old_cols if c.get("col_index", c.get("index", -1)) in used_col_indices]
if len(new_cols) < len(old_cols):
old_to_new = {}
for new_i, col in enumerate(new_cols):
old_i = col.get("col_index", col.get("index", new_i))
old_to_new[old_i] = new_i
col["col_index"] = new_i
col["index"] = new_i
col["label"] = f"column_{new_i + 1}" if len(new_cols) > 1 else "column_text"
for cell in cells:
old_ci = cell.get("col_index", 0)
cell["col_index"] = old_to_new.get(old_ci, old_ci)
cell["col_type"] = f"column_{cell['col_index'] + 1}" if len(new_cols) > 1 else "column_text"
z["columns"] = new_cols
def _assemble_result(
zones_data: List[Dict[str, Any]],
all_words: List[Dict[str, Any]],
img_w: int,
img_h: int,
session_id: str,
ipa_mode: str,
syllable_mode: str,
ipa_target_cols: set,
skip_ipa: bool,
dict_detection: Dict[str, Any],
page_number_info: Optional[Dict],
boxes_detected: int,
recovered_count: int,
duration: float,
syllable_insertions: int,
) -> dict:
"""Build the final result dict (Phase 6)."""
total_cells = sum(len(z.get("cells", [])) for z in zones_data)
total_columns = sum(len(z.get("columns", [])) for z in zones_data)
total_rows = sum(len(z.get("rows", [])) for z in zones_data)
# Collect color statistics
color_stats: Dict[str, int] = {}
for z in zones_data:
for cell in z.get("cells", []):
for wb in cell.get("word_boxes", []):
cn = wb.get("color_name", "black")
color_stats[cn] = color_stats.get(cn, 0) + 1
# Compute layout metrics
all_content_row_heights: List[float] = []
for z in zones_data:
for row in z.get("rows", []):
if not row.get("is_header", False):
h = row.get("y_max_px", 0) - row.get("y_min_px", 0)
if h > 0:
all_content_row_heights.append(h)
avg_row_height = (
sum(all_content_row_heights) / len(all_content_row_heights)
if all_content_row_heights else 30.0
)
font_size_suggestion = max(10, int(avg_row_height * 0.6))
return {
"session_id": session_id,
"image_width": img_w,
"image_height": img_h,
"zones": zones_data,
"boxes_detected": boxes_detected,
"summary": {
"total_zones": len(zones_data),
"total_columns": total_columns,
"total_rows": total_rows,
"total_cells": total_cells,
"total_words": len(all_words),
"recovered_colored": recovered_count,
"color_stats": color_stats,
},
"formatting": {
"bold_columns": [],
"header_rows": [],
},
"layout_metrics": {
"page_width_px": img_w,
"page_height_px": img_h,
"avg_row_height_px": round(avg_row_height, 1),
"font_size_suggestion_px": font_size_suggestion,
},
"dictionary_detection": {
"is_dictionary": dict_detection.get("is_dictionary", False),
"confidence": dict_detection.get("confidence", 0.0),
"signals": dict_detection.get("signals", {}),
"article_col_index": dict_detection.get("article_col_index"),
"headword_col_index": dict_detection.get("headword_col_index"),
},
"processing_modes": {
"ipa_mode": ipa_mode,
"syllable_mode": syllable_mode,
"ipa_applied": bool(ipa_target_cols) if not skip_ipa else False,
"syllables_applied": syllable_insertions > 0,
},
"page_number": page_number_info,
"duration_seconds": round(duration, 2),
}