Restructure: Move grid_* + vocab_* into packages (klausur-service)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m31s
CI / test-python-agent-core (push) Successful in 20s
CI / test-nodejs-website (push) Successful in 23s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m31s
CI / test-python-agent-core (push) Successful in 20s
CI / test-nodejs-website (push) Successful in 23s
grid/ package (16 files): grid/build/ — core, zones, cleanup, text_ops, cell_ops, finalize grid/editor/ — api, helpers, columns, filters, headers, zones vocab/ package (10 files): vocab/worksheet/ — api, models, extraction, generation, ocr, upload, analysis, compare vocab/ — session_store, learn_bridge 26 backward-compat shims. Internal imports relative. RAG untouched. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,452 +1,4 @@
|
||||
"""
|
||||
Grid Build Finalize — Phase 5b+6: Orchestrates cell-level operations,
|
||||
dictionary detection, syllable dividers, spell checking, empty column
|
||||
removal, and result assembly.
|
||||
|
||||
Extracted from grid_build_core.py for maintainability.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from grid_build_cell_ops import (
|
||||
_remove_bullets_and_artifacts,
|
||||
_remove_garbled_cells,
|
||||
_normalize_word_order,
|
||||
_enforce_max_columns,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _finalize_grid(
|
||||
zones_data: List[Dict[str, Any]],
|
||||
all_words: List[Dict[str, Any]],
|
||||
img_bgr: Any,
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
session_id: str,
|
||||
max_columns: Optional[int],
|
||||
ipa_mode: str,
|
||||
syllable_mode: str,
|
||||
en_col_type: Optional[str],
|
||||
ipa_target_cols: set,
|
||||
all_content_cols: set,
|
||||
skip_ipa: bool,
|
||||
document_category: Optional[str],
|
||||
margin_strip_detected: bool,
|
||||
page_number_info: Optional[Dict],
|
||||
boxes_detected: int,
|
||||
recovered_count: int,
|
||||
duration: float,
|
||||
) -> dict:
|
||||
"""Run final processing steps and assemble result dict.
|
||||
|
||||
Handles: bullet removal, artifact cells, word ordering, max_columns,
|
||||
dictionary detection, syllable dividers, spell check, empty columns,
|
||||
internal flag cleanup, and result assembly.
|
||||
"""
|
||||
total_cols = sum(len(z.get("columns", [])) for z in zones_data)
|
||||
|
||||
# 5i. Remove blue bullet/artifact word_boxes
|
||||
_remove_bullets_and_artifacts(zones_data)
|
||||
|
||||
# 5j-pre. Remove cells whose text is entirely garbled / artifact noise
|
||||
_remove_garbled_cells(zones_data)
|
||||
|
||||
# 5j. Normalise word_box order to reading order
|
||||
_normalize_word_order(zones_data)
|
||||
|
||||
# 5k. Enforce max_columns by merging narrowest columns
|
||||
if max_columns and max_columns > 0:
|
||||
_enforce_max_columns(zones_data, max_columns)
|
||||
|
||||
# --- Dictionary detection on assembled grid ---
|
||||
dict_detection = _detect_dictionary(
|
||||
zones_data, img_w, img_h, document_category, margin_strip_detected
|
||||
)
|
||||
|
||||
# --- Word-gap merge ---
|
||||
try:
|
||||
from cv_syllable_detect import merge_word_gaps_in_zones
|
||||
merge_word_gaps_in_zones(zones_data, session_id)
|
||||
except Exception as e:
|
||||
logger.warning("Word-gap merge failed: %s", e)
|
||||
|
||||
# --- Pipe auto-correction ---
|
||||
try:
|
||||
from cv_syllable_detect import autocorrect_pipe_artifacts
|
||||
autocorrect_pipe_artifacts(zones_data, session_id)
|
||||
except Exception as e:
|
||||
logger.warning("Pipe autocorrect failed: %s", e)
|
||||
|
||||
# --- Syllable divider insertion ---
|
||||
syllable_insertions = _insert_syllable_dividers(
|
||||
zones_data, img_bgr, session_id, syllable_mode, dict_detection,
|
||||
en_col_type, all_content_cols, total_cols,
|
||||
)
|
||||
|
||||
# --- Split merged words ---
|
||||
_split_merged_words(zones_data, session_id)
|
||||
|
||||
# --- Ensure space before IPA/phonetic brackets ---
|
||||
_fix_ipa_spacing(zones_data)
|
||||
|
||||
# --- SmartSpellChecker ---
|
||||
_run_spell_checker(zones_data, session_id, en_col_type, total_cols)
|
||||
|
||||
# --- Debug log cell counts per column ---
|
||||
for z in zones_data:
|
||||
if z.get("zone_type") == "content":
|
||||
from collections import Counter as _Counter
|
||||
_cc = _Counter(c.get("col_index") for c in z.get("cells", []))
|
||||
_cols = z.get("columns", [])
|
||||
logger.info(
|
||||
"pre-empty-col-removal zone %d: %d cols, cells_per_col=%s",
|
||||
z.get("zone_index", 0), len(_cols), dict(sorted(_cc.items())),
|
||||
)
|
||||
|
||||
# --- Remove empty columns ---
|
||||
_remove_empty_columns(zones_data)
|
||||
|
||||
# Clean up internal flags before returning
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
cell.pop("_ipa_corrected", None)
|
||||
|
||||
# 6. Build result
|
||||
return _assemble_result(
|
||||
zones_data, all_words, img_w, img_h, session_id,
|
||||
ipa_mode, syllable_mode, ipa_target_cols, skip_ipa,
|
||||
dict_detection, page_number_info, boxes_detected,
|
||||
recovered_count, duration, syllable_insertions,
|
||||
)
|
||||
|
||||
|
||||
def _detect_dictionary(
|
||||
zones_data: List[Dict[str, Any]],
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
document_category: Optional[str],
|
||||
margin_strip_detected: bool,
|
||||
) -> Dict[str, Any]:
|
||||
"""Run dictionary detection on the assembled grid."""
|
||||
from cv_layout import _score_dictionary_signals
|
||||
dict_detection: Dict[str, Any] = {"is_dictionary": False, "confidence": 0.0}
|
||||
try:
|
||||
from cv_vocab_types import ColumnGeometry
|
||||
for z in zones_data:
|
||||
zone_cells = z.get("cells", [])
|
||||
zone_cols = z.get("columns", [])
|
||||
if len(zone_cols) < 2 or len(zone_cells) < 10:
|
||||
continue
|
||||
pseudo_geoms = []
|
||||
for col in zone_cols:
|
||||
ci = col["index"]
|
||||
col_cells = [c for c in zone_cells if c.get("col_index") == ci]
|
||||
col_words = []
|
||||
for cell in col_cells:
|
||||
for wb in cell.get("word_boxes") or []:
|
||||
col_words.append({
|
||||
"text": wb.get("text", ""),
|
||||
"conf": wb.get("conf", 0),
|
||||
"top": wb.get("top", 0),
|
||||
"left": wb.get("left", 0),
|
||||
"height": wb.get("height", 0),
|
||||
"width": wb.get("width", 0),
|
||||
})
|
||||
if not cell.get("word_boxes") and cell.get("text"):
|
||||
col_words.append({
|
||||
"text": cell["text"],
|
||||
"conf": cell.get("confidence", 50),
|
||||
"top": cell.get("bbox_px", {}).get("y", 0),
|
||||
"left": cell.get("bbox_px", {}).get("x", 0),
|
||||
"height": cell.get("bbox_px", {}).get("h", 20),
|
||||
"width": cell.get("bbox_px", {}).get("w", 50),
|
||||
})
|
||||
col_w = col.get("x_max_px", 0) - col.get("x_min_px", 0)
|
||||
pseudo_geoms.append(ColumnGeometry(
|
||||
index=ci, x=col.get("x_min_px", 0), y=0,
|
||||
width=max(col_w, 1), height=img_h,
|
||||
word_count=len(col_words), words=col_words,
|
||||
width_ratio=col_w / max(img_w, 1),
|
||||
))
|
||||
if len(pseudo_geoms) >= 2:
|
||||
dd = _score_dictionary_signals(
|
||||
pseudo_geoms,
|
||||
document_category=document_category,
|
||||
margin_strip_detected=margin_strip_detected,
|
||||
)
|
||||
if dd["confidence"] > dict_detection["confidence"]:
|
||||
dict_detection = dd
|
||||
except Exception as e:
|
||||
logger.warning("Dictionary detection failed: %s", e)
|
||||
return dict_detection
|
||||
|
||||
|
||||
def _insert_syllable_dividers(
|
||||
zones_data: List[Dict[str, Any]],
|
||||
img_bgr: Any,
|
||||
session_id: str,
|
||||
syllable_mode: str,
|
||||
dict_detection: Dict[str, Any],
|
||||
en_col_type: Optional[str],
|
||||
all_content_cols: set,
|
||||
total_cols: int,
|
||||
) -> int:
|
||||
"""Insert syllable dividers for dictionary pages. Returns insertion count."""
|
||||
syllable_insertions = 0
|
||||
if syllable_mode == "none" or img_bgr is None:
|
||||
if syllable_mode == "none":
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
t = cell.get("text", "")
|
||||
if "|" in t:
|
||||
cell["text"] = t.replace("|", "")
|
||||
return syllable_insertions
|
||||
|
||||
_syllable_eligible = False
|
||||
if syllable_mode in ("all", "de", "en"):
|
||||
_syllable_eligible = True
|
||||
elif (dict_detection.get("is_dictionary")
|
||||
and dict_detection.get("article_col_index") is not None):
|
||||
_syllable_eligible = True
|
||||
|
||||
_syllable_col_filter: Optional[set] = None
|
||||
if syllable_mode == "en":
|
||||
_syllable_col_filter = {en_col_type} if en_col_type else set()
|
||||
elif syllable_mode == "de":
|
||||
if en_col_type and total_cols >= 3:
|
||||
_syllable_col_filter = all_content_cols - {en_col_type}
|
||||
|
||||
if _syllable_eligible:
|
||||
try:
|
||||
from cv_syllable_detect import insert_syllable_dividers
|
||||
force_syllables = (syllable_mode in ("all", "de", "en"))
|
||||
syllable_insertions = insert_syllable_dividers(
|
||||
zones_data, img_bgr, session_id,
|
||||
force=force_syllables,
|
||||
col_filter=_syllable_col_filter,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning("Syllable insertion failed: %s", e)
|
||||
|
||||
return syllable_insertions
|
||||
|
||||
|
||||
def _split_merged_words(
|
||||
zones_data: List[Dict[str, Any]],
|
||||
session_id: str,
|
||||
) -> None:
|
||||
"""Split merged words using dictionary lookup."""
|
||||
try:
|
||||
from cv_review import _try_split_merged_word, _SPELL_AVAILABLE
|
||||
if not _SPELL_AVAILABLE:
|
||||
return
|
||||
split_count = 0
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
text = cell.get("text", "")
|
||||
if not text:
|
||||
continue
|
||||
parts = []
|
||||
changed = False
|
||||
for token in text.split():
|
||||
clean = token
|
||||
bracket_pos = clean.find('[')
|
||||
suffix_ipa = ""
|
||||
if bracket_pos > 0:
|
||||
suffix_ipa = clean[bracket_pos:]
|
||||
clean = clean[:bracket_pos]
|
||||
suffix_punct = ""
|
||||
stripped = clean.rstrip(".,!?;:'\")")
|
||||
if stripped != clean:
|
||||
suffix_punct = clean[len(stripped):]
|
||||
clean = stripped
|
||||
suffix = suffix_punct + suffix_ipa
|
||||
contraction = ""
|
||||
if "'" in clean and clean.index("'") >= 2:
|
||||
apos_pos = clean.index("'")
|
||||
contraction = clean[apos_pos:]
|
||||
clean = clean[:apos_pos]
|
||||
suffix = contraction + suffix
|
||||
if len(clean) >= 4 and clean.isalpha():
|
||||
split = _try_split_merged_word(clean)
|
||||
if split:
|
||||
parts.append(split + suffix)
|
||||
changed = True
|
||||
continue
|
||||
parts.append(token)
|
||||
if changed:
|
||||
cell["text"] = " ".join(parts)
|
||||
split_count += 1
|
||||
if split_count:
|
||||
logger.info("build-grid session %s: split %d merged words", session_id, split_count)
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
def _fix_ipa_spacing(zones_data: List[Dict[str, Any]]) -> None:
|
||||
"""Ensure space before IPA/phonetic brackets: 'word[ipa]' -> 'word [ipa]'."""
|
||||
_IPA_NOSPACE_RE = re.compile(r'([a-zA-ZäöüÄÖÜß])(\[[^\]]{2,}\])')
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
text = cell.get("text", "")
|
||||
if text and "[" in text:
|
||||
fixed = _IPA_NOSPACE_RE.sub(r'\1 \2', text)
|
||||
if fixed != text:
|
||||
cell["text"] = fixed
|
||||
|
||||
|
||||
def _run_spell_checker(
|
||||
zones_data: List[Dict[str, Any]],
|
||||
session_id: str,
|
||||
en_col_type: Optional[str],
|
||||
total_cols: int,
|
||||
) -> None:
|
||||
"""Run SmartSpellChecker on all cells."""
|
||||
try:
|
||||
from smart_spell import SmartSpellChecker
|
||||
_ssc = SmartSpellChecker()
|
||||
spell_fix_count = 0
|
||||
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
text = cell.get("text", "")
|
||||
if not text or not text.strip():
|
||||
continue
|
||||
ct = cell.get("col_type", "")
|
||||
if not ct.startswith("column_"):
|
||||
continue
|
||||
|
||||
if total_cols >= 3 and en_col_type:
|
||||
lang = "en" if ct == en_col_type else "de"
|
||||
elif total_cols <= 2:
|
||||
lang = "auto"
|
||||
else:
|
||||
lang = "auto"
|
||||
|
||||
result = _ssc.correct_text(text, lang=lang)
|
||||
if result.changed:
|
||||
cell["text"] = result.corrected
|
||||
spell_fix_count += 1
|
||||
|
||||
if spell_fix_count:
|
||||
logger.info(
|
||||
"build-grid session %s: SmartSpellChecker fixed %d cells",
|
||||
session_id, spell_fix_count,
|
||||
)
|
||||
except ImportError:
|
||||
logger.debug("SmartSpellChecker not available in build-grid")
|
||||
except Exception as e:
|
||||
logger.warning("SmartSpellChecker error in build-grid: %s", e)
|
||||
|
||||
|
||||
def _remove_empty_columns(zones_data: List[Dict[str, Any]]) -> None:
|
||||
"""Remove columns that have no cells assigned."""
|
||||
for z in zones_data:
|
||||
cells = z.get("cells", [])
|
||||
used_col_indices = {c.get("col_index") for c in cells}
|
||||
old_cols = z.get("columns", [])
|
||||
new_cols = [c for c in old_cols if c.get("col_index", c.get("index", -1)) in used_col_indices]
|
||||
if len(new_cols) < len(old_cols):
|
||||
old_to_new = {}
|
||||
for new_i, col in enumerate(new_cols):
|
||||
old_i = col.get("col_index", col.get("index", new_i))
|
||||
old_to_new[old_i] = new_i
|
||||
col["col_index"] = new_i
|
||||
col["index"] = new_i
|
||||
col["label"] = f"column_{new_i + 1}" if len(new_cols) > 1 else "column_text"
|
||||
for cell in cells:
|
||||
old_ci = cell.get("col_index", 0)
|
||||
cell["col_index"] = old_to_new.get(old_ci, old_ci)
|
||||
cell["col_type"] = f"column_{cell['col_index'] + 1}" if len(new_cols) > 1 else "column_text"
|
||||
z["columns"] = new_cols
|
||||
|
||||
|
||||
def _assemble_result(
|
||||
zones_data: List[Dict[str, Any]],
|
||||
all_words: List[Dict[str, Any]],
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
session_id: str,
|
||||
ipa_mode: str,
|
||||
syllable_mode: str,
|
||||
ipa_target_cols: set,
|
||||
skip_ipa: bool,
|
||||
dict_detection: Dict[str, Any],
|
||||
page_number_info: Optional[Dict],
|
||||
boxes_detected: int,
|
||||
recovered_count: int,
|
||||
duration: float,
|
||||
syllable_insertions: int,
|
||||
) -> dict:
|
||||
"""Build the final result dict (Phase 6)."""
|
||||
total_cells = sum(len(z.get("cells", [])) for z in zones_data)
|
||||
total_columns = sum(len(z.get("columns", [])) for z in zones_data)
|
||||
total_rows = sum(len(z.get("rows", [])) for z in zones_data)
|
||||
|
||||
# Collect color statistics
|
||||
color_stats: Dict[str, int] = {}
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
for wb in cell.get("word_boxes", []):
|
||||
cn = wb.get("color_name", "black")
|
||||
color_stats[cn] = color_stats.get(cn, 0) + 1
|
||||
|
||||
# Compute layout metrics
|
||||
all_content_row_heights: List[float] = []
|
||||
for z in zones_data:
|
||||
for row in z.get("rows", []):
|
||||
if not row.get("is_header", False):
|
||||
h = row.get("y_max_px", 0) - row.get("y_min_px", 0)
|
||||
if h > 0:
|
||||
all_content_row_heights.append(h)
|
||||
avg_row_height = (
|
||||
sum(all_content_row_heights) / len(all_content_row_heights)
|
||||
if all_content_row_heights else 30.0
|
||||
)
|
||||
font_size_suggestion = max(10, int(avg_row_height * 0.6))
|
||||
|
||||
return {
|
||||
"session_id": session_id,
|
||||
"image_width": img_w,
|
||||
"image_height": img_h,
|
||||
"zones": zones_data,
|
||||
"boxes_detected": boxes_detected,
|
||||
"summary": {
|
||||
"total_zones": len(zones_data),
|
||||
"total_columns": total_columns,
|
||||
"total_rows": total_rows,
|
||||
"total_cells": total_cells,
|
||||
"total_words": len(all_words),
|
||||
"recovered_colored": recovered_count,
|
||||
"color_stats": color_stats,
|
||||
},
|
||||
"formatting": {
|
||||
"bold_columns": [],
|
||||
"header_rows": [],
|
||||
},
|
||||
"layout_metrics": {
|
||||
"page_width_px": img_w,
|
||||
"page_height_px": img_h,
|
||||
"avg_row_height_px": round(avg_row_height, 1),
|
||||
"font_size_suggestion_px": font_size_suggestion,
|
||||
},
|
||||
"dictionary_detection": {
|
||||
"is_dictionary": dict_detection.get("is_dictionary", False),
|
||||
"confidence": dict_detection.get("confidence", 0.0),
|
||||
"signals": dict_detection.get("signals", {}),
|
||||
"article_col_index": dict_detection.get("article_col_index"),
|
||||
"headword_col_index": dict_detection.get("headword_col_index"),
|
||||
},
|
||||
"processing_modes": {
|
||||
"ipa_mode": ipa_mode,
|
||||
"syllable_mode": syllable_mode,
|
||||
"ipa_applied": bool(ipa_target_cols) if not skip_ipa else False,
|
||||
"syllables_applied": syllable_insertions > 0,
|
||||
},
|
||||
"page_number": page_number_info,
|
||||
"duration_seconds": round(duration, 2),
|
||||
}
|
||||
# Backward-compat shim -- module moved to grid/build/finalize.py
|
||||
import importlib as _importlib
|
||||
import sys as _sys
|
||||
_sys.modules[__name__] = _importlib.import_module("grid.build.finalize")
|
||||
|
||||
Reference in New Issue
Block a user