Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
214 lines
7.8 KiB
Python
214 lines
7.8 KiB
Python
"""
|
|
Grid Build Core — the main _build_grid_core() function.
|
|
|
|
Extracted from grid_editor_api.py for maintainability.
|
|
Takes merged OCR word positions and builds a structured, zone-aware grid.
|
|
|
|
The function delegates to phase-specific modules:
|
|
- grid_build_zones.py — image loading, graphic/box detection, zone grids
|
|
- grid_build_cleanup.py — junk rows, artifacts, pipes, border strips
|
|
- grid_build_text_ops.py — color, headings, IPA, page refs
|
|
- grid_build_finalize.py — bullets, max_columns, dictionary, spelling, result
|
|
"""
|
|
|
|
import logging
|
|
import time
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
from grid_editor_helpers import (
|
|
_flatten_word_boxes,
|
|
_get_content_bounds,
|
|
_filter_decorative_margin,
|
|
_filter_footer_words,
|
|
_filter_header_junk,
|
|
)
|
|
|
|
from grid_build_zones import _build_zones
|
|
from grid_build_cleanup import _cleanup_zones
|
|
from grid_build_text_ops import _process_text
|
|
from grid_build_finalize import _finalize_grid
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
async def _build_grid_core(
|
|
session_id: str,
|
|
session: dict,
|
|
*,
|
|
ipa_mode: str = "auto",
|
|
syllable_mode: str = "auto",
|
|
enhance: bool = True,
|
|
max_columns: Optional[int] = None,
|
|
min_conf: Optional[int] = None,
|
|
) -> dict:
|
|
"""Core grid building logic — pure computation, no HTTP or DB side effects.
|
|
|
|
Args:
|
|
session_id: Session identifier (for logging and image loading).
|
|
session: Full session dict from get_session_db().
|
|
ipa_mode: "auto" (only when English headwords detected), "all"
|
|
(force IPA on all content columns), "en" (English column only),
|
|
"de" (German/definition columns only), or "none" (skip entirely).
|
|
syllable_mode: "auto" (only when original has pipe dividers),
|
|
"all" (force syllabification on all words), "en" (English only),
|
|
"de" (German only), or "none" (skip).
|
|
|
|
Returns:
|
|
StructuredGrid result dict.
|
|
|
|
Raises:
|
|
ValueError: If session data is incomplete.
|
|
"""
|
|
t0 = time.time()
|
|
|
|
# ── Phase 1: Input Validation & Word Filtering ──────────────────
|
|
|
|
# 1. Validate and load word results
|
|
word_result = session.get("word_result")
|
|
if not word_result or not word_result.get("cells"):
|
|
raise ValueError("No word results found. Run paddle-kombi or rapid-kombi first.")
|
|
|
|
img_w = word_result.get("image_width", 0)
|
|
img_h = word_result.get("image_height", 0)
|
|
if not img_w or not img_h:
|
|
raise ValueError("Missing image dimensions in word_result")
|
|
|
|
# 2. Flatten all word boxes from cells
|
|
all_words = _flatten_word_boxes(word_result["cells"])
|
|
if not all_words:
|
|
raise ValueError("No word boxes found in cells")
|
|
|
|
# 2a-pre. Apply min_conf filter if specified
|
|
if min_conf and min_conf > 0:
|
|
before = len(all_words)
|
|
all_words = [w for w in all_words if w.get('conf', 100) >= min_conf]
|
|
removed = before - len(all_words)
|
|
if removed:
|
|
logger.info("build-grid session %s: min_conf=%d removed %d/%d words",
|
|
session_id, min_conf, removed, before)
|
|
|
|
logger.info("build-grid session %s: %d words from %d cells (enhance=%s, max_cols=%s, min_conf=%s)",
|
|
session_id, len(all_words), len(word_result["cells"]),
|
|
enhance, max_columns, min_conf)
|
|
|
|
# 2b. Filter decorative margin columns (alphabet graphics)
|
|
margin_strip_info = _filter_decorative_margin(all_words, img_w, logger, session_id)
|
|
margin_strip_detected = margin_strip_info.get("found", False)
|
|
|
|
# Read document_category from session
|
|
document_category = session.get("document_category")
|
|
|
|
# 2c. Filter footer rows (page numbers at the very bottom)
|
|
page_number_info = _filter_footer_words(all_words, img_h, logger, session_id)
|
|
|
|
# 2c2. Filter OCR junk from header illustrations
|
|
_filter_header_junk(all_words, img_h, logger, session_id)
|
|
|
|
# 2d. Filter words inside user-defined exclude regions
|
|
structure_result = session.get("structure_result")
|
|
exclude_rects = []
|
|
if structure_result:
|
|
for er in structure_result.get("exclude_regions", []):
|
|
exclude_rects.append({
|
|
"x": er["x"], "y": er["y"],
|
|
"w": er["w"], "h": er["h"],
|
|
})
|
|
if exclude_rects:
|
|
before = len(all_words)
|
|
filtered = []
|
|
for w in all_words:
|
|
w_cx = w["left"] + w.get("width", 0) / 2
|
|
w_cy = w["top"] + w.get("height", 0) / 2
|
|
inside = any(
|
|
er["x"] <= w_cx <= er["x"] + er["w"]
|
|
and er["y"] <= w_cy <= er["y"] + er["h"]
|
|
for er in exclude_rects
|
|
)
|
|
if not inside:
|
|
filtered.append(w)
|
|
removed = before - len(filtered)
|
|
if removed:
|
|
all_words = filtered
|
|
logger.info(
|
|
"build-grid session %s: removed %d words inside %d user exclude region(s)",
|
|
session_id, removed, len(exclude_rects),
|
|
)
|
|
|
|
# 2e. Hard-filter words inside graphic/image regions from structure step
|
|
graphic_rects: List[Dict[str, int]] = []
|
|
if structure_result:
|
|
for g in structure_result.get("graphics", []):
|
|
graphic_rects.append({
|
|
"x": g["x"], "y": g["y"],
|
|
"w": g["w"], "h": g["h"],
|
|
})
|
|
if graphic_rects:
|
|
before = len(all_words)
|
|
all_words = [
|
|
w for w in all_words
|
|
if not any(
|
|
gr["x"] <= w["left"] + w.get("width", 0) / 2 <= gr["x"] + gr["w"]
|
|
and gr["y"] <= w["top"] + w.get("height", 0) / 2 <= gr["y"] + gr["h"]
|
|
for gr in graphic_rects
|
|
)
|
|
]
|
|
removed = before - len(all_words)
|
|
if removed:
|
|
logger.info(
|
|
"build-grid session %s: hard-removed %d words inside %d structure graphic region(s)",
|
|
session_id, removed, len(graphic_rects),
|
|
)
|
|
|
|
content_x, content_y, content_w, content_h = _get_content_bounds(all_words)
|
|
|
|
# ── Phase 2: Image Processing & Zone Detection ──────────────────
|
|
|
|
zone_result = await _build_zones(
|
|
session_id, session, all_words, graphic_rects,
|
|
content_x, content_y, content_w, content_h,
|
|
img_w, img_h,
|
|
)
|
|
zones_data = zone_result["zones_data"]
|
|
boxes_detected = zone_result["boxes_detected"]
|
|
recovered_count = zone_result["recovered_count"]
|
|
border_prefiltered = zone_result["border_prefiltered"]
|
|
img_bgr = zone_result["img_bgr"]
|
|
|
|
# ── Phase 3: Junk Removal & Cell Cleanup ────────────────────────
|
|
|
|
border_prefiltered = _cleanup_zones(zones_data, border_prefiltered, session_id)
|
|
|
|
# ── Phase 4+5a: Color, Headings, IPA, Page Refs ─────────────────
|
|
|
|
text_result = _process_text(
|
|
zones_data, img_bgr, img_w, img_h, ipa_mode, page_number_info,
|
|
)
|
|
|
|
# ── Phase 5b+6: Finalize & Result Assembly ──────────────────────
|
|
|
|
duration = time.time() - t0
|
|
|
|
result = _finalize_grid(
|
|
zones_data=zones_data,
|
|
all_words=all_words,
|
|
img_bgr=img_bgr,
|
|
img_w=img_w,
|
|
img_h=img_h,
|
|
session_id=session_id,
|
|
max_columns=max_columns,
|
|
ipa_mode=ipa_mode,
|
|
syllable_mode=syllable_mode,
|
|
en_col_type=text_result["en_col_type"],
|
|
ipa_target_cols=text_result["ipa_target_cols"],
|
|
all_content_cols=text_result["all_content_cols"],
|
|
skip_ipa=text_result["skip_ipa"],
|
|
document_category=document_category,
|
|
margin_strip_detected=margin_strip_detected,
|
|
page_number_info=text_result["page_number_info"],
|
|
boxes_detected=boxes_detected,
|
|
recovered_count=recovered_count,
|
|
duration=duration,
|
|
)
|
|
|
|
return result
|