Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
463 lines
16 KiB
Python
463 lines
16 KiB
Python
"""
|
|
Grid Build Zones — Phase 2: Image processing, graphic detection, box/zone
|
|
detection and zone-aware grid building.
|
|
|
|
Extracted from grid_build_core.py for maintainability.
|
|
"""
|
|
|
|
import logging
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
import cv2
|
|
import numpy as np
|
|
|
|
from cv_box_detect import detect_boxes, split_page_into_zones
|
|
from cv_graphic_detect import detect_graphic_elements
|
|
from cv_color_detect import recover_colored_text
|
|
from cv_vocab_types import PageZone
|
|
from ocr_pipeline_session_store import get_session_image
|
|
|
|
from grid_editor_helpers import (
|
|
_filter_border_strip_words,
|
|
_filter_border_ghosts,
|
|
_words_in_zone,
|
|
_PIPE_RE_VSPLIT,
|
|
_detect_vertical_dividers,
|
|
_split_zone_at_vertical_dividers,
|
|
_merge_content_zones_across_boxes,
|
|
_build_zone_grid,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
async def _build_zones(
|
|
session_id: str,
|
|
session: dict,
|
|
all_words: List[Dict[str, Any]],
|
|
graphic_rects: List[Dict[str, int]],
|
|
content_x: int,
|
|
content_y: int,
|
|
content_w: int,
|
|
content_h: int,
|
|
img_w: int,
|
|
img_h: int,
|
|
) -> Dict[str, Any]:
|
|
"""Load image, detect graphics/boxes, build zone-aware grids.
|
|
|
|
Returns a dict with keys:
|
|
zones_data, boxes_detected, recovered_count, border_prefiltered,
|
|
img_bgr, all_words (modified in-place but returned for clarity).
|
|
"""
|
|
zones_data: List[Dict[str, Any]] = []
|
|
boxes_detected = 0
|
|
recovered_count = 0
|
|
border_prefiltered = False
|
|
img_bgr = None
|
|
|
|
# 3. Load image for box detection
|
|
img_png = await get_session_image(session_id, "cropped")
|
|
if not img_png:
|
|
img_png = await get_session_image(session_id, "dewarped")
|
|
if not img_png:
|
|
img_png = await get_session_image(session_id, "original")
|
|
|
|
if img_png:
|
|
# Decode image for color detection + box detection
|
|
arr = np.frombuffer(img_png, dtype=np.uint8)
|
|
img_bgr = cv2.imdecode(arr, cv2.IMREAD_COLOR)
|
|
|
|
if img_bgr is not None:
|
|
# --- 3a. Detect graphic/image regions via CV and hard-filter ---
|
|
sig_words = [w for w in all_words if len((w.get("text") or "").strip()) >= 3]
|
|
fresh_graphics = detect_graphic_elements(img_bgr, sig_words)
|
|
if fresh_graphics:
|
|
fresh_rects = [
|
|
{"x": g.x, "y": g.y, "w": g.width, "h": g.height}
|
|
for g in fresh_graphics
|
|
]
|
|
graphic_rects.extend(fresh_rects)
|
|
logger.info(
|
|
"build-grid session %s: detected %d graphic region(s) via CV",
|
|
session_id, len(fresh_graphics),
|
|
)
|
|
# Hard-filter words inside newly detected graphic regions
|
|
before = len(all_words)
|
|
all_words[:] = [
|
|
w for w in all_words
|
|
if not any(
|
|
gr["x"] <= w["left"] + w.get("width", 0) / 2 <= gr["x"] + gr["w"]
|
|
and gr["y"] <= w["top"] + w.get("height", 0) / 2 <= gr["y"] + gr["h"]
|
|
for gr in fresh_rects
|
|
)
|
|
]
|
|
removed = before - len(all_words)
|
|
if removed:
|
|
logger.info(
|
|
"build-grid session %s: hard-removed %d words inside %d fresh graphic region(s)",
|
|
session_id, removed, len(fresh_rects),
|
|
)
|
|
|
|
# --- Recover colored text that OCR missed (before grid building) ---
|
|
recovered = recover_colored_text(img_bgr, all_words)
|
|
if recovered and graphic_rects:
|
|
# Filter recovered chars inside graphic regions
|
|
recovered = [
|
|
r for r in recovered
|
|
if not any(
|
|
gr["x"] <= r["left"] + r.get("width", 0) / 2 <= gr["x"] + gr["w"]
|
|
and gr["y"] <= r["top"] + r.get("height", 0) / 2 <= gr["y"] + gr["h"]
|
|
for gr in graphic_rects
|
|
)
|
|
]
|
|
if recovered:
|
|
recovered_count = len(recovered)
|
|
all_words.extend(recovered)
|
|
logger.info(
|
|
"build-grid session %s: +%d recovered colored words",
|
|
session_id, recovered_count,
|
|
)
|
|
|
|
# Detect bordered boxes
|
|
boxes = detect_boxes(
|
|
img_bgr,
|
|
content_x=content_x,
|
|
content_w=content_w,
|
|
content_y=content_y,
|
|
content_h=content_h,
|
|
)
|
|
boxes_detected = len(boxes)
|
|
|
|
if boxes:
|
|
# Filter border ghost words before grid building
|
|
all_words_new, ghost_count = _filter_border_ghosts(all_words, boxes)
|
|
if ghost_count:
|
|
all_words[:] = all_words_new
|
|
logger.info(
|
|
"build-grid session %s: removed %d border ghost words",
|
|
session_id, ghost_count,
|
|
)
|
|
|
|
# Split page into zones
|
|
page_zones = split_page_into_zones(
|
|
content_x, content_y, content_w, content_h, boxes
|
|
)
|
|
|
|
# Merge content zones separated by box zones
|
|
page_zones = _merge_content_zones_across_boxes(
|
|
page_zones, content_x, content_w
|
|
)
|
|
|
|
# 3b. Detect vertical dividers and split content zones
|
|
page_zones, border_prefiltered_vd = _detect_and_split_vertical_dividers(
|
|
page_zones, all_words
|
|
)
|
|
|
|
# --- First pass: build grids per zone independently ---
|
|
zone_grids = _build_grids_per_zone(
|
|
page_zones, all_words, img_w, img_h
|
|
)
|
|
border_prefiltered = border_prefiltered or any(
|
|
zg.get("_border_prefiltered") for zg in zone_grids
|
|
)
|
|
|
|
# --- Second pass: merge column boundaries from all content zones ---
|
|
_merge_content_zone_columns(
|
|
zone_grids, all_words, content_w, img_w, img_h, session_id
|
|
)
|
|
|
|
# --- Build zones_data from zone_grids ---
|
|
for zg in zone_grids:
|
|
pz = zg["pz"]
|
|
grid = zg["grid"]
|
|
grid.pop("_raw_columns", None)
|
|
|
|
zone_entry: Dict[str, Any] = {
|
|
"zone_index": pz.index,
|
|
"zone_type": pz.zone_type,
|
|
"bbox_px": {
|
|
"x": pz.x, "y": pz.y,
|
|
"w": pz.width, "h": pz.height,
|
|
},
|
|
"bbox_pct": {
|
|
"x": round(pz.x / img_w * 100, 2) if img_w else 0,
|
|
"y": round(pz.y / img_h * 100, 2) if img_h else 0,
|
|
"w": round(pz.width / img_w * 100, 2) if img_w else 0,
|
|
"h": round(pz.height / img_h * 100, 2) if img_h else 0,
|
|
},
|
|
"border": None,
|
|
"word_count": len(zg["words"]),
|
|
**grid,
|
|
}
|
|
|
|
if pz.box:
|
|
zone_entry["border"] = {
|
|
"thickness": pz.box.border_thickness,
|
|
"confidence": pz.box.confidence,
|
|
}
|
|
|
|
if pz.image_overlays:
|
|
zone_entry["image_overlays"] = pz.image_overlays
|
|
|
|
if pz.layout_hint:
|
|
zone_entry["layout_hint"] = pz.layout_hint
|
|
if pz.vsplit_group is not None:
|
|
zone_entry["vsplit_group"] = pz.vsplit_group
|
|
|
|
zones_data.append(zone_entry)
|
|
|
|
# 4. Fallback: no boxes detected -> single zone with all words
|
|
if not zones_data:
|
|
before = len(all_words)
|
|
filtered_words = [
|
|
w for w in all_words
|
|
if not (w.get("recovered") and len(w.get("text", "").strip()) <= 2)
|
|
]
|
|
removed = before - len(filtered_words)
|
|
if removed:
|
|
logger.info(
|
|
"build-grid session %s: filtered %d recovered artifacts (fallback zone)",
|
|
session_id, removed,
|
|
)
|
|
filtered_words, bs_removed = _filter_border_strip_words(filtered_words)
|
|
if bs_removed:
|
|
border_prefiltered = True
|
|
logger.info(
|
|
"build-grid session %s: pre-filtered %d border-strip words",
|
|
session_id, bs_removed,
|
|
)
|
|
grid = _build_zone_grid(
|
|
filtered_words, content_x, content_y, content_w, content_h,
|
|
0, img_w, img_h,
|
|
)
|
|
grid.pop("_raw_columns", None)
|
|
zones_data.append({
|
|
"zone_index": 0,
|
|
"zone_type": "content",
|
|
"bbox_px": {
|
|
"x": content_x, "y": content_y,
|
|
"w": content_w, "h": content_h,
|
|
},
|
|
"bbox_pct": {
|
|
"x": round(content_x / img_w * 100, 2) if img_w else 0,
|
|
"y": round(content_y / img_h * 100, 2) if img_h else 0,
|
|
"w": round(content_w / img_w * 100, 2) if img_w else 0,
|
|
"h": round(content_h / img_h * 100, 2) if img_h else 0,
|
|
},
|
|
"border": None,
|
|
"word_count": len(all_words),
|
|
**grid,
|
|
})
|
|
|
|
return {
|
|
"zones_data": zones_data,
|
|
"boxes_detected": boxes_detected,
|
|
"recovered_count": recovered_count,
|
|
"border_prefiltered": border_prefiltered,
|
|
"img_bgr": img_bgr,
|
|
}
|
|
|
|
|
|
def _detect_and_split_vertical_dividers(
|
|
page_zones: List[PageZone],
|
|
all_words: List[Dict[str, Any]],
|
|
) -> tuple:
|
|
"""Detect vertical dividers and split content zones.
|
|
|
|
Returns (expanded_zones, border_prefiltered_from_vsplit).
|
|
"""
|
|
vsplit_group_counter = 0
|
|
expanded_zones: List = []
|
|
for pz in page_zones:
|
|
if pz.zone_type != "content":
|
|
expanded_zones.append(pz)
|
|
continue
|
|
zone_words = _words_in_zone(
|
|
all_words, pz.y, pz.height, pz.x, pz.width
|
|
)
|
|
divider_xs = _detect_vertical_dividers(
|
|
zone_words, pz.x, pz.width, pz.y, pz.height
|
|
)
|
|
if divider_xs:
|
|
sub_zones = _split_zone_at_vertical_dividers(
|
|
pz, divider_xs, vsplit_group_counter
|
|
)
|
|
expanded_zones.extend(sub_zones)
|
|
vsplit_group_counter += 1
|
|
# Remove pipe words so they don't appear in sub-zones
|
|
pipe_ids = set(
|
|
id(w) for w in zone_words
|
|
if _PIPE_RE_VSPLIT.match((w.get("text") or "").strip())
|
|
)
|
|
all_words[:] = [w for w in all_words if id(w) not in pipe_ids]
|
|
logger.info(
|
|
"build-grid: vertical split zone %d at x=%s -> %d sub-zones",
|
|
pz.index, [int(x) for x in divider_xs], len(sub_zones),
|
|
)
|
|
else:
|
|
expanded_zones.append(pz)
|
|
# Re-index zones
|
|
for i, pz in enumerate(expanded_zones):
|
|
pz.index = i
|
|
return expanded_zones, False
|
|
|
|
|
|
def _build_grids_per_zone(
|
|
page_zones: List[PageZone],
|
|
all_words: List[Dict[str, Any]],
|
|
img_w: int,
|
|
img_h: int,
|
|
) -> List[Dict[str, Any]]:
|
|
"""Build grids for each zone independently (first pass)."""
|
|
zone_grids: List[Dict] = []
|
|
|
|
for pz in page_zones:
|
|
zone_words = _words_in_zone(
|
|
all_words, pz.y, pz.height, pz.x, pz.width
|
|
)
|
|
if pz.zone_type == "content":
|
|
logger.info(
|
|
"build-grid zone %d (%s): bounds x=%d..%d y=%d..%d -> %d/%d words",
|
|
pz.index, pz.zone_type,
|
|
pz.x, pz.x + pz.width, pz.y, pz.y + pz.height,
|
|
len(zone_words), len(all_words),
|
|
)
|
|
# Filter recovered single-char artifacts in ALL zones
|
|
before = len(zone_words)
|
|
zone_words = [
|
|
w for w in zone_words
|
|
if not (
|
|
w.get("recovered")
|
|
and len(w.get("text", "").strip()) <= 2
|
|
)
|
|
]
|
|
removed = before - len(zone_words)
|
|
if removed:
|
|
logger.info(
|
|
"build-grid: filtered %d recovered artifacts from %s zone %d",
|
|
removed, pz.zone_type, pz.index,
|
|
)
|
|
# Filter words inside image overlay regions (merged box zones)
|
|
if pz.image_overlays:
|
|
before_ov = len(zone_words)
|
|
zone_words = [
|
|
w for w in zone_words
|
|
if not any(
|
|
ov["y"] <= w["top"] + w["height"] / 2 <= ov["y"] + ov["height"]
|
|
and ov["x"] <= w["left"] + w["width"] / 2 <= ov["x"] + ov["width"]
|
|
for ov in pz.image_overlays
|
|
)
|
|
]
|
|
ov_removed = before_ov - len(zone_words)
|
|
if ov_removed:
|
|
logger.info(
|
|
"build-grid: filtered %d words inside image overlays from zone %d",
|
|
ov_removed, pz.index,
|
|
)
|
|
zone_words, bs_removed = _filter_border_strip_words(zone_words)
|
|
bp = False
|
|
if bs_removed:
|
|
bp = True
|
|
logger.info(
|
|
"build-grid: pre-filtered %d border-strip words from zone %d",
|
|
bs_removed, pz.index,
|
|
)
|
|
grid = _build_zone_grid(
|
|
zone_words, pz.x, pz.y, pz.width, pz.height,
|
|
pz.index, img_w, img_h,
|
|
skip_first_row_header=bool(pz.image_overlays),
|
|
)
|
|
zone_grids.append({
|
|
"pz": pz, "words": zone_words, "grid": grid,
|
|
"_border_prefiltered": bp,
|
|
})
|
|
|
|
return zone_grids
|
|
|
|
|
|
def _merge_content_zone_columns(
|
|
zone_grids: List[Dict[str, Any]],
|
|
all_words: List[Dict[str, Any]],
|
|
content_w: int,
|
|
img_w: int,
|
|
img_h: int,
|
|
session_id: str,
|
|
) -> None:
|
|
"""Second pass: merge column boundaries from all content zones.
|
|
|
|
Modifies zone_grids in place.
|
|
"""
|
|
content_zones = [
|
|
zg for zg in zone_grids
|
|
if zg["pz"].zone_type == "content"
|
|
and zg["pz"].vsplit_group is None
|
|
]
|
|
if len(content_zones) <= 1:
|
|
return
|
|
|
|
# Collect column split points (x_min of non-first columns)
|
|
all_split_xs: List[float] = []
|
|
for zg in content_zones:
|
|
raw_cols = zg["grid"].get("_raw_columns", [])
|
|
for col in raw_cols[1:]:
|
|
all_split_xs.append(col["x_min"])
|
|
|
|
if not all_split_xs:
|
|
return
|
|
|
|
all_split_xs.sort()
|
|
merge_distance = max(25, int(content_w * 0.03))
|
|
merged_xs = [all_split_xs[0]]
|
|
for x in all_split_xs[1:]:
|
|
if x - merged_xs[-1] < merge_distance:
|
|
merged_xs[-1] = (merged_xs[-1] + x) / 2
|
|
else:
|
|
merged_xs.append(x)
|
|
|
|
total_cols = len(merged_xs) + 1
|
|
max_zone_cols = max(
|
|
len(zg["grid"].get("_raw_columns", []))
|
|
for zg in content_zones
|
|
)
|
|
|
|
if total_cols < max_zone_cols:
|
|
return
|
|
|
|
cx_min = min(w["left"] for w in all_words)
|
|
cx_max = max(w["left"] + w["width"] for w in all_words)
|
|
merged_columns: List[Dict[str, Any]] = []
|
|
prev_x = cx_min
|
|
for i, sx in enumerate(merged_xs):
|
|
merged_columns.append({
|
|
"index": i,
|
|
"type": f"column_{i + 1}",
|
|
"x_min": prev_x,
|
|
"x_max": sx,
|
|
})
|
|
prev_x = sx
|
|
merged_columns.append({
|
|
"index": len(merged_xs),
|
|
"type": f"column_{len(merged_xs) + 1}",
|
|
"x_min": prev_x,
|
|
"x_max": cx_max,
|
|
})
|
|
|
|
# Re-build ALL content zones with merged columns
|
|
for zg in zone_grids:
|
|
pz = zg["pz"]
|
|
if pz.zone_type == "content":
|
|
grid = _build_zone_grid(
|
|
zg["words"], pz.x, pz.y,
|
|
pz.width, pz.height,
|
|
pz.index, img_w, img_h,
|
|
global_columns=merged_columns,
|
|
skip_first_row_header=bool(pz.image_overlays),
|
|
)
|
|
zg["grid"] = grid
|
|
logger.info(
|
|
"build-grid session %s: union of %d content "
|
|
"zones -> %d merged columns (max single zone: %d)",
|
|
session_id, len(content_zones),
|
|
total_cols, max_zone_cols,
|
|
)
|