Vertical zone split: detect divider lines and create independent sub-zones
Pages with two side-by-side vocabulary columns separated by a vertical black line are now split into independent sub-zones before row/column detection. Each sub-zone gets its own rows, preventing misalignment from different heading rhythms. - _detect_vertical_dividers(): finds pipe word_boxes at consistent x positions spanning >50% of zone height - _split_zone_at_vertical_dividers(): creates left/right PageZone objects with layout_hint and vsplit_group metadata - Column union skips vsplit zones (independent column sets) - Frontend renders vsplit zones side by side via flex layout - PageZone gets layout_hint + vsplit_group fields Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -449,6 +449,108 @@ def _words_in_zone(
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Vertical divider detection and zone splitting
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_PIPE_RE_VSPLIT = re.compile(r"^\|+$")
|
||||
|
||||
|
||||
def _detect_vertical_dividers(
|
||||
words: List[Dict],
|
||||
zone_x: int,
|
||||
zone_w: int,
|
||||
zone_y: int,
|
||||
zone_h: int,
|
||||
) -> List[float]:
|
||||
"""Detect vertical divider lines from pipe word_boxes at consistent x.
|
||||
|
||||
Returns list of divider x-positions (empty if no dividers found).
|
||||
"""
|
||||
if not words or zone_w <= 0 or zone_h <= 0:
|
||||
return []
|
||||
|
||||
# Collect pipe word_boxes
|
||||
pipes = [
|
||||
w for w in words
|
||||
if _PIPE_RE_VSPLIT.match((w.get("text") or "").strip())
|
||||
]
|
||||
if len(pipes) < 5:
|
||||
return []
|
||||
|
||||
# Cluster pipe x-centers by proximity
|
||||
tolerance = max(15, int(zone_w * 0.02))
|
||||
pipe_xs = sorted(w["left"] + w["width"] / 2 for w in pipes)
|
||||
|
||||
clusters: List[List[float]] = [[pipe_xs[0]]]
|
||||
for x in pipe_xs[1:]:
|
||||
if x - clusters[-1][-1] <= tolerance:
|
||||
clusters[-1].append(x)
|
||||
else:
|
||||
clusters.append([x])
|
||||
|
||||
dividers: List[float] = []
|
||||
for cluster in clusters:
|
||||
if len(cluster) < 5:
|
||||
continue
|
||||
mean_x = sum(cluster) / len(cluster)
|
||||
# Must be between 15% and 85% of zone width
|
||||
rel_pos = (mean_x - zone_x) / zone_w
|
||||
if rel_pos < 0.15 or rel_pos > 0.85:
|
||||
continue
|
||||
# Check vertical coverage: pipes must span >= 50% of zone height
|
||||
cluster_pipes = [
|
||||
w for w in pipes
|
||||
if abs(w["left"] + w["width"] / 2 - mean_x) <= tolerance
|
||||
]
|
||||
ys = [w["top"] for w in cluster_pipes] + [w["top"] + w["height"] for w in cluster_pipes]
|
||||
y_span = max(ys) - min(ys) if ys else 0
|
||||
if y_span < zone_h * 0.5:
|
||||
continue
|
||||
dividers.append(mean_x)
|
||||
|
||||
return sorted(dividers)
|
||||
|
||||
|
||||
def _split_zone_at_vertical_dividers(
|
||||
zone: "PageZone",
|
||||
divider_xs: List[float],
|
||||
vsplit_group_id: int,
|
||||
) -> List["PageZone"]:
|
||||
"""Split a PageZone at vertical divider positions into sub-zones."""
|
||||
from cv_vocab_types import PageZone
|
||||
|
||||
boundaries = [zone.x] + divider_xs + [zone.x + zone.width]
|
||||
hints = []
|
||||
for i in range(len(boundaries) - 1):
|
||||
if i == 0:
|
||||
hints.append("left_of_vsplit")
|
||||
elif i == len(boundaries) - 2:
|
||||
hints.append("right_of_vsplit")
|
||||
else:
|
||||
hints.append("middle_of_vsplit")
|
||||
|
||||
sub_zones = []
|
||||
for i in range(len(boundaries) - 1):
|
||||
x_start = int(boundaries[i])
|
||||
x_end = int(boundaries[i + 1])
|
||||
sub = PageZone(
|
||||
index=0, # re-indexed later
|
||||
zone_type=zone.zone_type,
|
||||
y=zone.y,
|
||||
height=zone.height,
|
||||
x=x_start,
|
||||
width=x_end - x_start,
|
||||
box=zone.box,
|
||||
image_overlays=zone.image_overlays,
|
||||
layout_hint=hints[i],
|
||||
vsplit_group=vsplit_group_id,
|
||||
)
|
||||
sub_zones.append(sub)
|
||||
|
||||
return sub_zones
|
||||
|
||||
|
||||
def _merge_content_zones_across_boxes(
|
||||
zones: List,
|
||||
content_x: int,
|
||||
@@ -1404,11 +1506,49 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
||||
page_zones, content_x, content_w
|
||||
)
|
||||
|
||||
# 3b. Detect vertical dividers and split content zones
|
||||
vsplit_group_counter = 0
|
||||
expanded_zones: List = []
|
||||
for pz in page_zones:
|
||||
if pz.zone_type != "content":
|
||||
expanded_zones.append(pz)
|
||||
continue
|
||||
zone_words = _words_in_zone(
|
||||
all_words, pz.y, pz.height, pz.x, pz.width
|
||||
)
|
||||
divider_xs = _detect_vertical_dividers(
|
||||
zone_words, pz.x, pz.width, pz.y, pz.height
|
||||
)
|
||||
if divider_xs:
|
||||
sub_zones = _split_zone_at_vertical_dividers(
|
||||
pz, divider_xs, vsplit_group_counter
|
||||
)
|
||||
expanded_zones.extend(sub_zones)
|
||||
vsplit_group_counter += 1
|
||||
# Remove pipe words so they don't appear in sub-zones
|
||||
pipe_ids = set(
|
||||
id(w) for w in zone_words
|
||||
if _PIPE_RE_VSPLIT.match((w.get("text") or "").strip())
|
||||
)
|
||||
all_words[:] = [w for w in all_words if id(w) not in pipe_ids]
|
||||
logger.info(
|
||||
"build-grid: vertical split zone %d at x=%s → %d sub-zones",
|
||||
pz.index, [int(x) for x in divider_xs], len(sub_zones),
|
||||
)
|
||||
else:
|
||||
expanded_zones.append(pz)
|
||||
# Re-index zones
|
||||
for i, pz in enumerate(expanded_zones):
|
||||
pz.index = i
|
||||
page_zones = expanded_zones
|
||||
|
||||
# --- Union columns from all content zones ---
|
||||
# Each content zone detects columns independently. Narrow
|
||||
# columns (page refs, markers) may appear in only one zone.
|
||||
# Merge column split-points from ALL content zones so every
|
||||
# zone shares the full column set.
|
||||
# NOTE: Zones from a vertical split are independent and must
|
||||
# NOT share columns with each other.
|
||||
|
||||
# First pass: build grids per zone independently
|
||||
zone_grids: List[Dict] = []
|
||||
@@ -1459,8 +1599,11 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
||||
zone_grids.append({"pz": pz, "words": zone_words, "grid": grid})
|
||||
|
||||
# Second pass: merge column boundaries from all content zones
|
||||
# Exclude zones from vertical splits — they have independent columns.
|
||||
content_zones = [
|
||||
zg for zg in zone_grids if zg["pz"].zone_type == "content"
|
||||
zg for zg in zone_grids
|
||||
if zg["pz"].zone_type == "content"
|
||||
and zg["pz"].vsplit_group is None
|
||||
]
|
||||
if len(content_zones) > 1:
|
||||
# Collect column split points (x_min of non-first columns)
|
||||
@@ -1564,6 +1707,11 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
||||
if pz.image_overlays:
|
||||
zone_entry["image_overlays"] = pz.image_overlays
|
||||
|
||||
if pz.layout_hint:
|
||||
zone_entry["layout_hint"] = pz.layout_hint
|
||||
if pz.vsplit_group is not None:
|
||||
zone_entry["vsplit_group"] = pz.vsplit_group
|
||||
|
||||
zones_data.append(zone_entry)
|
||||
|
||||
# 4. Fallback: no boxes detected → single zone with all words
|
||||
@@ -1719,8 +1867,11 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
||||
# OCR reads physical vertical divider lines as "|" or "||" characters.
|
||||
# These sit at consistent x positions near column boundaries and pollute
|
||||
# cell text. Remove them from word_boxes and rebuild cell text.
|
||||
# NOTE: Zones from a vertical split already had pipes removed in step 3b.
|
||||
_PIPE_RE = re.compile(r"^\|+$")
|
||||
for z in zones_data:
|
||||
if z.get("vsplit_group") is not None:
|
||||
continue # pipes already removed before split
|
||||
removed_pipes = 0
|
||||
for cell in z.get("cells", []):
|
||||
wbs = cell.get("word_boxes") or []
|
||||
|
||||
Reference in New Issue
Block a user