overflow-hidden → overflow-y-auto so all nav items are reachable. Added /parent (Eltern-Portal) link with people icon. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
459 lines
18 KiB
Python
459 lines
18 KiB
Python
"""
|
||
Post-processing refinements for column geometry.
|
||
|
||
Extracted from cv_layout_columns.py — contains:
|
||
- _detect_sub_columns() (sub-column detection via left-edge alignment)
|
||
- _split_broad_columns() (broad column splitting via word-coverage gaps)
|
||
- expand_narrow_columns() (narrow column expansion into whitespace)
|
||
|
||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||
"""
|
||
|
||
import logging
|
||
import statistics
|
||
from typing import Dict, List, Optional, Tuple
|
||
|
||
import numpy as np
|
||
|
||
from cv_vocab_types import ColumnGeometry
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
def _detect_sub_columns(
|
||
geometries: List[ColumnGeometry],
|
||
content_w: int,
|
||
left_x: int = 0,
|
||
top_y: int = 0,
|
||
header_y: Optional[int] = None,
|
||
footer_y: Optional[int] = None,
|
||
_edge_tolerance: int = 8,
|
||
_min_col_start_ratio: float = 0.10,
|
||
) -> List[ColumnGeometry]:
|
||
"""Split columns that contain internal sub-columns based on left-edge alignment.
|
||
|
||
For each column, clusters word left-edges into alignment bins (within
|
||
``_edge_tolerance`` px). The leftmost bin whose word count reaches
|
||
``_min_col_start_ratio`` of the column total is treated as the true column
|
||
start. Any words to the left of that bin form a sub-column, provided they
|
||
number >= 2 and < 35 % of total.
|
||
|
||
Word ``left`` values are relative to the content ROI (offset by *left_x*),
|
||
while ``ColumnGeometry.x`` is in absolute image coordinates. *left_x*
|
||
bridges the two coordinate systems.
|
||
|
||
If *header_y* / *footer_y* are provided (absolute y-coordinates), words
|
||
in header/footer regions are excluded from alignment clustering to avoid
|
||
polluting the bins with page numbers or chapter titles. Word ``top``
|
||
values are relative to *top_y*.
|
||
|
||
Returns a new list of ColumnGeometry — potentially longer than the input.
|
||
"""
|
||
if content_w <= 0:
|
||
return geometries
|
||
|
||
result: List[ColumnGeometry] = []
|
||
for geo in geometries:
|
||
# Only consider wide-enough columns with enough words
|
||
if geo.width_ratio < 0.15 or geo.word_count < 5:
|
||
result.append(geo)
|
||
continue
|
||
|
||
# Collect left-edges of confident words, excluding header/footer
|
||
# Convert header_y/footer_y from absolute to relative (word 'top' is relative to top_y)
|
||
min_top_rel = (header_y - top_y) if header_y is not None else None
|
||
max_top_rel = (footer_y - top_y) if footer_y is not None else None
|
||
|
||
confident = [w for w in geo.words
|
||
if w.get('conf', 0) >= 30
|
||
and (min_top_rel is None or w['top'] >= min_top_rel)
|
||
and (max_top_rel is None or w['top'] <= max_top_rel)]
|
||
if len(confident) < 3:
|
||
result.append(geo)
|
||
continue
|
||
|
||
# --- Cluster left-edges into alignment bins ---
|
||
sorted_edges = sorted(w['left'] for w in confident)
|
||
bins: List[Tuple[int, int, int, int]] = [] # (center, count, min_edge, max_edge)
|
||
cur = [sorted_edges[0]]
|
||
for i in range(1, len(sorted_edges)):
|
||
if sorted_edges[i] - cur[-1] <= _edge_tolerance:
|
||
cur.append(sorted_edges[i])
|
||
else:
|
||
bins.append((sum(cur) // len(cur), len(cur), min(cur), max(cur)))
|
||
cur = [sorted_edges[i]]
|
||
bins.append((sum(cur) // len(cur), len(cur), min(cur), max(cur)))
|
||
|
||
# --- Find the leftmost bin qualifying as a real column start ---
|
||
total = len(confident)
|
||
min_count = max(3, int(total * _min_col_start_ratio))
|
||
col_start_bin = None
|
||
for b in bins:
|
||
if b[1] >= min_count:
|
||
col_start_bin = b
|
||
break
|
||
|
||
if col_start_bin is None:
|
||
result.append(geo)
|
||
continue
|
||
|
||
# Words to the left of the column-start bin are sub-column candidates
|
||
split_threshold = col_start_bin[2] - _edge_tolerance
|
||
sub_words = [w for w in geo.words if w['left'] < split_threshold]
|
||
main_words = [w for w in geo.words if w['left'] >= split_threshold]
|
||
|
||
# Count only body words (excluding header/footer) for the threshold check
|
||
# so that header/footer words don't artificially trigger a split.
|
||
sub_body = [w for w in sub_words
|
||
if (min_top_rel is None or w['top'] >= min_top_rel)
|
||
and (max_top_rel is None or w['top'] <= max_top_rel)]
|
||
if len(sub_body) < 2 or len(sub_body) / len(geo.words) >= 0.35:
|
||
result.append(geo)
|
||
continue
|
||
|
||
# --- Guard against inline markers (bullet points, numbering) ---
|
||
# Bullet points like "1.", "2.", "•", "-" sit close to the main
|
||
# column text and are part of the cell, not a separate column.
|
||
# Only split if the horizontal gap between the rightmost sub-word
|
||
# and the main column start is large enough.
|
||
max_sub_right = max(w['left'] + w.get('width', 0) for w in sub_words)
|
||
gap_to_main = col_start_bin[2] - max_sub_right # px gap
|
||
median_heights = [w.get('height', 20) for w in confident]
|
||
med_h = statistics.median(median_heights) if median_heights else 20
|
||
min_gap = max(med_h * 1.2, 20) # at least 1.2× word height or 20px
|
||
if gap_to_main < min_gap:
|
||
logger.debug(
|
||
"SubColumnSplit: column idx=%d skipped — gap=%dpx < min=%dpx "
|
||
"(likely inline markers, not a sub-column)",
|
||
geo.index, gap_to_main, min_gap)
|
||
result.append(geo)
|
||
continue
|
||
|
||
# --- Build two sub-column geometries ---
|
||
# Word 'left' values are relative to left_x; geo.x is absolute.
|
||
# Convert the split position from relative to absolute coordinates.
|
||
max_sub_left = max(w['left'] for w in sub_words)
|
||
split_rel = (max_sub_left + col_start_bin[2]) // 2
|
||
split_abs = split_rel + left_x
|
||
|
||
sub_x = geo.x
|
||
sub_width = split_abs - geo.x
|
||
main_x = split_abs
|
||
main_width = (geo.x + geo.width) - split_abs
|
||
|
||
if sub_width <= 0 or main_width <= 0:
|
||
result.append(geo)
|
||
continue
|
||
|
||
sub_geo = ColumnGeometry(
|
||
index=0,
|
||
x=sub_x,
|
||
y=geo.y,
|
||
width=sub_width,
|
||
height=geo.height,
|
||
word_count=len(sub_words),
|
||
words=sub_words,
|
||
width_ratio=sub_width / content_w if content_w > 0 else 0.0,
|
||
is_sub_column=True,
|
||
)
|
||
main_geo = ColumnGeometry(
|
||
index=0,
|
||
x=main_x,
|
||
y=geo.y,
|
||
width=main_width,
|
||
height=geo.height,
|
||
word_count=len(main_words),
|
||
words=main_words,
|
||
width_ratio=main_width / content_w if content_w > 0 else 0.0,
|
||
is_sub_column=True,
|
||
)
|
||
|
||
result.append(sub_geo)
|
||
result.append(main_geo)
|
||
|
||
logger.info(
|
||
f"SubColumnSplit: column idx={geo.index} split at abs_x={split_abs} "
|
||
f"(rel={split_rel}), sub={len(sub_words)} words, "
|
||
f"main={len(main_words)} words, "
|
||
f"col_start_bin=({col_start_bin[0]}, n={col_start_bin[1]})"
|
||
)
|
||
|
||
# Re-index by left-to-right order
|
||
result.sort(key=lambda g: g.x)
|
||
for i, g in enumerate(result):
|
||
g.index = i
|
||
|
||
return result
|
||
|
||
|
||
def _split_broad_columns(
|
||
geometries: List[ColumnGeometry],
|
||
content_w: int,
|
||
left_x: int = 0,
|
||
_broad_threshold: float = 0.35,
|
||
_min_gap_px: int = 15,
|
||
_min_words_per_split: int = 5,
|
||
) -> List[ColumnGeometry]:
|
||
"""Split overly broad columns that contain two language blocks (EN+DE).
|
||
|
||
Uses word-coverage gap analysis: builds a per-pixel coverage array from the
|
||
words inside each broad column, finds the largest horizontal gap, and splits
|
||
the column at that gap.
|
||
|
||
Args:
|
||
geometries: Column geometries from _detect_sub_columns.
|
||
content_w: Width of the content area in pixels.
|
||
left_x: Left edge of content ROI in absolute image coordinates.
|
||
_broad_threshold: Minimum width_ratio to consider a column "broad".
|
||
_min_gap_px: Minimum gap width (pixels) to trigger a split.
|
||
_min_words_per_split: Both halves must have at least this many words.
|
||
|
||
Returns:
|
||
Updated list of ColumnGeometry (possibly with more columns).
|
||
"""
|
||
result: List[ColumnGeometry] = []
|
||
|
||
logger.info(f"SplitBroadCols: input {len(geometries)} cols: "
|
||
f"{[(g.index, g.x, g.width, g.word_count, round(g.width_ratio, 3)) for g in geometries]}")
|
||
|
||
for geo in geometries:
|
||
if geo.width_ratio <= _broad_threshold or len(geo.words) < 10:
|
||
result.append(geo)
|
||
continue
|
||
|
||
# Build word-coverage array (per pixel within column)
|
||
col_left_rel = geo.x - left_x # column left in content-relative coords
|
||
coverage = np.zeros(geo.width, dtype=np.float32)
|
||
|
||
for wd in geo.words:
|
||
# wd['left'] is relative to left_x (content ROI)
|
||
wl = wd['left'] - col_left_rel
|
||
wr = wl + wd.get('width', 0)
|
||
wl = max(0, int(wl))
|
||
wr = min(geo.width, int(wr))
|
||
if wr > wl:
|
||
coverage[wl:wr] += 1.0
|
||
|
||
# Light smoothing (kernel=3px) to avoid noise
|
||
if len(coverage) > 3:
|
||
kernel = np.ones(3, dtype=np.float32) / 3.0
|
||
coverage = np.convolve(coverage, kernel, mode='same')
|
||
|
||
# Normalise to [0, 1]
|
||
cmax = coverage.max()
|
||
if cmax > 0:
|
||
coverage /= cmax
|
||
|
||
# Find INTERNAL gaps where coverage < 0.5
|
||
# Exclude edge gaps (touching pixel 0 or geo.width) — those are margins.
|
||
low_mask = coverage < 0.5
|
||
all_gaps = []
|
||
_gs = None
|
||
for px in range(len(low_mask)):
|
||
if low_mask[px]:
|
||
if _gs is None:
|
||
_gs = px
|
||
else:
|
||
if _gs is not None:
|
||
all_gaps.append((_gs, px, px - _gs))
|
||
_gs = None
|
||
if _gs is not None:
|
||
all_gaps.append((_gs, len(low_mask), len(low_mask) - _gs))
|
||
|
||
# Filter: only internal gaps (not touching column edges)
|
||
_edge_margin = 10 # pixels from edge to ignore
|
||
internal_gaps = [g for g in all_gaps
|
||
if g[0] > _edge_margin and g[1] < geo.width - _edge_margin]
|
||
best_gap = max(internal_gaps, key=lambda g: g[2]) if internal_gaps else None
|
||
|
||
logger.info(f"SplitBroadCols: col {geo.index} all_gaps(>=5px): "
|
||
f"{[g for g in all_gaps if g[2] >= 5]}, "
|
||
f"internal(>=5px): {[g for g in internal_gaps if g[2] >= 5]}, "
|
||
f"best={best_gap}")
|
||
|
||
if best_gap is None or best_gap[2] < _min_gap_px:
|
||
result.append(geo)
|
||
continue
|
||
|
||
gap_center = (best_gap[0] + best_gap[1]) // 2
|
||
|
||
# Split words by midpoint relative to gap
|
||
left_words = []
|
||
right_words = []
|
||
for wd in geo.words:
|
||
wl = wd['left'] - col_left_rel
|
||
mid = wl + wd.get('width', 0) / 2.0
|
||
if mid < gap_center:
|
||
left_words.append(wd)
|
||
else:
|
||
right_words.append(wd)
|
||
|
||
if len(left_words) < _min_words_per_split or len(right_words) < _min_words_per_split:
|
||
result.append(geo)
|
||
continue
|
||
|
||
# Build two new ColumnGeometry objects
|
||
split_x_abs = geo.x + gap_center
|
||
left_w = gap_center
|
||
right_w = geo.width - gap_center
|
||
|
||
left_geo = ColumnGeometry(
|
||
index=0,
|
||
x=geo.x,
|
||
y=geo.y,
|
||
width=left_w,
|
||
height=geo.height,
|
||
word_count=len(left_words),
|
||
words=left_words,
|
||
width_ratio=left_w / content_w if content_w else 0,
|
||
is_sub_column=True,
|
||
)
|
||
right_geo = ColumnGeometry(
|
||
index=0,
|
||
x=split_x_abs,
|
||
y=geo.y,
|
||
width=right_w,
|
||
height=geo.height,
|
||
word_count=len(right_words),
|
||
words=right_words,
|
||
width_ratio=right_w / content_w if content_w else 0,
|
||
is_sub_column=True,
|
||
)
|
||
|
||
logger.info(
|
||
f"SplitBroadCols: col {geo.index} SPLIT at gap_center={gap_center} "
|
||
f"(gap {best_gap[2]}px @ [{best_gap[0]}..{best_gap[1]}]), "
|
||
f"left={len(left_words)} words (w={left_w}), "
|
||
f"right={len(right_words)} words (w={right_w})"
|
||
)
|
||
|
||
result.append(left_geo)
|
||
result.append(right_geo)
|
||
|
||
# Re-index left-to-right
|
||
result.sort(key=lambda g: g.x)
|
||
for i, g in enumerate(result):
|
||
g.index = i
|
||
|
||
return result
|
||
|
||
|
||
def expand_narrow_columns(
|
||
geometries: List[ColumnGeometry],
|
||
content_w: int,
|
||
left_x: int,
|
||
word_dicts: List[Dict],
|
||
) -> List[ColumnGeometry]:
|
||
"""Expand narrow columns into adjacent whitespace gaps.
|
||
|
||
Narrow columns (marker, page_ref, < 10% content width) often lose
|
||
content at image edges due to residual shear. This expands them toward
|
||
the neighbouring column, but never past 40% of the gap or past the
|
||
nearest word in the neighbour.
|
||
|
||
Must be called AFTER _detect_sub_columns() so that sub-column splits
|
||
(which create the narrowest columns) have already happened.
|
||
"""
|
||
_NARROW_THRESHOLD_PCT = 10.0
|
||
_MIN_WORD_MARGIN = 4
|
||
|
||
if len(geometries) < 2:
|
||
return geometries
|
||
|
||
logger.info("ExpandNarrowCols: input %d cols: %s",
|
||
len(geometries),
|
||
[(i, g.x, g.width, round(g.width / content_w * 100, 1))
|
||
for i, g in enumerate(geometries)])
|
||
|
||
for i, g in enumerate(geometries):
|
||
col_pct = g.width / content_w * 100 if content_w > 0 else 100
|
||
if col_pct >= _NARROW_THRESHOLD_PCT:
|
||
continue
|
||
|
||
expanded = False
|
||
orig_pct = col_pct
|
||
|
||
# --- try expanding to the LEFT ---
|
||
if i > 0:
|
||
left_nb = geometries[i - 1]
|
||
# Gap can be 0 if sub-column split created adjacent columns.
|
||
# In that case, look at where the neighbor's rightmost words
|
||
# actually are — there may be unused space we can claim.
|
||
nb_words_right = [wd['left'] + wd.get('width', 0)
|
||
for wd in left_nb.words]
|
||
if nb_words_right:
|
||
rightmost_word_abs = left_x + max(nb_words_right)
|
||
safe_left_abs = rightmost_word_abs + _MIN_WORD_MARGIN
|
||
else:
|
||
# No words in neighbor → we can take up to neighbor's start
|
||
safe_left_abs = left_nb.x + _MIN_WORD_MARGIN
|
||
|
||
if safe_left_abs < g.x:
|
||
g.width += (g.x - safe_left_abs)
|
||
g.x = safe_left_abs
|
||
expanded = True
|
||
|
||
# --- try expanding to the RIGHT ---
|
||
if i + 1 < len(geometries):
|
||
right_nb = geometries[i + 1]
|
||
nb_words_left = [wd['left'] for wd in right_nb.words]
|
||
if nb_words_left:
|
||
leftmost_word_abs = left_x + min(nb_words_left)
|
||
safe_right_abs = leftmost_word_abs - _MIN_WORD_MARGIN
|
||
else:
|
||
safe_right_abs = right_nb.x + right_nb.width - _MIN_WORD_MARGIN
|
||
|
||
cur_right = g.x + g.width
|
||
if safe_right_abs > cur_right:
|
||
g.width = safe_right_abs - g.x
|
||
expanded = True
|
||
|
||
if expanded:
|
||
col_left_rel = g.x - left_x
|
||
col_right_rel = col_left_rel + g.width
|
||
g.words = [wd for wd in word_dicts
|
||
if col_left_rel <= wd['left'] < col_right_rel]
|
||
g.word_count = len(g.words)
|
||
g.width_ratio = g.width / content_w if content_w > 0 else 0.0
|
||
logger.info(
|
||
"ExpandNarrowCols: col %d (%.1f%% → %.1f%%) x=%d w=%d words=%d",
|
||
i, orig_pct, g.width / content_w * 100, g.x, g.width, g.word_count)
|
||
|
||
# --- Shrink overlapping neighbors to match new boundaries ---
|
||
# Left neighbor: its right edge must not exceed our new left edge
|
||
if i > 0:
|
||
left_nb = geometries[i - 1]
|
||
nb_right = left_nb.x + left_nb.width
|
||
if nb_right > g.x:
|
||
left_nb.width = g.x - left_nb.x
|
||
if left_nb.width < 0:
|
||
left_nb.width = 0
|
||
left_nb.width_ratio = left_nb.width / content_w if content_w > 0 else 0.0
|
||
# Re-assign words
|
||
nb_left_rel = left_nb.x - left_x
|
||
nb_right_rel = nb_left_rel + left_nb.width
|
||
left_nb.words = [wd for wd in word_dicts
|
||
if nb_left_rel <= wd['left'] < nb_right_rel]
|
||
left_nb.word_count = len(left_nb.words)
|
||
|
||
# Right neighbor: its left edge must not be before our new right edge
|
||
if i + 1 < len(geometries):
|
||
right_nb = geometries[i + 1]
|
||
my_right = g.x + g.width
|
||
if right_nb.x < my_right:
|
||
old_right_edge = right_nb.x + right_nb.width
|
||
right_nb.x = my_right
|
||
right_nb.width = old_right_edge - right_nb.x
|
||
if right_nb.width < 0:
|
||
right_nb.width = 0
|
||
right_nb.width_ratio = right_nb.width / content_w if content_w > 0 else 0.0
|
||
# Re-assign words
|
||
nb_left_rel = right_nb.x - left_x
|
||
nb_right_rel = nb_left_rel + right_nb.width
|
||
right_nb.words = [wd for wd in word_dicts
|
||
if nb_left_rel <= wd['left'] < nb_right_rel]
|
||
right_nb.word_count = len(right_nb.words)
|
||
|
||
return geometries
|