overflow-hidden → overflow-y-auto so all nav items are reachable. Added /parent (Eltern-Portal) link with people icon. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
353 lines
14 KiB
Python
353 lines
14 KiB
Python
"""
|
||
Row geometry detection for document layout analysis.
|
||
|
||
Provides horizontal whitespace-gap analysis to detect text rows,
|
||
word-center grid regularization, and fallback word-grouping.
|
||
|
||
Extracted from cv_layout.py.
|
||
|
||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||
"""
|
||
|
||
import logging
|
||
from typing import Dict, List
|
||
|
||
import numpy as np
|
||
|
||
try:
|
||
import cv2
|
||
except ImportError:
|
||
cv2 = None # type: ignore[assignment]
|
||
|
||
from cv_vocab_types import RowGeometry
|
||
from cv_ocr_word_assembly import _group_words_into_lines
|
||
from cv_layout_row_regularize import _regularize_row_grid
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
# =============================================================================
|
||
# Row Geometry Detection (horizontal whitespace-gap analysis)
|
||
# =============================================================================
|
||
|
||
def detect_row_geometry(
|
||
inv: np.ndarray,
|
||
word_dicts: List[Dict],
|
||
left_x: int, right_x: int,
|
||
top_y: int, bottom_y: int,
|
||
) -> List['RowGeometry']:
|
||
"""Detect row geometry using horizontal whitespace-gap analysis.
|
||
|
||
Algorithm overview (two phases):
|
||
|
||
Phase 1 — Gap-based detection (Steps 1–6):
|
||
1. Build a horizontal projection profile: for each y-pixel, sum the
|
||
ink density across the content width. Only pixels within/near
|
||
Tesseract word bounding boxes contribute (word_mask), so that
|
||
images/illustrations don't merge adjacent text rows.
|
||
2. Smooth the projection and find contiguous regions below a
|
||
threshold (= gaps / horizontal whitespace between text lines).
|
||
The threshold is 15% of the median non-zero density.
|
||
3. Validate gaps against word bounding boxes — discard any gap
|
||
that overlaps a word, or shift the gap boundary to avoid the word.
|
||
4. Build rows from the spans between validated gaps.
|
||
5. Detect header/footer rows: gaps in the top/bottom 15% of the
|
||
page that are >= 2× the median gap size mark section boundaries.
|
||
|
||
Phase 2 — Word-center regularization (_regularize_row_grid, Step 7):
|
||
For each word, compute its vertical center (top + height/2).
|
||
Group words into line clusters by Y-proximity (tolerance = 40% of
|
||
the median gap-based row height).
|
||
For each cluster, the line center = median of all word centers.
|
||
The "pitch" = distance between consecutive line centers.
|
||
Section breaks are detected where the pitch exceeds 1.8× the median.
|
||
Within each section, row boundaries are placed at the midpoints
|
||
between consecutive line centers:
|
||
- Row top = midpoint to previous line center (or center - pitch/2 for first)
|
||
- Row bottom = midpoint to next line center (or center + pitch/2 for last)
|
||
This ensures rows tile without gaps or overlaps.
|
||
|
||
Fallback:
|
||
If < 2 gaps are found (very dense or uniform text), falls back to
|
||
_build_rows_from_word_grouping() which groups words by Y proximity.
|
||
|
||
Args:
|
||
inv: Inverted binarized image (white text on black bg, full page).
|
||
word_dicts: Word bounding boxes from Tesseract (relative to content ROI).
|
||
left_x, right_x: Absolute X bounds of the content area.
|
||
top_y, bottom_y: Absolute Y bounds of the content area.
|
||
|
||
Returns:
|
||
List of RowGeometry objects sorted top to bottom.
|
||
"""
|
||
content_w = right_x - left_x
|
||
content_h = bottom_y - top_y
|
||
|
||
if content_h < 10 or content_w < 10:
|
||
logger.warning("detect_row_geometry: content area too small")
|
||
return []
|
||
|
||
# --- Step 1: Horizontal projection profile ---
|
||
# For each y-pixel row, sum ink density across the content width.
|
||
# A word-coverage mask ensures only pixels near Tesseract words contribute,
|
||
# so that illustrations/images don't inflate the density and merge rows.
|
||
content_strip = inv[top_y:bottom_y, left_x:right_x]
|
||
WORD_PAD_Y = max(4, content_h // 300) # small vertical padding around words
|
||
word_mask = np.zeros((content_h, content_w), dtype=np.uint8)
|
||
for wd in word_dicts:
|
||
y1 = max(0, wd['top'] - WORD_PAD_Y)
|
||
y2 = min(content_h, wd['top'] + wd['height'] + WORD_PAD_Y)
|
||
x1 = max(0, wd['left'])
|
||
x2 = min(content_w, wd['left'] + wd['width'])
|
||
word_mask[y1:y2, x1:x2] = 255
|
||
|
||
masked_strip = cv2.bitwise_and(content_strip, word_mask)
|
||
h_proj = np.sum(masked_strip, axis=1).astype(float)
|
||
h_proj_norm = h_proj / (content_w * 255) if content_w > 0 else h_proj
|
||
|
||
# --- Step 2: Smoothing + gap threshold ---
|
||
# Smooth the projection to reduce noise, then threshold at 15% of the
|
||
# median non-zero density. Pixels below this threshold are considered
|
||
# "gap" (horizontal whitespace between text lines).
|
||
# MIN_GAP_HEIGHT prevents tiny noise gaps from splitting rows.
|
||
kernel_size = max(3, content_h // 200)
|
||
if kernel_size % 2 == 0:
|
||
kernel_size += 1
|
||
h_smooth = np.convolve(h_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
|
||
|
||
median_density = float(np.median(h_smooth[h_smooth > 0])) if np.any(h_smooth > 0) else 0.01
|
||
gap_threshold = max(median_density * 0.15, 0.003)
|
||
|
||
in_gap = h_smooth < gap_threshold
|
||
MIN_GAP_HEIGHT = max(3, content_h // 500)
|
||
|
||
# --- Step 3: Collect contiguous gap regions ---
|
||
raw_gaps = [] # (start_y_rel, end_y_rel) relative to content ROI
|
||
gap_start = None
|
||
for y in range(len(in_gap)):
|
||
if in_gap[y]:
|
||
if gap_start is None:
|
||
gap_start = y
|
||
else:
|
||
if gap_start is not None:
|
||
gap_height = y - gap_start
|
||
if gap_height >= MIN_GAP_HEIGHT:
|
||
raw_gaps.append((gap_start, y))
|
||
gap_start = None
|
||
if gap_start is not None:
|
||
gap_height = len(in_gap) - gap_start
|
||
if gap_height >= MIN_GAP_HEIGHT:
|
||
raw_gaps.append((gap_start, len(in_gap)))
|
||
|
||
logger.info(f"RowGeometry: {len(raw_gaps)} raw gaps found (threshold={gap_threshold:.4f}, "
|
||
f"min_height={MIN_GAP_HEIGHT}px)")
|
||
|
||
# --- Step 4: Validate gaps against word bounding boxes ---
|
||
# A gap is valid only if no word's bounding box overlaps it vertically.
|
||
# If a word overlaps, try to shift the gap boundary above or below the
|
||
# word. If neither shift yields enough room (>= MIN_GAP_HEIGHT), discard.
|
||
validated_gaps = []
|
||
for gap_start_rel, gap_end_rel in raw_gaps:
|
||
overlapping = False
|
||
for wd in word_dicts:
|
||
word_top = wd['top']
|
||
word_bottom = wd['top'] + wd['height']
|
||
if word_top < gap_end_rel and word_bottom > gap_start_rel:
|
||
overlapping = True
|
||
break
|
||
|
||
if not overlapping:
|
||
validated_gaps.append((gap_start_rel, gap_end_rel))
|
||
else:
|
||
# Try to shift the gap to avoid overlapping words
|
||
min_word_top = content_h
|
||
max_word_bottom = 0
|
||
for wd in word_dicts:
|
||
word_top = wd['top']
|
||
word_bottom = wd['top'] + wd['height']
|
||
if word_top < gap_end_rel and word_bottom > gap_start_rel:
|
||
min_word_top = min(min_word_top, word_top)
|
||
max_word_bottom = max(max_word_bottom, word_bottom)
|
||
|
||
if min_word_top - gap_start_rel >= MIN_GAP_HEIGHT:
|
||
validated_gaps.append((gap_start_rel, min_word_top))
|
||
elif gap_end_rel - max_word_bottom >= MIN_GAP_HEIGHT:
|
||
validated_gaps.append((max_word_bottom, gap_end_rel))
|
||
else:
|
||
logger.debug(f"RowGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
|
||
f"discarded (word overlap, no room to shift)")
|
||
|
||
logger.info(f"RowGeometry: {len(validated_gaps)} gaps after word validation")
|
||
|
||
# --- Fallback if too few gaps ---
|
||
if len(validated_gaps) < 2:
|
||
logger.info("RowGeometry: < 2 gaps found, falling back to word grouping")
|
||
return _build_rows_from_word_grouping(
|
||
word_dicts, left_x, right_x, top_y, bottom_y, content_w, content_h,
|
||
)
|
||
|
||
validated_gaps.sort(key=lambda g: g[0])
|
||
|
||
# --- Step 5: Header/footer detection via gap size ---
|
||
HEADER_FOOTER_ZONE = 0.15
|
||
GAP_MULTIPLIER = 2.0
|
||
|
||
gap_sizes = [g[1] - g[0] for g in validated_gaps]
|
||
median_gap = float(np.median(gap_sizes)) if gap_sizes else 0
|
||
large_gap_threshold = median_gap * GAP_MULTIPLIER
|
||
|
||
header_boundary_rel = None # y below which is header
|
||
footer_boundary_rel = None # y above which is footer
|
||
|
||
header_zone_limit = int(content_h * HEADER_FOOTER_ZONE)
|
||
footer_zone_start = int(content_h * (1.0 - HEADER_FOOTER_ZONE))
|
||
|
||
# Find largest gap in header zone
|
||
best_header_gap = None
|
||
for gs, ge in validated_gaps:
|
||
gap_mid = (gs + ge) / 2
|
||
gap_size = ge - gs
|
||
if gap_mid < header_zone_limit and gap_size > large_gap_threshold:
|
||
if best_header_gap is None or gap_size > (best_header_gap[1] - best_header_gap[0]):
|
||
best_header_gap = (gs, ge)
|
||
|
||
if best_header_gap is not None:
|
||
header_boundary_rel = best_header_gap[1]
|
||
logger.info(f"RowGeometry: header boundary at y_rel={header_boundary_rel} "
|
||
f"(gap={best_header_gap[1] - best_header_gap[0]}px, "
|
||
f"median_gap={median_gap:.0f}px)")
|
||
|
||
# Find largest gap in footer zone
|
||
best_footer_gap = None
|
||
for gs, ge in validated_gaps:
|
||
gap_mid = (gs + ge) / 2
|
||
gap_size = ge - gs
|
||
if gap_mid > footer_zone_start and gap_size > large_gap_threshold:
|
||
if best_footer_gap is None or gap_size > (best_footer_gap[1] - best_footer_gap[0]):
|
||
best_footer_gap = (gs, ge)
|
||
|
||
if best_footer_gap is not None:
|
||
footer_boundary_rel = best_footer_gap[0]
|
||
logger.info(f"RowGeometry: footer boundary at y_rel={footer_boundary_rel} "
|
||
f"(gap={best_footer_gap[1] - best_footer_gap[0]}px)")
|
||
|
||
# --- Step 6: Build RowGeometry objects from gaps ---
|
||
# Rows are the spans between consecutive gaps. The gap midpoints define
|
||
# where one row ends and the next begins. Each row's height extends
|
||
# from the end of the previous gap to the start of the next gap.
|
||
row_boundaries = [] # (start_y_rel, end_y_rel)
|
||
|
||
# Top of content to first gap
|
||
if validated_gaps[0][0] > MIN_GAP_HEIGHT:
|
||
row_boundaries.append((0, validated_gaps[0][0]))
|
||
|
||
# Between gaps
|
||
for i in range(len(validated_gaps) - 1):
|
||
row_start = validated_gaps[i][1]
|
||
row_end = validated_gaps[i + 1][0]
|
||
if row_end - row_start > 0:
|
||
row_boundaries.append((row_start, row_end))
|
||
|
||
# Last gap to bottom of content
|
||
if validated_gaps[-1][1] < content_h - MIN_GAP_HEIGHT:
|
||
row_boundaries.append((validated_gaps[-1][1], content_h))
|
||
|
||
rows = []
|
||
for idx, (row_start_rel, row_end_rel) in enumerate(row_boundaries):
|
||
# Determine row type
|
||
row_mid = (row_start_rel + row_end_rel) / 2
|
||
if header_boundary_rel is not None and row_mid < header_boundary_rel:
|
||
row_type = 'header'
|
||
elif footer_boundary_rel is not None and row_mid > footer_boundary_rel:
|
||
row_type = 'footer'
|
||
else:
|
||
row_type = 'content'
|
||
|
||
# Collect words in this row
|
||
row_words = [w for w in word_dicts
|
||
if w['top'] + w['height'] / 2 >= row_start_rel
|
||
and w['top'] + w['height'] / 2 < row_end_rel]
|
||
|
||
# Gap before this row
|
||
gap_before = 0
|
||
if idx == 0 and validated_gaps[0][0] > 0:
|
||
gap_before = validated_gaps[0][0]
|
||
elif idx > 0:
|
||
# Find the gap just before this row boundary
|
||
for gs, ge in validated_gaps:
|
||
if ge == row_start_rel:
|
||
gap_before = ge - gs
|
||
break
|
||
|
||
rows.append(RowGeometry(
|
||
index=idx,
|
||
x=left_x,
|
||
y=top_y + row_start_rel,
|
||
width=content_w,
|
||
height=row_end_rel - row_start_rel,
|
||
word_count=len(row_words),
|
||
words=row_words,
|
||
row_type=row_type,
|
||
gap_before=gap_before,
|
||
))
|
||
|
||
# --- Step 7: Word-center grid regularization ---
|
||
# Refine the gap-based rows using word vertical centers. For each word,
|
||
# compute center_y = top + height/2. Group into line clusters, compute
|
||
# the pitch (distance between consecutive line centers), and place row
|
||
# boundaries at the midpoints between centers. This gives more precise
|
||
# and evenly-spaced rows than the gap-based approach alone.
|
||
# Also detects section breaks (headings, paragraphs) where the pitch
|
||
# exceeds 1.8× the median, and handles each section independently.
|
||
rows = _regularize_row_grid(rows, word_dicts, left_x, right_x, top_y,
|
||
content_w, content_h, inv)
|
||
|
||
type_counts = {}
|
||
for r in rows:
|
||
type_counts[r.row_type] = type_counts.get(r.row_type, 0) + 1
|
||
logger.info(f"RowGeometry: {len(rows)} rows detected: {type_counts}")
|
||
|
||
return rows
|
||
|
||
|
||
def _build_rows_from_word_grouping(
|
||
word_dicts: List[Dict],
|
||
left_x: int, right_x: int,
|
||
top_y: int, bottom_y: int,
|
||
content_w: int, content_h: int,
|
||
) -> List['RowGeometry']:
|
||
"""Fallback: build rows by grouping words by Y position.
|
||
|
||
Uses _group_words_into_lines() with a generous tolerance.
|
||
No header/footer detection in fallback mode.
|
||
"""
|
||
if not word_dicts:
|
||
return []
|
||
|
||
y_tolerance = max(20, content_h // 100)
|
||
lines = _group_words_into_lines(word_dicts, y_tolerance_px=y_tolerance)
|
||
|
||
rows = []
|
||
for idx, line_words in enumerate(lines):
|
||
if not line_words:
|
||
continue
|
||
min_top = min(w['top'] for w in line_words)
|
||
max_bottom = max(w['top'] + w['height'] for w in line_words)
|
||
row_height = max_bottom - min_top
|
||
|
||
rows.append(RowGeometry(
|
||
index=idx,
|
||
x=left_x,
|
||
y=top_y + min_top,
|
||
width=content_w,
|
||
height=row_height,
|
||
word_count=len(line_words),
|
||
words=line_words,
|
||
row_type='content',
|
||
gap_before=0,
|
||
))
|
||
|
||
logger.info(f"RowGeometry (fallback): {len(rows)} rows from word grouping")
|
||
return rows
|