Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
590 lines
24 KiB
Python
590 lines
24 KiB
Python
"""
|
|
Core column detection: gap-based geometry and clustering fallback.
|
|
|
|
Extracted from the original cv_layout_columns.py — contains:
|
|
- _detect_columns_by_clustering() (fallback clustering)
|
|
- _build_geometries_from_starts() (geometry construction)
|
|
- detect_column_geometry() (main column detection)
|
|
|
|
Post-processing (sub-columns, broad-column split, narrow expansion)
|
|
lives in cv_layout_column_refine.py.
|
|
Legacy projection-profile layout lives in cv_layout_analyze.py.
|
|
|
|
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
|
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
|
"""
|
|
|
|
import logging
|
|
from typing import Dict, List, Optional, Tuple
|
|
|
|
import numpy as np
|
|
|
|
from cv_vocab_types import ColumnGeometry
|
|
from cv_layout_detection import _find_content_bounds
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
try:
|
|
import cv2
|
|
except ImportError:
|
|
cv2 = None # type: ignore[assignment]
|
|
|
|
try:
|
|
import pytesseract
|
|
from PIL import Image
|
|
except ImportError:
|
|
pytesseract = None # type: ignore[assignment]
|
|
Image = None # type: ignore[assignment,misc]
|
|
|
|
|
|
# =============================================================================
|
|
# Stage 5b: Word-Based Layout Analysis (Two-Phase Column Detection)
|
|
# =============================================================================
|
|
|
|
# --- Phase A: Geometry Detection ---
|
|
|
|
def _detect_columns_by_clustering(
|
|
word_dicts: List[Dict],
|
|
left_edges: List[int],
|
|
edge_word_indices: List[int],
|
|
content_w: int,
|
|
content_h: int,
|
|
left_x: int,
|
|
right_x: int,
|
|
top_y: int,
|
|
bottom_y: int,
|
|
inv: Optional[np.ndarray] = None,
|
|
) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], Optional[np.ndarray]]]:
|
|
"""Fallback: detect columns by clustering left-aligned word positions.
|
|
|
|
Used when the primary gap-based algorithm finds fewer than 2 gaps.
|
|
"""
|
|
tolerance = max(10, int(content_w * 0.01))
|
|
sorted_pairs = sorted(zip(left_edges, edge_word_indices), key=lambda p: p[0])
|
|
|
|
clusters = []
|
|
cluster_widxs = []
|
|
cur_edges = [sorted_pairs[0][0]]
|
|
cur_widxs = [sorted_pairs[0][1]]
|
|
for edge, widx in sorted_pairs[1:]:
|
|
if edge - cur_edges[-1] <= tolerance:
|
|
cur_edges.append(edge)
|
|
cur_widxs.append(widx)
|
|
else:
|
|
clusters.append(cur_edges)
|
|
cluster_widxs.append(cur_widxs)
|
|
cur_edges = [edge]
|
|
cur_widxs = [widx]
|
|
clusters.append(cur_edges)
|
|
cluster_widxs.append(cur_widxs)
|
|
|
|
MIN_Y_COVERAGE_PRIMARY = 0.30
|
|
MIN_Y_COVERAGE_SECONDARY = 0.15
|
|
MIN_WORDS_SECONDARY = 5
|
|
|
|
cluster_infos = []
|
|
for c_edges, c_widxs in zip(clusters, cluster_widxs):
|
|
if len(c_edges) < 2:
|
|
continue
|
|
y_positions = [word_dicts[idx]['top'] for idx in c_widxs]
|
|
y_span = max(y_positions) - min(y_positions)
|
|
y_coverage = y_span / content_h if content_h > 0 else 0.0
|
|
cluster_infos.append({
|
|
'mean_x': int(np.mean(c_edges)),
|
|
'count': len(c_edges),
|
|
'min_edge': min(c_edges),
|
|
'max_edge': max(c_edges),
|
|
'y_min': min(y_positions),
|
|
'y_max': max(y_positions),
|
|
'y_coverage': y_coverage,
|
|
})
|
|
|
|
primary = [c for c in cluster_infos if c['y_coverage'] >= MIN_Y_COVERAGE_PRIMARY]
|
|
primary_set = set(id(c) for c in primary)
|
|
secondary = [c for c in cluster_infos
|
|
if id(c) not in primary_set
|
|
and c['y_coverage'] >= MIN_Y_COVERAGE_SECONDARY
|
|
and c['count'] >= MIN_WORDS_SECONDARY]
|
|
significant = sorted(primary + secondary, key=lambda c: c['mean_x'])
|
|
|
|
if len(significant) < 3:
|
|
logger.info("ColumnGeometry clustering fallback: < 3 significant clusters")
|
|
return None
|
|
|
|
merge_distance = max(30, int(content_w * 0.06))
|
|
merged = [significant[0].copy()]
|
|
for s in significant[1:]:
|
|
if s['mean_x'] - merged[-1]['mean_x'] < merge_distance:
|
|
prev = merged[-1]
|
|
total = prev['count'] + s['count']
|
|
avg_x = (prev['mean_x'] * prev['count'] + s['mean_x'] * s['count']) // total
|
|
prev['mean_x'] = avg_x
|
|
prev['count'] = total
|
|
prev['min_edge'] = min(prev['min_edge'], s['min_edge'])
|
|
prev['max_edge'] = max(prev['max_edge'], s['max_edge'])
|
|
else:
|
|
merged.append(s.copy())
|
|
|
|
if len(merged) < 3:
|
|
logger.info("ColumnGeometry clustering fallback: < 3 merged clusters")
|
|
return None
|
|
|
|
logger.info(f"ColumnGeometry clustering fallback: {len(merged)} columns from clustering")
|
|
|
|
margin_px = max(6, int(content_w * 0.003))
|
|
return _build_geometries_from_starts(
|
|
[(max(0, left_x + m['min_edge'] - margin_px), m['count']) for m in merged],
|
|
word_dicts, left_x, right_x, top_y, bottom_y, content_w, content_h, inv,
|
|
)
|
|
|
|
|
|
def _build_geometries_from_starts(
|
|
col_starts: List[Tuple[int, int]],
|
|
word_dicts: List[Dict],
|
|
left_x: int,
|
|
right_x: int,
|
|
top_y: int,
|
|
bottom_y: int,
|
|
content_w: int,
|
|
content_h: int,
|
|
inv: Optional[np.ndarray] = None,
|
|
) -> Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], Optional[np.ndarray]]:
|
|
"""Build ColumnGeometry objects from a list of (abs_start_x, word_count) pairs."""
|
|
geometries = []
|
|
for i, (start_x, count) in enumerate(col_starts):
|
|
if i + 1 < len(col_starts):
|
|
col_width = col_starts[i + 1][0] - start_x
|
|
else:
|
|
col_width = right_x - start_x
|
|
|
|
col_left_rel = start_x - left_x
|
|
col_right_rel = col_left_rel + col_width
|
|
col_words = [w for w in word_dicts
|
|
if col_left_rel <= w['left'] < col_right_rel]
|
|
|
|
geometries.append(ColumnGeometry(
|
|
index=i,
|
|
x=start_x,
|
|
y=top_y,
|
|
width=col_width,
|
|
height=content_h,
|
|
word_count=len(col_words),
|
|
words=col_words,
|
|
width_ratio=col_width / content_w if content_w > 0 else 0.0,
|
|
))
|
|
|
|
logger.info(f"ColumnGeometry: {len(geometries)} columns: "
|
|
f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
|
|
return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)
|
|
|
|
|
|
def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], np.ndarray]]:
|
|
"""Detect column geometry using whitespace-gap analysis with word validation.
|
|
|
|
Phase A of the two-phase column detection. Uses vertical projection
|
|
profiles to find whitespace gaps between columns, then validates that
|
|
no gap cuts through a word bounding box.
|
|
|
|
Falls back to clustering-based detection if fewer than 2 gaps are found.
|
|
|
|
Args:
|
|
ocr_img: Binarized grayscale image for layout analysis.
|
|
dewarped_bgr: Original BGR image (for Tesseract word detection).
|
|
|
|
Returns:
|
|
Tuple of (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)
|
|
or None if detection fails entirely.
|
|
"""
|
|
h, w = ocr_img.shape[:2]
|
|
|
|
# --- Step 1: Find content bounds ---
|
|
inv = cv2.bitwise_not(ocr_img)
|
|
left_x, right_x, top_y, bottom_y = _find_content_bounds(inv)
|
|
content_w = right_x - left_x
|
|
content_h = bottom_y - top_y
|
|
|
|
if content_w < w * 0.3 or content_h < h * 0.3:
|
|
left_x, right_x = 0, w
|
|
top_y, bottom_y = 0, h
|
|
content_w, content_h = w, h
|
|
|
|
logger.info(f"ColumnGeometry: content bounds x=[{left_x}..{right_x}] ({content_w}px), "
|
|
f"y=[{top_y}..{bottom_y}] ({content_h}px)")
|
|
|
|
# --- Step 2: Get word bounding boxes from Tesseract ---
|
|
# Crop from left_x to full image width (not right_x) so words at the right
|
|
# edge of the last column are included even if they extend past the detected
|
|
# content boundary (right_x).
|
|
content_roi = dewarped_bgr[top_y:bottom_y, left_x:w]
|
|
pil_img = Image.fromarray(cv2.cvtColor(content_roi, cv2.COLOR_BGR2RGB))
|
|
|
|
try:
|
|
data = pytesseract.image_to_data(pil_img, lang='eng+deu', output_type=pytesseract.Output.DICT)
|
|
except Exception as e:
|
|
logger.warning(f"ColumnGeometry: Tesseract image_to_data failed: {e}")
|
|
return None
|
|
|
|
word_dicts = []
|
|
left_edges = []
|
|
edge_word_indices = []
|
|
n_words = len(data['text'])
|
|
for i in range(n_words):
|
|
conf = int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1
|
|
text = str(data['text'][i]).strip()
|
|
if conf < 30 or not text:
|
|
continue
|
|
lx = int(data['left'][i])
|
|
ty = int(data['top'][i])
|
|
bw = int(data['width'][i])
|
|
bh = int(data['height'][i])
|
|
left_edges.append(lx)
|
|
edge_word_indices.append(len(word_dicts))
|
|
word_dicts.append({
|
|
'text': text, 'conf': conf,
|
|
'left': lx, 'top': ty, 'width': bw, 'height': bh,
|
|
})
|
|
|
|
if len(left_edges) < 5:
|
|
logger.warning(f"ColumnGeometry: only {len(left_edges)} words detected")
|
|
return None
|
|
|
|
logger.info(f"ColumnGeometry: {len(left_edges)} words detected in content area")
|
|
|
|
# --- Step 2b: Segment by sub-headers ---
|
|
# Pages with sub-headers (e.g. "Unit 4: Bonnie Scotland") have full-width
|
|
# text bands that pollute the vertical projection. We detect large
|
|
# horizontal gaps (= whitespace rows separating sections) and use only
|
|
# the tallest content segment for the projection. This makes column
|
|
# detection immune to sub-headers, illustrations, and section dividers.
|
|
content_strip = inv[top_y:bottom_y, left_x:right_x]
|
|
h_proj_row = np.sum(content_strip, axis=1).astype(float)
|
|
h_proj_row_norm = h_proj_row / (content_w * 255) if content_w > 0 else h_proj_row
|
|
|
|
# Find horizontal gaps (near-empty rows)
|
|
H_GAP_THRESH = 0.02 # rows with <2% ink density are "empty"
|
|
h_in_gap = h_proj_row_norm < H_GAP_THRESH
|
|
H_MIN_GAP = max(5, content_h // 200) # min gap height ~5-7px
|
|
|
|
h_gaps: List[Tuple[int, int]] = []
|
|
h_gap_start = None
|
|
for y_idx in range(len(h_in_gap)):
|
|
if h_in_gap[y_idx]:
|
|
if h_gap_start is None:
|
|
h_gap_start = y_idx
|
|
else:
|
|
if h_gap_start is not None:
|
|
if y_idx - h_gap_start >= H_MIN_GAP:
|
|
h_gaps.append((h_gap_start, y_idx))
|
|
h_gap_start = None
|
|
if h_gap_start is not None and len(h_in_gap) - h_gap_start >= H_MIN_GAP:
|
|
h_gaps.append((h_gap_start, len(h_in_gap)))
|
|
|
|
# Identify "large" gaps (significantly bigger than median) that indicate
|
|
# section boundaries (sub-headers, chapter titles).
|
|
if len(h_gaps) >= 3:
|
|
gap_sizes = sorted(g[1] - g[0] for g in h_gaps)
|
|
median_gap_h = gap_sizes[len(gap_sizes) // 2]
|
|
large_gap_thresh = max(median_gap_h * 1.8, H_MIN_GAP + 3)
|
|
large_gaps = [(gs, ge) for gs, ge in h_gaps if ge - gs >= large_gap_thresh]
|
|
else:
|
|
large_gaps = h_gaps
|
|
|
|
# Build content segments between large gaps and pick the tallest
|
|
seg_boundaries = [0]
|
|
for gs, ge in large_gaps:
|
|
seg_boundaries.append(gs)
|
|
seg_boundaries.append(ge)
|
|
seg_boundaries.append(content_h)
|
|
|
|
segments = []
|
|
for i in range(0, len(seg_boundaries) - 1, 2):
|
|
seg_top = seg_boundaries[i]
|
|
seg_bot = seg_boundaries[i + 1] if i + 1 < len(seg_boundaries) else content_h
|
|
seg_height = seg_bot - seg_top
|
|
if seg_height > 20: # ignore tiny fragments
|
|
segments.append((seg_top, seg_bot, seg_height))
|
|
|
|
if segments:
|
|
segments.sort(key=lambda s: s[2], reverse=True)
|
|
best_seg = segments[0]
|
|
proj_strip = content_strip[best_seg[0]:best_seg[1], :]
|
|
effective_h = best_seg[2]
|
|
if len(segments) > 1:
|
|
logger.info(f"ColumnGeometry: {len(segments)} segments from {len(large_gaps)} "
|
|
f"large h-gaps, using tallest: rows {best_seg[0]}..{best_seg[1]} "
|
|
f"({effective_h}px, {effective_h*100/content_h:.0f}%)")
|
|
else:
|
|
proj_strip = content_strip
|
|
effective_h = content_h
|
|
|
|
# --- Step 3: Vertical projection profile ---
|
|
v_proj = np.sum(proj_strip, axis=0).astype(float)
|
|
v_proj_norm = v_proj / (effective_h * 255) if effective_h > 0 else v_proj
|
|
|
|
# Smooth the projection to avoid noise-induced micro-gaps
|
|
kernel_size = max(5, content_w // 80)
|
|
if kernel_size % 2 == 0:
|
|
kernel_size += 1 # keep odd for symmetry
|
|
v_smooth = np.convolve(v_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
|
|
|
|
# --- Step 4: Find whitespace gaps ---
|
|
# Threshold: areas with very little ink density are gaps
|
|
median_density = float(np.median(v_smooth[v_smooth > 0])) if np.any(v_smooth > 0) else 0.01
|
|
gap_threshold = max(median_density * 0.15, 0.005)
|
|
|
|
in_gap = v_smooth < gap_threshold
|
|
MIN_GAP_WIDTH = max(8, content_w // 200) # min ~8px or 0.5% of content width
|
|
|
|
# Collect contiguous gap regions
|
|
raw_gaps = [] # (start_x_rel, end_x_rel) relative to content ROI
|
|
gap_start = None
|
|
for x in range(len(in_gap)):
|
|
if in_gap[x]:
|
|
if gap_start is None:
|
|
gap_start = x
|
|
else:
|
|
if gap_start is not None:
|
|
gap_width = x - gap_start
|
|
if gap_width >= MIN_GAP_WIDTH:
|
|
raw_gaps.append((gap_start, x))
|
|
gap_start = None
|
|
# Handle gap at the right edge
|
|
if gap_start is not None:
|
|
gap_width = len(in_gap) - gap_start
|
|
if gap_width >= MIN_GAP_WIDTH:
|
|
raw_gaps.append((gap_start, len(in_gap)))
|
|
|
|
logger.info(f"ColumnGeometry: {len(raw_gaps)} raw gaps found (threshold={gap_threshold:.4f}, "
|
|
f"min_width={MIN_GAP_WIDTH}px): "
|
|
f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in raw_gaps]}")
|
|
|
|
# --- Step 5: Validate gaps against word bounding boxes ---
|
|
# When using a segment for projection, only validate against words
|
|
# inside that segment — words from sub-headers or other sections
|
|
# would incorrectly overlap with real column gaps.
|
|
if segments and len(segments) > 1:
|
|
seg_top_abs = best_seg[0] # relative to content strip
|
|
seg_bot_abs = best_seg[1]
|
|
segment_words = [wd for wd in word_dicts
|
|
if wd['top'] >= seg_top_abs
|
|
and wd['top'] + wd['height'] <= seg_bot_abs]
|
|
logger.info(f"ColumnGeometry: filtering words to segment: "
|
|
f"{len(segment_words)}/{len(word_dicts)} words")
|
|
else:
|
|
segment_words = word_dicts
|
|
|
|
validated_gaps = []
|
|
for gap_start_rel, gap_end_rel in raw_gaps:
|
|
# Check if any word overlaps with this gap region
|
|
overlapping = False
|
|
for wd in segment_words:
|
|
word_left = wd['left']
|
|
word_right = wd['left'] + wd['width']
|
|
if word_left < gap_end_rel and word_right > gap_start_rel:
|
|
overlapping = True
|
|
break
|
|
|
|
if not overlapping:
|
|
validated_gaps.append((gap_start_rel, gap_end_rel))
|
|
else:
|
|
# Try to shift the gap to avoid the overlapping word(s)
|
|
# Find the tightest word boundaries within the gap region
|
|
min_word_left = content_w
|
|
max_word_right = 0
|
|
for wd in segment_words:
|
|
word_left = wd['left']
|
|
word_right = wd['left'] + wd['width']
|
|
if word_left < gap_end_rel and word_right > gap_start_rel:
|
|
min_word_left = min(min_word_left, word_left)
|
|
max_word_right = max(max_word_right, word_right)
|
|
|
|
# Try gap before the overlapping words
|
|
if min_word_left - gap_start_rel >= MIN_GAP_WIDTH:
|
|
validated_gaps.append((gap_start_rel, min_word_left))
|
|
logger.debug(f"ColumnGeometry: gap shifted left to avoid word at {min_word_left}")
|
|
# Try gap after the overlapping words
|
|
elif gap_end_rel - max_word_right >= MIN_GAP_WIDTH:
|
|
validated_gaps.append((max_word_right, gap_end_rel))
|
|
logger.debug(f"ColumnGeometry: gap shifted right to avoid word at {max_word_right}")
|
|
else:
|
|
logger.debug(f"ColumnGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
|
|
f"discarded (word overlap, no room to shift)")
|
|
|
|
logger.info(f"ColumnGeometry: {len(validated_gaps)} gaps after word validation: "
|
|
f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in validated_gaps]}")
|
|
|
|
# --- Step 5b: Word-coverage gap detection (fallback for noisy scans) ---
|
|
# When pixel-based projection fails (e.g. due to illustrations or colored
|
|
# bands), use word bounding boxes to find clear vertical gaps. This is
|
|
# immune to decorative graphics that Tesseract doesn't recognise as words.
|
|
if len(validated_gaps) < 2:
|
|
logger.info("ColumnGeometry: < 2 pixel-gaps, trying word-coverage gaps")
|
|
word_coverage = np.zeros(content_w, dtype=np.int32)
|
|
for wd in segment_words:
|
|
wl = max(0, wd['left'])
|
|
wr = min(wd['left'] + wd['width'], content_w)
|
|
if wr > wl:
|
|
word_coverage[wl:wr] += 1
|
|
|
|
# Smooth slightly to bridge tiny 1-2px noise gaps between words
|
|
wc_kernel = max(3, content_w // 300)
|
|
if wc_kernel % 2 == 0:
|
|
wc_kernel += 1
|
|
wc_smooth = np.convolve(word_coverage.astype(float),
|
|
np.ones(wc_kernel) / wc_kernel, mode='same')
|
|
|
|
wc_in_gap = wc_smooth < 0.5 # effectively zero word coverage
|
|
WC_MIN_GAP = max(4, content_w // 300)
|
|
|
|
wc_gaps: List[Tuple[int, int]] = []
|
|
wc_gap_start = None
|
|
for x in range(len(wc_in_gap)):
|
|
if wc_in_gap[x]:
|
|
if wc_gap_start is None:
|
|
wc_gap_start = x
|
|
else:
|
|
if wc_gap_start is not None:
|
|
if x - wc_gap_start >= WC_MIN_GAP:
|
|
wc_gaps.append((wc_gap_start, x))
|
|
wc_gap_start = None
|
|
if wc_gap_start is not None and len(wc_in_gap) - wc_gap_start >= WC_MIN_GAP:
|
|
wc_gaps.append((wc_gap_start, len(wc_in_gap)))
|
|
|
|
logger.info(f"ColumnGeometry: {len(wc_gaps)} word-coverage gaps found "
|
|
f"(min_width={WC_MIN_GAP}px): "
|
|
f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in wc_gaps]}")
|
|
|
|
if len(wc_gaps) >= 2:
|
|
validated_gaps = wc_gaps
|
|
|
|
# --- Step 6: Fallback to clustering if too few gaps ---
|
|
if len(validated_gaps) < 2:
|
|
logger.info("ColumnGeometry: < 2 gaps found, falling back to clustering")
|
|
return _detect_columns_by_clustering(
|
|
word_dicts, left_edges, edge_word_indices,
|
|
content_w, content_h, left_x, right_x, top_y, bottom_y, inv,
|
|
)
|
|
|
|
# --- Step 7: Derive column boundaries from gaps ---
|
|
# Sort gaps by position
|
|
validated_gaps.sort(key=lambda g: g[0])
|
|
|
|
# Identify margin gaps (first and last) vs interior gaps
|
|
# A margin gap touches the edge of the content area (within 2% tolerance)
|
|
edge_tolerance = max(10, int(content_w * 0.02))
|
|
|
|
is_left_margin = validated_gaps[0][0] <= edge_tolerance
|
|
is_right_margin = validated_gaps[-1][1] >= content_w - edge_tolerance
|
|
|
|
# Interior gaps define column boundaries
|
|
# Column starts at the end of a gap, ends at the start of the next gap
|
|
col_starts = []
|
|
|
|
if is_left_margin:
|
|
# First column starts after the left margin gap
|
|
first_gap_end = validated_gaps[0][1]
|
|
interior_gaps = validated_gaps[1:]
|
|
else:
|
|
# No left margin gap — first column starts at content left edge
|
|
first_gap_end = 0
|
|
interior_gaps = validated_gaps[:]
|
|
|
|
if is_right_margin:
|
|
# Last gap is right margin — don't use it as column start
|
|
interior_gaps_for_boundaries = interior_gaps[:-1]
|
|
right_boundary = validated_gaps[-1][0] # last column ends at right margin gap start
|
|
else:
|
|
interior_gaps_for_boundaries = interior_gaps
|
|
right_boundary = content_w
|
|
|
|
# First column
|
|
col_starts.append(left_x + first_gap_end)
|
|
|
|
# Columns between interior gaps
|
|
for gap_start_rel, gap_end_rel in interior_gaps_for_boundaries:
|
|
col_starts.append(left_x + gap_end_rel)
|
|
|
|
# Count words per column region (for logging)
|
|
col_start_counts = []
|
|
for i, start_x in enumerate(col_starts):
|
|
if i + 1 < len(col_starts):
|
|
next_start = col_starts[i + 1]
|
|
else:
|
|
# Rightmost column always extends to full image width (w).
|
|
# The page margin contains only white space — extending the OCR
|
|
# crop to the image edge is safe and prevents text near the right
|
|
# border from being cut off.
|
|
next_start = w
|
|
|
|
col_left_rel = start_x - left_x
|
|
col_right_rel = next_start - left_x
|
|
n_words_in_col = sum(1 for w in word_dicts
|
|
if col_left_rel <= w['left'] < col_right_rel)
|
|
col_start_counts.append((start_x, n_words_in_col))
|
|
|
|
logger.info(f"ColumnGeometry: {len(col_starts)} columns from {len(validated_gaps)} gaps "
|
|
f"(left_margin={is_left_margin}, right_margin={is_right_margin}): "
|
|
f"{col_start_counts}")
|
|
|
|
# --- Step 8: Build ColumnGeometry objects ---
|
|
# Determine right edge for each column
|
|
all_boundaries = []
|
|
for i, start_x in enumerate(col_starts):
|
|
if i + 1 < len(col_starts):
|
|
end_x = col_starts[i + 1]
|
|
else:
|
|
# Rightmost column always extends to full image width (w).
|
|
end_x = w
|
|
all_boundaries.append((start_x, end_x))
|
|
|
|
geometries = []
|
|
for i, (start_x, end_x) in enumerate(all_boundaries):
|
|
col_width = end_x - start_x
|
|
col_left_rel = start_x - left_x
|
|
col_right_rel = col_left_rel + col_width
|
|
col_words = [w for w in word_dicts
|
|
if col_left_rel <= w['left'] < col_right_rel]
|
|
|
|
geometries.append(ColumnGeometry(
|
|
index=i,
|
|
x=start_x,
|
|
y=top_y,
|
|
width=col_width,
|
|
height=content_h,
|
|
word_count=len(col_words),
|
|
words=col_words,
|
|
width_ratio=col_width / content_w if content_w > 0 else 0.0,
|
|
))
|
|
|
|
logger.info(f"ColumnGeometry: {len(geometries)} columns: "
|
|
f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
|
|
|
|
# --- Step 9: Filter phantom narrow columns ---
|
|
# Tiny spurious gaps (e.g. 11px + 35px adjacent) can create very narrow
|
|
# columns (< 3% of content width) with zero or no words. These are not
|
|
# real columns — remove them and close the gap between neighbors.
|
|
min_real_col_w = max(20, int(content_w * 0.03))
|
|
filtered_geoms = [g for g in geometries
|
|
if not (g.word_count < 3 and g.width < min_real_col_w)]
|
|
if len(filtered_geoms) < len(geometries):
|
|
n_removed = len(geometries) - len(filtered_geoms)
|
|
logger.info(f"ColumnGeometry: removed {n_removed} phantom column(s) "
|
|
f"(width < {min_real_col_w}px and words < 3)")
|
|
# Extend each remaining column to close gaps with its right neighbor
|
|
for i, g in enumerate(filtered_geoms):
|
|
if i + 1 < len(filtered_geoms):
|
|
g.width = filtered_geoms[i + 1].x - g.x
|
|
else:
|
|
g.width = w - g.x
|
|
g.index = i
|
|
col_left_rel = g.x - left_x
|
|
col_right_rel = col_left_rel + g.width
|
|
g.words = [w for w in word_dicts
|
|
if col_left_rel <= w['left'] < col_right_rel]
|
|
g.word_count = len(g.words)
|
|
geometries = filtered_geoms
|
|
logger.info(f"ColumnGeometry: {len(geometries)} columns after phantom filter: "
|
|
f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
|
|
|
|
return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)
|