fix: Seite an Sub-Headern segmentieren, groesstes Segment fuer Projektion
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m58s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 17s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m58s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 17s
Statt full-width Zeilen zu maskieren wird die Seite jetzt an grossen horizontalen Luecken (Sub-Header, Kapitelgrenzen) in Segmente unterteilt. Das groesste Segment wird fuer die vertikale Projektion verwendet. Dadurch stoeren Illustrationen und Ueberschriften nicht mehr. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2131,45 +2131,75 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
|
||||
|
||||
logger.info(f"ColumnGeometry: {len(left_edges)} words detected in content area")
|
||||
|
||||
# --- Step 2b: Mask out full-width rows (sub-headers, colored bands) ---
|
||||
# Rows where ink spans nearly the full content width distort the vertical
|
||||
# projection by filling in column gaps. Detect them via horizontal density
|
||||
# and zero them out before computing v_proj.
|
||||
# --- Step 2b: Segment by sub-headers ---
|
||||
# Pages with sub-headers (e.g. "Unit 4: Bonnie Scotland") have full-width
|
||||
# text bands that pollute the vertical projection. We detect large
|
||||
# horizontal gaps (= whitespace rows separating sections) and use only
|
||||
# the tallest content segment for the projection. This makes column
|
||||
# detection immune to sub-headers, illustrations, and section dividers.
|
||||
content_strip = inv[top_y:bottom_y, left_x:right_x]
|
||||
h_proj_row = np.sum(content_strip, axis=1).astype(float)
|
||||
h_proj_row_norm = h_proj_row / (content_w * 255) if content_w > 0 else h_proj_row
|
||||
|
||||
FULLWIDTH_THRESHOLD = 0.40 # normal text ~10-25%; full-width bands 40%+
|
||||
fullwidth_mask = h_proj_row_norm > FULLWIDTH_THRESHOLD
|
||||
# Find horizontal gaps (near-empty rows)
|
||||
H_GAP_THRESH = 0.02 # rows with <2% ink density are "empty"
|
||||
h_in_gap = h_proj_row_norm < H_GAP_THRESH
|
||||
H_MIN_GAP = max(5, content_h // 200) # min gap height ~5-7px
|
||||
|
||||
# Only mask contiguous bands (>=3 rows), not isolated noisy rows
|
||||
masked_strip = content_strip.copy()
|
||||
n_masked = 0
|
||||
band_start = None
|
||||
for y_idx in range(len(fullwidth_mask)):
|
||||
if fullwidth_mask[y_idx]:
|
||||
if band_start is None:
|
||||
band_start = y_idx
|
||||
h_gaps: List[Tuple[int, int]] = []
|
||||
h_gap_start = None
|
||||
for y_idx in range(len(h_in_gap)):
|
||||
if h_in_gap[y_idx]:
|
||||
if h_gap_start is None:
|
||||
h_gap_start = y_idx
|
||||
else:
|
||||
if band_start is not None:
|
||||
band_height = y_idx - band_start
|
||||
if band_height >= 3:
|
||||
masked_strip[band_start:y_idx, :] = 0
|
||||
n_masked += band_height
|
||||
band_start = None
|
||||
if band_start is not None:
|
||||
band_height = len(fullwidth_mask) - band_start
|
||||
if band_height >= 3:
|
||||
masked_strip[band_start:len(fullwidth_mask), :] = 0
|
||||
n_masked += band_height
|
||||
if h_gap_start is not None:
|
||||
if y_idx - h_gap_start >= H_MIN_GAP:
|
||||
h_gaps.append((h_gap_start, y_idx))
|
||||
h_gap_start = None
|
||||
if h_gap_start is not None and len(h_in_gap) - h_gap_start >= H_MIN_GAP:
|
||||
h_gaps.append((h_gap_start, len(h_in_gap)))
|
||||
|
||||
if n_masked > 0:
|
||||
logger.info(f"ColumnGeometry: masked {n_masked} full-width rows "
|
||||
f"({n_masked * 100 / content_h:.1f}% of content height)")
|
||||
# Identify "large" gaps (significantly bigger than median) that indicate
|
||||
# section boundaries (sub-headers, chapter titles).
|
||||
if len(h_gaps) >= 3:
|
||||
gap_sizes = sorted(g[1] - g[0] for g in h_gaps)
|
||||
median_gap_h = gap_sizes[len(gap_sizes) // 2]
|
||||
large_gap_thresh = max(median_gap_h * 1.8, H_MIN_GAP + 3)
|
||||
large_gaps = [(gs, ge) for gs, ge in h_gaps if ge - gs >= large_gap_thresh]
|
||||
else:
|
||||
large_gaps = h_gaps
|
||||
|
||||
# Build content segments between large gaps and pick the tallest
|
||||
seg_boundaries = [0]
|
||||
for gs, ge in large_gaps:
|
||||
seg_boundaries.append(gs)
|
||||
seg_boundaries.append(ge)
|
||||
seg_boundaries.append(content_h)
|
||||
|
||||
segments = []
|
||||
for i in range(0, len(seg_boundaries) - 1, 2):
|
||||
seg_top = seg_boundaries[i]
|
||||
seg_bot = seg_boundaries[i + 1] if i + 1 < len(seg_boundaries) else content_h
|
||||
seg_height = seg_bot - seg_top
|
||||
if seg_height > 20: # ignore tiny fragments
|
||||
segments.append((seg_top, seg_bot, seg_height))
|
||||
|
||||
if segments:
|
||||
segments.sort(key=lambda s: s[2], reverse=True)
|
||||
best_seg = segments[0]
|
||||
proj_strip = content_strip[best_seg[0]:best_seg[1], :]
|
||||
effective_h = best_seg[2]
|
||||
if len(segments) > 1:
|
||||
logger.info(f"ColumnGeometry: {len(segments)} segments from {len(large_gaps)} "
|
||||
f"large h-gaps, using tallest: rows {best_seg[0]}..{best_seg[1]} "
|
||||
f"({effective_h}px, {effective_h*100/content_h:.0f}%)")
|
||||
else:
|
||||
proj_strip = content_strip
|
||||
effective_h = content_h
|
||||
|
||||
# --- Step 3: Vertical projection profile ---
|
||||
effective_h = content_h - n_masked
|
||||
v_proj = np.sum(masked_strip, axis=0).astype(float)
|
||||
v_proj = np.sum(proj_strip, axis=0).astype(float)
|
||||
v_proj_norm = v_proj / (effective_h * 255) if effective_h > 0 else v_proj
|
||||
|
||||
# Smooth the projection to avoid noise-induced micro-gaps
|
||||
|
||||
Reference in New Issue
Block a user