feat(ocr-pipeline): generic header/footer detection via projection gap analysis
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m48s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 16s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m48s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 16s
Replace the trivial top_y/bottom_y threshold check with horizontal projection gap analysis that finds large whitespace gaps separating header/footer content from the main body. This correctly detects headers (e.g. "VOCABULARY" banners) and footers (page numbers) even when _find_content_bounds includes them in the content area. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -921,22 +921,15 @@ def analyze_layout(layout_img: np.ndarray, ocr_img: np.ndarray) -> List[PageRegi
|
|||||||
width=w, height=content_h
|
width=w, height=content_h
|
||||||
))
|
))
|
||||||
|
|
||||||
# Add header/footer info
|
# Add header/footer info (gap-based detection with fallback)
|
||||||
if top_y > 10:
|
_add_header_footer(regions, top_y, bottom_y, w, h, inv=inv)
|
||||||
regions.append(PageRegion(
|
|
||||||
type='header', x=0, y=0,
|
|
||||||
width=w, height=top_y
|
|
||||||
))
|
|
||||||
if bottom_y < h - 10:
|
|
||||||
regions.append(PageRegion(
|
|
||||||
type='footer', x=0, y=bottom_y,
|
|
||||||
width=w, height=h - bottom_y
|
|
||||||
))
|
|
||||||
|
|
||||||
|
has_header = any(r.type == 'header' for r in regions)
|
||||||
|
has_footer = any(r.type == 'footer' for r in regions)
|
||||||
col_count = len([r for r in regions if r.type.startswith('column')])
|
col_count = len([r for r in regions if r.type.startswith('column')])
|
||||||
logger.info(f"Layout: {col_count} columns, "
|
logger.info(f"Layout: {col_count} columns, "
|
||||||
f"header={'yes' if top_y > 10 else 'no'}, "
|
f"header={'yes' if has_header else 'no'}, "
|
||||||
f"footer={'yes' if bottom_y < h - 10 else 'no'}")
|
f"footer={'yes' if has_footer else 'no'}")
|
||||||
|
|
||||||
return regions
|
return regions
|
||||||
|
|
||||||
@@ -2076,7 +2069,8 @@ def classify_column_types(geometries: List[ColumnGeometry],
|
|||||||
img_h: int,
|
img_h: int,
|
||||||
bottom_y: int,
|
bottom_y: int,
|
||||||
left_x: int = 0,
|
left_x: int = 0,
|
||||||
right_x: int = 0) -> List[PageRegion]:
|
right_x: int = 0,
|
||||||
|
inv: Optional[np.ndarray] = None) -> List[PageRegion]:
|
||||||
"""Classify column types using a 3-level fallback chain.
|
"""Classify column types using a 3-level fallback chain.
|
||||||
|
|
||||||
Level 1: Content-based (language + role scoring)
|
Level 1: Content-based (language + role scoring)
|
||||||
@@ -2159,20 +2153,20 @@ def classify_column_types(geometries: List[ColumnGeometry],
|
|||||||
regions = _classify_by_content(geometries, lang_scores, role_scores, content_w, content_h)
|
regions = _classify_by_content(geometries, lang_scores, role_scores, content_w, content_h)
|
||||||
if regions is not None:
|
if regions is not None:
|
||||||
logger.info("ClassifyColumns: Level 1 (content-based) succeeded")
|
logger.info("ClassifyColumns: Level 1 (content-based) succeeded")
|
||||||
_add_header_footer(regions, top_y, bottom_y, img_w, img_h)
|
_add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
|
||||||
return _with_margins(ignore_regions + regions)
|
return _with_margins(ignore_regions + regions)
|
||||||
|
|
||||||
# --- Level 2: Position + language enhanced ---
|
# --- Level 2: Position + language enhanced ---
|
||||||
regions = _classify_by_position_enhanced(geometries, lang_scores, content_w, content_h)
|
regions = _classify_by_position_enhanced(geometries, lang_scores, content_w, content_h)
|
||||||
if regions is not None:
|
if regions is not None:
|
||||||
logger.info("ClassifyColumns: Level 2 (position+language) succeeded")
|
logger.info("ClassifyColumns: Level 2 (position+language) succeeded")
|
||||||
_add_header_footer(regions, top_y, bottom_y, img_w, img_h)
|
_add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
|
||||||
return _with_margins(ignore_regions + regions)
|
return _with_margins(ignore_regions + regions)
|
||||||
|
|
||||||
# --- Level 3: Pure position fallback (old code, no regression) ---
|
# --- Level 3: Pure position fallback (old code, no regression) ---
|
||||||
logger.info("ClassifyColumns: Level 3 (position fallback)")
|
logger.info("ClassifyColumns: Level 3 (position fallback)")
|
||||||
regions = _classify_by_position_fallback(geometries, content_w, content_h)
|
regions = _classify_by_position_fallback(geometries, content_w, content_h)
|
||||||
_add_header_footer(regions, top_y, bottom_y, img_w, img_h)
|
_add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
|
||||||
return _with_margins(ignore_regions + regions)
|
return _with_margins(ignore_regions + regions)
|
||||||
|
|
||||||
|
|
||||||
@@ -2534,12 +2528,127 @@ def _classify_by_position_fallback(geometries: List[ColumnGeometry],
|
|||||||
return regions
|
return regions
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_header_footer_gaps(
|
||||||
|
inv: np.ndarray,
|
||||||
|
img_w: int,
|
||||||
|
img_h: int,
|
||||||
|
) -> Tuple[Optional[int], Optional[int]]:
|
||||||
|
"""Detect header/footer boundaries via horizontal projection gap analysis.
|
||||||
|
|
||||||
|
Scans the full-page inverted image for large horizontal gaps in the top/bottom
|
||||||
|
20% that separate header/footer content from the main body.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(header_y, footer_y) — absolute y-coordinates.
|
||||||
|
header_y = bottom edge of header region (None if no header detected).
|
||||||
|
footer_y = top edge of footer region (None if no footer detected).
|
||||||
|
"""
|
||||||
|
HEADER_FOOTER_ZONE = 0.20
|
||||||
|
GAP_MULTIPLIER = 2.0
|
||||||
|
|
||||||
|
# Step 1: Horizontal projection over full image width
|
||||||
|
h_proj = np.sum(inv, axis=1).astype(float)
|
||||||
|
h_proj_norm = h_proj / (img_w * 255) if img_w > 0 else h_proj
|
||||||
|
|
||||||
|
# Step 2: Smoothing
|
||||||
|
kernel_size = max(3, img_h // 200)
|
||||||
|
if kernel_size % 2 == 0:
|
||||||
|
kernel_size += 1
|
||||||
|
h_smooth = np.convolve(h_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
|
||||||
|
|
||||||
|
# Step 3: Gap threshold
|
||||||
|
positive = h_smooth[h_smooth > 0]
|
||||||
|
median_density = float(np.median(positive)) if len(positive) > 0 else 0.01
|
||||||
|
gap_threshold = max(median_density * 0.15, 0.003)
|
||||||
|
|
||||||
|
in_gap = h_smooth < gap_threshold
|
||||||
|
MIN_GAP_HEIGHT = max(3, img_h // 500)
|
||||||
|
|
||||||
|
# Step 4: Collect contiguous gaps
|
||||||
|
raw_gaps: List[Tuple[int, int]] = []
|
||||||
|
gap_start: Optional[int] = None
|
||||||
|
for y in range(len(in_gap)):
|
||||||
|
if in_gap[y]:
|
||||||
|
if gap_start is None:
|
||||||
|
gap_start = y
|
||||||
|
else:
|
||||||
|
if gap_start is not None:
|
||||||
|
gap_height = y - gap_start
|
||||||
|
if gap_height >= MIN_GAP_HEIGHT:
|
||||||
|
raw_gaps.append((gap_start, y))
|
||||||
|
gap_start = None
|
||||||
|
if gap_start is not None:
|
||||||
|
gap_height = len(in_gap) - gap_start
|
||||||
|
if gap_height >= MIN_GAP_HEIGHT:
|
||||||
|
raw_gaps.append((gap_start, len(in_gap)))
|
||||||
|
|
||||||
|
if not raw_gaps:
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
# Step 5: Compute median gap size and large-gap threshold
|
||||||
|
gap_sizes = [g[1] - g[0] for g in raw_gaps]
|
||||||
|
median_gap = float(np.median(gap_sizes))
|
||||||
|
large_gap_threshold = median_gap * GAP_MULTIPLIER
|
||||||
|
|
||||||
|
# Step 6: Find largest qualifying gap in header / footer zones
|
||||||
|
header_zone_limit = int(img_h * HEADER_FOOTER_ZONE)
|
||||||
|
footer_zone_start = int(img_h * (1.0 - HEADER_FOOTER_ZONE))
|
||||||
|
|
||||||
|
header_y: Optional[int] = None
|
||||||
|
footer_y: Optional[int] = None
|
||||||
|
|
||||||
|
best_header_size = 0
|
||||||
|
for gs, ge in raw_gaps:
|
||||||
|
gap_mid = (gs + ge) / 2
|
||||||
|
gap_size = ge - gs
|
||||||
|
if gap_mid < header_zone_limit and gap_size > large_gap_threshold:
|
||||||
|
if gap_size > best_header_size:
|
||||||
|
best_header_size = gap_size
|
||||||
|
header_y = ge # bottom edge of gap
|
||||||
|
|
||||||
|
best_footer_size = 0
|
||||||
|
for gs, ge in raw_gaps:
|
||||||
|
gap_mid = (gs + ge) / 2
|
||||||
|
gap_size = ge - gs
|
||||||
|
if gap_mid > footer_zone_start and gap_size > large_gap_threshold:
|
||||||
|
if gap_size > best_footer_size:
|
||||||
|
best_footer_size = gap_size
|
||||||
|
footer_y = gs # top edge of gap
|
||||||
|
|
||||||
|
if header_y is not None:
|
||||||
|
logger.info(f"HeaderFooterGaps: header boundary at y={header_y} "
|
||||||
|
f"(gap={best_header_size}px, median_gap={median_gap:.0f}px)")
|
||||||
|
if footer_y is not None:
|
||||||
|
logger.info(f"HeaderFooterGaps: footer boundary at y={footer_y} "
|
||||||
|
f"(gap={best_footer_size}px, median_gap={median_gap:.0f}px)")
|
||||||
|
|
||||||
|
return header_y, footer_y
|
||||||
|
|
||||||
|
|
||||||
def _add_header_footer(regions: List[PageRegion], top_y: int, bottom_y: int,
|
def _add_header_footer(regions: List[PageRegion], top_y: int, bottom_y: int,
|
||||||
img_w: int, img_h: int) -> None:
|
img_w: int, img_h: int,
|
||||||
"""Add header/footer regions in-place."""
|
inv: Optional[np.ndarray] = None) -> None:
|
||||||
if top_y > 10:
|
"""Add header/footer regions in-place.
|
||||||
|
|
||||||
|
When *inv* is provided, uses gap-based detection to find header/footer
|
||||||
|
boundaries. Falls back to simple top_y/bottom_y check otherwise.
|
||||||
|
"""
|
||||||
|
header_y: Optional[int] = None
|
||||||
|
footer_y: Optional[int] = None
|
||||||
|
|
||||||
|
if inv is not None:
|
||||||
|
header_y, footer_y = _detect_header_footer_gaps(inv, img_w, img_h)
|
||||||
|
|
||||||
|
# Gap-based header
|
||||||
|
if header_y is not None and header_y > 10:
|
||||||
|
regions.append(PageRegion(type='header', x=0, y=0, width=img_w, height=header_y))
|
||||||
|
elif top_y > 10:
|
||||||
regions.append(PageRegion(type='header', x=0, y=0, width=img_w, height=top_y))
|
regions.append(PageRegion(type='header', x=0, y=0, width=img_w, height=top_y))
|
||||||
if bottom_y < img_h - 10:
|
|
||||||
|
# Gap-based footer
|
||||||
|
if footer_y is not None and footer_y < img_h - 10:
|
||||||
|
regions.append(PageRegion(type='footer', x=0, y=footer_y, width=img_w, height=img_h - footer_y))
|
||||||
|
elif bottom_y < img_h - 10:
|
||||||
regions.append(PageRegion(type='footer', x=0, y=bottom_y, width=img_w, height=img_h - bottom_y))
|
regions.append(PageRegion(type='footer', x=0, y=bottom_y, width=img_w, height=img_h - bottom_y))
|
||||||
|
|
||||||
|
|
||||||
@@ -2576,7 +2685,7 @@ def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Li
|
|||||||
|
|
||||||
# Phase B: Content-based classification
|
# Phase B: Content-based classification
|
||||||
regions = classify_column_types(geometries, content_w, top_y, w, h, bottom_y,
|
regions = classify_column_types(geometries, content_w, top_y, w, h, bottom_y,
|
||||||
left_x=left_x, right_x=right_x)
|
left_x=left_x, right_x=right_x, inv=_inv)
|
||||||
|
|
||||||
col_count = len([r for r in regions if r.type.startswith('column') or r.type == 'page_ref'])
|
col_count = len([r for r in regions if r.type.startswith('column') or r.type == 'page_ref'])
|
||||||
methods = set(r.classification_method for r in regions if r.classification_method)
|
methods = set(r.classification_method for r in regions if r.classification_method)
|
||||||
|
|||||||
@@ -700,7 +700,7 @@ async def detect_columns(session_id: str):
|
|||||||
|
|
||||||
# Phase B: Content-based classification
|
# Phase B: Content-based classification
|
||||||
regions = classify_column_types(geometries, content_w, top_y, w, h, bottom_y,
|
regions = classify_column_types(geometries, content_w, top_y, w, h, bottom_y,
|
||||||
left_x=left_x, right_x=right_x)
|
left_x=left_x, right_x=right_x, inv=inv)
|
||||||
|
|
||||||
duration = time.time() - t0
|
duration = time.time() - t0
|
||||||
|
|
||||||
|
|||||||
@@ -34,6 +34,7 @@ from cv_vocab_pipeline import (
|
|||||||
_find_content_bounds,
|
_find_content_bounds,
|
||||||
_filter_narrow_runs,
|
_filter_narrow_runs,
|
||||||
_build_margin_regions,
|
_build_margin_regions,
|
||||||
|
_detect_header_footer_gaps,
|
||||||
analyze_layout,
|
analyze_layout,
|
||||||
_group_words_into_lines,
|
_group_words_into_lines,
|
||||||
match_lines_to_vocab,
|
match_lines_to_vocab,
|
||||||
@@ -989,6 +990,106 @@ class TestMarginRegions:
|
|||||||
assert m.classification_method == 'content_bounds'
|
assert m.classification_method == 'content_bounds'
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================
|
||||||
|
# Header/Footer Gap Detection
|
||||||
|
# =============================================
|
||||||
|
|
||||||
|
class TestHeaderFooterGapDetection:
|
||||||
|
"""Tests for _detect_header_footer_gaps()."""
|
||||||
|
|
||||||
|
def _make_inv(self, height: int, width: int, bands: list) -> np.ndarray:
|
||||||
|
"""Create an inverted binary image with white horizontal bands.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
height: Image height.
|
||||||
|
width: Image width.
|
||||||
|
bands: List of (y_start, y_end) tuples where pixels are white (255).
|
||||||
|
"""
|
||||||
|
inv = np.zeros((height, width), dtype=np.uint8)
|
||||||
|
for y1, y2 in bands:
|
||||||
|
inv[y1:y2, :] = 255
|
||||||
|
return inv
|
||||||
|
|
||||||
|
def _make_body_with_lines(self, h, w, body_start, body_end,
|
||||||
|
line_h=15, gap_h=12):
|
||||||
|
"""Create bands simulating text lines with inter-line gaps.
|
||||||
|
|
||||||
|
gap_h must be large enough to survive smoothing (kernel ~ h//200).
|
||||||
|
"""
|
||||||
|
bands = []
|
||||||
|
y = body_start
|
||||||
|
while y + line_h <= body_end:
|
||||||
|
bands.append((y, y + line_h))
|
||||||
|
y += line_h + gap_h
|
||||||
|
return bands
|
||||||
|
|
||||||
|
def test_header_gap_detected(self):
|
||||||
|
"""Content at top + large gap + main body → header_y at the gap."""
|
||||||
|
h, w = 2000, 800
|
||||||
|
# Header content at rows 20-80
|
||||||
|
bands = [(20, 80)]
|
||||||
|
# Large gap 80-300 (220px) — much larger than 12px line gaps
|
||||||
|
# Body lines from 300 to ~1990 (extends near bottom, no footer gap)
|
||||||
|
bands += self._make_body_with_lines(h, w, 300, 1990)
|
||||||
|
inv = self._make_inv(h, w, bands)
|
||||||
|
header_y, footer_y = _detect_header_footer_gaps(inv, w, h)
|
||||||
|
assert header_y is not None
|
||||||
|
assert 80 <= header_y <= 310
|
||||||
|
|
||||||
|
def test_footer_gap_detected(self):
|
||||||
|
"""Main body + large gap + page number → footer_y at the gap."""
|
||||||
|
h, w = 2000, 800
|
||||||
|
# Body lines from 10 to 1600 (starts near top, no header gap)
|
||||||
|
bands = self._make_body_with_lines(h, w, 10, 1600)
|
||||||
|
# Large gap 1600-1880 (280px)
|
||||||
|
# Page number 1880-1920
|
||||||
|
bands.append((1880, 1920))
|
||||||
|
inv = self._make_inv(h, w, bands)
|
||||||
|
header_y, footer_y = _detect_header_footer_gaps(inv, w, h)
|
||||||
|
assert footer_y is not None
|
||||||
|
assert 1580 <= footer_y <= 1890
|
||||||
|
|
||||||
|
def test_both_header_and_footer(self):
|
||||||
|
"""Header + gap + body lines + gap + footer → both detected."""
|
||||||
|
h, w = 2000, 800
|
||||||
|
# Header 10-60
|
||||||
|
bands = [(10, 60)]
|
||||||
|
# Large gap 60-250 (190px)
|
||||||
|
# Body lines from 250 to 1700
|
||||||
|
bands += self._make_body_with_lines(h, w, 250, 1700)
|
||||||
|
# Large gap 1700-1900 (200px)
|
||||||
|
# Footer 1900-1970
|
||||||
|
bands.append((1900, 1970))
|
||||||
|
inv = self._make_inv(h, w, bands)
|
||||||
|
header_y, footer_y = _detect_header_footer_gaps(inv, w, h)
|
||||||
|
assert header_y is not None
|
||||||
|
assert footer_y is not None
|
||||||
|
assert 60 <= header_y <= 260
|
||||||
|
assert 1690 <= footer_y <= 1910
|
||||||
|
|
||||||
|
def test_no_gaps_returns_none(self):
|
||||||
|
"""Uniform content across the page → (None, None)."""
|
||||||
|
h, w = 1000, 800
|
||||||
|
# Content across entire height
|
||||||
|
inv = self._make_inv(h, w, [(0, 1000)])
|
||||||
|
header_y, footer_y = _detect_header_footer_gaps(inv, w, h)
|
||||||
|
assert header_y is None
|
||||||
|
assert footer_y is None
|
||||||
|
|
||||||
|
def test_small_gaps_ignored(self):
|
||||||
|
"""Gaps smaller than 2x median should be ignored."""
|
||||||
|
h, w = 1000, 800
|
||||||
|
# Many small, evenly-spaced gaps (like line spacing) — no large outlier
|
||||||
|
bands = []
|
||||||
|
for row_start in range(0, 1000, 20):
|
||||||
|
bands.append((row_start, row_start + 15)) # 15px content, 5px gap
|
||||||
|
inv = self._make_inv(h, w, bands)
|
||||||
|
header_y, footer_y = _detect_header_footer_gaps(inv, w, h)
|
||||||
|
# All gaps are equal size, none > 2x median → no header/footer
|
||||||
|
assert header_y is None
|
||||||
|
assert footer_y is None
|
||||||
|
|
||||||
|
|
||||||
# =============================================
|
# =============================================
|
||||||
# RUN TESTS
|
# RUN TESTS
|
||||||
# =============================================
|
# =============================================
|
||||||
|
|||||||
Reference in New Issue
Block a user