feat(ocr-pipeline): distinguish header/footer vs margin_top/margin_bottom
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 29s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 2m0s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 19s

Check for actual ink content in detected top/bottom regions:
- 'header'/'footer' when text is present (e.g. title, page number)
- 'margin_top'/'margin_bottom' when the region is empty page margin

Also update all skip-type sets and color maps for the new types.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-02 16:55:41 +01:00
parent f615c5f66d
commit c8981423d4
3 changed files with 134 additions and 27 deletions

View File

@@ -98,7 +98,7 @@ ENGLISH_FUNCTION_WORDS = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'to', 'o
@dataclass
class PageRegion:
"""A detected region on the page."""
type: str # 'column_en', 'column_de', 'column_example', 'page_ref', 'column_marker', 'column_text', 'header', 'footer'
type: str # 'column_en', 'column_de', 'column_example', 'page_ref', 'column_marker', 'column_text', 'header', 'footer', 'margin_top', 'margin_bottom'
x: int
y: int
width: int
@@ -924,12 +924,10 @@ def analyze_layout(layout_img: np.ndarray, ocr_img: np.ndarray) -> List[PageRegi
# Add header/footer info (gap-based detection with fallback)
_add_header_footer(regions, top_y, bottom_y, w, h, inv=inv)
has_header = any(r.type == 'header' for r in regions)
has_footer = any(r.type == 'footer' for r in regions)
top_region = next((r.type for r in regions if r.type in ('header', 'margin_top')), 'none')
bottom_region = next((r.type for r in regions if r.type in ('footer', 'margin_bottom')), 'none')
col_count = len([r for r in regions if r.type.startswith('column')])
logger.info(f"Layout: {col_count} columns, "
f"header={'yes' if has_header else 'no'}, "
f"footer={'yes' if has_footer else 'no'}")
logger.info(f"Layout: {col_count} columns, top={top_region}, bottom={bottom_region}")
return regions
@@ -2042,7 +2040,8 @@ def _build_margin_regions(
# Right margin: from end of last content column to image edge
non_margin = [r for r in all_regions
if r.type not in ('margin_left', 'margin_right', 'header', 'footer')]
if r.type not in ('margin_left', 'margin_right', 'header', 'footer',
'margin_top', 'margin_bottom')]
if non_margin:
last_col_end = max(r.x + r.width for r in non_margin)
else:
@@ -2625,13 +2624,37 @@ def _detect_header_footer_gaps(
return header_y, footer_y
def _region_has_content(inv: np.ndarray, y_start: int, y_end: int,
min_density: float = 0.005) -> bool:
"""Check whether a horizontal strip contains meaningful ink.
Args:
inv: Inverted binarized image (white-on-black).
y_start: Top of the region (inclusive).
y_end: Bottom of the region (exclusive).
min_density: Fraction of white pixels required to count as content.
Returns:
True if the region contains text/graphics, False if empty margin.
"""
if y_start >= y_end:
return False
strip = inv[y_start:y_end, :]
density = float(np.sum(strip)) / (strip.shape[0] * strip.shape[1] * 255)
return density > min_density
def _add_header_footer(regions: List[PageRegion], top_y: int, bottom_y: int,
img_w: int, img_h: int,
inv: Optional[np.ndarray] = None) -> None:
"""Add header/footer regions in-place.
"""Add header/footer/margin regions in-place.
When *inv* is provided, uses gap-based detection to find header/footer
boundaries. Falls back to simple top_y/bottom_y check otherwise.
Uses gap-based detection when *inv* is provided, otherwise falls back
to simple top_y/bottom_y bounds.
Region types depend on whether there is actual content (text/graphics):
- 'header' / 'footer' — region contains text (e.g. title, page number)
- 'margin_top' / 'margin_bottom' — region is empty page margin
"""
header_y: Optional[int] = None
footer_y: Optional[int] = None
@@ -2639,17 +2662,28 @@ def _add_header_footer(regions: List[PageRegion], top_y: int, bottom_y: int,
if inv is not None:
header_y, footer_y = _detect_header_footer_gaps(inv, img_w, img_h)
# Gap-based header
if header_y is not None and header_y > 10:
regions.append(PageRegion(type='header', x=0, y=0, width=img_w, height=header_y))
elif top_y > 10:
regions.append(PageRegion(type='header', x=0, y=0, width=img_w, height=top_y))
# --- Top region ---
top_boundary = header_y if header_y is not None and header_y > 10 else (
top_y if top_y > 10 else None
)
if top_boundary is not None:
has_content = inv is not None and _region_has_content(inv, 0, top_boundary)
rtype = 'header' if has_content else 'margin_top'
regions.append(PageRegion(type=rtype, x=0, y=0, width=img_w, height=top_boundary))
logger.info(f"HeaderFooter: top region type={rtype} height={top_boundary}px "
f"(has_content={has_content})")
# Gap-based footer
if footer_y is not None and footer_y < img_h - 10:
regions.append(PageRegion(type='footer', x=0, y=footer_y, width=img_w, height=img_h - footer_y))
elif bottom_y < img_h - 10:
regions.append(PageRegion(type='footer', x=0, y=bottom_y, width=img_w, height=img_h - bottom_y))
# --- Bottom region ---
bottom_boundary = footer_y if footer_y is not None and footer_y < img_h - 10 else (
bottom_y if bottom_y < img_h - 10 else None
)
if bottom_boundary is not None:
has_content = inv is not None and _region_has_content(inv, bottom_boundary, img_h)
rtype = 'footer' if has_content else 'margin_bottom'
regions.append(PageRegion(type=rtype, x=0, y=bottom_boundary, width=img_w,
height=img_h - bottom_boundary))
logger.info(f"HeaderFooter: bottom region type={rtype} y={bottom_boundary} "
f"height={img_h - bottom_boundary}px (has_content={has_content})")
# --- Main Entry Point ---
@@ -2690,7 +2724,7 @@ def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Li
col_count = len([r for r in regions if r.type.startswith('column') or r.type == 'page_ref'])
methods = set(r.classification_method for r in regions if r.classification_method)
logger.info(f"LayoutByWords: {col_count} columns detected (methods: {methods}): "
f"{[(r.type, r.x, r.width, r.classification_confidence) for r in regions if r.type not in ('header','footer')]}")
f"{[(r.type, r.x, r.width, r.classification_confidence) for r in regions if r.type not in ('header','footer','margin_top','margin_bottom')]}")
return regions
@@ -3797,7 +3831,7 @@ def build_cell_grid(
return [], []
# Use columns only — skip ignore, header, footer, page_ref
_skip_types = {'column_ignore', 'header', 'footer', 'page_ref', 'margin_left', 'margin_right'}
_skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'page_ref', 'margin_left', 'margin_right'}
relevant_cols = [c for c in column_regions if c.type not in _skip_types]
if not relevant_cols:
logger.warning("build_cell_grid: no usable columns found")
@@ -3959,7 +3993,7 @@ def build_cell_grid_streaming(
if not content_rows:
return
_skip_types = {'column_ignore', 'header', 'footer', 'page_ref', 'margin_left', 'margin_right'}
_skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'page_ref', 'margin_left', 'margin_right'}
relevant_cols = [c for c in column_regions if c.type not in _skip_types]
if not relevant_cols:
return
@@ -4444,7 +4478,7 @@ def run_multi_pass_ocr(ocr_img: np.ndarray,
"""
results: Dict[str, List[Dict]] = {}
_ocr_skip = {'header', 'footer', 'margin_left', 'margin_right'}
_ocr_skip = {'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
for region in regions:
if region.type in _ocr_skip:
continue # Skip non-content regions