Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
480 lines
16 KiB
Python
480 lines
16 KiB
Python
"""
|
||
Document type detection, image preparation, content bounds, and header/footer detection.
|
||
|
||
Extracted from cv_layout.py — these are the "input-side" helpers that run before
|
||
column/row geometry analysis.
|
||
|
||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||
"""
|
||
|
||
import logging
|
||
from typing import List, Optional, Tuple
|
||
|
||
import numpy as np
|
||
|
||
from cv_vocab_types import (
|
||
DocumentTypeResult,
|
||
PageRegion,
|
||
)
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
try:
|
||
import cv2
|
||
except ImportError:
|
||
cv2 = None # type: ignore[assignment]
|
||
|
||
|
||
# =============================================================================
|
||
# Document Type Detection
|
||
# =============================================================================
|
||
|
||
def detect_document_type(ocr_img: np.ndarray, img_bgr: np.ndarray) -> DocumentTypeResult:
|
||
"""Detect whether the page is a vocab table, generic table, or full text.
|
||
|
||
Uses projection profiles and text density analysis — no OCR required.
|
||
Runs in < 2 seconds.
|
||
|
||
Args:
|
||
ocr_img: Binarized grayscale image (for projection profiles).
|
||
img_bgr: BGR color image.
|
||
|
||
Returns:
|
||
DocumentTypeResult with doc_type, confidence, pipeline, skip_steps.
|
||
"""
|
||
if ocr_img is None or ocr_img.size == 0:
|
||
return DocumentTypeResult(
|
||
doc_type='full_text', confidence=0.5, pipeline='full_page',
|
||
skip_steps=['columns', 'rows'],
|
||
features={'error': 'empty image'},
|
||
)
|
||
|
||
h, w = ocr_img.shape[:2]
|
||
|
||
# --- 1. Vertical projection profile → detect column gaps ---
|
||
# Sum dark pixels along each column (x-axis). Gaps = valleys in the profile.
|
||
# Invert: dark pixels on white background → high values = text.
|
||
vert_proj = np.sum(ocr_img < 128, axis=0).astype(float)
|
||
|
||
# Smooth the profile to avoid noise spikes
|
||
kernel_size = max(3, w // 100)
|
||
if kernel_size % 2 == 0:
|
||
kernel_size += 1
|
||
vert_smooth = np.convolve(vert_proj, np.ones(kernel_size) / kernel_size, mode='same')
|
||
|
||
# Find significant vertical gaps (columns of near-zero text density)
|
||
# A gap must be at least 1% of image width and have < 5% of max density
|
||
max_density = max(vert_smooth.max(), 1)
|
||
gap_threshold = max_density * 0.05
|
||
min_gap_width = max(5, w // 100)
|
||
|
||
in_gap = False
|
||
gap_count = 0
|
||
gap_start = 0
|
||
vert_gaps = []
|
||
|
||
for x in range(w):
|
||
if vert_smooth[x] < gap_threshold:
|
||
if not in_gap:
|
||
in_gap = True
|
||
gap_start = x
|
||
else:
|
||
if in_gap:
|
||
gap_width = x - gap_start
|
||
if gap_width >= min_gap_width:
|
||
gap_count += 1
|
||
vert_gaps.append((gap_start, x, gap_width))
|
||
in_gap = False
|
||
|
||
# Filter out margin gaps (within 10% of image edges)
|
||
margin_threshold = w * 0.10
|
||
internal_gaps = [g for g in vert_gaps if g[0] > margin_threshold and g[1] < w - margin_threshold]
|
||
internal_gap_count = len(internal_gaps)
|
||
|
||
# --- 2. Horizontal projection profile → detect row gaps ---
|
||
horiz_proj = np.sum(ocr_img < 128, axis=1).astype(float)
|
||
h_kernel = max(3, h // 200)
|
||
if h_kernel % 2 == 0:
|
||
h_kernel += 1
|
||
horiz_smooth = np.convolve(horiz_proj, np.ones(h_kernel) / h_kernel, mode='same')
|
||
|
||
h_max = max(horiz_smooth.max(), 1)
|
||
h_gap_threshold = h_max * 0.05
|
||
min_row_gap = max(3, h // 200)
|
||
|
||
row_gap_count = 0
|
||
in_gap = False
|
||
for y in range(h):
|
||
if horiz_smooth[y] < h_gap_threshold:
|
||
if not in_gap:
|
||
in_gap = True
|
||
gap_start = y
|
||
else:
|
||
if in_gap:
|
||
if y - gap_start >= min_row_gap:
|
||
row_gap_count += 1
|
||
in_gap = False
|
||
|
||
# --- 3. Text density distribution (4×4 grid) ---
|
||
grid_rows, grid_cols = 4, 4
|
||
cell_h, cell_w = h // grid_rows, w // grid_cols
|
||
densities = []
|
||
for gr in range(grid_rows):
|
||
for gc in range(grid_cols):
|
||
cell = ocr_img[gr * cell_h:(gr + 1) * cell_h,
|
||
gc * cell_w:(gc + 1) * cell_w]
|
||
if cell.size > 0:
|
||
d = float(np.count_nonzero(cell < 128)) / cell.size
|
||
densities.append(d)
|
||
|
||
density_std = float(np.std(densities)) if densities else 0
|
||
density_mean = float(np.mean(densities)) if densities else 0
|
||
|
||
features = {
|
||
'vertical_gaps': gap_count,
|
||
'internal_vertical_gaps': internal_gap_count,
|
||
'vertical_gap_details': [(g[0], g[1], g[2]) for g in vert_gaps[:10]],
|
||
'row_gaps': row_gap_count,
|
||
'density_mean': round(density_mean, 4),
|
||
'density_std': round(density_std, 4),
|
||
'image_size': (w, h),
|
||
}
|
||
|
||
# --- 4. Decision tree ---
|
||
# Use internal_gap_count (excludes margin gaps) for column detection.
|
||
if internal_gap_count >= 2 and row_gap_count >= 5:
|
||
# Multiple internal vertical gaps + many row gaps → table
|
||
confidence = min(0.95, 0.7 + internal_gap_count * 0.05 + row_gap_count * 0.005)
|
||
return DocumentTypeResult(
|
||
doc_type='vocab_table',
|
||
confidence=round(confidence, 2),
|
||
pipeline='cell_first',
|
||
skip_steps=[],
|
||
features=features,
|
||
)
|
||
elif internal_gap_count >= 1 and row_gap_count >= 3:
|
||
# Some internal structure, likely a table
|
||
confidence = min(0.85, 0.5 + internal_gap_count * 0.1 + row_gap_count * 0.01)
|
||
return DocumentTypeResult(
|
||
doc_type='generic_table',
|
||
confidence=round(confidence, 2),
|
||
pipeline='cell_first',
|
||
skip_steps=[],
|
||
features=features,
|
||
)
|
||
elif internal_gap_count == 0:
|
||
# No internal column gaps → full text (regardless of density)
|
||
confidence = min(0.95, 0.8 + (1 - min(density_std, 0.1)) * 0.15)
|
||
return DocumentTypeResult(
|
||
doc_type='full_text',
|
||
confidence=round(confidence, 2),
|
||
pipeline='full_page',
|
||
skip_steps=['columns', 'rows'],
|
||
features=features,
|
||
)
|
||
else:
|
||
# Ambiguous — default to vocab_table (most common use case)
|
||
return DocumentTypeResult(
|
||
doc_type='vocab_table',
|
||
confidence=0.5,
|
||
pipeline='cell_first',
|
||
skip_steps=[],
|
||
features=features,
|
||
)
|
||
|
||
|
||
# =============================================================================
|
||
# Image Creation (Dual Image Preparation)
|
||
# =============================================================================
|
||
|
||
def create_ocr_image(img: np.ndarray) -> np.ndarray:
|
||
"""Create a binarized image optimized for Tesseract OCR.
|
||
|
||
Steps: Grayscale → Background normalization → Adaptive threshold → Denoise.
|
||
|
||
Args:
|
||
img: BGR image.
|
||
|
||
Returns:
|
||
Binary image (white text on black background inverted to black on white).
|
||
"""
|
||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||
|
||
# Background normalization: divide by blurred version
|
||
bg = cv2.GaussianBlur(gray, (51, 51), 0)
|
||
normalized = cv2.divide(gray, bg, scale=255)
|
||
|
||
# Adaptive binarization
|
||
binary = cv2.adaptiveThreshold(
|
||
normalized, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
||
cv2.THRESH_BINARY, 31, 10
|
||
)
|
||
|
||
# Light denoise
|
||
denoised = cv2.medianBlur(binary, 3)
|
||
|
||
return denoised
|
||
|
||
|
||
def create_layout_image(img: np.ndarray) -> np.ndarray:
|
||
"""Create a CLAHE-enhanced grayscale image for layout analysis.
|
||
|
||
Args:
|
||
img: BGR image.
|
||
|
||
Returns:
|
||
Enhanced grayscale image.
|
||
"""
|
||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
|
||
enhanced = clahe.apply(gray)
|
||
return enhanced
|
||
|
||
|
||
# =============================================================================
|
||
# Content Bounds Detection
|
||
# =============================================================================
|
||
|
||
def _filter_narrow_runs(mask: np.ndarray, min_width: int) -> np.ndarray:
|
||
"""Remove contiguous True-runs shorter than *min_width* from a 1-D bool mask."""
|
||
out = mask.copy()
|
||
n = len(out)
|
||
i = 0
|
||
while i < n:
|
||
if out[i]:
|
||
start = i
|
||
while i < n and out[i]:
|
||
i += 1
|
||
if (i - start) < min_width:
|
||
out[start:i] = False
|
||
else:
|
||
i += 1
|
||
return out
|
||
|
||
|
||
def _find_content_bounds(inv: np.ndarray) -> Tuple[int, int, int, int]:
|
||
"""Find the bounding box of actual text content (excluding page margins).
|
||
|
||
Scan artefacts (thin black lines at page edges) are filtered out by
|
||
discarding contiguous projection runs narrower than 1 % of the image
|
||
dimension (min 5 px).
|
||
|
||
Returns:
|
||
Tuple of (left_x, right_x, top_y, bottom_y).
|
||
"""
|
||
h, w = inv.shape[:2]
|
||
threshold = 0.005
|
||
|
||
# --- Horizontal projection for top/bottom ---
|
||
h_proj = np.sum(inv, axis=1).astype(float) / (w * 255)
|
||
h_mask = h_proj > threshold
|
||
min_h_run = max(5, h // 100)
|
||
h_mask = _filter_narrow_runs(h_mask, min_h_run)
|
||
|
||
top_y = 0
|
||
for y in range(h):
|
||
if h_mask[y]:
|
||
top_y = max(0, y - 5)
|
||
break
|
||
|
||
bottom_y = h
|
||
for y in range(h - 1, 0, -1):
|
||
if h_mask[y]:
|
||
bottom_y = min(h, y + 5)
|
||
break
|
||
|
||
# --- Vertical projection for left/right margins ---
|
||
v_proj = np.sum(inv[top_y:bottom_y, :], axis=0).astype(float)
|
||
v_proj_norm = v_proj / ((bottom_y - top_y) * 255) if (bottom_y - top_y) > 0 else v_proj
|
||
v_mask = v_proj_norm > threshold
|
||
min_v_run = max(5, w // 100)
|
||
v_mask = _filter_narrow_runs(v_mask, min_v_run)
|
||
|
||
left_x = 0
|
||
for x in range(w):
|
||
if v_mask[x]:
|
||
left_x = max(0, x - 2)
|
||
break
|
||
|
||
right_x = w
|
||
for x in range(w - 1, 0, -1):
|
||
if v_mask[x]:
|
||
right_x = min(w, x + 2)
|
||
break
|
||
|
||
return left_x, right_x, top_y, bottom_y
|
||
|
||
|
||
# =============================================================================
|
||
# Header / Footer Detection
|
||
# =============================================================================
|
||
|
||
def _detect_header_footer_gaps(
|
||
inv: np.ndarray,
|
||
img_w: int,
|
||
img_h: int,
|
||
) -> Tuple[Optional[int], Optional[int]]:
|
||
"""Detect header/footer boundaries via horizontal projection gap analysis.
|
||
|
||
Scans the full-page inverted image for large horizontal gaps in the top/bottom
|
||
20% that separate header/footer content from the main body.
|
||
|
||
Returns:
|
||
(header_y, footer_y) — absolute y-coordinates.
|
||
header_y = bottom edge of header region (None if no header detected).
|
||
footer_y = top edge of footer region (None if no footer detected).
|
||
"""
|
||
HEADER_FOOTER_ZONE = 0.20
|
||
GAP_MULTIPLIER = 2.0
|
||
|
||
# Step 1: Horizontal projection — clamp to img_h to avoid dewarp padding
|
||
actual_h = min(inv.shape[0], img_h)
|
||
roi = inv[:actual_h, :]
|
||
h_proj = np.sum(roi, axis=1).astype(float)
|
||
proj_w = roi.shape[1]
|
||
h_proj_norm = h_proj / (proj_w * 255) if proj_w > 0 else h_proj
|
||
|
||
# Step 2: Smoothing
|
||
kernel_size = max(3, actual_h // 200)
|
||
if kernel_size % 2 == 0:
|
||
kernel_size += 1
|
||
h_smooth = np.convolve(h_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
|
||
|
||
# Step 3: Gap threshold
|
||
positive = h_smooth[h_smooth > 0]
|
||
median_density = float(np.median(positive)) if len(positive) > 0 else 0.01
|
||
gap_threshold = max(median_density * 0.15, 0.003)
|
||
|
||
in_gap = h_smooth < gap_threshold
|
||
MIN_GAP_HEIGHT = max(3, actual_h // 500)
|
||
|
||
# Step 4: Collect contiguous gaps
|
||
raw_gaps: List[Tuple[int, int]] = []
|
||
gap_start: Optional[int] = None
|
||
for y in range(len(in_gap)):
|
||
if in_gap[y]:
|
||
if gap_start is None:
|
||
gap_start = y
|
||
else:
|
||
if gap_start is not None:
|
||
gap_height = y - gap_start
|
||
if gap_height >= MIN_GAP_HEIGHT:
|
||
raw_gaps.append((gap_start, y))
|
||
gap_start = None
|
||
if gap_start is not None:
|
||
gap_height = len(in_gap) - gap_start
|
||
if gap_height >= MIN_GAP_HEIGHT:
|
||
raw_gaps.append((gap_start, len(in_gap)))
|
||
|
||
if not raw_gaps:
|
||
return None, None
|
||
|
||
# Step 5: Compute median gap size and large-gap threshold
|
||
gap_sizes = [g[1] - g[0] for g in raw_gaps]
|
||
median_gap = float(np.median(gap_sizes))
|
||
large_gap_threshold = median_gap * GAP_MULTIPLIER
|
||
|
||
# Step 6: Find largest qualifying gap in header / footer zones
|
||
# A separator gap must have content on BOTH sides — edge-touching gaps
|
||
# (e.g. dewarp padding at bottom) are not valid separators.
|
||
EDGE_MARGIN = max(5, actual_h // 400)
|
||
header_zone_limit = int(actual_h * HEADER_FOOTER_ZONE)
|
||
footer_zone_start = int(actual_h * (1.0 - HEADER_FOOTER_ZONE))
|
||
|
||
header_y: Optional[int] = None
|
||
footer_y: Optional[int] = None
|
||
|
||
best_header_size = 0
|
||
for gs, ge in raw_gaps:
|
||
if gs <= EDGE_MARGIN:
|
||
continue # skip gaps touching the top edge
|
||
gap_mid = (gs + ge) / 2
|
||
gap_size = ge - gs
|
||
if gap_mid < header_zone_limit and gap_size > large_gap_threshold:
|
||
if gap_size > best_header_size:
|
||
best_header_size = gap_size
|
||
header_y = ge # bottom edge of gap
|
||
|
||
best_footer_size = 0
|
||
for gs, ge in raw_gaps:
|
||
if ge >= actual_h - EDGE_MARGIN:
|
||
continue # skip gaps touching the bottom edge
|
||
gap_mid = (gs + ge) / 2
|
||
gap_size = ge - gs
|
||
if gap_mid > footer_zone_start and gap_size > large_gap_threshold:
|
||
if gap_size > best_footer_size:
|
||
best_footer_size = gap_size
|
||
footer_y = gs # top edge of gap
|
||
|
||
if header_y is not None:
|
||
logger.info(f"HeaderFooterGaps: header boundary at y={header_y} "
|
||
f"(gap={best_header_size}px, median_gap={median_gap:.0f}px)")
|
||
if footer_y is not None:
|
||
logger.info(f"HeaderFooterGaps: footer boundary at y={footer_y} "
|
||
f"(gap={best_footer_size}px, median_gap={median_gap:.0f}px)")
|
||
|
||
return header_y, footer_y
|
||
|
||
|
||
def _region_has_content(inv: np.ndarray, y_start: int, y_end: int,
|
||
min_density: float = 0.005) -> bool:
|
||
"""Check whether a horizontal strip contains meaningful ink.
|
||
|
||
Args:
|
||
inv: Inverted binarized image (white-on-black).
|
||
y_start: Top of the region (inclusive).
|
||
y_end: Bottom of the region (exclusive).
|
||
min_density: Fraction of white pixels required to count as content.
|
||
|
||
Returns:
|
||
True if the region contains text/graphics, False if empty margin.
|
||
"""
|
||
if y_start >= y_end:
|
||
return False
|
||
strip = inv[y_start:y_end, :]
|
||
density = float(np.sum(strip)) / (strip.shape[0] * strip.shape[1] * 255)
|
||
return density > min_density
|
||
|
||
|
||
def _add_header_footer(regions: List[PageRegion], top_y: int, bottom_y: int,
|
||
img_w: int, img_h: int,
|
||
inv: Optional[np.ndarray] = None) -> None:
|
||
"""Add header/footer/margin regions in-place.
|
||
|
||
Uses gap-based detection when *inv* is provided, otherwise falls back
|
||
to simple top_y/bottom_y bounds.
|
||
|
||
Region types depend on whether there is actual content (text/graphics):
|
||
- 'header' / 'footer' — region contains text (e.g. title, page number)
|
||
- 'margin_top' / 'margin_bottom' — region is empty page margin
|
||
"""
|
||
header_y: Optional[int] = None
|
||
footer_y: Optional[int] = None
|
||
|
||
if inv is not None:
|
||
header_y, footer_y = _detect_header_footer_gaps(inv, img_w, img_h)
|
||
|
||
# --- Top region ---
|
||
top_boundary = header_y if header_y is not None and header_y > 10 else (
|
||
top_y if top_y > 10 else None
|
||
)
|
||
if top_boundary is not None:
|
||
has_content = inv is not None and _region_has_content(inv, 0, top_boundary)
|
||
rtype = 'header' if has_content else 'margin_top'
|
||
regions.append(PageRegion(type=rtype, x=0, y=0, width=img_w, height=top_boundary))
|
||
logger.info(f"HeaderFooter: top region type={rtype} height={top_boundary}px "
|
||
f"(has_content={has_content})")
|
||
|
||
# --- Bottom region ---
|
||
bottom_boundary = footer_y if footer_y is not None and footer_y < img_h - 10 else (
|
||
bottom_y if bottom_y < img_h - 10 else None
|
||
)
|
||
if bottom_boundary is not None:
|
||
has_content = inv is not None and _region_has_content(inv, bottom_boundary, img_h)
|
||
rtype = 'footer' if has_content else 'margin_bottom'
|
||
regions.append(PageRegion(type=rtype, x=0, y=bottom_boundary, width=img_w,
|
||
height=img_h - bottom_boundary))
|
||
logger.info(f"HeaderFooter: bottom region type={rtype} y={bottom_boundary} "
|
||
f"height={img_h - bottom_boundary}px (has_content={has_content})")
|