overflow-hidden → overflow-y-auto so all nav items are reachable. Added /parent (Eltern-Portal) link with people icon. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
275 lines
9.7 KiB
Python
275 lines
9.7 KiB
Python
"""
|
|
Layout analysis for OCR vocabulary pages — orchestration and re-exports.
|
|
|
|
This module provides the high-level entry points for layout analysis and
|
|
re-exports all functions from sub-modules for backward compatibility.
|
|
|
|
Sub-modules:
|
|
- cv_layout_detection: Document type detection, image creation, content bounds, header/footer
|
|
- cv_layout_analyze: Legacy projection-based layout analysis
|
|
- cv_layout_columns: Core column geometry detection
|
|
- cv_layout_column_refine: Sub-column, broad-column, expand operations
|
|
- cv_layout_rows: Row geometry detection
|
|
- cv_layout_row_regularize: Row grid regularization
|
|
- cv_layout_scoring: Language/role scoring, dictionary signals
|
|
- cv_layout_classify: Column type classification (Phase B)
|
|
- cv_layout_classify_position: Position-based classification fallbacks
|
|
"""
|
|
|
|
import logging
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
|
|
import numpy as np
|
|
|
|
from cv_vocab_types import ColumnGeometry, DetectedBox, PageRegion
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# ── Re-exports (backward compatibility) ───────────────────────────────────
|
|
|
|
from cv_layout_detection import ( # noqa: F401
|
|
detect_document_type,
|
|
create_ocr_image,
|
|
create_layout_image,
|
|
_filter_narrow_runs,
|
|
_find_content_bounds,
|
|
_detect_header_footer_gaps,
|
|
_region_has_content,
|
|
_add_header_footer,
|
|
)
|
|
|
|
from cv_layout_analyze import ( # noqa: F401
|
|
analyze_layout,
|
|
)
|
|
|
|
from cv_layout_columns import ( # noqa: F401
|
|
detect_column_geometry,
|
|
_detect_columns_by_clustering,
|
|
_build_geometries_from_starts,
|
|
)
|
|
|
|
from cv_layout_column_refine import ( # noqa: F401
|
|
_detect_sub_columns,
|
|
_split_broad_columns,
|
|
expand_narrow_columns,
|
|
)
|
|
|
|
from cv_layout_rows import ( # noqa: F401
|
|
detect_row_geometry,
|
|
_build_rows_from_word_grouping,
|
|
)
|
|
|
|
from cv_layout_row_regularize import ( # noqa: F401
|
|
_regularize_row_grid,
|
|
)
|
|
|
|
from cv_layout_scoring import ( # noqa: F401
|
|
_score_language,
|
|
_score_role,
|
|
_score_dictionary_signals,
|
|
_classify_dictionary_columns,
|
|
)
|
|
|
|
from cv_layout_classify import ( # noqa: F401
|
|
_build_margin_regions,
|
|
positional_column_regions,
|
|
classify_column_types,
|
|
_classify_by_content,
|
|
)
|
|
|
|
from cv_layout_classify_position import ( # noqa: F401
|
|
_classify_by_position_enhanced,
|
|
_classify_by_position_fallback,
|
|
)
|
|
|
|
|
|
# ── Orchestration Functions ───────────────────────────────────────────────
|
|
|
|
def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> List[PageRegion]:
|
|
"""Detect columns using two-phase approach: geometry then content classification.
|
|
|
|
Phase A: detect_column_geometry() — clustering word positions into columns.
|
|
Phase B: classify_column_types() — content-based type assignment with fallback.
|
|
|
|
Falls back to projection-based analyze_layout() if geometry detection fails.
|
|
"""
|
|
h, w = ocr_img.shape[:2]
|
|
|
|
result = detect_column_geometry(ocr_img, dewarped_bgr)
|
|
|
|
if result is None:
|
|
logger.info("LayoutByWords: geometry detection failed, falling back to projection profiles")
|
|
layout_img = create_layout_image(dewarped_bgr)
|
|
return analyze_layout(layout_img, ocr_img)
|
|
|
|
geometries, left_x, right_x, top_y, bottom_y, _word_dicts, _inv = result
|
|
content_w = right_x - left_x
|
|
|
|
header_y, footer_y = _detect_header_footer_gaps(_inv, w, h) if _inv is not None else (None, None)
|
|
|
|
geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
|
|
top_y=top_y, header_y=header_y, footer_y=footer_y)
|
|
|
|
geometries = _split_broad_columns(geometries, content_w, left_x=left_x)
|
|
|
|
content_h = bottom_y - top_y
|
|
regions = positional_column_regions(geometries, content_w, content_h, left_x)
|
|
|
|
col_count = len([r for r in regions if r.type.startswith('column') or r.type == 'page_ref'])
|
|
methods = set(r.classification_method for r in regions if r.classification_method)
|
|
logger.info(f"LayoutByWords: {col_count} columns detected (methods: {methods}): "
|
|
f"{[(r.type, r.x, r.width, r.classification_confidence) for r in regions if r.type not in ('header','footer','margin_top','margin_bottom')]}")
|
|
|
|
return regions
|
|
|
|
|
|
def detect_column_geometry_zoned(
|
|
ocr_img: np.ndarray,
|
|
dewarped_bgr: np.ndarray,
|
|
) -> Optional[Tuple[
|
|
List[ColumnGeometry],
|
|
int, int, int, int,
|
|
List[Dict],
|
|
np.ndarray,
|
|
List[Dict],
|
|
List[DetectedBox],
|
|
]]:
|
|
"""Zone-aware column geometry detection.
|
|
|
|
1. Finds content bounds.
|
|
2. Runs box detection.
|
|
3. If boxes found: splits page into zones, runs detect_column_geometry()
|
|
per content zone on the corresponding sub-image.
|
|
4. If no boxes: delegates entirely to detect_column_geometry().
|
|
"""
|
|
from cv_box_detect import detect_boxes, split_page_into_zones
|
|
|
|
geo_result = detect_column_geometry(ocr_img, dewarped_bgr)
|
|
if geo_result is None:
|
|
return None
|
|
|
|
geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result
|
|
content_w = right_x - left_x
|
|
content_h = bottom_y - top_y
|
|
|
|
boxes = detect_boxes(dewarped_bgr, left_x, content_w, top_y, content_h)
|
|
|
|
if not boxes:
|
|
zone_data = [{
|
|
"index": 0, "zone_type": "content",
|
|
"y": top_y, "height": content_h,
|
|
"x": left_x, "width": content_w, "columns": [],
|
|
}]
|
|
return (geometries, left_x, right_x, top_y, bottom_y,
|
|
word_dicts, inv, zone_data, boxes)
|
|
|
|
zones = split_page_into_zones(left_x, top_y, content_w, content_h, boxes)
|
|
|
|
content_strips: List[Tuple[int, int]] = []
|
|
for zone in zones:
|
|
if zone.zone_type == 'content' and zone.height >= 40:
|
|
content_strips.append((zone.y, zone.y + zone.height))
|
|
|
|
if not content_strips:
|
|
logger.info("ZonedColumns: no content zones with height >= 40, using original result")
|
|
zone_data = [{"index": 0, "zone_type": "content", "y": top_y,
|
|
"height": content_h, "x": left_x, "width": content_w, "columns": []}]
|
|
return (geometries, left_x, right_x, top_y, bottom_y,
|
|
word_dicts, inv, zone_data, boxes)
|
|
|
|
ocr_strips = [ocr_img[ys:ye, :] for ys, ye in content_strips]
|
|
bgr_strips = [dewarped_bgr[ys:ye, :] for ys, ye in content_strips]
|
|
combined_ocr = np.vstack(ocr_strips)
|
|
combined_bgr = np.vstack(bgr_strips)
|
|
|
|
logger.info(f"ZonedColumns: {len(boxes)} box(es), concatenating {len(content_strips)} "
|
|
f"content strips into combined image {combined_ocr.shape[1]}x{combined_ocr.shape[0]}")
|
|
|
|
combined_result = detect_column_geometry(combined_ocr, combined_bgr)
|
|
if combined_result is not None:
|
|
combined_geoms, c_lx, c_rx, c_ty, c_by, combined_words, combined_inv = combined_result
|
|
else:
|
|
logger.info("ZonedColumns: combined image column detection failed, using original")
|
|
combined_geoms = geometries
|
|
|
|
strip_offsets: List[Tuple[int, int, int]] = []
|
|
cum_y = 0
|
|
for ys, ye in content_strips:
|
|
h = ye - ys
|
|
strip_offsets.append((cum_y, h, ys))
|
|
cum_y += h
|
|
|
|
def _combined_y_to_abs(cy: int) -> int:
|
|
for c_start, s_h, abs_start in strip_offsets:
|
|
if cy < c_start + s_h:
|
|
return abs_start + (cy - c_start)
|
|
last_c, last_h, last_abs = strip_offsets[-1]
|
|
return last_abs + last_h
|
|
|
|
if combined_result is not None:
|
|
for g in combined_geoms:
|
|
abs_y = _combined_y_to_abs(g.y)
|
|
abs_y_end = _combined_y_to_abs(g.y + g.height)
|
|
g.y = abs_y
|
|
g.height = abs_y_end - abs_y
|
|
|
|
if word_dicts:
|
|
content_words = []
|
|
for w in word_dicts:
|
|
w_abs_cx = w['left'] + left_x + w['width'] / 2
|
|
w_abs_cy = w['top'] + top_y + w['height'] / 2
|
|
inside_box = any(
|
|
box.x <= w_abs_cx <= box.x + box.width
|
|
and box.y <= w_abs_cy <= box.y + box.height
|
|
for box in boxes
|
|
)
|
|
if not inside_box:
|
|
content_words.append(w)
|
|
|
|
target_geoms = combined_geoms if combined_result is not None else geometries
|
|
for g in target_geoms:
|
|
g_left_rel = g.x - left_x
|
|
g_right_rel = g_left_rel + g.width
|
|
g.words = [
|
|
w for w in content_words
|
|
if g_left_rel <= w['left'] + w['width'] / 2 < g_right_rel
|
|
]
|
|
g.word_count = len(g.words)
|
|
|
|
excluded_count = len(word_dicts) - len(content_words)
|
|
if excluded_count:
|
|
logger.info(
|
|
"ZonedColumns: enriched geometries with %d content words "
|
|
"(excluded %d box-interior words)",
|
|
len(content_words), excluded_count,
|
|
)
|
|
|
|
zones_data: List[Dict] = []
|
|
for zone in zones:
|
|
zone_dict: Dict = {
|
|
"index": zone.index,
|
|
"zone_type": zone.zone_type,
|
|
"y": zone.y,
|
|
"height": zone.height,
|
|
"x": zone.x,
|
|
"width": zone.width,
|
|
"columns": [],
|
|
}
|
|
if zone.box is not None:
|
|
zone_dict["box"] = {
|
|
"x": zone.box.x, "y": zone.box.y,
|
|
"width": zone.box.width, "height": zone.box.height,
|
|
"confidence": zone.box.confidence,
|
|
"border_thickness": zone.box.border_thickness,
|
|
}
|
|
zones_data.append(zone_dict)
|
|
|
|
all_geometries = combined_geoms if combined_geoms else geometries
|
|
|
|
logger.info(f"ZonedColumns: {len(boxes)} box(es), {len(zones)} zone(s), "
|
|
f"{len(all_geometries)} total columns (combined-image approach)")
|
|
|
|
return (all_geometries, left_x, right_x, top_y, bottom_y,
|
|
word_dicts, inv, zones_data, boxes)
|