Files
breakpilot-lehrer/klausur-service/backend/ocr/layout/layout.py
Benjamin Admin 45287b3541 Fix: Sidebar scrollable + add Eltern-Portal nav link
overflow-hidden → overflow-y-auto so all nav items are reachable.
Added /parent (Eltern-Portal) link with people icon.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 20:49:44 +02:00

275 lines
9.7 KiB
Python

"""
Layout analysis for OCR vocabulary pages — orchestration and re-exports.
This module provides the high-level entry points for layout analysis and
re-exports all functions from sub-modules for backward compatibility.
Sub-modules:
- cv_layout_detection: Document type detection, image creation, content bounds, header/footer
- cv_layout_analyze: Legacy projection-based layout analysis
- cv_layout_columns: Core column geometry detection
- cv_layout_column_refine: Sub-column, broad-column, expand operations
- cv_layout_rows: Row geometry detection
- cv_layout_row_regularize: Row grid regularization
- cv_layout_scoring: Language/role scoring, dictionary signals
- cv_layout_classify: Column type classification (Phase B)
- cv_layout_classify_position: Position-based classification fallbacks
"""
import logging
from typing import Any, Dict, List, Optional, Tuple
import numpy as np
from cv_vocab_types import ColumnGeometry, DetectedBox, PageRegion
logger = logging.getLogger(__name__)
# ── Re-exports (backward compatibility) ───────────────────────────────────
from cv_layout_detection import ( # noqa: F401
detect_document_type,
create_ocr_image,
create_layout_image,
_filter_narrow_runs,
_find_content_bounds,
_detect_header_footer_gaps,
_region_has_content,
_add_header_footer,
)
from cv_layout_analyze import ( # noqa: F401
analyze_layout,
)
from cv_layout_columns import ( # noqa: F401
detect_column_geometry,
_detect_columns_by_clustering,
_build_geometries_from_starts,
)
from cv_layout_column_refine import ( # noqa: F401
_detect_sub_columns,
_split_broad_columns,
expand_narrow_columns,
)
from cv_layout_rows import ( # noqa: F401
detect_row_geometry,
_build_rows_from_word_grouping,
)
from cv_layout_row_regularize import ( # noqa: F401
_regularize_row_grid,
)
from cv_layout_scoring import ( # noqa: F401
_score_language,
_score_role,
_score_dictionary_signals,
_classify_dictionary_columns,
)
from cv_layout_classify import ( # noqa: F401
_build_margin_regions,
positional_column_regions,
classify_column_types,
_classify_by_content,
)
from cv_layout_classify_position import ( # noqa: F401
_classify_by_position_enhanced,
_classify_by_position_fallback,
)
# ── Orchestration Functions ───────────────────────────────────────────────
def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> List[PageRegion]:
"""Detect columns using two-phase approach: geometry then content classification.
Phase A: detect_column_geometry() — clustering word positions into columns.
Phase B: classify_column_types() — content-based type assignment with fallback.
Falls back to projection-based analyze_layout() if geometry detection fails.
"""
h, w = ocr_img.shape[:2]
result = detect_column_geometry(ocr_img, dewarped_bgr)
if result is None:
logger.info("LayoutByWords: geometry detection failed, falling back to projection profiles")
layout_img = create_layout_image(dewarped_bgr)
return analyze_layout(layout_img, ocr_img)
geometries, left_x, right_x, top_y, bottom_y, _word_dicts, _inv = result
content_w = right_x - left_x
header_y, footer_y = _detect_header_footer_gaps(_inv, w, h) if _inv is not None else (None, None)
geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
top_y=top_y, header_y=header_y, footer_y=footer_y)
geometries = _split_broad_columns(geometries, content_w, left_x=left_x)
content_h = bottom_y - top_y
regions = positional_column_regions(geometries, content_w, content_h, left_x)
col_count = len([r for r in regions if r.type.startswith('column') or r.type == 'page_ref'])
methods = set(r.classification_method for r in regions if r.classification_method)
logger.info(f"LayoutByWords: {col_count} columns detected (methods: {methods}): "
f"{[(r.type, r.x, r.width, r.classification_confidence) for r in regions if r.type not in ('header','footer','margin_top','margin_bottom')]}")
return regions
def detect_column_geometry_zoned(
ocr_img: np.ndarray,
dewarped_bgr: np.ndarray,
) -> Optional[Tuple[
List[ColumnGeometry],
int, int, int, int,
List[Dict],
np.ndarray,
List[Dict],
List[DetectedBox],
]]:
"""Zone-aware column geometry detection.
1. Finds content bounds.
2. Runs box detection.
3. If boxes found: splits page into zones, runs detect_column_geometry()
per content zone on the corresponding sub-image.
4. If no boxes: delegates entirely to detect_column_geometry().
"""
from cv_box_detect import detect_boxes, split_page_into_zones
geo_result = detect_column_geometry(ocr_img, dewarped_bgr)
if geo_result is None:
return None
geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result
content_w = right_x - left_x
content_h = bottom_y - top_y
boxes = detect_boxes(dewarped_bgr, left_x, content_w, top_y, content_h)
if not boxes:
zone_data = [{
"index": 0, "zone_type": "content",
"y": top_y, "height": content_h,
"x": left_x, "width": content_w, "columns": [],
}]
return (geometries, left_x, right_x, top_y, bottom_y,
word_dicts, inv, zone_data, boxes)
zones = split_page_into_zones(left_x, top_y, content_w, content_h, boxes)
content_strips: List[Tuple[int, int]] = []
for zone in zones:
if zone.zone_type == 'content' and zone.height >= 40:
content_strips.append((zone.y, zone.y + zone.height))
if not content_strips:
logger.info("ZonedColumns: no content zones with height >= 40, using original result")
zone_data = [{"index": 0, "zone_type": "content", "y": top_y,
"height": content_h, "x": left_x, "width": content_w, "columns": []}]
return (geometries, left_x, right_x, top_y, bottom_y,
word_dicts, inv, zone_data, boxes)
ocr_strips = [ocr_img[ys:ye, :] for ys, ye in content_strips]
bgr_strips = [dewarped_bgr[ys:ye, :] for ys, ye in content_strips]
combined_ocr = np.vstack(ocr_strips)
combined_bgr = np.vstack(bgr_strips)
logger.info(f"ZonedColumns: {len(boxes)} box(es), concatenating {len(content_strips)} "
f"content strips into combined image {combined_ocr.shape[1]}x{combined_ocr.shape[0]}")
combined_result = detect_column_geometry(combined_ocr, combined_bgr)
if combined_result is not None:
combined_geoms, c_lx, c_rx, c_ty, c_by, combined_words, combined_inv = combined_result
else:
logger.info("ZonedColumns: combined image column detection failed, using original")
combined_geoms = geometries
strip_offsets: List[Tuple[int, int, int]] = []
cum_y = 0
for ys, ye in content_strips:
h = ye - ys
strip_offsets.append((cum_y, h, ys))
cum_y += h
def _combined_y_to_abs(cy: int) -> int:
for c_start, s_h, abs_start in strip_offsets:
if cy < c_start + s_h:
return abs_start + (cy - c_start)
last_c, last_h, last_abs = strip_offsets[-1]
return last_abs + last_h
if combined_result is not None:
for g in combined_geoms:
abs_y = _combined_y_to_abs(g.y)
abs_y_end = _combined_y_to_abs(g.y + g.height)
g.y = abs_y
g.height = abs_y_end - abs_y
if word_dicts:
content_words = []
for w in word_dicts:
w_abs_cx = w['left'] + left_x + w['width'] / 2
w_abs_cy = w['top'] + top_y + w['height'] / 2
inside_box = any(
box.x <= w_abs_cx <= box.x + box.width
and box.y <= w_abs_cy <= box.y + box.height
for box in boxes
)
if not inside_box:
content_words.append(w)
target_geoms = combined_geoms if combined_result is not None else geometries
for g in target_geoms:
g_left_rel = g.x - left_x
g_right_rel = g_left_rel + g.width
g.words = [
w for w in content_words
if g_left_rel <= w['left'] + w['width'] / 2 < g_right_rel
]
g.word_count = len(g.words)
excluded_count = len(word_dicts) - len(content_words)
if excluded_count:
logger.info(
"ZonedColumns: enriched geometries with %d content words "
"(excluded %d box-interior words)",
len(content_words), excluded_count,
)
zones_data: List[Dict] = []
for zone in zones:
zone_dict: Dict = {
"index": zone.index,
"zone_type": zone.zone_type,
"y": zone.y,
"height": zone.height,
"x": zone.x,
"width": zone.width,
"columns": [],
}
if zone.box is not None:
zone_dict["box"] = {
"x": zone.box.x, "y": zone.box.y,
"width": zone.box.width, "height": zone.box.height,
"confidence": zone.box.confidence,
"border_thickness": zone.box.border_thickness,
}
zones_data.append(zone_dict)
all_geometries = combined_geoms if combined_geoms else geometries
logger.info(f"ZonedColumns: {len(boxes)} box(es), {len(zones)} zone(s), "
f"{len(all_geometries)} total columns (combined-image approach)")
return (all_geometries, left_x, right_x, top_y, bottom_y,
word_dicts, inv, zones_data, boxes)