feat(ocr-pipeline): word-based 5-column detection for vocabulary pages

Replace projection-profile layout analysis with Tesseract word bounding
box clustering to detect 5-column vocabulary layouts (page_ref, EN, DE,
markers, examples). Falls back to projection profiles when < 3 clusters.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-02-26 23:08:14 +01:00
parent aa06ae0f61
commit cf27a95308
4 changed files with 235 additions and 13 deletions

View File

@@ -29,6 +29,7 @@ from pydantic import BaseModel
from cv_vocab_pipeline import (
analyze_layout,
analyze_layout_by_words,
create_ocr_image,
deskew_image,
deskew_image_by_word_alignment,
@@ -639,15 +640,11 @@ async def detect_columns(session_id: str):
t0 = time.time()
# Prepare images for analyze_layout
gray = cv2.cvtColor(dewarped_bgr, cv2.COLOR_BGR2GRAY)
# CLAHE-enhanced for layout analysis
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
layout_img = clahe.apply(gray)
# Binarized for text density
# Binarized image for layout analysis
ocr_img = create_ocr_image(dewarped_bgr)
regions = analyze_layout(layout_img, ocr_img)
# Word-based detection (with automatic fallback to projection profiles)
regions = analyze_layout_by_words(ocr_img, dewarped_bgr)
duration = time.time() - t0
columns = [asdict(r) for r in regions]
@@ -740,11 +737,13 @@ async def _get_columns_overlay(session_id: str) -> Response:
if img is None:
raise HTTPException(status_code=500, detail="Failed to decode image")
# Color map for region types
# Color map for region types (BGR)
colors = {
"column_en": (255, 180, 0), # Blue (BGR)
"column_en": (255, 180, 0), # Blue
"column_de": (0, 200, 0), # Green
"column_example": (0, 140, 255), # Orange
"page_ref": (200, 0, 200), # Purple
"column_marker": (0, 0, 220), # Red
"header": (128, 128, 128), # Gray
"footer": (128, 128, 128), # Gray
}