feat(ocr-pipeline): word-based 5-column detection for vocabulary pages
Replace projection-profile layout analysis with Tesseract word bounding box clustering to detect 5-column vocabulary layouts (page_ref, EN, DE, markers, examples). Falls back to projection profiles when < 3 clusters. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -29,6 +29,7 @@ from pydantic import BaseModel
|
||||
|
||||
from cv_vocab_pipeline import (
|
||||
analyze_layout,
|
||||
analyze_layout_by_words,
|
||||
create_ocr_image,
|
||||
deskew_image,
|
||||
deskew_image_by_word_alignment,
|
||||
@@ -639,15 +640,11 @@ async def detect_columns(session_id: str):
|
||||
|
||||
t0 = time.time()
|
||||
|
||||
# Prepare images for analyze_layout
|
||||
gray = cv2.cvtColor(dewarped_bgr, cv2.COLOR_BGR2GRAY)
|
||||
# CLAHE-enhanced for layout analysis
|
||||
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
|
||||
layout_img = clahe.apply(gray)
|
||||
# Binarized for text density
|
||||
# Binarized image for layout analysis
|
||||
ocr_img = create_ocr_image(dewarped_bgr)
|
||||
|
||||
regions = analyze_layout(layout_img, ocr_img)
|
||||
# Word-based detection (with automatic fallback to projection profiles)
|
||||
regions = analyze_layout_by_words(ocr_img, dewarped_bgr)
|
||||
duration = time.time() - t0
|
||||
|
||||
columns = [asdict(r) for r in regions]
|
||||
@@ -740,11 +737,13 @@ async def _get_columns_overlay(session_id: str) -> Response:
|
||||
if img is None:
|
||||
raise HTTPException(status_code=500, detail="Failed to decode image")
|
||||
|
||||
# Color map for region types
|
||||
# Color map for region types (BGR)
|
||||
colors = {
|
||||
"column_en": (255, 180, 0), # Blue (BGR)
|
||||
"column_en": (255, 180, 0), # Blue
|
||||
"column_de": (0, 200, 0), # Green
|
||||
"column_example": (0, 140, 255), # Orange
|
||||
"page_ref": (200, 0, 200), # Purple
|
||||
"column_marker": (0, 0, 220), # Red
|
||||
"header": (128, 128, 128), # Gray
|
||||
"footer": (128, 128, 128), # Gray
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user