feat(ocr-pipeline): word-based 5-column detection for vocabulary pages

Replace projection-profile layout analysis with Tesseract word bounding box clustering to detect 5-column vocabulary layouts (page_ref, EN, DE, markers, examples). Falls back to projection profiles when < 3 clusters. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-26 23:08:14 +01:00
parent aa06ae0f61
commit cf27a95308
4 changed files with 235 additions and 13 deletions
--- a/klausur-service/backend/ocr_pipeline_api.py
+++ b/klausur-service/backend/ocr_pipeline_api.py
@@ -29,6 +29,7 @@ from pydantic import BaseModel

 from cv_vocab_pipeline import (
    analyze_layout,
+    analyze_layout_by_words,
    create_ocr_image,
    deskew_image,
    deskew_image_by_word_alignment,
@@ -639,15 +640,11 @@ async def detect_columns(session_id: str):

    t0 = time.time()

-    # Prepare images for analyze_layout
-    gray = cv2.cvtColor(dewarped_bgr, cv2.COLOR_BGR2GRAY)
-    # CLAHE-enhanced for layout analysis
-    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
-    layout_img = clahe.apply(gray)
-    # Binarized for text density
+    # Binarized image for layout analysis
    ocr_img = create_ocr_image(dewarped_bgr)

-    regions = analyze_layout(layout_img, ocr_img)
+    # Word-based detection (with automatic fallback to projection profiles)
+    regions = analyze_layout_by_words(ocr_img, dewarped_bgr)
    duration = time.time() - t0

    columns = [asdict(r) for r in regions]
@@ -740,11 +737,13 @@ async def _get_columns_overlay(session_id: str) -> Response:
    if img is None:
        raise HTTPException(status_code=500, detail="Failed to decode image")

-    # Color map for region types
+    # Color map for region types (BGR)
    colors = {
-        "column_en": (255, 180, 0),      # Blue (BGR)
+        "column_en": (255, 180, 0),      # Blue
        "column_de": (0, 200, 0),         # Green
        "column_example": (0, 140, 255),  # Orange
+        "page_ref": (200, 0, 200),        # Purple
+        "column_marker": (0, 0, 220),     # Red
        "header": (128, 128, 128),        # Gray
        "footer": (128, 128, 128),        # Gray
    }