From cf27a9530837e7f81fde6696c674f748c8887a0a Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 26 Feb 2026 23:08:14 +0100 Subject: [PATCH] feat(ocr-pipeline): word-based 5-column detection for vocabulary pages Replace projection-profile layout analysis with Tesseract word bounding box clustering to detect 5-column vocabulary layouts (page_ref, EN, DE, markers, examples). Falls back to projection profiles when < 3 clusters. Co-Authored-By: Claude Opus 4.6 --- .../app/(admin)/ai/ocr-pipeline/types.ts | 2 +- .../ocr-pipeline/ColumnControls.tsx | 8 +- klausur-service/backend/cv_vocab_pipeline.py | 221 +++++++++++++++++- klausur-service/backend/ocr_pipeline_api.py | 17 +- 4 files changed, 235 insertions(+), 13 deletions(-) diff --git a/admin-lehrer/app/(admin)/ai/ocr-pipeline/types.ts b/admin-lehrer/app/(admin)/ai/ocr-pipeline/types.ts index 9f2f66d..df349b7 100644 --- a/admin-lehrer/app/(admin)/ai/ocr-pipeline/types.ts +++ b/admin-lehrer/app/(admin)/ai/ocr-pipeline/types.ts @@ -64,7 +64,7 @@ export interface DewarpGroundTruth { } export interface PageRegion { - type: 'column_en' | 'column_de' | 'column_example' | 'header' | 'footer' + type: 'column_en' | 'column_de' | 'column_example' | 'page_ref' | 'column_marker' | 'header' | 'footer' x: number y: number width: number diff --git a/admin-lehrer/components/ocr-pipeline/ColumnControls.tsx b/admin-lehrer/components/ocr-pipeline/ColumnControls.tsx index 52a808c..cc5a706 100644 --- a/admin-lehrer/components/ocr-pipeline/ColumnControls.tsx +++ b/admin-lehrer/components/ocr-pipeline/ColumnControls.tsx @@ -15,6 +15,8 @@ const TYPE_COLORS: Record = { column_en: 'bg-blue-100 text-blue-700 dark:bg-blue-900/30 dark:text-blue-400', column_de: 'bg-green-100 text-green-700 dark:bg-green-900/30 dark:text-green-400', column_example: 'bg-orange-100 text-orange-700 dark:bg-orange-900/30 dark:text-orange-400', + page_ref: 'bg-purple-100 text-purple-700 dark:bg-purple-900/30 dark:text-purple-400', + column_marker: 'bg-red-100 text-red-700 dark:bg-red-900/30 dark:text-red-400', header: 'bg-gray-100 text-gray-600 dark:bg-gray-700/50 dark:text-gray-400', footer: 'bg-gray-100 text-gray-600 dark:bg-gray-700/50 dark:text-gray-400', } @@ -23,6 +25,8 @@ const TYPE_LABELS: Record = { column_en: 'EN', column_de: 'DE', column_example: 'Beispiel', + page_ref: 'Seite', + column_marker: 'Marker', header: 'Header', footer: 'Footer', } @@ -32,8 +36,8 @@ export function ColumnControls({ columnResult, onRerun, onGroundTruth, onNext, i if (!columnResult) return null - const columns = columnResult.columns.filter((c: PageRegion) => c.type.startsWith('column')) - const headerFooter = columnResult.columns.filter((c: PageRegion) => !c.type.startsWith('column')) + const columns = columnResult.columns.filter((c: PageRegion) => c.type.startsWith('column') || c.type === 'page_ref') + const headerFooter = columnResult.columns.filter((c: PageRegion) => !c.type.startsWith('column') && c.type !== 'page_ref') const handleGt = (isCorrect: boolean) => { onGroundTruth({ is_correct: isCorrect }) diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 5ef826d..ace62ab 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -53,7 +53,7 @@ CV_PIPELINE_AVAILABLE = CV2_AVAILABLE and TESSERACT_AVAILABLE @dataclass class PageRegion: """A detected region on the page.""" - type: str # 'column_en', 'column_de', 'column_example', 'header', 'footer' + type: str # 'column_en', 'column_de', 'column_example', 'page_ref', 'column_marker', 'header', 'footer' x: int y: int width: int @@ -839,6 +839,225 @@ def analyze_layout(layout_img: np.ndarray, ocr_img: np.ndarray) -> List[PageRegi return regions +# ============================================================================= +# Stage 5b: Word-Based Layout Analysis (5-Column Detection) +# ============================================================================= + +def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> List[PageRegion]: + """Detect columns by clustering left-aligned word positions from Tesseract. + + This approach works better than projection profiles for vocabulary pages + with 5 columns (page_ref, EN, DE, markers, examples) because it detects + column starts where left-aligned words cluster. + + Args: + ocr_img: Binarized grayscale image for layout analysis. + dewarped_bgr: Original BGR image (for Tesseract word detection). + + Returns: + List of PageRegion objects. Falls back to analyze_layout() if < 3 clusters. + """ + h, w = ocr_img.shape[:2] + + # --- Find content bounds --- + inv = cv2.bitwise_not(ocr_img) + left_x, right_x, top_y, bottom_y = _find_content_bounds(inv) + content_w = right_x - left_x + content_h = bottom_y - top_y + + if content_w < w * 0.3 or content_h < h * 0.3: + left_x, right_x = 0, w + top_y, bottom_y = 0, h + content_w, content_h = w, h + + logger.info(f"LayoutByWords: content bounds x=[{left_x}..{right_x}] ({content_w}px), " + f"y=[{top_y}..{bottom_y}] ({content_h}px)") + + # --- Get word bounding boxes from Tesseract --- + content_roi = dewarped_bgr[top_y:bottom_y, left_x:right_x] + pil_img = Image.fromarray(cv2.cvtColor(content_roi, cv2.COLOR_BGR2RGB)) + + try: + data = pytesseract.image_to_data(pil_img, lang='eng+deu', output_type=pytesseract.Output.DICT) + except Exception as e: + logger.warning(f"LayoutByWords: Tesseract image_to_data failed: {e}, falling back") + layout_img = create_layout_image(dewarped_bgr) + return analyze_layout(layout_img, ocr_img) + + # Collect left edges of recognized words (confidence > 30) + left_edges = [] + word_info = [] # (left, top, width, height, text, conf) + n_words = len(data['text']) + for i in range(n_words): + conf = int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1 + text = str(data['text'][i]).strip() + if conf < 30 or not text: + continue + lx = int(data['left'][i]) + ty = int(data['top'][i]) + bw = int(data['width'][i]) + bh = int(data['height'][i]) + left_edges.append(lx) + word_info.append((lx, ty, bw, bh, text, conf)) + + if len(left_edges) < 5: + logger.warning(f"LayoutByWords: only {len(left_edges)} words detected, falling back") + layout_img = create_layout_image(dewarped_bgr) + return analyze_layout(layout_img, ocr_img) + + logger.info(f"LayoutByWords: {len(left_edges)} words detected in content area") + + # --- Cluster left edges --- + tolerance = max(10, int(content_w * 0.01)) # ~1% of content width + sorted_edges = sorted(left_edges) + + clusters = [] # list of (center_x, count, edges) + current_cluster = [sorted_edges[0]] + for edge in sorted_edges[1:]: + if edge - current_cluster[-1] <= tolerance: + current_cluster.append(edge) + else: + clusters.append(current_cluster) + current_cluster = [edge] + clusters.append(current_cluster) + + # Filter: only clusters with >= 2 words + significant = [(int(np.mean(c)), len(c), min(c), max(c)) for c in clusters if len(c) >= 2] + significant.sort(key=lambda s: s[0]) + + logger.info(f"LayoutByWords: {len(significant)} significant clusters " + f"(from {len(clusters)} total): " + f"{[(s[0]+left_x, s[1]) for s in significant[:10]]}") + + if len(significant) < 3: + logger.info("LayoutByWords: < 3 clusters, falling back to projection-based layout") + layout_img = create_layout_image(dewarped_bgr) + return analyze_layout(layout_img, ocr_img) + + # --- Merge clusters that are very close (within 2*tolerance) --- + merged = [significant[0]] + for s in significant[1:]: + if s[0] - merged[-1][0] < 2 * tolerance: + # Merge: weighted average position, sum counts + prev = merged[-1] + total = prev[1] + s[1] + avg_x = (prev[0] * prev[1] + s[0] * s[1]) // total + merged[-1] = (avg_x, total, min(prev[2], s[2]), max(prev[3], s[3])) + else: + merged.append(s) + + logger.info(f"LayoutByWords: {len(merged)} clusters after merging: " + f"{[(m[0]+left_x, m[1]) for m in merged]}") + + if len(merged) < 3: + logger.info("LayoutByWords: < 3 merged clusters, falling back") + layout_img = create_layout_image(dewarped_bgr) + return analyze_layout(layout_img, ocr_img) + + # --- Derive column boundaries --- + # 2mm margin before each cluster start (~8px at 100dpi, scale with image) + margin_px = max(5, int(content_w * 0.005)) + + col_starts = [] # (abs_x, word_count) + for center_x, count, min_edge, max_edge in merged: + abs_start = max(0, left_x + min_edge - margin_px) + col_starts.append((abs_start, count)) + + # Calculate column widths + col_defs = [] # (abs_x, width, word_count) + for i, (start_x, count) in enumerate(col_starts): + if i + 1 < len(col_starts): + col_width = col_starts[i + 1][0] - start_x + else: + col_width = right_x - start_x + col_defs.append((start_x, col_width, count)) + + logger.info(f"LayoutByWords: column definitions: " + f"{[(d[0], d[1], d[2]) for d in col_defs]}") + + # --- Assign types based on rules --- + regions = [] + total_content_w = right_x - left_x + untyped = list(range(len(col_defs))) # indices not yet assigned + + # Rule 1: Leftmost narrow column (< 12% width) → page_ref + if col_defs[0][1] < total_content_w * 0.12: + regions.append(PageRegion( + type='page_ref', x=col_defs[0][0], y=top_y, + width=col_defs[0][1], height=content_h + )) + untyped.remove(0) + logger.info(f"LayoutByWords: col 0 → page_ref (width={col_defs[0][1]}px, " + f"{col_defs[0][1]*100/total_content_w:.1f}%)") + + # Rule 2: Narrow column with few words (< 8% width, <= 8 words) → column_marker + for i in list(untyped): + col_x, col_w, col_count = col_defs[i] + if col_w < total_content_w * 0.08 and col_count <= 8: + regions.append(PageRegion( + type='column_marker', x=col_x, y=top_y, + width=col_w, height=content_h + )) + untyped.remove(i) + logger.info(f"LayoutByWords: col {i} → column_marker (width={col_w}px, " + f"{col_w*100/total_content_w:.1f}%, words={col_count})") + + # Rule 3: Rightmost remaining (widest or last) → column_example + if len(untyped) >= 3: + last_idx = untyped[-1] + regions.append(PageRegion( + type='column_example', x=col_defs[last_idx][0], y=top_y, + width=col_defs[last_idx][1], height=content_h + )) + untyped.remove(last_idx) + logger.info(f"LayoutByWords: col {last_idx} → column_example") + + # Rule 4: First remaining → column_en, second → column_de + if len(untyped) >= 2: + en_idx = untyped[0] + de_idx = untyped[1] + regions.append(PageRegion( + type='column_en', x=col_defs[en_idx][0], y=top_y, + width=col_defs[en_idx][1], height=content_h + )) + regions.append(PageRegion( + type='column_de', x=col_defs[de_idx][0], y=top_y, + width=col_defs[de_idx][1], height=content_h + )) + untyped = untyped[2:] + logger.info(f"LayoutByWords: col {en_idx} → column_en, col {de_idx} → column_de") + elif len(untyped) == 1: + # Only one left — call it column_en + idx = untyped[0] + regions.append(PageRegion( + type='column_en', x=col_defs[idx][0], y=top_y, + width=col_defs[idx][1], height=content_h + )) + untyped = [] + + # Any remaining untyped columns get generic column_example type + for idx in untyped: + regions.append(PageRegion( + type='column_example', x=col_defs[idx][0], y=top_y, + width=col_defs[idx][1], height=content_h + )) + + # Sort by x position for consistent output + regions.sort(key=lambda r: r.x) + + # Add header/footer + if top_y > 10: + regions.append(PageRegion(type='header', x=0, y=0, width=w, height=top_y)) + if bottom_y < h - 10: + regions.append(PageRegion(type='footer', x=0, y=bottom_y, width=w, height=h - bottom_y)) + + col_count = len([r for r in regions if r.type.startswith('column') or r.type == 'page_ref']) + logger.info(f"LayoutByWords: {col_count} columns detected: " + f"{[(r.type, r.x, r.width) for r in regions if r.type not in ('header','footer')]}") + + return regions + + # ============================================================================= # Stage 6: Multi-Pass OCR # ============================================================================= diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py index bcf155f..d7cfd1a 100644 --- a/klausur-service/backend/ocr_pipeline_api.py +++ b/klausur-service/backend/ocr_pipeline_api.py @@ -29,6 +29,7 @@ from pydantic import BaseModel from cv_vocab_pipeline import ( analyze_layout, + analyze_layout_by_words, create_ocr_image, deskew_image, deskew_image_by_word_alignment, @@ -639,15 +640,11 @@ async def detect_columns(session_id: str): t0 = time.time() - # Prepare images for analyze_layout - gray = cv2.cvtColor(dewarped_bgr, cv2.COLOR_BGR2GRAY) - # CLAHE-enhanced for layout analysis - clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)) - layout_img = clahe.apply(gray) - # Binarized for text density + # Binarized image for layout analysis ocr_img = create_ocr_image(dewarped_bgr) - regions = analyze_layout(layout_img, ocr_img) + # Word-based detection (with automatic fallback to projection profiles) + regions = analyze_layout_by_words(ocr_img, dewarped_bgr) duration = time.time() - t0 columns = [asdict(r) for r in regions] @@ -740,11 +737,13 @@ async def _get_columns_overlay(session_id: str) -> Response: if img is None: raise HTTPException(status_code=500, detail="Failed to decode image") - # Color map for region types + # Color map for region types (BGR) colors = { - "column_en": (255, 180, 0), # Blue (BGR) + "column_en": (255, 180, 0), # Blue "column_de": (0, 200, 0), # Green "column_example": (0, 140, 255), # Orange + "page_ref": (200, 0, 200), # Purple + "column_marker": (0, 0, 220), # Red "header": (128, 128, 128), # Gray "footer": (128, 128, 128), # Gray }