feat: Words-First Grid Builder (bottom-up alternative zu cell_grid_v2)

Neuer Algorithmus in cv_words_first.py: Clustert Tesseract word_boxes direkt zu Spalten (X-Gap) und Zeilen (Y-Proximity), baut Zellen an Schnittpunkten. Kein Spalten-/Zeilenerkennung noetig. - cv_words_first.py: _cluster_columns, _cluster_rows, _build_cells, build_grid_from_words - ocr_pipeline_api.py: grid_method Parameter (v2|words_first) im /words Endpoint - StepWordRecognition.tsx: Dropdown Toggle fuer Grid-Methode - OCR-Pipeline.md: Doku v4.3.0 mit Words-First Algorithmus - 15 Unit-Tests fuer cv_words_first Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-12 06:46:05 +01:00
parent 2fdf3ff868
commit ced5bb3dd3
6 changed files with 854 additions and 34 deletions
--- a/klausur-service/backend/ocr_pipeline_api.py
+++ b/klausur-service/backend/ocr_pipeline_api.py
@@ -71,6 +71,7 @@ from cv_vocab_pipeline import (
    render_image_high_res,
    render_pdf_high_res,
 )
+from cv_words_first import build_grid_from_words
 from ocr_pipeline_session_store import (
    create_session_db,
    delete_all_sessions_db,
@@ -1859,6 +1860,7 @@ async def detect_words(
    pronunciation: str = "british",
    stream: bool = False,
    skip_heal_gaps: bool = False,
+    grid_method: str = "v2",
 ):
    """Build word grid from columns × rows, OCR each cell.

@@ -1868,6 +1870,9 @@ async def detect_words(
        stream: false (default) for JSON response, true for SSE streaming
        skip_heal_gaps: false (default). When true, cells keep exact row geometry
            positions without gap-healing expansion. Better for overlay rendering.
+        grid_method: 'v2' (default) or 'words_first' — grid construction strategy.
+            'v2' uses pre-detected columns/rows (top-down).
+            'words_first' clusters words bottom-up (no column/row detection needed).
    """
    if session_id not in _cache:
        logger.info("detect_words: session %s not in cache, loading from DB", session_id)
@@ -1902,7 +1907,7 @@ async def detect_words(
            "duration_seconds": 0,
        }
        logger.info("detect_words: no column_result — using full-page pseudo-column %dx%d", img_w_tmp, img_h_tmp)
-    if not row_result or not row_result.get("rows"):
+    if grid_method != "words_first" and (not row_result or not row_result.get("rows")):
        raise HTTPException(status_code=400, detail="Row detection must be completed first")

    # Convert column dicts back to PageRegion objects
@@ -1983,6 +1988,102 @@ async def detect_words(
        if excluded:
            logger.info(f"detect_words: excluded {excluded} rows inside box zones")

+    # --- Words-First path: bottom-up grid from word boxes ---
+    if grid_method == "words_first":
+        t0 = time.time()
+        img_h, img_w = dewarped_bgr.shape[:2]
+
+        # Get word_dicts from cache or run Tesseract full-page
+        wf_word_dicts = cached.get("_word_dicts")
+        if wf_word_dicts is None:
+            ocr_img_tmp = create_ocr_image(dewarped_bgr)
+            geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr)
+            if geo_result is not None:
+                _geoms, left_x, right_x, top_y, bottom_y, wf_word_dicts, inv = geo_result
+                cached["_word_dicts"] = wf_word_dicts
+                cached["_inv"] = inv
+                cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
+
+        if not wf_word_dicts:
+            raise HTTPException(status_code=400, detail="No words detected — cannot build words-first grid")
+
+        # Convert word coordinates to absolute image coordinates if needed
+        # (detect_column_geometry returns words relative to content ROI)
+        content_bounds = cached.get("_content_bounds")
+        if content_bounds:
+            lx, _rx, ty, _by = content_bounds
+            abs_words = []
+            for w in wf_word_dicts:
+                abs_words.append({
+                    **w,
+                    'left': w['left'] + lx,
+                    'top': w['top'] + ty,
+                })
+            wf_word_dicts = abs_words
+
+        cells, columns_meta = build_grid_from_words(wf_word_dicts, img_w, img_h)
+        duration = time.time() - t0
+
+        # Apply IPA phonetic fixes
+        fix_cell_phonetics(cells, pronunciation=pronunciation)
+
+        # Add zone_index for backward compat
+        for cell in cells:
+            cell.setdefault("zone_index", 0)
+
+        col_types = {c['type'] for c in columns_meta}
+        is_vocab = bool(col_types & {'column_en', 'column_de'})
+        n_rows = len(set(c['row_index'] for c in cells)) if cells else 0
+        n_cols = len(columns_meta)
+        used_engine = "words_first"
+
+        word_result = {
+            "cells": cells,
+            "grid_shape": {
+                "rows": n_rows,
+                "cols": n_cols,
+                "total_cells": len(cells),
+            },
+            "columns_used": columns_meta,
+            "layout": "vocab" if is_vocab else "generic",
+            "image_width": img_w,
+            "image_height": img_h,
+            "duration_seconds": round(duration, 2),
+            "ocr_engine": used_engine,
+            "grid_method": "words_first",
+            "summary": {
+                "total_cells": len(cells),
+                "non_empty_cells": sum(1 for c in cells if c.get("text")),
+                "low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
+            },
+        }
+
+        if is_vocab or 'column_text' in col_types:
+            entries = _cells_to_vocab_entries(cells, columns_meta)
+            entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
+            word_result["vocab_entries"] = entries
+            word_result["entries"] = entries
+            word_result["entry_count"] = len(entries)
+            word_result["summary"]["total_entries"] = len(entries)
+            word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english"))
+            word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))
+
+        await update_session_db(session_id, word_result=word_result, current_step=8)
+        cached["word_result"] = word_result
+
+        logger.info(f"OCR Pipeline: words-first session {session_id}: "
+                    f"{len(cells)} cells ({duration:.2f}s), {n_rows} rows, {n_cols} cols")
+
+        await _append_pipeline_log(session_id, "words", {
+            "grid_method": "words_first",
+            "total_cells": len(cells),
+            "non_empty_cells": word_result["summary"]["non_empty_cells"],
+            "ocr_engine": used_engine,
+            "layout": word_result["layout"],
+        }, duration_ms=int(duration * 1000))
+
+        return {"session_id": session_id, **word_result}
+
    if stream:
        # Cell-First OCR v2: use batch-then-stream approach instead of
        # per-cell streaming. The parallel ThreadPoolExecutor in
@@ -2001,7 +2102,7 @@ async def detect_words(
            },
        )

-    # --- Non-streaming path ---
+    # --- Non-streaming path (grid_method=v2) ---
    t0 = time.time()

    # Create binarized OCR image (for Tesseract)