feat: Words-First Grid Builder (bottom-up alternative zu cell_grid_v2)

Neuer Algorithmus in cv_words_first.py: Clustert Tesseract word_boxes direkt zu Spalten (X-Gap) und Zeilen (Y-Proximity), baut Zellen an Schnittpunkten. Kein Spalten-/Zeilenerkennung noetig. - cv_words_first.py: _cluster_columns, _cluster_rows, _build_cells, build_grid_from_words - ocr_pipeline_api.py: grid_method Parameter (v2|words_first) im /words Endpoint - StepWordRecognition.tsx: Dropdown Toggle fuer Grid-Methode - OCR-Pipeline.md: Doku v4.3.0 mit Words-First Algorithmus - 15 Unit-Tests fuer cv_words_first Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-12 06:46:05 +01:00
parent 2fdf3ff868
commit ced5bb3dd3
6 changed files with 854 additions and 34 deletions
--- a/klausur-service/backend/cv_words_first.py
+++ b/klausur-service/backend/cv_words_first.py
@@ -0,0 +1,282 @@
+"""
+Words-First Grid Builder (Bottom-Up).
+
+Builds a cell grid from Tesseract word_boxes directly, without requiring
+pre-detected columns or rows.  Algorithm:
+
+  1. Cluster words into columns by X-gap analysis
+  2. Cluster words into rows by Y-proximity
+  3. Build cells at (column, row) intersections
+
+Returns the same (cells, columns_meta) format as build_cell_grid_v2().
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+import statistics
+from typing import Any, Dict, List, Tuple
+
+from cv_ocr_engines import (
+    _group_words_into_lines,
+    _words_to_reading_order_text,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# 1. Column clustering
+# ---------------------------------------------------------------------------
+
+def _cluster_columns(
+    words: List[Dict],
+    img_w: int,
+    min_gap_pct: float = 3.0,
+) -> List[Dict[str, Any]]:
+    """Cluster words into columns by finding large horizontal gaps.
+
+    Returns a list of column dicts:
+        [{'index': 0, 'type': 'column_1', 'x_min': ..., 'x_max': ...}, ...]
+    sorted left-to-right.
+    """
+    if not words:
+        return []
+
+    # Sort by X center
+    sorted_w = sorted(words, key=lambda w: w['left'] + w['width'] / 2)
+
+    # Collect word heights to compute adaptive threshold
+    heights = [w['height'] for w in sorted_w if w.get('height', 0) > 0]
+    median_h = statistics.median(heights) if heights else 30
+
+    # Adaptive gap threshold: 3× median word height, but at least min_gap_pct of image width
+    min_gap_px = max(median_h * 3, img_w * min_gap_pct / 100) if img_w > 0 else median_h * 3
+
+    # Find X-gap boundaries between consecutive words (sorted by X-center)
+    # For each word, compute right edge; for next word, compute left edge
+    boundaries: List[float] = []  # X positions where columns split
+    for i in range(len(sorted_w) - 1):
+        right_edge = sorted_w[i]['left'] + sorted_w[i]['width']
+        left_edge = sorted_w[i + 1]['left']
+        gap = left_edge - right_edge
+        if gap > min_gap_px:
+            # Split point is midway through the gap
+            boundaries.append((right_edge + left_edge) / 2)
+
+    # Build column ranges from boundaries
+    # Column ranges: (-inf, boundary[0]), (boundary[0], boundary[1]), ..., (boundary[-1], +inf)
+    col_edges = [0.0] + boundaries + [float(img_w)]
+    columns = []
+    for ci in range(len(col_edges) - 1):
+        columns.append({
+            'index': ci,
+            'type': f'column_{ci + 1}' if len(col_edges) > 2 else 'column_text',
+            'x_min': col_edges[ci],
+            'x_max': col_edges[ci + 1],
+        })
+
+    return columns
+
+
+# ---------------------------------------------------------------------------
+# 2. Row clustering
+# ---------------------------------------------------------------------------
+
+def _cluster_rows(
+    words: List[Dict],
+) -> List[Dict[str, Any]]:
+    """Cluster words into visual rows by Y-proximity.
+
+    Uses half the median word height as Y-tolerance.
+
+    Returns a list of row dicts:
+        [{'index': 0, 'y_min': ..., 'y_max': ..., 'y_center': ...}, ...]
+    sorted top-to-bottom.
+    """
+    if not words:
+        return []
+
+    heights = [w['height'] for w in words if w.get('height', 0) > 0]
+    median_h = statistics.median(heights) if heights else 20
+    y_tol = max(median_h * 0.5, 5)
+
+    lines = _group_words_into_lines(words, y_tolerance_px=int(y_tol))
+
+    rows = []
+    for ri, line_words in enumerate(lines):
+        y_min = min(w['top'] for w in line_words)
+        y_max = max(w['top'] + w['height'] for w in line_words)
+        rows.append({
+            'index': ri,
+            'y_min': y_min,
+            'y_max': y_max,
+            'y_center': (y_min + y_max) / 2,
+        })
+
+    return rows
+
+
+# ---------------------------------------------------------------------------
+# 3. Build cells
+# ---------------------------------------------------------------------------
+
+def _assign_word_to_column(word: Dict, columns: List[Dict]) -> int:
+    """Return column index for a word based on its X-center."""
+    x_center = word['left'] + word['width'] / 2
+    for col in columns:
+        if col['x_min'] <= x_center < col['x_max']:
+            return col['index']
+    # Fallback: nearest column
+    return min(columns, key=lambda c: abs((c['x_min'] + c['x_max']) / 2 - x_center))['index']
+
+
+def _assign_word_to_row(word: Dict, rows: List[Dict]) -> int:
+    """Return row index for a word based on its Y-center."""
+    y_center = word['top'] + word['height'] / 2
+    # Find the row whose y_range contains this word's center
+    for row in rows:
+        if row['y_min'] <= y_center <= row['y_max']:
+            return row['index']
+    # Fallback: nearest row by Y-center
+    return min(rows, key=lambda r: abs(r['y_center'] - y_center))['index']
+
+
+def _build_cells(
+    words: List[Dict],
+    columns: List[Dict],
+    rows: List[Dict],
+    img_w: int,
+    img_h: int,
+) -> List[Dict[str, Any]]:
+    """Build cell dicts from word assignments to (column, row) pairs."""
+    if not columns or not rows:
+        return []
+
+    # Bucket words into (col_idx, row_idx)
+    buckets: Dict[Tuple[int, int], List[Dict]] = {}
+    for w in words:
+        ci = _assign_word_to_column(w, columns)
+        ri = _assign_word_to_row(w, rows)
+        buckets.setdefault((ci, ri), []).append(w)
+
+    cells = []
+    for (ci, ri), cell_words in sorted(buckets.items(), key=lambda kv: (kv[0][1], kv[0][0])):
+        col = columns[ci]
+        row = rows[ri]
+
+        # Compute tight bbox from actual word positions
+        x_min = min(w['left'] for w in cell_words)
+        y_min = min(w['top'] for w in cell_words)
+        x_max = max(w['left'] + w['width'] for w in cell_words)
+        y_max = max(w['top'] + w['height'] for w in cell_words)
+        bw = x_max - x_min
+        bh = y_max - y_min
+
+        # Text from words in reading order
+        text = _words_to_reading_order_text(cell_words, y_tolerance_px=max(10, int(bh * 0.4)))
+
+        # Average confidence
+        confs = [w.get('conf', 0) for w in cell_words if w.get('conf', 0) > 0]
+        avg_conf = sum(confs) / len(confs) if confs else 0.0
+
+        # Word boxes with percent coordinates
+        word_boxes = []
+        for w in sorted(cell_words, key=lambda ww: (ww['top'], ww['left'])):
+            word_boxes.append({
+                'text': w.get('text', ''),
+                'left': round(w['left'] / img_w * 100, 2) if img_w else 0,
+                'top': round(w['top'] / img_h * 100, 2) if img_h else 0,
+                'width': round(w['width'] / img_w * 100, 2) if img_w else 0,
+                'height': round(w['height'] / img_h * 100, 2) if img_h else 0,
+                'conf': w.get('conf', 0),
+            })
+
+        cells.append({
+            'cell_id': f"R{ri:02d}_C{ci}",
+            'row_index': ri,
+            'col_index': ci,
+            'col_type': col['type'],
+            'text': text,
+            'confidence': round(avg_conf, 1),
+            'bbox_px': {'x': x_min, 'y': y_min, 'w': bw, 'h': bh},
+            'bbox_pct': {
+                'x': round(x_min / img_w * 100, 2) if img_w else 0,
+                'y': round(y_min / img_h * 100, 2) if img_h else 0,
+                'w': round(bw / img_w * 100, 2) if img_w else 0,
+                'h': round(bh / img_h * 100, 2) if img_h else 0,
+            },
+            'word_boxes': word_boxes,
+            'ocr_engine': 'words_first',
+            'is_bold': False,
+        })
+
+    return cells
+
+
+# ---------------------------------------------------------------------------
+# 4. Public API
+# ---------------------------------------------------------------------------
+
+def build_grid_from_words(
+    word_dicts: List[Dict],
+    img_w: int,
+    img_h: int,
+    min_confidence: int = 30,
+) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
+    """Build a cell grid bottom-up from Tesseract word boxes.
+
+    Args:
+        word_dicts: Flat list of word dicts with keys:
+            text, left, top, width, height, conf
+            (absolute pixel coordinates).
+        img_w: Image width in pixels.
+        img_h: Image height in pixels.
+        min_confidence: Minimum OCR confidence to keep a word.
+
+    Returns:
+        (cells, columns_meta) — same format as build_cell_grid_v2().
+        cells: list of cell dicts with cell_id, bbox_px, bbox_pct, etc.
+        columns_meta: list of {'index', 'type', 'x', 'width'} dicts.
+    """
+    if not word_dicts:
+        logger.info("build_grid_from_words: no words — returning empty grid")
+        return [], []
+
+    # Filter by confidence
+    words = [
+        w for w in word_dicts
+        if w.get('conf', 0) >= min_confidence and w.get('text', '').strip()
+    ]
+    if not words:
+        logger.info("build_grid_from_words: all words filtered (conf < %d)", min_confidence)
+        return [], []
+
+    logger.info("build_grid_from_words: %d words (after confidence filter from %d)", len(words), len(word_dicts))
+
+    # Step 1: cluster columns
+    columns = _cluster_columns(words, img_w)
+    logger.info("build_grid_from_words: %d column(s) detected", len(columns))
+
+    # Step 2: cluster rows
+    rows = _cluster_rows(words)
+    logger.info("build_grid_from_words: %d row(s) detected", len(rows))
+
+    # Step 3: build cells
+    cells = _build_cells(words, columns, rows, img_w, img_h)
+    logger.info("build_grid_from_words: %d cells built", len(cells))
+
+    # Build columns_meta in same format as build_cell_grid_v2
+    columns_meta = []
+    for col in columns:
+        x = int(col['x_min'])
+        w = int(col['x_max'] - col['x_min'])
+        columns_meta.append({
+            'index': col['index'],
+            'type': col['type'],
+            'x': x,
+            'width': w,
+        })
+
+    return cells, columns_meta