feat(ocr-pipeline): add Step 5 word recognition (grid from columns × rows)

Backend: build_word_grid() intersects column regions with content rows, OCRs each cell with language-specific Tesseract, and returns vocabulary entries with percent-based bounding boxes. New endpoints: POST /words, GET /image/words-overlay, ground-truth save/retrieve for words. Frontend: StepWordRecognition with overview + step-through labeling modes, goToStep callback for row correction feedback loop. MkDocs: OCR Pipeline documentation added. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-28 02:18:29 +01:00
parent 47dc2e6f7a
commit 954103cdf2
9 changed files with 1429 additions and 21 deletions
--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -2169,6 +2169,142 @@ def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Li
    return regions


+# =============================================================================
+# Pipeline Step 5: Word Grid from Columns × Rows
+# =============================================================================
+
+def build_word_grid(
+    ocr_img: np.ndarray,
+    column_regions: List[PageRegion],
+    row_geometries: List[RowGeometry],
+    img_w: int,
+    img_h: int,
+    lang: str = "eng+deu",
+) -> List[Dict[str, Any]]:
+    """Build a word grid by intersecting columns and rows, then OCR each cell.
+
+    Args:
+        ocr_img: Binarized full-page image.
+        column_regions: Classified columns from Step 3 (PageRegion list).
+        row_geometries: Rows from Step 4 (RowGeometry list).
+        img_w: Image width in pixels.
+        img_h: Image height in pixels.
+        lang: Default Tesseract language.
+
+    Returns:
+        List of entry dicts with english/german/example text and bbox info (percent).
+    """
+    # Filter to content rows only (skip header/footer)
+    content_rows = [r for r in row_geometries if r.row_type == 'content']
+    if not content_rows:
+        logger.warning("build_word_grid: no content rows found")
+        return []
+
+    # Map column types to roles
+    VOCAB_COLUMN_TYPES = {'column_en', 'column_de', 'column_example'}
+    relevant_cols = [c for c in column_regions if c.type in VOCAB_COLUMN_TYPES]
+    if not relevant_cols:
+        logger.warning("build_word_grid: no relevant vocabulary columns found")
+        return []
+
+    # Sort columns left-to-right
+    relevant_cols.sort(key=lambda c: c.x)
+
+    # Choose OCR language per column type
+    lang_map = {
+        'column_en': 'eng',
+        'column_de': 'deu',
+        'column_example': 'eng+deu',
+    }
+
+    entries: List[Dict[str, Any]] = []
+
+    for row_idx, row in enumerate(content_rows):
+        entry: Dict[str, Any] = {
+            'row_index': row_idx,
+            'english': '',
+            'german': '',
+            'example': '',
+            'confidence': 0.0,
+            'bbox': {
+                'x': round(row.x / img_w * 100, 2),
+                'y': round(row.y / img_h * 100, 2),
+                'w': round(row.width / img_w * 100, 2),
+                'h': round(row.height / img_h * 100, 2),
+            },
+            'bbox_en': None,
+            'bbox_de': None,
+            'bbox_ex': None,
+        }
+
+        confidences: List[float] = []
+
+        for col in relevant_cols:
+            # Compute cell region: column x/width, row y/height
+            cell_x = col.x
+            cell_y = row.y
+            cell_w = col.width
+            cell_h = row.height
+
+            # Clamp to image bounds
+            cell_x = max(0, cell_x)
+            cell_y = max(0, cell_y)
+            if cell_x + cell_w > img_w:
+                cell_w = img_w - cell_x
+            if cell_y + cell_h > img_h:
+                cell_h = img_h - cell_y
+
+            if cell_w <= 0 or cell_h <= 0:
+                continue
+
+            cell_region = PageRegion(
+                type=col.type,
+                x=cell_x, y=cell_y,
+                width=cell_w, height=cell_h,
+            )
+
+            cell_lang = lang_map.get(col.type, lang)
+            words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=7)
+
+            # Sort words by x position, join to text
+            words.sort(key=lambda w: w['left'])
+            text = ' '.join(w['text'] for w in words)
+            if words:
+                avg_conf = sum(w['conf'] for w in words) / len(words)
+                confidences.append(avg_conf)
+
+            # Bbox in percent
+            cell_bbox = {
+                'x': round(cell_x / img_w * 100, 2),
+                'y': round(cell_y / img_h * 100, 2),
+                'w': round(cell_w / img_w * 100, 2),
+                'h': round(cell_h / img_h * 100, 2),
+            }
+
+            if col.type == 'column_en':
+                entry['english'] = text
+                entry['bbox_en'] = cell_bbox
+            elif col.type == 'column_de':
+                entry['german'] = text
+                entry['bbox_de'] = cell_bbox
+            elif col.type == 'column_example':
+                entry['example'] = text
+                entry['bbox_ex'] = cell_bbox
+
+        entry['confidence'] = round(
+            sum(confidences) / len(confidences), 1
+        ) if confidences else 0.0
+
+        # Only include if at least one field has text
+        if entry['english'] or entry['german'] or entry['example']:
+            entries.append(entry)
+
+    logger.info(f"build_word_grid: {len(entries)} entries from "
+                f"{len(content_rows)} content rows × {len(relevant_cols)} columns")
+
+    return entries
+
+
 # =============================================================================
 # Stage 6: Multi-Pass OCR
 # =============================================================================