feat: Words-First Grid Builder (bottom-up alternative zu cell_grid_v2)

Neuer Algorithmus in cv_words_first.py: Clustert Tesseract word_boxes direkt zu Spalten (X-Gap) und Zeilen (Y-Proximity), baut Zellen an Schnittpunkten. Kein Spalten-/Zeilenerkennung noetig. - cv_words_first.py: _cluster_columns, _cluster_rows, _build_cells, build_grid_from_words - ocr_pipeline_api.py: grid_method Parameter (v2|words_first) im /words Endpoint - StepWordRecognition.tsx: Dropdown Toggle fuer Grid-Methode - OCR-Pipeline.md: Doku v4.3.0 mit Words-First Algorithmus - 15 Unit-Tests fuer cv_words_first Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-12 06:46:05 +01:00
parent 2fdf3ff868
commit ced5bb3dd3
6 changed files with 854 additions and 34 deletions
@@ -63,6 +63,7 @@ export function StepWordRecognition({ sessionId, onNext, goToStep, skipHealGaps
  const [ocrEngine, setOcrEngine] = useState<'auto' | 'tesseract' | 'rapid'>('auto')
  const [usedEngine, setUsedEngine] = useState<string>('')
  const [pronunciation, setPronunciation] = useState<'british' | 'american'>('british')
  const [gridMethod, setGridMethod] = useState<'v2' | 'words_first'>('v2')
  // Streaming progress state
  const [streamProgress, setStreamProgress] = useState<{ current: number; total: number } | null>(null)
@@ -112,7 +113,7 @@ export function StepWordRecognition({ sessionId, onNext, goToStep, skipHealGaps
      let res: Response | null = null
      for (let attempt = 0; attempt < 2; attempt++) {
        res = await fetch(
-          `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/words?stream=true&engine=${eng}&pronunciation=${pronunciation}${skipHealGaps ? '&skip_heal_gaps=true' : ''}`,
+          `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/words?stream=${gridMethod === 'v2' ? 'true' : 'false'}&engine=${eng}&pronunciation=${pronunciation}${skipHealGaps ? '&skip_heal_gaps=true' : ''}&grid_method=${gridMethod}`,
          { method: 'POST' },
        )
        if (res.ok) break
@@ -128,6 +129,13 @@ export function StepWordRecognition({ sessionId, onNext, goToStep, skipHealGaps
        throw new Error(err.detail || 'Worterkennung fehlgeschlagen')
      }
      // words_first returns plain JSON (no streaming)
      if (gridMethod === 'words_first') {
        const data = await res.json() as GridResult
        applyGridResult(data)
        return
      }
      const reader = res.body!.getReader()
      const decoder = new TextDecoder()
      let buffer = ''
@@ -220,7 +228,7 @@ export function StepWordRecognition({ sessionId, onNext, goToStep, skipHealGaps
      setDetecting(false)
    }
  // eslint-disable-next-line react-hooks/exhaustive-deps
-  }, [sessionId, ocrEngine, pronunciation])
+  }, [sessionId, ocrEngine, pronunciation, gridMethod])
  const handleGroundTruth = useCallback(async (isCorrect: boolean) => {
    if (!sessionId) return
@@ -789,6 +797,16 @@ export function StepWordRecognition({ sessionId, onNext, goToStep, skipHealGaps
      {gridResult && (
        <div className="bg-white dark:bg-gray-800 rounded-xl border border-gray-200 dark:border-gray-700 p-4 space-y-3">
          <div className="flex items-center gap-3 flex-wrap">
            {/* Grid method selector */}
            <select
              value={gridMethod}
              onChange={(e) => setGridMethod(e.target.value as 'v2' | 'words_first')}
              className="px-2 py-1.5 text-xs border rounded-lg dark:bg-gray-700 dark:border-gray-600"
            >
              <option value="v2">Standard (v2)</option>
              <option value="words_first">Words-First</option>
            </select>
            {/* OCR Engine selector */}
            <select
              value={ocrEngine}
@@ -1,6 +1,6 @@
 # OCR Pipeline - Schrittweise Seitenrekonstruktion
-**Version:** 4.1.0
+**Version:** 4.3.0
 **Status:** Produktiv (Schritte 1–10 implementiert)
 **URL:** https://macmini:3002/ai/ocr-pipeline
@@ -22,7 +22,7 @@ Jeder Schritt kann individuell geprueft, korrigiert und mit Ground-Truth-Daten v
 | 4 | Zuschneiden (Crop) | Content-basierter Crop: Buchruecken-Schatten + Ink-Projektion | Implementiert |
 | 5 | Spaltenerkennung | Unsichtbare Spalten finden (Projektionsprofile + Wortvalidierung) | Implementiert |
 | 6 | Zeilenerkennung | Horizontale Zeilen + Kopf-/Fusszeilen-Klassifikation + Luecken-Heilung | Implementiert |
-| 7 | Worterkennung | Hybrid-Grid: Breite Spalten full-page, schmale cell-crop | Implementiert |
+| 7 | Worterkennung | Hybrid-Grid (v2) oder Words-First (bottom-up) | Implementiert |
 | 8 | Korrektur | Zeichenverwirrung + regel-basierte Rechtschreibkorrektur (SSE-Stream) | Implementiert |
 | 9 | Rekonstruktion | Interaktive Zellenbearbeitung auf Bildhintergrund (Fabric.js) | Implementiert |
 | 10 | Validierung | Ground-Truth-Vergleich und Qualitaetspruefung | Implementiert |
@@ -82,28 +82,29 @@ flowchart TD
                                      │
                            detect_document_type()
                                      │
-                    ┌─────────────────┴──────────────────┐
+                    ┌──────────────────┼──────────────────┐
-                    ▼                                     ▼
+                    ▼                  ▼                  ▼
-            FULL-TEXT PFAD                        CELL-FIRST PFAD
+            FULL-TEXT PFAD     WORDS-FIRST PFAD    CELL-FIRST PFAD
-            (pipeline='full_page')                (pipeline='cell_first')
+            (pipeline=         (grid_method=        (grid_method=
-                    │                                     │
+             'full_page')       'words_first')       'v2', default)
-            Keine Spalten/Zeilen                  Spaltenerkennung
+                    │                  │                  │
-            analyze_layout_by_words()             detect_column_geometry()
+            Keine Spalten/     Tesseract Full-Page  Spaltenerkennung
-            Lese-Reihenfolge                      _detect_sub_columns()
+            Zeilen             word_boxes            detect_column_geometry()
-                    │                             expand_narrow_columns()
+            analyze_layout_    _cluster_columns()    _detect_sub_columns()
-                    │                             Zeilenerkennung
+            by_words()         _cluster_rows()       expand_narrow_columns()
-                    │                             detect_row_geometry()
+                    │          _build_cells()         Zeilenerkennung
-                    │                                     │
+                    │                  │              detect_row_geometry()
-                    │                            build_cell_grid_v2()
+                    │          build_grid_from_       │
-                    │                                     │
+                    │          words()           build_cell_grid_v2()
-                    │                           ┌─────────┴──────────┐
+                    │                  │              │
-                    │                           ▼                    ▼
+                    │                  │    ┌─────────┴──────────┐
-                    │                     Breite Spalten       Schmale Spalten
+                    │                  │    ▼                    ▼
-                    │                     (>= 15% Breite)     (< 15% Breite)
+                    │                  │  Breite Spalten   Schmale Spalten
-                    │                     Full-Page Words     Cell-Crop OCR
+                    │                  │  (>= 15% Breite)  (< 15% Breite)
-                    │                     word_lookup          cell_crop_v2
+                    │                  │  Full-Page Words  Cell-Crop OCR
-                    │                           │                    │
+                    │                  │  word_lookup       cell_crop_v2
-                    └───────────────────────────┴────────────────────┘
+                    │                  │    │                    │
                    └──────────────────┴────┴────────────────────┘
                                                │
                                    Post-Processing Pipeline
                                    (Lautschrift, Komma-Split, etc.)
@@ -147,6 +148,8 @@ klausur-service/backend/
 │   └── cv_vocab_pipeline.py            # Computer Vision + NLP Algorithmen
 ├── ocr_pipeline_api.py                 # FastAPI Router (Schritte 2-10)
 ├── orientation_crop_api.py             # FastAPI Router (Schritte 1 + 4)
 ├── cv_box_detect.py                    # Box-Erkennung + Zonen-Aufteilung
 ├── cv_words_first.py                   # Words-First Grid Builder (bottom-up)
 ├── page_crop.py                        # Content-basierter Crop-Algorithmus
 ├── ocr_pipeline_session_store.py       # PostgreSQL Persistence
 ├── layout_reconstruction_service.py    # Fabric.js JSON + PDF/DOCX Export
@@ -169,7 +172,8 @@ admin-lehrer/
    ├── StepRowDetection.tsx             # Schritt 6: Zeilenerkennung
    ├── StepWordRecognition.tsx          # Schritt 7: Worterkennung
    ├── StepLlmReview.tsx               # Schritt 8: Korrektur (SSE-Stream)
-    ├── StepReconstruction.tsx           # Schritt 9: Rekonstruktion (Canvas)
+    ├── StepReconstruction.tsx           # Schritt 9: Rekonstruktion (Canvas + Overlay)
    ├── usePixelWordPositions.ts        # Shared Hook: Pixel-basierte Wortpositionierung
    ├── FabricReconstructionCanvas.tsx   # Fabric.js Editor
    └── StepGroundTruth.tsx             # Schritt 10: Validierung
 ```
@@ -257,10 +261,20 @@ Alle Endpoints unter `/api/v1/ocr-pipeline/`.
 | Methode | Pfad | Beschreibung |
 |---------|------|--------------|
-| `POST` | `/sessions/{id}/words` | Wort-Grid aus Spalten x Zeilen erstellen |
+| `POST` | `/sessions/{id}/words` | Wort-Grid erstellen |
 | `POST` | `/sessions/{id}/ground-truth/words` | Ground Truth speichern |
 | `GET` | `/sessions/{id}/ground-truth/words` | Ground Truth abrufen |
 **Query-Parameter fuer `/sessions/{id}/words`:**
 | Parameter | Default | Beschreibung |
 |-----------|---------|--------------|
 | `engine` | `auto` | OCR-Engine: `auto`, `tesseract`, `rapid` |
 | `pronunciation` | `british` | IPA-Woerterbuch: `british` oder `american` |
 | `stream` | `false` | SSE-Streaming (nur bei `grid_method=v2`) |
 | `skip_heal_gaps` | `false` | Zeilen-Luecken nicht heilen (Overlay-Modus) |
 | `grid_method` | `v2` | Grid-Strategie: `v2` (top-down) oder `words_first` (bottom-up) |
 ### Schritt 8: Korrektur
 | Methode | Pfad | Beschreibung |
@@ -513,6 +527,12 @@ Horizontale Projektionsprofile finden Zeilen-Luecken; word-level Validierung ver
   der entstehenden Luecke ausgedehnt, damit kein Zeileninhalt durch schrumpfende Grenzen
   abgeschnitten wird.
 3. **Box-Boundary-Schutz** (`box_ranges_inner`, neu in v4.2):
   Bei Seiten mit Box-Zonen (Sub-Sessions) werden Zeilen am Box-Rand nicht faelschlich
   ausgeschlossen. Das Problem: Die letzte Textzeile ueber einer Box ueberlappt haeufig
   mit dem Box-Rahmen. Loesung: Die Exclusion-Zone wird um `max(border_thickness, 5px)`
   geschrumpft, sodass nur Zeilen **innerhalb** der Box ausgeschlossen werden.
 ```python
 def _is_artifact_row(row: RowGeometry) -> bool:
    """Zeile ist Artefakt wenn alle Tokens <= 1 Zeichen."""
@@ -524,13 +544,128 @@ def _heal_row_gaps(rows, top_bound, bottom_bound):
    ...
 ```
 ### Box-Zonen und Content-Strips (Detail)
 Seiten mit Box-Bereichen (z.B. Grammatik-Tipps, Uebungsboxen) werden in Zonen aufgeteilt:
 ```
 ┌──────────────────────────┐
 │ Content Zone 0 (Zeilen)  │ ← Vokabeltabelle oben
 ├──────────────────────────┤
 │ ███ Box Zone (border) ███│ ← Sub-Session mit eigener OCR
 ├──────────────────────────┤
 │ Content Zone 2 (Zeilen)  │ ← Vokabeltabelle unten
 └──────────────────────────┘
 ```
 **Content-Strip-Verfahren** (`detect_rows` in `ocr_pipeline_api.py`):
 1. Box-Zonen identifizieren, `box_ranges_inner` berechnen (geschrumpft um Border-Dicke)
 2. Content-Strips = Seitenbereiche **ohne** Box-Inneres, vertikal gestapelt
 3. Zeilenerkennung auf gestapeltem Bild, Y-Koordinaten zurueckgemappt
 4. Wort-Filterung: Woerter in Box-Innerem werden ausgeschlossen
 **Wichtig:** `box_ranges_inner` (nicht `box_ranges`) wird verwendet, damit
 Zeilen am Box-Rand nicht abgeschnitten werden. Minimum 5px Margin.
 ---
-## Schritt 7: Worterkennung — Hybrid-Grid (Detail)
+## Schritt 7: Worterkennung (Detail)
-### Algorithmus: `build_cell_grid_v2()`
+Schritt 7 bietet zwei Grid-Strategien, auswaehlbar per `grid_method`-Parameter:
-Schritt 5 nutzt eine **Hybrid-Strategie**: Breite Spalten verwenden die Full-Page-Tesseract-Woerter,
+| Strategie | Parameter | Ansatz | Benoetigt Spalten/Zeilen? |
 |-----------|-----------|--------|--------------------------|
 | **Hybrid-Grid v2** | `grid_method=v2` (Default) | Top-down: Spalten → Zeilen → Zellen → OCR | Ja (Schritte 5+6) |
 | **Words-First** | `grid_method=words_first` | Bottom-up: Woerter → Spalten clustern → Zeilen clustern → Zellen | Nein |
 ---
 ### Words-First Grid Builder: `build_grid_from_words()`
 **Datei:** `cv_words_first.py`
 Der Words-First Builder arbeitet bottom-up: Er nimmt die pixelgenauen `word_boxes` aus einem
 Tesseract Full-Page-Lauf und clustert sie direkt zu Spalten und Zeilen — ohne die
 vorherige Spalten-/Zeilenerkennung (Schritte 5+6) zu benoetigen.
 #### Algorithmus
 ```
 Eingabe: word_dicts (flat list), img_w, img_h
                    │
        ┌───────────┴───────────┐
        │ 1. Confidence-Filter  │
        │    conf >= 30         │
        │    Whitespace entf.   │
        └───────────┬───────────┘
                    │
        ┌───────────┴───────────┐
        │ 2. _cluster_columns() │
        │    X-Gap-Analyse      │
        │    Schwelle: median_h │
        │    × 3 (min 3% Breite)│
        └───────────┬───────────┘
                    │
        ┌───────────┴───────────┐
        │ 3. _cluster_rows()    │
        │    Y-Proximity-Grupp. │
        │    Toleranz: median_h │
        │    / 2                │
        └───────────┬───────────┘
                    │
        ┌───────────┴───────────┐
        │ 4. _build_cells()     │
        │    Wort → (col, row)  │
        │    Text + bbox + conf │
        │    word_boxes pro Zelle│
        └───────────┬───────────┘
                    │
            Ausgabe: cells[], columns_meta[]
            (identisch zu build_cell_grid_v2)
 ```
 #### Spalten-Clustering
 1. Alle Woerter nach X-Mitte sortieren
 2. Aufeinanderfolgende X-Gaps berechnen
 3. Adaptiver Schwellwert: `median_word_height × 3` (min 3% Bildbreite)
 4. Gaps > Schwellwert = Spaltengrenzen
 5. Kein Gap gefunden → 1 Spalte (`column_text`)
 6. Spaltentypen: `column_1`, `column_2`, ... (generisch, positionsbasiert)
 #### Zeilen-Clustering
 1. Woerter zu visuellen Zeilen gruppieren (Y-Toleranz: halbe Worthoehe)
 2. Jede visuelle Zeile = eine Zeile im Grid
 3. Sortiert von oben nach unten
 #### Edge Cases
 | Fall | Behandlung |
 |------|------------|
 | Einzelne Spalte (Fliesstext) | Kein X-Gap → 1 Spalte `column_text` |
 | Keine Woerter erkannt | Leeres Ergebnis `([], [])` |
 | Ueberschriften (grosse Schrift) | Eigene Zeile durch Y-Gap |
 | Bilder/Grafiken | Keine Woerter → automatisch leerer Bereich |
 | Schmale Spalten (Seitenzahlen) | Eigene Spalte durch X-Gap |
 #### Vergleich v2 vs. Words-First
 | Kriterium | v2 (Top-Down) | Words-First (Bottom-Up) |
 |-----------|---------------|------------------------|
 | **Abhaengigkeiten** | Spalten + Zeilen noetig | Nur Tesseract-Woerter |
 | **Spaltentypen** | Semantisch (EN, DE, ...) | Positionsbasiert (1, 2, ...) |
 | **OCR** | Hybrid (full-page + cell-crop) | Nur full-page Tesseract |
 | **Robustheit** | Abhaengig von Spalten-/Zeilenerkennung | Direkt aus Wortpositionen |
 | **Geschwindigkeit** | Langsamer (cell-crop pro Zelle) | Schneller (kein OCR-Lauf) |
 | **Genauigkeit** | Besser bei schmalen Spalten | Besser bei ungewoehnlichen Layouts |
 ---
 ### Hybrid-Grid v2: `build_cell_grid_v2()`
 Schritt 7 nutzt im Default eine **Hybrid-Strategie**: Breite Spalten verwenden die Full-Page-Tesseract-Woerter,
 schmale Spalten werden isoliert per Cell-Crop OCR verarbeitet.
 !!! success "Warum Hybrid?"
@@ -692,7 +827,7 @@ Change-Format:
 ## Schritt 9: Rekonstruktion (Detail)
-Zwei Modi verfuegbar:
+Drei Modi verfuegbar:
 ### Einfacher Modus
@@ -709,6 +844,73 @@ angezeigt, alle Grid-Zellen (auch leere!) werden als editierbare Textfelder daru
 - Zoom 50–200 %
 - Per-Zell-Reset-Button bei geaenderten Zellen
 ### Overlay-Modus (neu in v4.2)
 Ganzseitige Tabellenrekonstruktion mit **Pixel-basierter Wortpositionierung**.
 Nur verfuegbar bei Parent-Sessions mit Sub-Sessions (Box-Bereiche).
 **Funktionsweise:**
 1. **Sub-Session-Merging:** Zellen aus Sub-Sessions werden koordinaten-konvertiert
   und in die Parent-Session eingefuegt. Die Umrechnung laeuft ueber die Box-Zone:
   ```
   parentCellX = boxXPct + (subCell.bbox_pct.x / 100) * boxWPct
   parentCellY = boxYPct + (subCell.bbox_pct.y / 100) * boxHPct
   ```
 2. **180°-Rotation:** Bei Parent-Sessions mit Boxen wird das Bild standardmaessig
   180° gedreht, da der Scan haeufig kopfueber vorliegt. Die Pixel-Analyse
   arbeitet auf dem rotierten Bild:
   - Canvas: `ctx.translate(W, H); ctx.rotate(Math.PI)`
   - Zell-Koordinaten: `(100 - x - w, 100 - y - h)` fuer rotiertes Space
   - Cluster-Ruecktransformation: `start → cw-1-end`, danach `reverse()`
 3. **Pixel-Wortpositionierung:** Der `usePixelWordPositions` Hook analysiert
   dunkle Pixel per vertikaler Projektion, findet Wortgruppen-Cluster und
   berechnet die exakte horizontale Position + Auto-Schriftgroesse.
 **Layout:** 50/50 Grid (links Originalbild, rechts Rekonstruktion)
 **Toolbar:**
 - Schriftgroessen-Slider (30–120%)
 - Bold-Toggle
 - 180°-Rotations-Toggle
 - Speichern-Button
 **Visuelle Elemente:**
 - Spaltenlinien (aus `column_result.columns`)
 - Zeilenlinien (aus `row_result.rows`)
 - Box-Zonen-Markierung (blau, halbtransparent)
 - Editierbare Inputs an Pixel-Positionen
 ### Shared Hook: `usePixelWordPositions`
 Extrahierter Hook fuer die Pixel-basierte Wortpositionierung, genutzt in
 StepLlmReview (Schritt 8) und StepReconstruction (Schritt 9).
 ```typescript
 function usePixelWordPositions(
  imageUrl: string,
  cells: GridCell[],
  active: boolean,
  rotation: 0 | 180 = 0,
 ): Map<string, WordPosition[]>
 ```
 **Algorithmus:**
 1. Bild in offscreen Canvas laden (optional 180° gedreht)
 2. Pro Zelle: `getImageData()` → vertikale Projektion (dunkle Pixel pro Spalte)
 3. Cluster-Erkennung (Schwelle: 3% der Zellhoehe, Gap: 2% der Zellbreite)
 4. Bei Rotation: Cluster zurueck ins Original-Koordinatensystem spiegeln
 5. Text-Gruppen (split bei 3+ Leerzeichen) auf Cluster matchen
 6. Auto-Schriftgroesse per `measureText()` + `fontRatio`
 7. Mode-Normalisierung: Haeufigste `fontRatio` (gerundet auf 0.02) auf alle anwenden
 **Rueckgabe:** `Map<cell_id, WordPosition[]>` mit `xPct`, `wPct`, `text`, `fontRatio`
 ### Fabric.js Editor
 Erweiterter Canvas-Editor (`FabricReconstructionCanvas.tsx`):
@@ -861,6 +1063,8 @@ ssh macmini "/usr/local/bin/docker compose -f /Users/benjaminadmin/Projekte/brea
 | Datum | Version | Aenderung |
 |-------|---------|----------|
 | 2026-03-12 | 4.3.0 | Words-First Grid Builder (`cv_words_first.py`): Bottom-up-Algorithmus clustert Tesseract word_boxes direkt zu Spalten/Zeilen/Zellen. Neuer `grid_method` Parameter im `/words` Endpoint. Frontend-Toggle in StepWordRecognition. |
 | 2026-03-10 | 4.2.0 | Rekonstruktion: Overlay-Modus mit Pixel-Wortpositionierung, 180°-Rotation, Sub-Session-Merging, usePixelWordPositions Hook, Box-Boundary-Schutz (box_ranges_inner) |
 | 2026-03-05 | 3.1.0 | Spalten: Seiten-Segmentierung an Sub-Headern, Word-Coverage Fallback, Segment-gefilterte Validierung |
 | 2026-03-05 | 3.0.1 | Dewarp: Feinabstimmung mit 7 Schiebereglern (3 Rotation + 4 Shear), Combined-Adjust-Endpoint |
 | 2026-03-05 | 3.0.0 | Doku-Update: Dokumenttyp-Erkennung, Hybrid-Grid, Sub-Column-Detection, Pipeline-Pfade |
@@ -34,3 +34,4 @@ from cv_ocr_engines import (  # noqa: F401
    _fix_phonetic_brackets,
 )
 from cv_cell_grid import _cells_to_vocab_entries  # noqa: F401
 from cv_words_first import build_grid_from_words  # noqa: F401
@@ -0,0 +1,282 @@
 """
 Words-First Grid Builder (Bottom-Up).
 Builds a cell grid from Tesseract word_boxes directly, without requiring
 pre-detected columns or rows.  Algorithm:
  1. Cluster words into columns by X-gap analysis
  2. Cluster words into rows by Y-proximity
  3. Build cells at (column, row) intersections
 Returns the same (cells, columns_meta) format as build_cell_grid_v2().
 Lizenz: Apache 2.0 (kommerziell nutzbar)
 DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
 """
 import logging
 import statistics
 from typing import Any, Dict, List, Tuple
 from cv_ocr_engines import (
    _group_words_into_lines,
    _words_to_reading_order_text,
 )
 logger = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 # 1. Column clustering
 # ---------------------------------------------------------------------------
 def _cluster_columns(
    words: List[Dict],
    img_w: int,
    min_gap_pct: float = 3.0,
 ) -> List[Dict[str, Any]]:
    """Cluster words into columns by finding large horizontal gaps.
    Returns a list of column dicts:
        [{'index': 0, 'type': 'column_1', 'x_min': ..., 'x_max': ...}, ...]
    sorted left-to-right.
    """
    if not words:
        return []
    # Sort by X center
    sorted_w = sorted(words, key=lambda w: w['left'] + w['width'] / 2)
    # Collect word heights to compute adaptive threshold
    heights = [w['height'] for w in sorted_w if w.get('height', 0) > 0]
    median_h = statistics.median(heights) if heights else 30
    # Adaptive gap threshold: 3× median word height, but at least min_gap_pct of image width
    min_gap_px = max(median_h * 3, img_w * min_gap_pct / 100) if img_w > 0 else median_h * 3
    # Find X-gap boundaries between consecutive words (sorted by X-center)
    # For each word, compute right edge; for next word, compute left edge
    boundaries: List[float] = []  # X positions where columns split
    for i in range(len(sorted_w) - 1):
        right_edge = sorted_w[i]['left'] + sorted_w[i]['width']
        left_edge = sorted_w[i + 1]['left']
        gap = left_edge - right_edge
        if gap > min_gap_px:
            # Split point is midway through the gap
            boundaries.append((right_edge + left_edge) / 2)
    # Build column ranges from boundaries
    # Column ranges: (-inf, boundary[0]), (boundary[0], boundary[1]), ..., (boundary[-1], +inf)
    col_edges = [0.0] + boundaries + [float(img_w)]
    columns = []
    for ci in range(len(col_edges) - 1):
        columns.append({
            'index': ci,
            'type': f'column_{ci + 1}' if len(col_edges) > 2 else 'column_text',
            'x_min': col_edges[ci],
            'x_max': col_edges[ci + 1],
        })
    return columns
 # ---------------------------------------------------------------------------
 # 2. Row clustering
 # ---------------------------------------------------------------------------
 def _cluster_rows(
    words: List[Dict],
 ) -> List[Dict[str, Any]]:
    """Cluster words into visual rows by Y-proximity.
    Uses half the median word height as Y-tolerance.
    Returns a list of row dicts:
        [{'index': 0, 'y_min': ..., 'y_max': ..., 'y_center': ...}, ...]
    sorted top-to-bottom.
    """
    if not words:
        return []
    heights = [w['height'] for w in words if w.get('height', 0) > 0]
    median_h = statistics.median(heights) if heights else 20
    y_tol = max(median_h * 0.5, 5)
    lines = _group_words_into_lines(words, y_tolerance_px=int(y_tol))
    rows = []
    for ri, line_words in enumerate(lines):
        y_min = min(w['top'] for w in line_words)
        y_max = max(w['top'] + w['height'] for w in line_words)
        rows.append({
            'index': ri,
            'y_min': y_min,
            'y_max': y_max,
            'y_center': (y_min + y_max) / 2,
        })
    return rows
 # ---------------------------------------------------------------------------
 # 3. Build cells
 # ---------------------------------------------------------------------------
 def _assign_word_to_column(word: Dict, columns: List[Dict]) -> int:
    """Return column index for a word based on its X-center."""
    x_center = word['left'] + word['width'] / 2
    for col in columns:
        if col['x_min'] <= x_center < col['x_max']:
            return col['index']
    # Fallback: nearest column
    return min(columns, key=lambda c: abs((c['x_min'] + c['x_max']) / 2 - x_center))['index']
 def _assign_word_to_row(word: Dict, rows: List[Dict]) -> int:
    """Return row index for a word based on its Y-center."""
    y_center = word['top'] + word['height'] / 2
    # Find the row whose y_range contains this word's center
    for row in rows:
        if row['y_min'] <= y_center <= row['y_max']:
            return row['index']
    # Fallback: nearest row by Y-center
    return min(rows, key=lambda r: abs(r['y_center'] - y_center))['index']
 def _build_cells(
    words: List[Dict],
    columns: List[Dict],
    rows: List[Dict],
    img_w: int,
    img_h: int,
 ) -> List[Dict[str, Any]]:
    """Build cell dicts from word assignments to (column, row) pairs."""
    if not columns or not rows:
        return []
    # Bucket words into (col_idx, row_idx)
    buckets: Dict[Tuple[int, int], List[Dict]] = {}
    for w in words:
        ci = _assign_word_to_column(w, columns)
        ri = _assign_word_to_row(w, rows)
        buckets.setdefault((ci, ri), []).append(w)
    cells = []
    for (ci, ri), cell_words in sorted(buckets.items(), key=lambda kv: (kv[0][1], kv[0][0])):
        col = columns[ci]
        row = rows[ri]
        # Compute tight bbox from actual word positions
        x_min = min(w['left'] for w in cell_words)
        y_min = min(w['top'] for w in cell_words)
        x_max = max(w['left'] + w['width'] for w in cell_words)
        y_max = max(w['top'] + w['height'] for w in cell_words)
        bw = x_max - x_min
        bh = y_max - y_min
        # Text from words in reading order
        text = _words_to_reading_order_text(cell_words, y_tolerance_px=max(10, int(bh * 0.4)))
        # Average confidence
        confs = [w.get('conf', 0) for w in cell_words if w.get('conf', 0) > 0]
        avg_conf = sum(confs) / len(confs) if confs else 0.0
        # Word boxes with percent coordinates
        word_boxes = []
        for w in sorted(cell_words, key=lambda ww: (ww['top'], ww['left'])):
            word_boxes.append({
                'text': w.get('text', ''),
                'left': round(w['left'] / img_w * 100, 2) if img_w else 0,
                'top': round(w['top'] / img_h * 100, 2) if img_h else 0,
                'width': round(w['width'] / img_w * 100, 2) if img_w else 0,
                'height': round(w['height'] / img_h * 100, 2) if img_h else 0,
                'conf': w.get('conf', 0),
            })
        cells.append({
            'cell_id': f"R{ri:02d}_C{ci}",
            'row_index': ri,
            'col_index': ci,
            'col_type': col['type'],
            'text': text,
            'confidence': round(avg_conf, 1),
            'bbox_px': {'x': x_min, 'y': y_min, 'w': bw, 'h': bh},
            'bbox_pct': {
                'x': round(x_min / img_w * 100, 2) if img_w else 0,
                'y': round(y_min / img_h * 100, 2) if img_h else 0,
                'w': round(bw / img_w * 100, 2) if img_w else 0,
                'h': round(bh / img_h * 100, 2) if img_h else 0,
            },
            'word_boxes': word_boxes,
            'ocr_engine': 'words_first',
            'is_bold': False,
        })
    return cells
 # ---------------------------------------------------------------------------
 # 4. Public API
 # ---------------------------------------------------------------------------
 def build_grid_from_words(
    word_dicts: List[Dict],
    img_w: int,
    img_h: int,
    min_confidence: int = 30,
 ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
    """Build a cell grid bottom-up from Tesseract word boxes.
    Args:
        word_dicts: Flat list of word dicts with keys:
            text, left, top, width, height, conf
            (absolute pixel coordinates).
        img_w: Image width in pixels.
        img_h: Image height in pixels.
        min_confidence: Minimum OCR confidence to keep a word.
    Returns:
        (cells, columns_meta) — same format as build_cell_grid_v2().
        cells: list of cell dicts with cell_id, bbox_px, bbox_pct, etc.
        columns_meta: list of {'index', 'type', 'x', 'width'} dicts.
    """
    if not word_dicts:
        logger.info("build_grid_from_words: no words — returning empty grid")
        return [], []
    # Filter by confidence
    words = [
        w for w in word_dicts
        if w.get('conf', 0) >= min_confidence and w.get('text', '').strip()
    ]
    if not words:
        logger.info("build_grid_from_words: all words filtered (conf < %d)", min_confidence)
        return [], []
    logger.info("build_grid_from_words: %d words (after confidence filter from %d)", len(words), len(word_dicts))
    # Step 1: cluster columns
    columns = _cluster_columns(words, img_w)
    logger.info("build_grid_from_words: %d column(s) detected", len(columns))
    # Step 2: cluster rows
    rows = _cluster_rows(words)
    logger.info("build_grid_from_words: %d row(s) detected", len(rows))
    # Step 3: build cells
    cells = _build_cells(words, columns, rows, img_w, img_h)
    logger.info("build_grid_from_words: %d cells built", len(cells))
    # Build columns_meta in same format as build_cell_grid_v2
    columns_meta = []
    for col in columns:
        x = int(col['x_min'])
        w = int(col['x_max'] - col['x_min'])
        columns_meta.append({
            'index': col['index'],
            'type': col['type'],
            'x': x,
            'width': w,
        })
    return cells, columns_meta
@@ -71,6 +71,7 @@ from cv_vocab_pipeline import (
    render_image_high_res,
    render_pdf_high_res,
 )
 from cv_words_first import build_grid_from_words
 from ocr_pipeline_session_store import (
    create_session_db,
    delete_all_sessions_db,
@@ -1859,6 +1860,7 @@ async def detect_words(
    pronunciation: str = "british",
    stream: bool = False,
    skip_heal_gaps: bool = False,
    grid_method: str = "v2",
 ):
    """Build word grid from columns × rows, OCR each cell.
@@ -1868,6 +1870,9 @@ async def detect_words(
        stream: false (default) for JSON response, true for SSE streaming
        skip_heal_gaps: false (default). When true, cells keep exact row geometry
            positions without gap-healing expansion. Better for overlay rendering.
        grid_method: 'v2' (default) or 'words_first' — grid construction strategy.
            'v2' uses pre-detected columns/rows (top-down).
            'words_first' clusters words bottom-up (no column/row detection needed).
    """
    if session_id not in _cache:
        logger.info("detect_words: session %s not in cache, loading from DB", session_id)
@@ -1902,7 +1907,7 @@ async def detect_words(
            "duration_seconds": 0,
        }
        logger.info("detect_words: no column_result — using full-page pseudo-column %dx%d", img_w_tmp, img_h_tmp)
-    if not row_result or not row_result.get("rows"):
+    if grid_method != "words_first" and (not row_result or not row_result.get("rows")):
        raise HTTPException(status_code=400, detail="Row detection must be completed first")
    # Convert column dicts back to PageRegion objects
@@ -1983,6 +1988,102 @@ async def detect_words(
        if excluded:
            logger.info(f"detect_words: excluded {excluded} rows inside box zones")
    # --- Words-First path: bottom-up grid from word boxes ---
    if grid_method == "words_first":
        t0 = time.time()
        img_h, img_w = dewarped_bgr.shape[:2]
        # Get word_dicts from cache or run Tesseract full-page
        wf_word_dicts = cached.get("_word_dicts")
        if wf_word_dicts is None:
            ocr_img_tmp = create_ocr_image(dewarped_bgr)
            geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr)
            if geo_result is not None:
                _geoms, left_x, right_x, top_y, bottom_y, wf_word_dicts, inv = geo_result
                cached["_word_dicts"] = wf_word_dicts
                cached["_inv"] = inv
                cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
        if not wf_word_dicts:
            raise HTTPException(status_code=400, detail="No words detected — cannot build words-first grid")
        # Convert word coordinates to absolute image coordinates if needed
        # (detect_column_geometry returns words relative to content ROI)
        content_bounds = cached.get("_content_bounds")
        if content_bounds:
            lx, _rx, ty, _by = content_bounds
            abs_words = []
            for w in wf_word_dicts:
                abs_words.append({
                    **w,
                    'left': w['left'] + lx,
                    'top': w['top'] + ty,
                })
            wf_word_dicts = abs_words
        cells, columns_meta = build_grid_from_words(wf_word_dicts, img_w, img_h)
        duration = time.time() - t0
        # Apply IPA phonetic fixes
        fix_cell_phonetics(cells, pronunciation=pronunciation)
        # Add zone_index for backward compat
        for cell in cells:
            cell.setdefault("zone_index", 0)
        col_types = {c['type'] for c in columns_meta}
        is_vocab = bool(col_types & {'column_en', 'column_de'})
        n_rows = len(set(c['row_index'] for c in cells)) if cells else 0
        n_cols = len(columns_meta)
        used_engine = "words_first"
        word_result = {
            "cells": cells,
            "grid_shape": {
                "rows": n_rows,
                "cols": n_cols,
                "total_cells": len(cells),
            },
            "columns_used": columns_meta,
            "layout": "vocab" if is_vocab else "generic",
            "image_width": img_w,
            "image_height": img_h,
            "duration_seconds": round(duration, 2),
            "ocr_engine": used_engine,
            "grid_method": "words_first",
            "summary": {
                "total_cells": len(cells),
                "non_empty_cells": sum(1 for c in cells if c.get("text")),
                "low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
            },
        }
        if is_vocab or 'column_text' in col_types:
            entries = _cells_to_vocab_entries(cells, columns_meta)
            entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
            word_result["vocab_entries"] = entries
            word_result["entries"] = entries
            word_result["entry_count"] = len(entries)
            word_result["summary"]["total_entries"] = len(entries)
            word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english"))
            word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))
        await update_session_db(session_id, word_result=word_result, current_step=8)
        cached["word_result"] = word_result
        logger.info(f"OCR Pipeline: words-first session {session_id}: "
                    f"{len(cells)} cells ({duration:.2f}s), {n_rows} rows, {n_cols} cols")
        await _append_pipeline_log(session_id, "words", {
            "grid_method": "words_first",
            "total_cells": len(cells),
            "non_empty_cells": word_result["summary"]["non_empty_cells"],
            "ocr_engine": used_engine,
            "layout": word_result["layout"],
        }, duration_ms=int(duration * 1000))
        return {"session_id": session_id, **word_result}
    if stream:
        # Cell-First OCR v2: use batch-then-stream approach instead of
        # per-cell streaming. The parallel ThreadPoolExecutor in
@@ -2001,7 +2102,7 @@ async def detect_words(
            },
        )
-    # --- Non-streaming path ---
+    # --- Non-streaming path (grid_method=v2) ---
    t0 = time.time()
    # Create binarized OCR image (for Tesseract)
@@ -0,0 +1,214 @@
 """Tests for cv_words_first.py — Words-First Grid Builder."""
 import pytest
 from cv_words_first import (
    _assign_word_to_column,
    _assign_word_to_row,
    _build_cells,
    _cluster_columns,
    _cluster_rows,
    build_grid_from_words,
 )
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
 def _word(text: str, left: int, top: int, width: int = 60, height: int = 20, conf: int = 90):
    """Create a synthetic word dict."""
    return {
        'text': text,
        'left': left,
        'top': top,
        'width': width,
        'height': height,
        'conf': conf,
    }
 # ---------------------------------------------------------------------------
 # _cluster_columns
 # ---------------------------------------------------------------------------
 class TestClusterColumns:
    def test_single_column_freetext(self):
        """Words spread evenly across page → 1 column (column_text)."""
        words = [
            _word("Hello", 50, 10),
            _word("world", 120, 10),
            _word("this", 50, 40),
            _word("is", 120, 40),
            _word("text", 190, 40),
        ]
        cols = _cluster_columns(words, img_w=400)
        assert len(cols) == 1
        assert cols[0]['type'] == 'column_text'
    def test_two_columns(self):
        """Two word groups with large X-gap → 2 columns."""
        words = [
            _word("apple", 20, 10),
            _word("Apfel", 300, 10),
            _word("dog", 20, 40),
            _word("Hund", 300, 40),
        ]
        cols = _cluster_columns(words, img_w=500)
        assert len(cols) == 2
        assert cols[0]['type'] == 'column_1'
        assert cols[1]['type'] == 'column_2'
    def test_three_columns(self):
        """Three groups separated by wide gaps → 3 columns."""
        words = [
            _word("1", 10, 10, width=20),
            _word("apple", 100, 10),
            _word("Apfel", 400, 10),
            _word("2", 10, 40, width=20),
            _word("dog", 100, 40),
            _word("Hund", 400, 40),
        ]
        cols = _cluster_columns(words, img_w=600)
        assert len(cols) == 3
    def test_empty_words(self):
        """No words → empty result."""
        assert _cluster_columns([], img_w=500) == []
 # ---------------------------------------------------------------------------
 # _cluster_rows
 # ---------------------------------------------------------------------------
 class TestClusterRows:
    def test_two_rows(self):
        """Words at two Y-levels → 2 rows."""
        words = [
            _word("hello", 10, 20),
            _word("world", 100, 25),
            _word("foo", 10, 80),
            _word("bar", 100, 82),
        ]
        rows = _cluster_rows(words)
        assert len(rows) == 2
        assert rows[0]['y_min'] < rows[1]['y_min']
    def test_single_row(self):
        """All words at same Y → 1 row."""
        words = [
            _word("a", 10, 50),
            _word("b", 80, 52),
            _word("c", 150, 51),
        ]
        rows = _cluster_rows(words)
        assert len(rows) == 1
    def test_empty(self):
        assert _cluster_rows([]) == []
 # ---------------------------------------------------------------------------
 # build_grid_from_words (integration)
 # ---------------------------------------------------------------------------
 class TestBuildGridFromWords:
    def test_two_column_vocab(self):
        """Simulate a 2-column vocabulary page with 3 rows."""
        words = [
            _word("apple", 50, 20),
            _word("Apfel", 400, 22),
            _word("dog", 50, 60),
            _word("Hund", 400, 62),
            _word("cat", 50, 100),
            _word("Katze", 400, 102),
        ]
        cells, cols_meta = build_grid_from_words(words, img_w=600, img_h=200)
        assert len(cols_meta) == 2
        assert len(cells) == 6  # 3 rows × 2 cols
        # Check cell_id format
        cell_ids = {c['cell_id'] for c in cells}
        assert 'R00_C0' in cell_ids
        assert 'R00_C1' in cell_ids
    def test_single_column_freetext(self):
        """Single-column text → 1 column, multiple rows."""
        words = [
            _word("Hello", 50, 20),
            _word("world", 120, 22),
            _word("Second", 50, 60),
            _word("line", 120, 62),
        ]
        cells, cols_meta = build_grid_from_words(words, img_w=300, img_h=150)
        assert len(cols_meta) == 1
        assert cols_meta[0]['type'] == 'column_text'
        assert len(cells) == 2  # 2 rows, 1 column each
    def test_empty_input(self):
        cells, cols = build_grid_from_words([], img_w=500, img_h=500)
        assert cells == []
        assert cols == []
    def test_low_confidence_filtered(self):
        """Words below min_confidence are excluded."""
        words = [
            _word("good", 50, 20, conf=90),
            _word("bad", 200, 20, conf=10),
        ]
        cells, cols = build_grid_from_words(words, img_w=400, img_h=100, min_confidence=30)
        # Only the good word should produce a cell
        assert len(cells) == 1
        assert cells[0]['text'] == 'good'
    def test_bbox_pct_correct(self):
        """Check that bbox_pct is correctly computed from pixel coords."""
        words = [_word("test", 200, 100, width=100, height=30)]
        cells, _ = build_grid_from_words(words, img_w=1000, img_h=500)
        assert len(cells) == 1
        bp = cells[0]['bbox_pct']
        assert bp['x'] == 20.0   # 200/1000*100
        assert bp['y'] == 20.0   # 100/500*100
        assert bp['w'] == 10.0   # 100/1000*100
        assert bp['h'] == 6.0    # 30/500*100
    def test_columns_meta_format(self):
        """columns_meta has same keys as build_cell_grid_v2 output."""
        words = [
            _word("a", 50, 20),
            _word("b", 400, 20),
        ]
        _, cols_meta = build_grid_from_words(words, img_w=600, img_h=100)
        for col in cols_meta:
            assert 'index' in col
            assert 'type' in col
            assert 'x' in col
            assert 'width' in col
    def test_word_boxes_included(self):
        """Each cell should contain word_boxes with percent coords."""
        words = [
            _word("hello", 50, 20),
            _word("world", 120, 22),
        ]
        cells, _ = build_grid_from_words(words, img_w=300, img_h=100)
        assert len(cells) == 1  # single row, single column
        wb = cells[0].get('word_boxes', [])
        assert len(wb) == 2
        for w in wb:
            assert 'left' in w
            assert 'top' in w
            assert 'text' in w
    def test_all_whitespace_filtered(self):
        """Words with only whitespace text are filtered out."""
        words = [
            _word("  ", 50, 20, conf=90),
            _word("hello", 200, 20, conf=90),
        ]
        cells, _ = build_grid_from_words(words, img_w=400, img_h=100)
        assert len(cells) == 1
        assert cells[0]['text'] == 'hello'