feat: Words-First Grid Builder (bottom-up alternative zu cell_grid_v2)

Neuer Algorithmus in cv_words_first.py: Clustert Tesseract word_boxes direkt zu Spalten (X-Gap) und Zeilen (Y-Proximity), baut Zellen an Schnittpunkten. Kein Spalten-/Zeilenerkennung noetig. - cv_words_first.py: _cluster_columns, _cluster_rows, _build_cells, build_grid_from_words - ocr_pipeline_api.py: grid_method Parameter (v2|words_first) im /words Endpoint - StepWordRecognition.tsx: Dropdown Toggle fuer Grid-Methode - OCR-Pipeline.md: Doku v4.3.0 mit Words-First Algorithmus - 15 Unit-Tests fuer cv_words_first Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-12 06:46:05 +01:00
parent 2fdf3ff868
commit ced5bb3dd3
6 changed files with 854 additions and 34 deletions
@@ -63,6 +63,7 @@ export function StepWordRecognition({ sessionId, onNext, goToStep, skipHealGaps
  const [ocrEngine, setOcrEngine] = useState<'auto' | 'tesseract' | 'rapid'>('auto')
  const [usedEngine, setUsedEngine] = useState<string>('')
  const [pronunciation, setPronunciation] = useState<'british' | 'american'>('british')
+  const [gridMethod, setGridMethod] = useState<'v2' | 'words_first'>('v2')

  // Streaming progress state
  const [streamProgress, setStreamProgress] = useState<{ current: number; total: number } | null>(null)
@@ -112,7 +113,7 @@ export function StepWordRecognition({ sessionId, onNext, goToStep, skipHealGaps
      let res: Response | null = null
      for (let attempt = 0; attempt < 2; attempt++) {
        res = await fetch(
-          `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/words?stream=true&engine=${eng}&pronunciation=${pronunciation}${skipHealGaps ? '&skip_heal_gaps=true' : ''}`,
+          `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/words?stream=${gridMethod === 'v2' ? 'true' : 'false'}&engine=${eng}&pronunciation=${pronunciation}${skipHealGaps ? '&skip_heal_gaps=true' : ''}&grid_method=${gridMethod}`,
          { method: 'POST' },
        )
        if (res.ok) break
@@ -128,6 +129,13 @@ export function StepWordRecognition({ sessionId, onNext, goToStep, skipHealGaps
        throw new Error(err.detail || 'Worterkennung fehlgeschlagen')
      }

+      // words_first returns plain JSON (no streaming)
+      if (gridMethod === 'words_first') {
+        const data = await res.json() as GridResult
+        applyGridResult(data)
+        return
+      }
+
      const reader = res.body!.getReader()
      const decoder = new TextDecoder()
      let buffer = ''
@@ -220,7 +228,7 @@ export function StepWordRecognition({ sessionId, onNext, goToStep, skipHealGaps
      setDetecting(false)
    }
  // eslint-disable-next-line react-hooks/exhaustive-deps
-  }, [sessionId, ocrEngine, pronunciation])
+  }, [sessionId, ocrEngine, pronunciation, gridMethod])

  const handleGroundTruth = useCallback(async (isCorrect: boolean) => {
    if (!sessionId) return
@@ -789,6 +797,16 @@ export function StepWordRecognition({ sessionId, onNext, goToStep, skipHealGaps
      {gridResult && (
        <div className="bg-white dark:bg-gray-800 rounded-xl border border-gray-200 dark:border-gray-700 p-4 space-y-3">
          <div className="flex items-center gap-3 flex-wrap">
+            {/* Grid method selector */}
+            <select
+              value={gridMethod}
+              onChange={(e) => setGridMethod(e.target.value as 'v2' | 'words_first')}
+              className="px-2 py-1.5 text-xs border rounded-lg dark:bg-gray-700 dark:border-gray-600"
+            >
+              <option value="v2">Standard (v2)</option>
+              <option value="words_first">Words-First</option>
+            </select>
+
            {/* OCR Engine selector */}
            <select
              value={ocrEngine}
@@ -1,6 +1,6 @@
 # OCR Pipeline - Schrittweise Seitenrekonstruktion

-**Version:** 4.1.0
+**Version:** 4.3.0
 **Status:** Produktiv (Schritte 1–10 implementiert)
 **URL:** https://macmini:3002/ai/ocr-pipeline

@@ -22,7 +22,7 @@ Jeder Schritt kann individuell geprueft, korrigiert und mit Ground-Truth-Daten v
 | 4 | Zuschneiden (Crop) | Content-basierter Crop: Buchruecken-Schatten + Ink-Projektion | Implementiert |
 | 5 | Spaltenerkennung | Unsichtbare Spalten finden (Projektionsprofile + Wortvalidierung) | Implementiert |
 | 6 | Zeilenerkennung | Horizontale Zeilen + Kopf-/Fusszeilen-Klassifikation + Luecken-Heilung | Implementiert |
-| 7 | Worterkennung | Hybrid-Grid: Breite Spalten full-page, schmale cell-crop | Implementiert |
+| 7 | Worterkennung | Hybrid-Grid (v2) oder Words-First (bottom-up) | Implementiert |
 | 8 | Korrektur | Zeichenverwirrung + regel-basierte Rechtschreibkorrektur (SSE-Stream) | Implementiert |
 | 9 | Rekonstruktion | Interaktive Zellenbearbeitung auf Bildhintergrund (Fabric.js) | Implementiert |
 | 10 | Validierung | Ground-Truth-Vergleich und Qualitaetspruefung | Implementiert |
@@ -82,28 +82,29 @@ flowchart TD
                                      │
                            detect_document_type()
                                      │
-                    ┌─────────────────┴──────────────────┐
-                    ▼                                     ▼
-            FULL-TEXT PFAD                        CELL-FIRST PFAD
-            (pipeline='full_page')                (pipeline='cell_first')
-                    │                                     │
-            Keine Spalten/Zeilen                  Spaltenerkennung
-            analyze_layout_by_words()             detect_column_geometry()
-            Lese-Reihenfolge                      _detect_sub_columns()
-                    │                             expand_narrow_columns()
-                    │                             Zeilenerkennung
-                    │                             detect_row_geometry()
-                    │                                     │
-                    │                            build_cell_grid_v2()
-                    │                                     │
-                    │                           ┌─────────┴──────────┐
-                    │                           ▼                    ▼
-                    │                     Breite Spalten       Schmale Spalten
-                    │                     (>= 15% Breite)     (< 15% Breite)
-                    │                     Full-Page Words     Cell-Crop OCR
-                    │                     word_lookup          cell_crop_v2
-                    │                           │                    │
-                    └───────────────────────────┴────────────────────┘
+                    ┌──────────────────┼──────────────────┐
+                    ▼                  ▼                  ▼
+            FULL-TEXT PFAD     WORDS-FIRST PFAD    CELL-FIRST PFAD
+            (pipeline=         (grid_method=        (grid_method=
+             'full_page')       'words_first')       'v2', default)
+                    │                  │                  │
+            Keine Spalten/     Tesseract Full-Page  Spaltenerkennung
+            Zeilen             word_boxes            detect_column_geometry()
+            analyze_layout_    _cluster_columns()    _detect_sub_columns()
+            by_words()         _cluster_rows()       expand_narrow_columns()
+                    │          _build_cells()         Zeilenerkennung
+                    │                  │              detect_row_geometry()
+                    │          build_grid_from_       │
+                    │          words()           build_cell_grid_v2()
+                    │                  │              │
+                    │                  │    ┌─────────┴──────────┐
+                    │                  │    ▼                    ▼
+                    │                  │  Breite Spalten   Schmale Spalten
+                    │                  │  (>= 15% Breite)  (< 15% Breite)
+                    │                  │  Full-Page Words  Cell-Crop OCR
+                    │                  │  word_lookup       cell_crop_v2
+                    │                  │    │                    │
+                    └──────────────────┴────┴────────────────────┘
                                                │
                                    Post-Processing Pipeline
                                    (Lautschrift, Komma-Split, etc.)
@@ -147,6 +148,8 @@ klausur-service/backend/
 │   └── cv_vocab_pipeline.py            # Computer Vision + NLP Algorithmen
 ├── ocr_pipeline_api.py                 # FastAPI Router (Schritte 2-10)
 ├── orientation_crop_api.py             # FastAPI Router (Schritte 1 + 4)
+├── cv_box_detect.py                    # Box-Erkennung + Zonen-Aufteilung
+├── cv_words_first.py                   # Words-First Grid Builder (bottom-up)
 ├── page_crop.py                        # Content-basierter Crop-Algorithmus
 ├── ocr_pipeline_session_store.py       # PostgreSQL Persistence
 ├── layout_reconstruction_service.py    # Fabric.js JSON + PDF/DOCX Export
@@ -169,7 +172,8 @@ admin-lehrer/
    ├── StepRowDetection.tsx             # Schritt 6: Zeilenerkennung
    ├── StepWordRecognition.tsx          # Schritt 7: Worterkennung
    ├── StepLlmReview.tsx               # Schritt 8: Korrektur (SSE-Stream)
-    ├── StepReconstruction.tsx           # Schritt 9: Rekonstruktion (Canvas)
+    ├── StepReconstruction.tsx           # Schritt 9: Rekonstruktion (Canvas + Overlay)
+    ├── usePixelWordPositions.ts        # Shared Hook: Pixel-basierte Wortpositionierung
    ├── FabricReconstructionCanvas.tsx   # Fabric.js Editor
    └── StepGroundTruth.tsx             # Schritt 10: Validierung
 ```
@@ -257,10 +261,20 @@ Alle Endpoints unter `/api/v1/ocr-pipeline/`.

 | Methode | Pfad | Beschreibung |
 |---------|------|--------------|
-| `POST` | `/sessions/{id}/words` | Wort-Grid aus Spalten x Zeilen erstellen |
+| `POST` | `/sessions/{id}/words` | Wort-Grid erstellen |
 | `POST` | `/sessions/{id}/ground-truth/words` | Ground Truth speichern |
 | `GET` | `/sessions/{id}/ground-truth/words` | Ground Truth abrufen |

+**Query-Parameter fuer `/sessions/{id}/words`:**
+
+| Parameter | Default | Beschreibung |
+|-----------|---------|--------------|
+| `engine` | `auto` | OCR-Engine: `auto`, `tesseract`, `rapid` |
+| `pronunciation` | `british` | IPA-Woerterbuch: `british` oder `american` |
+| `stream` | `false` | SSE-Streaming (nur bei `grid_method=v2`) |
+| `skip_heal_gaps` | `false` | Zeilen-Luecken nicht heilen (Overlay-Modus) |
+| `grid_method` | `v2` | Grid-Strategie: `v2` (top-down) oder `words_first` (bottom-up) |
+
 ### Schritt 8: Korrektur

 | Methode | Pfad | Beschreibung |
@@ -513,6 +527,12 @@ Horizontale Projektionsprofile finden Zeilen-Luecken; word-level Validierung ver
   der entstehenden Luecke ausgedehnt, damit kein Zeileninhalt durch schrumpfende Grenzen
   abgeschnitten wird.

+3. **Box-Boundary-Schutz** (`box_ranges_inner`, neu in v4.2):
+   Bei Seiten mit Box-Zonen (Sub-Sessions) werden Zeilen am Box-Rand nicht faelschlich
+   ausgeschlossen. Das Problem: Die letzte Textzeile ueber einer Box ueberlappt haeufig
+   mit dem Box-Rahmen. Loesung: Die Exclusion-Zone wird um `max(border_thickness, 5px)`
+   geschrumpft, sodass nur Zeilen **innerhalb** der Box ausgeschlossen werden.
+
 ```python
 def _is_artifact_row(row: RowGeometry) -> bool:
    """Zeile ist Artefakt wenn alle Tokens <= 1 Zeichen."""
@@ -524,13 +544,128 @@ def _heal_row_gaps(rows, top_bound, bottom_bound):
    ...
 ```

+### Box-Zonen und Content-Strips (Detail)
+
+Seiten mit Box-Bereichen (z.B. Grammatik-Tipps, Uebungsboxen) werden in Zonen aufgeteilt:
+
+```
+┌──────────────────────────┐
+│ Content Zone 0 (Zeilen)  │ ← Vokabeltabelle oben
+├──────────────────────────┤
+│ ███ Box Zone (border) ███│ ← Sub-Session mit eigener OCR
+├──────────────────────────┤
+│ Content Zone 2 (Zeilen)  │ ← Vokabeltabelle unten
+└──────────────────────────┘
+```
+
+**Content-Strip-Verfahren** (`detect_rows` in `ocr_pipeline_api.py`):
+
+1. Box-Zonen identifizieren, `box_ranges_inner` berechnen (geschrumpft um Border-Dicke)
+2. Content-Strips = Seitenbereiche **ohne** Box-Inneres, vertikal gestapelt
+3. Zeilenerkennung auf gestapeltem Bild, Y-Koordinaten zurueckgemappt
+4. Wort-Filterung: Woerter in Box-Innerem werden ausgeschlossen
+
+**Wichtig:** `box_ranges_inner` (nicht `box_ranges`) wird verwendet, damit
+Zeilen am Box-Rand nicht abgeschnitten werden. Minimum 5px Margin.
+
 ---

-## Schritt 7: Worterkennung — Hybrid-Grid (Detail)
+## Schritt 7: Worterkennung (Detail)

-### Algorithmus: `build_cell_grid_v2()`
+Schritt 7 bietet zwei Grid-Strategien, auswaehlbar per `grid_method`-Parameter:

-Schritt 5 nutzt eine **Hybrid-Strategie**: Breite Spalten verwenden die Full-Page-Tesseract-Woerter,
+| Strategie | Parameter | Ansatz | Benoetigt Spalten/Zeilen? |
+|-----------|-----------|--------|--------------------------|
+| **Hybrid-Grid v2** | `grid_method=v2` (Default) | Top-down: Spalten → Zeilen → Zellen → OCR | Ja (Schritte 5+6) |
+| **Words-First** | `grid_method=words_first` | Bottom-up: Woerter → Spalten clustern → Zeilen clustern → Zellen | Nein |
+
+---
+
+### Words-First Grid Builder: `build_grid_from_words()`
+
+**Datei:** `cv_words_first.py`
+
+Der Words-First Builder arbeitet bottom-up: Er nimmt die pixelgenauen `word_boxes` aus einem
+Tesseract Full-Page-Lauf und clustert sie direkt zu Spalten und Zeilen — ohne die
+vorherige Spalten-/Zeilenerkennung (Schritte 5+6) zu benoetigen.
+
+#### Algorithmus
+
+```
+Eingabe: word_dicts (flat list), img_w, img_h
+                    │
+        ┌───────────┴───────────┐
+        │ 1. Confidence-Filter  │
+        │    conf >= 30         │
+        │    Whitespace entf.   │
+        └───────────┬───────────┘
+                    │
+        ┌───────────┴───────────┐
+        │ 2. _cluster_columns() │
+        │    X-Gap-Analyse      │
+        │    Schwelle: median_h │
+        │    × 3 (min 3% Breite)│
+        └───────────┬───────────┘
+                    │
+        ┌───────────┴───────────┐
+        │ 3. _cluster_rows()    │
+        │    Y-Proximity-Grupp. │
+        │    Toleranz: median_h │
+        │    / 2                │
+        └───────────┬───────────┘
+                    │
+        ┌───────────┴───────────┐
+        │ 4. _build_cells()     │
+        │    Wort → (col, row)  │
+        │    Text + bbox + conf │
+        │    word_boxes pro Zelle│
+        └───────────┬───────────┘
+                    │
+            Ausgabe: cells[], columns_meta[]
+            (identisch zu build_cell_grid_v2)
+```
+
+#### Spalten-Clustering
+
+1. Alle Woerter nach X-Mitte sortieren
+2. Aufeinanderfolgende X-Gaps berechnen
+3. Adaptiver Schwellwert: `median_word_height × 3` (min 3% Bildbreite)
+4. Gaps > Schwellwert = Spaltengrenzen
+5. Kein Gap gefunden → 1 Spalte (`column_text`)
+6. Spaltentypen: `column_1`, `column_2`, ... (generisch, positionsbasiert)
+
+#### Zeilen-Clustering
+
+1. Woerter zu visuellen Zeilen gruppieren (Y-Toleranz: halbe Worthoehe)
+2. Jede visuelle Zeile = eine Zeile im Grid
+3. Sortiert von oben nach unten
+
+#### Edge Cases
+
+| Fall | Behandlung |
+|------|------------|
+| Einzelne Spalte (Fliesstext) | Kein X-Gap → 1 Spalte `column_text` |
+| Keine Woerter erkannt | Leeres Ergebnis `([], [])` |
+| Ueberschriften (grosse Schrift) | Eigene Zeile durch Y-Gap |
+| Bilder/Grafiken | Keine Woerter → automatisch leerer Bereich |
+| Schmale Spalten (Seitenzahlen) | Eigene Spalte durch X-Gap |
+
+#### Vergleich v2 vs. Words-First
+
+| Kriterium | v2 (Top-Down) | Words-First (Bottom-Up) |
+|-----------|---------------|------------------------|
+| **Abhaengigkeiten** | Spalten + Zeilen noetig | Nur Tesseract-Woerter |
+| **Spaltentypen** | Semantisch (EN, DE, ...) | Positionsbasiert (1, 2, ...) |
+| **OCR** | Hybrid (full-page + cell-crop) | Nur full-page Tesseract |
+| **Robustheit** | Abhaengig von Spalten-/Zeilenerkennung | Direkt aus Wortpositionen |
+| **Geschwindigkeit** | Langsamer (cell-crop pro Zelle) | Schneller (kein OCR-Lauf) |
+| **Genauigkeit** | Besser bei schmalen Spalten | Besser bei ungewoehnlichen Layouts |
+
+---
+
+### Hybrid-Grid v2: `build_cell_grid_v2()`
+
+Schritt 7 nutzt im Default eine **Hybrid-Strategie**: Breite Spalten verwenden die Full-Page-Tesseract-Woerter,
 schmale Spalten werden isoliert per Cell-Crop OCR verarbeitet.

 !!! success "Warum Hybrid?"
@@ -692,7 +827,7 @@ Change-Format:

 ## Schritt 9: Rekonstruktion (Detail)

-Zwei Modi verfuegbar:
+Drei Modi verfuegbar:

 ### Einfacher Modus

@@ -709,6 +844,73 @@ angezeigt, alle Grid-Zellen (auch leere!) werden als editierbare Textfelder daru
 - Zoom 50–200 %
 - Per-Zell-Reset-Button bei geaenderten Zellen

+### Overlay-Modus (neu in v4.2)
+
+Ganzseitige Tabellenrekonstruktion mit **Pixel-basierter Wortpositionierung**.
+Nur verfuegbar bei Parent-Sessions mit Sub-Sessions (Box-Bereiche).
+
+**Funktionsweise:**
+
+1. **Sub-Session-Merging:** Zellen aus Sub-Sessions werden koordinaten-konvertiert
+   und in die Parent-Session eingefuegt. Die Umrechnung laeuft ueber die Box-Zone:
+   ```
+   parentCellX = boxXPct + (subCell.bbox_pct.x / 100) * boxWPct
+   parentCellY = boxYPct + (subCell.bbox_pct.y / 100) * boxHPct
+   ```
+
+2. **180°-Rotation:** Bei Parent-Sessions mit Boxen wird das Bild standardmaessig
+   180° gedreht, da der Scan haeufig kopfueber vorliegt. Die Pixel-Analyse
+   arbeitet auf dem rotierten Bild:
+   - Canvas: `ctx.translate(W, H); ctx.rotate(Math.PI)`
+   - Zell-Koordinaten: `(100 - x - w, 100 - y - h)` fuer rotiertes Space
+   - Cluster-Ruecktransformation: `start → cw-1-end`, danach `reverse()`
+
+3. **Pixel-Wortpositionierung:** Der `usePixelWordPositions` Hook analysiert
+   dunkle Pixel per vertikaler Projektion, findet Wortgruppen-Cluster und
+   berechnet die exakte horizontale Position + Auto-Schriftgroesse.
+
+**Layout:** 50/50 Grid (links Originalbild, rechts Rekonstruktion)
+
+**Toolbar:**
+
+- Schriftgroessen-Slider (30–120%)
+- Bold-Toggle
+- 180°-Rotations-Toggle
+- Speichern-Button
+
+**Visuelle Elemente:**
+
+- Spaltenlinien (aus `column_result.columns`)
+- Zeilenlinien (aus `row_result.rows`)
+- Box-Zonen-Markierung (blau, halbtransparent)
+- Editierbare Inputs an Pixel-Positionen
+
+### Shared Hook: `usePixelWordPositions`
+
+Extrahierter Hook fuer die Pixel-basierte Wortpositionierung, genutzt in
+StepLlmReview (Schritt 8) und StepReconstruction (Schritt 9).
+
+```typescript
+function usePixelWordPositions(
+  imageUrl: string,
+  cells: GridCell[],
+  active: boolean,
+  rotation: 0 | 180 = 0,
+): Map<string, WordPosition[]>
+```
+
+**Algorithmus:**
+
+1. Bild in offscreen Canvas laden (optional 180° gedreht)
+2. Pro Zelle: `getImageData()` → vertikale Projektion (dunkle Pixel pro Spalte)
+3. Cluster-Erkennung (Schwelle: 3% der Zellhoehe, Gap: 2% der Zellbreite)
+4. Bei Rotation: Cluster zurueck ins Original-Koordinatensystem spiegeln
+5. Text-Gruppen (split bei 3+ Leerzeichen) auf Cluster matchen
+6. Auto-Schriftgroesse per `measureText()` + `fontRatio`
+7. Mode-Normalisierung: Haeufigste `fontRatio` (gerundet auf 0.02) auf alle anwenden
+
+**Rueckgabe:** `Map<cell_id, WordPosition[]>` mit `xPct`, `wPct`, `text`, `fontRatio`
+
 ### Fabric.js Editor

 Erweiterter Canvas-Editor (`FabricReconstructionCanvas.tsx`):
@@ -861,6 +1063,8 @@ ssh macmini "/usr/local/bin/docker compose -f /Users/benjaminadmin/Projekte/brea

 | Datum | Version | Aenderung |
 |-------|---------|----------|
+| 2026-03-12 | 4.3.0 | Words-First Grid Builder (`cv_words_first.py`): Bottom-up-Algorithmus clustert Tesseract word_boxes direkt zu Spalten/Zeilen/Zellen. Neuer `grid_method` Parameter im `/words` Endpoint. Frontend-Toggle in StepWordRecognition. |
+| 2026-03-10 | 4.2.0 | Rekonstruktion: Overlay-Modus mit Pixel-Wortpositionierung, 180°-Rotation, Sub-Session-Merging, usePixelWordPositions Hook, Box-Boundary-Schutz (box_ranges_inner) |
 | 2026-03-05 | 3.1.0 | Spalten: Seiten-Segmentierung an Sub-Headern, Word-Coverage Fallback, Segment-gefilterte Validierung |
 | 2026-03-05 | 3.0.1 | Dewarp: Feinabstimmung mit 7 Schiebereglern (3 Rotation + 4 Shear), Combined-Adjust-Endpoint |
 | 2026-03-05 | 3.0.0 | Doku-Update: Dokumenttyp-Erkennung, Hybrid-Grid, Sub-Column-Detection, Pipeline-Pfade |
@@ -34,3 +34,4 @@ from cv_ocr_engines import (  # noqa: F401
    _fix_phonetic_brackets,
 )
 from cv_cell_grid import _cells_to_vocab_entries  # noqa: F401
+from cv_words_first import build_grid_from_words  # noqa: F401
@@ -0,0 +1,282 @@
+"""
+Words-First Grid Builder (Bottom-Up).
+
+Builds a cell grid from Tesseract word_boxes directly, without requiring
+pre-detected columns or rows.  Algorithm:
+
+  1. Cluster words into columns by X-gap analysis
+  2. Cluster words into rows by Y-proximity
+  3. Build cells at (column, row) intersections
+
+Returns the same (cells, columns_meta) format as build_cell_grid_v2().
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+import statistics
+from typing import Any, Dict, List, Tuple
+
+from cv_ocr_engines import (
+    _group_words_into_lines,
+    _words_to_reading_order_text,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# 1. Column clustering
+# ---------------------------------------------------------------------------
+
+def _cluster_columns(
+    words: List[Dict],
+    img_w: int,
+    min_gap_pct: float = 3.0,
+) -> List[Dict[str, Any]]:
+    """Cluster words into columns by finding large horizontal gaps.
+
+    Returns a list of column dicts:
+        [{'index': 0, 'type': 'column_1', 'x_min': ..., 'x_max': ...}, ...]
+    sorted left-to-right.
+    """
+    if not words:
+        return []
+
+    # Sort by X center
+    sorted_w = sorted(words, key=lambda w: w['left'] + w['width'] / 2)
+
+    # Collect word heights to compute adaptive threshold
+    heights = [w['height'] for w in sorted_w if w.get('height', 0) > 0]
+    median_h = statistics.median(heights) if heights else 30
+
+    # Adaptive gap threshold: 3× median word height, but at least min_gap_pct of image width
+    min_gap_px = max(median_h * 3, img_w * min_gap_pct / 100) if img_w > 0 else median_h * 3
+
+    # Find X-gap boundaries between consecutive words (sorted by X-center)
+    # For each word, compute right edge; for next word, compute left edge
+    boundaries: List[float] = []  # X positions where columns split
+    for i in range(len(sorted_w) - 1):
+        right_edge = sorted_w[i]['left'] + sorted_w[i]['width']
+        left_edge = sorted_w[i + 1]['left']
+        gap = left_edge - right_edge
+        if gap > min_gap_px:
+            # Split point is midway through the gap
+            boundaries.append((right_edge + left_edge) / 2)
+
+    # Build column ranges from boundaries
+    # Column ranges: (-inf, boundary[0]), (boundary[0], boundary[1]), ..., (boundary[-1], +inf)
+    col_edges = [0.0] + boundaries + [float(img_w)]
+    columns = []
+    for ci in range(len(col_edges) - 1):
+        columns.append({
+            'index': ci,
+            'type': f'column_{ci + 1}' if len(col_edges) > 2 else 'column_text',
+            'x_min': col_edges[ci],
+            'x_max': col_edges[ci + 1],
+        })
+
+    return columns
+
+
+# ---------------------------------------------------------------------------
+# 2. Row clustering
+# ---------------------------------------------------------------------------
+
+def _cluster_rows(
+    words: List[Dict],
+) -> List[Dict[str, Any]]:
+    """Cluster words into visual rows by Y-proximity.
+
+    Uses half the median word height as Y-tolerance.
+
+    Returns a list of row dicts:
+        [{'index': 0, 'y_min': ..., 'y_max': ..., 'y_center': ...}, ...]
+    sorted top-to-bottom.
+    """
+    if not words:
+        return []
+
+    heights = [w['height'] for w in words if w.get('height', 0) > 0]
+    median_h = statistics.median(heights) if heights else 20
+    y_tol = max(median_h * 0.5, 5)
+
+    lines = _group_words_into_lines(words, y_tolerance_px=int(y_tol))
+
+    rows = []
+    for ri, line_words in enumerate(lines):
+        y_min = min(w['top'] for w in line_words)
+        y_max = max(w['top'] + w['height'] for w in line_words)
+        rows.append({
+            'index': ri,
+            'y_min': y_min,
+            'y_max': y_max,
+            'y_center': (y_min + y_max) / 2,
+        })
+
+    return rows
+
+
+# ---------------------------------------------------------------------------
+# 3. Build cells
+# ---------------------------------------------------------------------------
+
+def _assign_word_to_column(word: Dict, columns: List[Dict]) -> int:
+    """Return column index for a word based on its X-center."""
+    x_center = word['left'] + word['width'] / 2
+    for col in columns:
+        if col['x_min'] <= x_center < col['x_max']:
+            return col['index']
+    # Fallback: nearest column
+    return min(columns, key=lambda c: abs((c['x_min'] + c['x_max']) / 2 - x_center))['index']
+
+
+def _assign_word_to_row(word: Dict, rows: List[Dict]) -> int:
+    """Return row index for a word based on its Y-center."""
+    y_center = word['top'] + word['height'] / 2
+    # Find the row whose y_range contains this word's center
+    for row in rows:
+        if row['y_min'] <= y_center <= row['y_max']:
+            return row['index']
+    # Fallback: nearest row by Y-center
+    return min(rows, key=lambda r: abs(r['y_center'] - y_center))['index']
+
+
+def _build_cells(
+    words: List[Dict],
+    columns: List[Dict],
+    rows: List[Dict],
+    img_w: int,
+    img_h: int,
+) -> List[Dict[str, Any]]:
+    """Build cell dicts from word assignments to (column, row) pairs."""
+    if not columns or not rows:
+        return []
+
+    # Bucket words into (col_idx, row_idx)
+    buckets: Dict[Tuple[int, int], List[Dict]] = {}
+    for w in words:
+        ci = _assign_word_to_column(w, columns)
+        ri = _assign_word_to_row(w, rows)
+        buckets.setdefault((ci, ri), []).append(w)
+
+    cells = []
+    for (ci, ri), cell_words in sorted(buckets.items(), key=lambda kv: (kv[0][1], kv[0][0])):
+        col = columns[ci]
+        row = rows[ri]
+
+        # Compute tight bbox from actual word positions
+        x_min = min(w['left'] for w in cell_words)
+        y_min = min(w['top'] for w in cell_words)
+        x_max = max(w['left'] + w['width'] for w in cell_words)
+        y_max = max(w['top'] + w['height'] for w in cell_words)
+        bw = x_max - x_min
+        bh = y_max - y_min
+
+        # Text from words in reading order
+        text = _words_to_reading_order_text(cell_words, y_tolerance_px=max(10, int(bh * 0.4)))
+
+        # Average confidence
+        confs = [w.get('conf', 0) for w in cell_words if w.get('conf', 0) > 0]
+        avg_conf = sum(confs) / len(confs) if confs else 0.0
+
+        # Word boxes with percent coordinates
+        word_boxes = []
+        for w in sorted(cell_words, key=lambda ww: (ww['top'], ww['left'])):
+            word_boxes.append({
+                'text': w.get('text', ''),
+                'left': round(w['left'] / img_w * 100, 2) if img_w else 0,
+                'top': round(w['top'] / img_h * 100, 2) if img_h else 0,
+                'width': round(w['width'] / img_w * 100, 2) if img_w else 0,
+                'height': round(w['height'] / img_h * 100, 2) if img_h else 0,
+                'conf': w.get('conf', 0),
+            })
+
+        cells.append({
+            'cell_id': f"R{ri:02d}_C{ci}",
+            'row_index': ri,
+            'col_index': ci,
+            'col_type': col['type'],
+            'text': text,
+            'confidence': round(avg_conf, 1),
+            'bbox_px': {'x': x_min, 'y': y_min, 'w': bw, 'h': bh},
+            'bbox_pct': {
+                'x': round(x_min / img_w * 100, 2) if img_w else 0,
+                'y': round(y_min / img_h * 100, 2) if img_h else 0,
+                'w': round(bw / img_w * 100, 2) if img_w else 0,
+                'h': round(bh / img_h * 100, 2) if img_h else 0,
+            },
+            'word_boxes': word_boxes,
+            'ocr_engine': 'words_first',
+            'is_bold': False,
+        })
+
+    return cells
+
+
+# ---------------------------------------------------------------------------
+# 4. Public API
+# ---------------------------------------------------------------------------
+
+def build_grid_from_words(
+    word_dicts: List[Dict],
+    img_w: int,
+    img_h: int,
+    min_confidence: int = 30,
+) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
+    """Build a cell grid bottom-up from Tesseract word boxes.
+
+    Args:
+        word_dicts: Flat list of word dicts with keys:
+            text, left, top, width, height, conf
+            (absolute pixel coordinates).
+        img_w: Image width in pixels.
+        img_h: Image height in pixels.
+        min_confidence: Minimum OCR confidence to keep a word.
+
+    Returns:
+        (cells, columns_meta) — same format as build_cell_grid_v2().
+        cells: list of cell dicts with cell_id, bbox_px, bbox_pct, etc.
+        columns_meta: list of {'index', 'type', 'x', 'width'} dicts.
+    """
+    if not word_dicts:
+        logger.info("build_grid_from_words: no words — returning empty grid")
+        return [], []
+
+    # Filter by confidence
+    words = [
+        w for w in word_dicts
+        if w.get('conf', 0) >= min_confidence and w.get('text', '').strip()
+    ]
+    if not words:
+        logger.info("build_grid_from_words: all words filtered (conf < %d)", min_confidence)
+        return [], []
+
+    logger.info("build_grid_from_words: %d words (after confidence filter from %d)", len(words), len(word_dicts))
+
+    # Step 1: cluster columns
+    columns = _cluster_columns(words, img_w)
+    logger.info("build_grid_from_words: %d column(s) detected", len(columns))
+
+    # Step 2: cluster rows
+    rows = _cluster_rows(words)
+    logger.info("build_grid_from_words: %d row(s) detected", len(rows))
+
+    # Step 3: build cells
+    cells = _build_cells(words, columns, rows, img_w, img_h)
+    logger.info("build_grid_from_words: %d cells built", len(cells))
+
+    # Build columns_meta in same format as build_cell_grid_v2
+    columns_meta = []
+    for col in columns:
+        x = int(col['x_min'])
+        w = int(col['x_max'] - col['x_min'])
+        columns_meta.append({
+            'index': col['index'],
+            'type': col['type'],
+            'x': x,
+            'width': w,
+        })
+
+    return cells, columns_meta
@@ -71,6 +71,7 @@ from cv_vocab_pipeline import (
    render_image_high_res,
    render_pdf_high_res,
 )
+from cv_words_first import build_grid_from_words
 from ocr_pipeline_session_store import (
    create_session_db,
    delete_all_sessions_db,
@@ -1859,6 +1860,7 @@ async def detect_words(
    pronunciation: str = "british",
    stream: bool = False,
    skip_heal_gaps: bool = False,
+    grid_method: str = "v2",
 ):
    """Build word grid from columns × rows, OCR each cell.

@@ -1868,6 +1870,9 @@ async def detect_words(
        stream: false (default) for JSON response, true for SSE streaming
        skip_heal_gaps: false (default). When true, cells keep exact row geometry
            positions without gap-healing expansion. Better for overlay rendering.
+        grid_method: 'v2' (default) or 'words_first' — grid construction strategy.
+            'v2' uses pre-detected columns/rows (top-down).
+            'words_first' clusters words bottom-up (no column/row detection needed).
    """
    if session_id not in _cache:
        logger.info("detect_words: session %s not in cache, loading from DB", session_id)
@@ -1902,7 +1907,7 @@ async def detect_words(
            "duration_seconds": 0,
        }
        logger.info("detect_words: no column_result — using full-page pseudo-column %dx%d", img_w_tmp, img_h_tmp)
-    if not row_result or not row_result.get("rows"):
+    if grid_method != "words_first" and (not row_result or not row_result.get("rows")):
        raise HTTPException(status_code=400, detail="Row detection must be completed first")

    # Convert column dicts back to PageRegion objects
@@ -1983,6 +1988,102 @@ async def detect_words(
        if excluded:
            logger.info(f"detect_words: excluded {excluded} rows inside box zones")

+    # --- Words-First path: bottom-up grid from word boxes ---
+    if grid_method == "words_first":
+        t0 = time.time()
+        img_h, img_w = dewarped_bgr.shape[:2]
+
+        # Get word_dicts from cache or run Tesseract full-page
+        wf_word_dicts = cached.get("_word_dicts")
+        if wf_word_dicts is None:
+            ocr_img_tmp = create_ocr_image(dewarped_bgr)
+            geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr)
+            if geo_result is not None:
+                _geoms, left_x, right_x, top_y, bottom_y, wf_word_dicts, inv = geo_result
+                cached["_word_dicts"] = wf_word_dicts
+                cached["_inv"] = inv
+                cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
+
+        if not wf_word_dicts:
+            raise HTTPException(status_code=400, detail="No words detected — cannot build words-first grid")
+
+        # Convert word coordinates to absolute image coordinates if needed
+        # (detect_column_geometry returns words relative to content ROI)
+        content_bounds = cached.get("_content_bounds")
+        if content_bounds:
+            lx, _rx, ty, _by = content_bounds
+            abs_words = []
+            for w in wf_word_dicts:
+                abs_words.append({
+                    **w,
+                    'left': w['left'] + lx,
+                    'top': w['top'] + ty,
+                })
+            wf_word_dicts = abs_words
+
+        cells, columns_meta = build_grid_from_words(wf_word_dicts, img_w, img_h)
+        duration = time.time() - t0
+
+        # Apply IPA phonetic fixes
+        fix_cell_phonetics(cells, pronunciation=pronunciation)
+
+        # Add zone_index for backward compat
+        for cell in cells:
+            cell.setdefault("zone_index", 0)
+
+        col_types = {c['type'] for c in columns_meta}
+        is_vocab = bool(col_types & {'column_en', 'column_de'})
+        n_rows = len(set(c['row_index'] for c in cells)) if cells else 0
+        n_cols = len(columns_meta)
+        used_engine = "words_first"
+
+        word_result = {
+            "cells": cells,
+            "grid_shape": {
+                "rows": n_rows,
+                "cols": n_cols,
+                "total_cells": len(cells),
+            },
+            "columns_used": columns_meta,
+            "layout": "vocab" if is_vocab else "generic",
+            "image_width": img_w,
+            "image_height": img_h,
+            "duration_seconds": round(duration, 2),
+            "ocr_engine": used_engine,
+            "grid_method": "words_first",
+            "summary": {
+                "total_cells": len(cells),
+                "non_empty_cells": sum(1 for c in cells if c.get("text")),
+                "low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
+            },
+        }
+
+        if is_vocab or 'column_text' in col_types:
+            entries = _cells_to_vocab_entries(cells, columns_meta)
+            entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
+            word_result["vocab_entries"] = entries
+            word_result["entries"] = entries
+            word_result["entry_count"] = len(entries)
+            word_result["summary"]["total_entries"] = len(entries)
+            word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english"))
+            word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))
+
+        await update_session_db(session_id, word_result=word_result, current_step=8)
+        cached["word_result"] = word_result
+
+        logger.info(f"OCR Pipeline: words-first session {session_id}: "
+                    f"{len(cells)} cells ({duration:.2f}s), {n_rows} rows, {n_cols} cols")
+
+        await _append_pipeline_log(session_id, "words", {
+            "grid_method": "words_first",
+            "total_cells": len(cells),
+            "non_empty_cells": word_result["summary"]["non_empty_cells"],
+            "ocr_engine": used_engine,
+            "layout": word_result["layout"],
+        }, duration_ms=int(duration * 1000))
+
+        return {"session_id": session_id, **word_result}
+
    if stream:
        # Cell-First OCR v2: use batch-then-stream approach instead of
        # per-cell streaming. The parallel ThreadPoolExecutor in
@@ -2001,7 +2102,7 @@ async def detect_words(
            },
        )

-    # --- Non-streaming path ---
+    # --- Non-streaming path (grid_method=v2) ---
    t0 = time.time()

    # Create binarized OCR image (for Tesseract)
@@ -0,0 +1,214 @@
+"""Tests for cv_words_first.py — Words-First Grid Builder."""
+
+import pytest
+from cv_words_first import (
+    _assign_word_to_column,
+    _assign_word_to_row,
+    _build_cells,
+    _cluster_columns,
+    _cluster_rows,
+    build_grid_from_words,
+)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _word(text: str, left: int, top: int, width: int = 60, height: int = 20, conf: int = 90):
+    """Create a synthetic word dict."""
+    return {
+        'text': text,
+        'left': left,
+        'top': top,
+        'width': width,
+        'height': height,
+        'conf': conf,
+    }
+
+
+# ---------------------------------------------------------------------------
+# _cluster_columns
+# ---------------------------------------------------------------------------
+
+class TestClusterColumns:
+
+    def test_single_column_freetext(self):
+        """Words spread evenly across page → 1 column (column_text)."""
+        words = [
+            _word("Hello", 50, 10),
+            _word("world", 120, 10),
+            _word("this", 50, 40),
+            _word("is", 120, 40),
+            _word("text", 190, 40),
+        ]
+        cols = _cluster_columns(words, img_w=400)
+        assert len(cols) == 1
+        assert cols[0]['type'] == 'column_text'
+
+    def test_two_columns(self):
+        """Two word groups with large X-gap → 2 columns."""
+        words = [
+            _word("apple", 20, 10),
+            _word("Apfel", 300, 10),
+            _word("dog", 20, 40),
+            _word("Hund", 300, 40),
+        ]
+        cols = _cluster_columns(words, img_w=500)
+        assert len(cols) == 2
+        assert cols[0]['type'] == 'column_1'
+        assert cols[1]['type'] == 'column_2'
+
+    def test_three_columns(self):
+        """Three groups separated by wide gaps → 3 columns."""
+        words = [
+            _word("1", 10, 10, width=20),
+            _word("apple", 100, 10),
+            _word("Apfel", 400, 10),
+            _word("2", 10, 40, width=20),
+            _word("dog", 100, 40),
+            _word("Hund", 400, 40),
+        ]
+        cols = _cluster_columns(words, img_w=600)
+        assert len(cols) == 3
+
+    def test_empty_words(self):
+        """No words → empty result."""
+        assert _cluster_columns([], img_w=500) == []
+
+
+# ---------------------------------------------------------------------------
+# _cluster_rows
+# ---------------------------------------------------------------------------
+
+class TestClusterRows:
+
+    def test_two_rows(self):
+        """Words at two Y-levels → 2 rows."""
+        words = [
+            _word("hello", 10, 20),
+            _word("world", 100, 25),
+            _word("foo", 10, 80),
+            _word("bar", 100, 82),
+        ]
+        rows = _cluster_rows(words)
+        assert len(rows) == 2
+        assert rows[0]['y_min'] < rows[1]['y_min']
+
+    def test_single_row(self):
+        """All words at same Y → 1 row."""
+        words = [
+            _word("a", 10, 50),
+            _word("b", 80, 52),
+            _word("c", 150, 51),
+        ]
+        rows = _cluster_rows(words)
+        assert len(rows) == 1
+
+    def test_empty(self):
+        assert _cluster_rows([]) == []
+
+
+# ---------------------------------------------------------------------------
+# build_grid_from_words (integration)
+# ---------------------------------------------------------------------------
+
+class TestBuildGridFromWords:
+
+    def test_two_column_vocab(self):
+        """Simulate a 2-column vocabulary page with 3 rows."""
+        words = [
+            _word("apple", 50, 20),
+            _word("Apfel", 400, 22),
+            _word("dog", 50, 60),
+            _word("Hund", 400, 62),
+            _word("cat", 50, 100),
+            _word("Katze", 400, 102),
+        ]
+        cells, cols_meta = build_grid_from_words(words, img_w=600, img_h=200)
+
+        assert len(cols_meta) == 2
+        assert len(cells) == 6  # 3 rows × 2 cols
+        # Check cell_id format
+        cell_ids = {c['cell_id'] for c in cells}
+        assert 'R00_C0' in cell_ids
+        assert 'R00_C1' in cell_ids
+
+    def test_single_column_freetext(self):
+        """Single-column text → 1 column, multiple rows."""
+        words = [
+            _word("Hello", 50, 20),
+            _word("world", 120, 22),
+            _word("Second", 50, 60),
+            _word("line", 120, 62),
+        ]
+        cells, cols_meta = build_grid_from_words(words, img_w=300, img_h=150)
+
+        assert len(cols_meta) == 1
+        assert cols_meta[0]['type'] == 'column_text'
+        assert len(cells) == 2  # 2 rows, 1 column each
+
+    def test_empty_input(self):
+        cells, cols = build_grid_from_words([], img_w=500, img_h=500)
+        assert cells == []
+        assert cols == []
+
+    def test_low_confidence_filtered(self):
+        """Words below min_confidence are excluded."""
+        words = [
+            _word("good", 50, 20, conf=90),
+            _word("bad", 200, 20, conf=10),
+        ]
+        cells, cols = build_grid_from_words(words, img_w=400, img_h=100, min_confidence=30)
+        # Only the good word should produce a cell
+        assert len(cells) == 1
+        assert cells[0]['text'] == 'good'
+
+    def test_bbox_pct_correct(self):
+        """Check that bbox_pct is correctly computed from pixel coords."""
+        words = [_word("test", 200, 100, width=100, height=30)]
+        cells, _ = build_grid_from_words(words, img_w=1000, img_h=500)
+        assert len(cells) == 1
+        bp = cells[0]['bbox_pct']
+        assert bp['x'] == 20.0   # 200/1000*100
+        assert bp['y'] == 20.0   # 100/500*100
+        assert bp['w'] == 10.0   # 100/1000*100
+        assert bp['h'] == 6.0    # 30/500*100
+
+    def test_columns_meta_format(self):
+        """columns_meta has same keys as build_cell_grid_v2 output."""
+        words = [
+            _word("a", 50, 20),
+            _word("b", 400, 20),
+        ]
+        _, cols_meta = build_grid_from_words(words, img_w=600, img_h=100)
+        for col in cols_meta:
+            assert 'index' in col
+            assert 'type' in col
+            assert 'x' in col
+            assert 'width' in col
+
+    def test_word_boxes_included(self):
+        """Each cell should contain word_boxes with percent coords."""
+        words = [
+            _word("hello", 50, 20),
+            _word("world", 120, 22),
+        ]
+        cells, _ = build_grid_from_words(words, img_w=300, img_h=100)
+        assert len(cells) == 1  # single row, single column
+        wb = cells[0].get('word_boxes', [])
+        assert len(wb) == 2
+        for w in wb:
+            assert 'left' in w
+            assert 'top' in w
+            assert 'text' in w
+
+    def test_all_whitespace_filtered(self):
+        """Words with only whitespace text are filtered out."""
+        words = [
+            _word("  ", 50, 20, conf=90),
+            _word("hello", 200, 20, conf=90),
+        ]
+        cells, _ = build_grid_from_words(words, img_w=400, img_h=100)
+        assert len(cells) == 1
+        assert cells[0]['text'] == 'hello'