feat: Words-First Grid Builder (bottom-up alternative zu cell_grid_v2)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 54s
CI / test-go-edu-search (push) Successful in 47s
CI / test-python-klausur (push) Failing after 2m31s
CI / test-python-agent-core (push) Successful in 23s
CI / test-nodejs-website (push) Successful in 32s

Neuer Algorithmus in cv_words_first.py: Clustert Tesseract word_boxes
direkt zu Spalten (X-Gap) und Zeilen (Y-Proximity), baut Zellen an
Schnittpunkten. Kein Spalten-/Zeilenerkennung noetig.

- cv_words_first.py: _cluster_columns, _cluster_rows, _build_cells, build_grid_from_words
- ocr_pipeline_api.py: grid_method Parameter (v2|words_first) im /words Endpoint
- StepWordRecognition.tsx: Dropdown Toggle fuer Grid-Methode
- OCR-Pipeline.md: Doku v4.3.0 mit Words-First Algorithmus
- 15 Unit-Tests fuer cv_words_first

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-12 06:46:05 +01:00
parent 2fdf3ff868
commit ced5bb3dd3
6 changed files with 854 additions and 34 deletions

View File

@@ -63,6 +63,7 @@ export function StepWordRecognition({ sessionId, onNext, goToStep, skipHealGaps
const [ocrEngine, setOcrEngine] = useState<'auto' | 'tesseract' | 'rapid'>('auto') const [ocrEngine, setOcrEngine] = useState<'auto' | 'tesseract' | 'rapid'>('auto')
const [usedEngine, setUsedEngine] = useState<string>('') const [usedEngine, setUsedEngine] = useState<string>('')
const [pronunciation, setPronunciation] = useState<'british' | 'american'>('british') const [pronunciation, setPronunciation] = useState<'british' | 'american'>('british')
const [gridMethod, setGridMethod] = useState<'v2' | 'words_first'>('v2')
// Streaming progress state // Streaming progress state
const [streamProgress, setStreamProgress] = useState<{ current: number; total: number } | null>(null) const [streamProgress, setStreamProgress] = useState<{ current: number; total: number } | null>(null)
@@ -112,7 +113,7 @@ export function StepWordRecognition({ sessionId, onNext, goToStep, skipHealGaps
let res: Response | null = null let res: Response | null = null
for (let attempt = 0; attempt < 2; attempt++) { for (let attempt = 0; attempt < 2; attempt++) {
res = await fetch( res = await fetch(
`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/words?stream=true&engine=${eng}&pronunciation=${pronunciation}${skipHealGaps ? '&skip_heal_gaps=true' : ''}`, `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/words?stream=${gridMethod === 'v2' ? 'true' : 'false'}&engine=${eng}&pronunciation=${pronunciation}${skipHealGaps ? '&skip_heal_gaps=true' : ''}&grid_method=${gridMethod}`,
{ method: 'POST' }, { method: 'POST' },
) )
if (res.ok) break if (res.ok) break
@@ -128,6 +129,13 @@ export function StepWordRecognition({ sessionId, onNext, goToStep, skipHealGaps
throw new Error(err.detail || 'Worterkennung fehlgeschlagen') throw new Error(err.detail || 'Worterkennung fehlgeschlagen')
} }
// words_first returns plain JSON (no streaming)
if (gridMethod === 'words_first') {
const data = await res.json() as GridResult
applyGridResult(data)
return
}
const reader = res.body!.getReader() const reader = res.body!.getReader()
const decoder = new TextDecoder() const decoder = new TextDecoder()
let buffer = '' let buffer = ''
@@ -220,7 +228,7 @@ export function StepWordRecognition({ sessionId, onNext, goToStep, skipHealGaps
setDetecting(false) setDetecting(false)
} }
// eslint-disable-next-line react-hooks/exhaustive-deps // eslint-disable-next-line react-hooks/exhaustive-deps
}, [sessionId, ocrEngine, pronunciation]) }, [sessionId, ocrEngine, pronunciation, gridMethod])
const handleGroundTruth = useCallback(async (isCorrect: boolean) => { const handleGroundTruth = useCallback(async (isCorrect: boolean) => {
if (!sessionId) return if (!sessionId) return
@@ -789,6 +797,16 @@ export function StepWordRecognition({ sessionId, onNext, goToStep, skipHealGaps
{gridResult && ( {gridResult && (
<div className="bg-white dark:bg-gray-800 rounded-xl border border-gray-200 dark:border-gray-700 p-4 space-y-3"> <div className="bg-white dark:bg-gray-800 rounded-xl border border-gray-200 dark:border-gray-700 p-4 space-y-3">
<div className="flex items-center gap-3 flex-wrap"> <div className="flex items-center gap-3 flex-wrap">
{/* Grid method selector */}
<select
value={gridMethod}
onChange={(e) => setGridMethod(e.target.value as 'v2' | 'words_first')}
className="px-2 py-1.5 text-xs border rounded-lg dark:bg-gray-700 dark:border-gray-600"
>
<option value="v2">Standard (v2)</option>
<option value="words_first">Words-First</option>
</select>
{/* OCR Engine selector */} {/* OCR Engine selector */}
<select <select
value={ocrEngine} value={ocrEngine}

View File

@@ -1,6 +1,6 @@
# OCR Pipeline - Schrittweise Seitenrekonstruktion # OCR Pipeline - Schrittweise Seitenrekonstruktion
**Version:** 4.1.0 **Version:** 4.3.0
**Status:** Produktiv (Schritte 110 implementiert) **Status:** Produktiv (Schritte 110 implementiert)
**URL:** https://macmini:3002/ai/ocr-pipeline **URL:** https://macmini:3002/ai/ocr-pipeline
@@ -22,7 +22,7 @@ Jeder Schritt kann individuell geprueft, korrigiert und mit Ground-Truth-Daten v
| 4 | Zuschneiden (Crop) | Content-basierter Crop: Buchruecken-Schatten + Ink-Projektion | Implementiert | | 4 | Zuschneiden (Crop) | Content-basierter Crop: Buchruecken-Schatten + Ink-Projektion | Implementiert |
| 5 | Spaltenerkennung | Unsichtbare Spalten finden (Projektionsprofile + Wortvalidierung) | Implementiert | | 5 | Spaltenerkennung | Unsichtbare Spalten finden (Projektionsprofile + Wortvalidierung) | Implementiert |
| 6 | Zeilenerkennung | Horizontale Zeilen + Kopf-/Fusszeilen-Klassifikation + Luecken-Heilung | Implementiert | | 6 | Zeilenerkennung | Horizontale Zeilen + Kopf-/Fusszeilen-Klassifikation + Luecken-Heilung | Implementiert |
| 7 | Worterkennung | Hybrid-Grid: Breite Spalten full-page, schmale cell-crop | Implementiert | | 7 | Worterkennung | Hybrid-Grid (v2) oder Words-First (bottom-up) | Implementiert |
| 8 | Korrektur | Zeichenverwirrung + regel-basierte Rechtschreibkorrektur (SSE-Stream) | Implementiert | | 8 | Korrektur | Zeichenverwirrung + regel-basierte Rechtschreibkorrektur (SSE-Stream) | Implementiert |
| 9 | Rekonstruktion | Interaktive Zellenbearbeitung auf Bildhintergrund (Fabric.js) | Implementiert | | 9 | Rekonstruktion | Interaktive Zellenbearbeitung auf Bildhintergrund (Fabric.js) | Implementiert |
| 10 | Validierung | Ground-Truth-Vergleich und Qualitaetspruefung | Implementiert | | 10 | Validierung | Ground-Truth-Vergleich und Qualitaetspruefung | Implementiert |
@@ -82,28 +82,29 @@ flowchart TD
detect_document_type() detect_document_type()
┌───────────────────────────────────┐ ┌──────────────────┼──────────────────┐
FULL-TEXT PFAD CELL-FIRST PFAD FULL-TEXT PFAD WORDS-FIRST PFAD CELL-FIRST PFAD
(pipeline='full_page') (pipeline='cell_first') (pipeline= (grid_method= (grid_method=
│ │ 'full_page') 'words_first') 'v2', default)
Keine Spalten/Zeilen Spaltenerkennung │ │
analyze_layout_by_words() detect_column_geometry() Keine Spalten/ Tesseract Full-Page Spaltenerkennung
Lese-Reihenfolge _detect_sub_columns() Zeilen word_boxes detect_column_geometry()
│ expand_narrow_columns() analyze_layout_ _cluster_columns() _detect_sub_columns()
│ Zeilenerkennung by_words() _cluster_rows() expand_narrow_columns()
detect_row_geometry() _build_cells() Zeilenerkennung
detect_row_geometry()
build_cell_grid_v2() build_grid_from_ │
words() build_cell_grid_v2()
┌─────────┴──────────┐
▼ ▼ │ ┌─────────┴──────────┐
Breite Spalten Schmale Spalten ▼ ▼
(>= 15% Breite) (< 15% Breite) │ Breite Spalten Schmale Spalten
Full-Page Words Cell-Crop OCR (>= 15% Breite) (< 15% Breite)
word_lookup cell_crop_v2 Full-Page Words Cell-Crop OCR
│ │ │ word_lookup cell_crop_v2
└───────────────────────────┴────────────────────┘ │ │ │ │
└──────────────────┴────┴────────────────────┘
Post-Processing Pipeline Post-Processing Pipeline
(Lautschrift, Komma-Split, etc.) (Lautschrift, Komma-Split, etc.)
@@ -147,6 +148,8 @@ klausur-service/backend/
│ └── cv_vocab_pipeline.py # Computer Vision + NLP Algorithmen │ └── cv_vocab_pipeline.py # Computer Vision + NLP Algorithmen
├── ocr_pipeline_api.py # FastAPI Router (Schritte 2-10) ├── ocr_pipeline_api.py # FastAPI Router (Schritte 2-10)
├── orientation_crop_api.py # FastAPI Router (Schritte 1 + 4) ├── orientation_crop_api.py # FastAPI Router (Schritte 1 + 4)
├── cv_box_detect.py # Box-Erkennung + Zonen-Aufteilung
├── cv_words_first.py # Words-First Grid Builder (bottom-up)
├── page_crop.py # Content-basierter Crop-Algorithmus ├── page_crop.py # Content-basierter Crop-Algorithmus
├── ocr_pipeline_session_store.py # PostgreSQL Persistence ├── ocr_pipeline_session_store.py # PostgreSQL Persistence
├── layout_reconstruction_service.py # Fabric.js JSON + PDF/DOCX Export ├── layout_reconstruction_service.py # Fabric.js JSON + PDF/DOCX Export
@@ -169,7 +172,8 @@ admin-lehrer/
├── StepRowDetection.tsx # Schritt 6: Zeilenerkennung ├── StepRowDetection.tsx # Schritt 6: Zeilenerkennung
├── StepWordRecognition.tsx # Schritt 7: Worterkennung ├── StepWordRecognition.tsx # Schritt 7: Worterkennung
├── StepLlmReview.tsx # Schritt 8: Korrektur (SSE-Stream) ├── StepLlmReview.tsx # Schritt 8: Korrektur (SSE-Stream)
├── StepReconstruction.tsx # Schritt 9: Rekonstruktion (Canvas) ├── StepReconstruction.tsx # Schritt 9: Rekonstruktion (Canvas + Overlay)
├── usePixelWordPositions.ts # Shared Hook: Pixel-basierte Wortpositionierung
├── FabricReconstructionCanvas.tsx # Fabric.js Editor ├── FabricReconstructionCanvas.tsx # Fabric.js Editor
└── StepGroundTruth.tsx # Schritt 10: Validierung └── StepGroundTruth.tsx # Schritt 10: Validierung
``` ```
@@ -257,10 +261,20 @@ Alle Endpoints unter `/api/v1/ocr-pipeline/`.
| Methode | Pfad | Beschreibung | | Methode | Pfad | Beschreibung |
|---------|------|--------------| |---------|------|--------------|
| `POST` | `/sessions/{id}/words` | Wort-Grid aus Spalten x Zeilen erstellen | | `POST` | `/sessions/{id}/words` | Wort-Grid erstellen |
| `POST` | `/sessions/{id}/ground-truth/words` | Ground Truth speichern | | `POST` | `/sessions/{id}/ground-truth/words` | Ground Truth speichern |
| `GET` | `/sessions/{id}/ground-truth/words` | Ground Truth abrufen | | `GET` | `/sessions/{id}/ground-truth/words` | Ground Truth abrufen |
**Query-Parameter fuer `/sessions/{id}/words`:**
| Parameter | Default | Beschreibung |
|-----------|---------|--------------|
| `engine` | `auto` | OCR-Engine: `auto`, `tesseract`, `rapid` |
| `pronunciation` | `british` | IPA-Woerterbuch: `british` oder `american` |
| `stream` | `false` | SSE-Streaming (nur bei `grid_method=v2`) |
| `skip_heal_gaps` | `false` | Zeilen-Luecken nicht heilen (Overlay-Modus) |
| `grid_method` | `v2` | Grid-Strategie: `v2` (top-down) oder `words_first` (bottom-up) |
### Schritt 8: Korrektur ### Schritt 8: Korrektur
| Methode | Pfad | Beschreibung | | Methode | Pfad | Beschreibung |
@@ -513,6 +527,12 @@ Horizontale Projektionsprofile finden Zeilen-Luecken; word-level Validierung ver
der entstehenden Luecke ausgedehnt, damit kein Zeileninhalt durch schrumpfende Grenzen der entstehenden Luecke ausgedehnt, damit kein Zeileninhalt durch schrumpfende Grenzen
abgeschnitten wird. abgeschnitten wird.
3. **Box-Boundary-Schutz** (`box_ranges_inner`, neu in v4.2):
Bei Seiten mit Box-Zonen (Sub-Sessions) werden Zeilen am Box-Rand nicht faelschlich
ausgeschlossen. Das Problem: Die letzte Textzeile ueber einer Box ueberlappt haeufig
mit dem Box-Rahmen. Loesung: Die Exclusion-Zone wird um `max(border_thickness, 5px)`
geschrumpft, sodass nur Zeilen **innerhalb** der Box ausgeschlossen werden.
```python ```python
def _is_artifact_row(row: RowGeometry) -> bool: def _is_artifact_row(row: RowGeometry) -> bool:
"""Zeile ist Artefakt wenn alle Tokens <= 1 Zeichen.""" """Zeile ist Artefakt wenn alle Tokens <= 1 Zeichen."""
@@ -524,13 +544,128 @@ def _heal_row_gaps(rows, top_bound, bottom_bound):
... ...
``` ```
### Box-Zonen und Content-Strips (Detail)
Seiten mit Box-Bereichen (z.B. Grammatik-Tipps, Uebungsboxen) werden in Zonen aufgeteilt:
```
┌──────────────────────────┐
│ Content Zone 0 (Zeilen) │ ← Vokabeltabelle oben
├──────────────────────────┤
│ ███ Box Zone (border) ███│ ← Sub-Session mit eigener OCR
├──────────────────────────┤
│ Content Zone 2 (Zeilen) │ ← Vokabeltabelle unten
└──────────────────────────┘
```
**Content-Strip-Verfahren** (`detect_rows` in `ocr_pipeline_api.py`):
1. Box-Zonen identifizieren, `box_ranges_inner` berechnen (geschrumpft um Border-Dicke)
2. Content-Strips = Seitenbereiche **ohne** Box-Inneres, vertikal gestapelt
3. Zeilenerkennung auf gestapeltem Bild, Y-Koordinaten zurueckgemappt
4. Wort-Filterung: Woerter in Box-Innerem werden ausgeschlossen
**Wichtig:** `box_ranges_inner` (nicht `box_ranges`) wird verwendet, damit
Zeilen am Box-Rand nicht abgeschnitten werden. Minimum 5px Margin.
--- ---
## Schritt 7: Worterkennung — Hybrid-Grid (Detail) ## Schritt 7: Worterkennung (Detail)
### Algorithmus: `build_cell_grid_v2()` Schritt 7 bietet zwei Grid-Strategien, auswaehlbar per `grid_method`-Parameter:
Schritt 5 nutzt eine **Hybrid-Strategie**: Breite Spalten verwenden die Full-Page-Tesseract-Woerter, | Strategie | Parameter | Ansatz | Benoetigt Spalten/Zeilen? |
|-----------|-----------|--------|--------------------------|
| **Hybrid-Grid v2** | `grid_method=v2` (Default) | Top-down: Spalten → Zeilen → Zellen → OCR | Ja (Schritte 5+6) |
| **Words-First** | `grid_method=words_first` | Bottom-up: Woerter → Spalten clustern → Zeilen clustern → Zellen | Nein |
---
### Words-First Grid Builder: `build_grid_from_words()`
**Datei:** `cv_words_first.py`
Der Words-First Builder arbeitet bottom-up: Er nimmt die pixelgenauen `word_boxes` aus einem
Tesseract Full-Page-Lauf und clustert sie direkt zu Spalten und Zeilen — ohne die
vorherige Spalten-/Zeilenerkennung (Schritte 5+6) zu benoetigen.
#### Algorithmus
```
Eingabe: word_dicts (flat list), img_w, img_h
┌───────────┴───────────┐
│ 1. Confidence-Filter │
│ conf >= 30 │
│ Whitespace entf. │
└───────────┬───────────┘
┌───────────┴───────────┐
│ 2. _cluster_columns() │
│ X-Gap-Analyse │
│ Schwelle: median_h │
× 3 (min 3% Breite)│
└───────────┬───────────┘
┌───────────┴───────────┐
│ 3. _cluster_rows() │
│ Y-Proximity-Grupp. │
│ Toleranz: median_h │
│ / 2 │
└───────────┬───────────┘
┌───────────┴───────────┐
│ 4. _build_cells() │
│ Wort → (col, row) │
│ Text + bbox + conf │
│ word_boxes pro Zelle│
└───────────┬───────────┘
Ausgabe: cells[], columns_meta[]
(identisch zu build_cell_grid_v2)
```
#### Spalten-Clustering
1. Alle Woerter nach X-Mitte sortieren
2. Aufeinanderfolgende X-Gaps berechnen
3. Adaptiver Schwellwert: `median_word_height × 3` (min 3% Bildbreite)
4. Gaps > Schwellwert = Spaltengrenzen
5. Kein Gap gefunden → 1 Spalte (`column_text`)
6. Spaltentypen: `column_1`, `column_2`, ... (generisch, positionsbasiert)
#### Zeilen-Clustering
1. Woerter zu visuellen Zeilen gruppieren (Y-Toleranz: halbe Worthoehe)
2. Jede visuelle Zeile = eine Zeile im Grid
3. Sortiert von oben nach unten
#### Edge Cases
| Fall | Behandlung |
|------|------------|
| Einzelne Spalte (Fliesstext) | Kein X-Gap → 1 Spalte `column_text` |
| Keine Woerter erkannt | Leeres Ergebnis `([], [])` |
| Ueberschriften (grosse Schrift) | Eigene Zeile durch Y-Gap |
| Bilder/Grafiken | Keine Woerter → automatisch leerer Bereich |
| Schmale Spalten (Seitenzahlen) | Eigene Spalte durch X-Gap |
#### Vergleich v2 vs. Words-First
| Kriterium | v2 (Top-Down) | Words-First (Bottom-Up) |
|-----------|---------------|------------------------|
| **Abhaengigkeiten** | Spalten + Zeilen noetig | Nur Tesseract-Woerter |
| **Spaltentypen** | Semantisch (EN, DE, ...) | Positionsbasiert (1, 2, ...) |
| **OCR** | Hybrid (full-page + cell-crop) | Nur full-page Tesseract |
| **Robustheit** | Abhaengig von Spalten-/Zeilenerkennung | Direkt aus Wortpositionen |
| **Geschwindigkeit** | Langsamer (cell-crop pro Zelle) | Schneller (kein OCR-Lauf) |
| **Genauigkeit** | Besser bei schmalen Spalten | Besser bei ungewoehnlichen Layouts |
---
### Hybrid-Grid v2: `build_cell_grid_v2()`
Schritt 7 nutzt im Default eine **Hybrid-Strategie**: Breite Spalten verwenden die Full-Page-Tesseract-Woerter,
schmale Spalten werden isoliert per Cell-Crop OCR verarbeitet. schmale Spalten werden isoliert per Cell-Crop OCR verarbeitet.
!!! success "Warum Hybrid?" !!! success "Warum Hybrid?"
@@ -692,7 +827,7 @@ Change-Format:
## Schritt 9: Rekonstruktion (Detail) ## Schritt 9: Rekonstruktion (Detail)
Zwei Modi verfuegbar: Drei Modi verfuegbar:
### Einfacher Modus ### Einfacher Modus
@@ -709,6 +844,73 @@ angezeigt, alle Grid-Zellen (auch leere!) werden als editierbare Textfelder daru
- Zoom 50200 % - Zoom 50200 %
- Per-Zell-Reset-Button bei geaenderten Zellen - Per-Zell-Reset-Button bei geaenderten Zellen
### Overlay-Modus (neu in v4.2)
Ganzseitige Tabellenrekonstruktion mit **Pixel-basierter Wortpositionierung**.
Nur verfuegbar bei Parent-Sessions mit Sub-Sessions (Box-Bereiche).
**Funktionsweise:**
1. **Sub-Session-Merging:** Zellen aus Sub-Sessions werden koordinaten-konvertiert
und in die Parent-Session eingefuegt. Die Umrechnung laeuft ueber die Box-Zone:
```
parentCellX = boxXPct + (subCell.bbox_pct.x / 100) * boxWPct
parentCellY = boxYPct + (subCell.bbox_pct.y / 100) * boxHPct
```
2. **180°-Rotation:** Bei Parent-Sessions mit Boxen wird das Bild standardmaessig
180° gedreht, da der Scan haeufig kopfueber vorliegt. Die Pixel-Analyse
arbeitet auf dem rotierten Bild:
- Canvas: `ctx.translate(W, H); ctx.rotate(Math.PI)`
- Zell-Koordinaten: `(100 - x - w, 100 - y - h)` fuer rotiertes Space
- Cluster-Ruecktransformation: `start → cw-1-end`, danach `reverse()`
3. **Pixel-Wortpositionierung:** Der `usePixelWordPositions` Hook analysiert
dunkle Pixel per vertikaler Projektion, findet Wortgruppen-Cluster und
berechnet die exakte horizontale Position + Auto-Schriftgroesse.
**Layout:** 50/50 Grid (links Originalbild, rechts Rekonstruktion)
**Toolbar:**
- Schriftgroessen-Slider (30120%)
- Bold-Toggle
- 180°-Rotations-Toggle
- Speichern-Button
**Visuelle Elemente:**
- Spaltenlinien (aus `column_result.columns`)
- Zeilenlinien (aus `row_result.rows`)
- Box-Zonen-Markierung (blau, halbtransparent)
- Editierbare Inputs an Pixel-Positionen
### Shared Hook: `usePixelWordPositions`
Extrahierter Hook fuer die Pixel-basierte Wortpositionierung, genutzt in
StepLlmReview (Schritt 8) und StepReconstruction (Schritt 9).
```typescript
function usePixelWordPositions(
imageUrl: string,
cells: GridCell[],
active: boolean,
rotation: 0 | 180 = 0,
): Map<string, WordPosition[]>
```
**Algorithmus:**
1. Bild in offscreen Canvas laden (optional 180° gedreht)
2. Pro Zelle: `getImageData()` → vertikale Projektion (dunkle Pixel pro Spalte)
3. Cluster-Erkennung (Schwelle: 3% der Zellhoehe, Gap: 2% der Zellbreite)
4. Bei Rotation: Cluster zurueck ins Original-Koordinatensystem spiegeln
5. Text-Gruppen (split bei 3+ Leerzeichen) auf Cluster matchen
6. Auto-Schriftgroesse per `measureText()` + `fontRatio`
7. Mode-Normalisierung: Haeufigste `fontRatio` (gerundet auf 0.02) auf alle anwenden
**Rueckgabe:** `Map<cell_id, WordPosition[]>` mit `xPct`, `wPct`, `text`, `fontRatio`
### Fabric.js Editor ### Fabric.js Editor
Erweiterter Canvas-Editor (`FabricReconstructionCanvas.tsx`): Erweiterter Canvas-Editor (`FabricReconstructionCanvas.tsx`):
@@ -861,6 +1063,8 @@ ssh macmini "/usr/local/bin/docker compose -f /Users/benjaminadmin/Projekte/brea
| Datum | Version | Aenderung | | Datum | Version | Aenderung |
|-------|---------|----------| |-------|---------|----------|
| 2026-03-12 | 4.3.0 | Words-First Grid Builder (`cv_words_first.py`): Bottom-up-Algorithmus clustert Tesseract word_boxes direkt zu Spalten/Zeilen/Zellen. Neuer `grid_method` Parameter im `/words` Endpoint. Frontend-Toggle in StepWordRecognition. |
| 2026-03-10 | 4.2.0 | Rekonstruktion: Overlay-Modus mit Pixel-Wortpositionierung, 180°-Rotation, Sub-Session-Merging, usePixelWordPositions Hook, Box-Boundary-Schutz (box_ranges_inner) |
| 2026-03-05 | 3.1.0 | Spalten: Seiten-Segmentierung an Sub-Headern, Word-Coverage Fallback, Segment-gefilterte Validierung | | 2026-03-05 | 3.1.0 | Spalten: Seiten-Segmentierung an Sub-Headern, Word-Coverage Fallback, Segment-gefilterte Validierung |
| 2026-03-05 | 3.0.1 | Dewarp: Feinabstimmung mit 7 Schiebereglern (3 Rotation + 4 Shear), Combined-Adjust-Endpoint | | 2026-03-05 | 3.0.1 | Dewarp: Feinabstimmung mit 7 Schiebereglern (3 Rotation + 4 Shear), Combined-Adjust-Endpoint |
| 2026-03-05 | 3.0.0 | Doku-Update: Dokumenttyp-Erkennung, Hybrid-Grid, Sub-Column-Detection, Pipeline-Pfade | | 2026-03-05 | 3.0.0 | Doku-Update: Dokumenttyp-Erkennung, Hybrid-Grid, Sub-Column-Detection, Pipeline-Pfade |

View File

@@ -34,3 +34,4 @@ from cv_ocr_engines import ( # noqa: F401
_fix_phonetic_brackets, _fix_phonetic_brackets,
) )
from cv_cell_grid import _cells_to_vocab_entries # noqa: F401 from cv_cell_grid import _cells_to_vocab_entries # noqa: F401
from cv_words_first import build_grid_from_words # noqa: F401

View File

@@ -0,0 +1,282 @@
"""
Words-First Grid Builder (Bottom-Up).
Builds a cell grid from Tesseract word_boxes directly, without requiring
pre-detected columns or rows. Algorithm:
1. Cluster words into columns by X-gap analysis
2. Cluster words into rows by Y-proximity
3. Build cells at (column, row) intersections
Returns the same (cells, columns_meta) format as build_cell_grid_v2().
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
import statistics
from typing import Any, Dict, List, Tuple
from cv_ocr_engines import (
_group_words_into_lines,
_words_to_reading_order_text,
)
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# 1. Column clustering
# ---------------------------------------------------------------------------
def _cluster_columns(
words: List[Dict],
img_w: int,
min_gap_pct: float = 3.0,
) -> List[Dict[str, Any]]:
"""Cluster words into columns by finding large horizontal gaps.
Returns a list of column dicts:
[{'index': 0, 'type': 'column_1', 'x_min': ..., 'x_max': ...}, ...]
sorted left-to-right.
"""
if not words:
return []
# Sort by X center
sorted_w = sorted(words, key=lambda w: w['left'] + w['width'] / 2)
# Collect word heights to compute adaptive threshold
heights = [w['height'] for w in sorted_w if w.get('height', 0) > 0]
median_h = statistics.median(heights) if heights else 30
# Adaptive gap threshold: 3× median word height, but at least min_gap_pct of image width
min_gap_px = max(median_h * 3, img_w * min_gap_pct / 100) if img_w > 0 else median_h * 3
# Find X-gap boundaries between consecutive words (sorted by X-center)
# For each word, compute right edge; for next word, compute left edge
boundaries: List[float] = [] # X positions where columns split
for i in range(len(sorted_w) - 1):
right_edge = sorted_w[i]['left'] + sorted_w[i]['width']
left_edge = sorted_w[i + 1]['left']
gap = left_edge - right_edge
if gap > min_gap_px:
# Split point is midway through the gap
boundaries.append((right_edge + left_edge) / 2)
# Build column ranges from boundaries
# Column ranges: (-inf, boundary[0]), (boundary[0], boundary[1]), ..., (boundary[-1], +inf)
col_edges = [0.0] + boundaries + [float(img_w)]
columns = []
for ci in range(len(col_edges) - 1):
columns.append({
'index': ci,
'type': f'column_{ci + 1}' if len(col_edges) > 2 else 'column_text',
'x_min': col_edges[ci],
'x_max': col_edges[ci + 1],
})
return columns
# ---------------------------------------------------------------------------
# 2. Row clustering
# ---------------------------------------------------------------------------
def _cluster_rows(
words: List[Dict],
) -> List[Dict[str, Any]]:
"""Cluster words into visual rows by Y-proximity.
Uses half the median word height as Y-tolerance.
Returns a list of row dicts:
[{'index': 0, 'y_min': ..., 'y_max': ..., 'y_center': ...}, ...]
sorted top-to-bottom.
"""
if not words:
return []
heights = [w['height'] for w in words if w.get('height', 0) > 0]
median_h = statistics.median(heights) if heights else 20
y_tol = max(median_h * 0.5, 5)
lines = _group_words_into_lines(words, y_tolerance_px=int(y_tol))
rows = []
for ri, line_words in enumerate(lines):
y_min = min(w['top'] for w in line_words)
y_max = max(w['top'] + w['height'] for w in line_words)
rows.append({
'index': ri,
'y_min': y_min,
'y_max': y_max,
'y_center': (y_min + y_max) / 2,
})
return rows
# ---------------------------------------------------------------------------
# 3. Build cells
# ---------------------------------------------------------------------------
def _assign_word_to_column(word: Dict, columns: List[Dict]) -> int:
"""Return column index for a word based on its X-center."""
x_center = word['left'] + word['width'] / 2
for col in columns:
if col['x_min'] <= x_center < col['x_max']:
return col['index']
# Fallback: nearest column
return min(columns, key=lambda c: abs((c['x_min'] + c['x_max']) / 2 - x_center))['index']
def _assign_word_to_row(word: Dict, rows: List[Dict]) -> int:
"""Return row index for a word based on its Y-center."""
y_center = word['top'] + word['height'] / 2
# Find the row whose y_range contains this word's center
for row in rows:
if row['y_min'] <= y_center <= row['y_max']:
return row['index']
# Fallback: nearest row by Y-center
return min(rows, key=lambda r: abs(r['y_center'] - y_center))['index']
def _build_cells(
words: List[Dict],
columns: List[Dict],
rows: List[Dict],
img_w: int,
img_h: int,
) -> List[Dict[str, Any]]:
"""Build cell dicts from word assignments to (column, row) pairs."""
if not columns or not rows:
return []
# Bucket words into (col_idx, row_idx)
buckets: Dict[Tuple[int, int], List[Dict]] = {}
for w in words:
ci = _assign_word_to_column(w, columns)
ri = _assign_word_to_row(w, rows)
buckets.setdefault((ci, ri), []).append(w)
cells = []
for (ci, ri), cell_words in sorted(buckets.items(), key=lambda kv: (kv[0][1], kv[0][0])):
col = columns[ci]
row = rows[ri]
# Compute tight bbox from actual word positions
x_min = min(w['left'] for w in cell_words)
y_min = min(w['top'] for w in cell_words)
x_max = max(w['left'] + w['width'] for w in cell_words)
y_max = max(w['top'] + w['height'] for w in cell_words)
bw = x_max - x_min
bh = y_max - y_min
# Text from words in reading order
text = _words_to_reading_order_text(cell_words, y_tolerance_px=max(10, int(bh * 0.4)))
# Average confidence
confs = [w.get('conf', 0) for w in cell_words if w.get('conf', 0) > 0]
avg_conf = sum(confs) / len(confs) if confs else 0.0
# Word boxes with percent coordinates
word_boxes = []
for w in sorted(cell_words, key=lambda ww: (ww['top'], ww['left'])):
word_boxes.append({
'text': w.get('text', ''),
'left': round(w['left'] / img_w * 100, 2) if img_w else 0,
'top': round(w['top'] / img_h * 100, 2) if img_h else 0,
'width': round(w['width'] / img_w * 100, 2) if img_w else 0,
'height': round(w['height'] / img_h * 100, 2) if img_h else 0,
'conf': w.get('conf', 0),
})
cells.append({
'cell_id': f"R{ri:02d}_C{ci}",
'row_index': ri,
'col_index': ci,
'col_type': col['type'],
'text': text,
'confidence': round(avg_conf, 1),
'bbox_px': {'x': x_min, 'y': y_min, 'w': bw, 'h': bh},
'bbox_pct': {
'x': round(x_min / img_w * 100, 2) if img_w else 0,
'y': round(y_min / img_h * 100, 2) if img_h else 0,
'w': round(bw / img_w * 100, 2) if img_w else 0,
'h': round(bh / img_h * 100, 2) if img_h else 0,
},
'word_boxes': word_boxes,
'ocr_engine': 'words_first',
'is_bold': False,
})
return cells
# ---------------------------------------------------------------------------
# 4. Public API
# ---------------------------------------------------------------------------
def build_grid_from_words(
word_dicts: List[Dict],
img_w: int,
img_h: int,
min_confidence: int = 30,
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
"""Build a cell grid bottom-up from Tesseract word boxes.
Args:
word_dicts: Flat list of word dicts with keys:
text, left, top, width, height, conf
(absolute pixel coordinates).
img_w: Image width in pixels.
img_h: Image height in pixels.
min_confidence: Minimum OCR confidence to keep a word.
Returns:
(cells, columns_meta) — same format as build_cell_grid_v2().
cells: list of cell dicts with cell_id, bbox_px, bbox_pct, etc.
columns_meta: list of {'index', 'type', 'x', 'width'} dicts.
"""
if not word_dicts:
logger.info("build_grid_from_words: no words — returning empty grid")
return [], []
# Filter by confidence
words = [
w for w in word_dicts
if w.get('conf', 0) >= min_confidence and w.get('text', '').strip()
]
if not words:
logger.info("build_grid_from_words: all words filtered (conf < %d)", min_confidence)
return [], []
logger.info("build_grid_from_words: %d words (after confidence filter from %d)", len(words), len(word_dicts))
# Step 1: cluster columns
columns = _cluster_columns(words, img_w)
logger.info("build_grid_from_words: %d column(s) detected", len(columns))
# Step 2: cluster rows
rows = _cluster_rows(words)
logger.info("build_grid_from_words: %d row(s) detected", len(rows))
# Step 3: build cells
cells = _build_cells(words, columns, rows, img_w, img_h)
logger.info("build_grid_from_words: %d cells built", len(cells))
# Build columns_meta in same format as build_cell_grid_v2
columns_meta = []
for col in columns:
x = int(col['x_min'])
w = int(col['x_max'] - col['x_min'])
columns_meta.append({
'index': col['index'],
'type': col['type'],
'x': x,
'width': w,
})
return cells, columns_meta

View File

@@ -71,6 +71,7 @@ from cv_vocab_pipeline import (
render_image_high_res, render_image_high_res,
render_pdf_high_res, render_pdf_high_res,
) )
from cv_words_first import build_grid_from_words
from ocr_pipeline_session_store import ( from ocr_pipeline_session_store import (
create_session_db, create_session_db,
delete_all_sessions_db, delete_all_sessions_db,
@@ -1859,6 +1860,7 @@ async def detect_words(
pronunciation: str = "british", pronunciation: str = "british",
stream: bool = False, stream: bool = False,
skip_heal_gaps: bool = False, skip_heal_gaps: bool = False,
grid_method: str = "v2",
): ):
"""Build word grid from columns × rows, OCR each cell. """Build word grid from columns × rows, OCR each cell.
@@ -1868,6 +1870,9 @@ async def detect_words(
stream: false (default) for JSON response, true for SSE streaming stream: false (default) for JSON response, true for SSE streaming
skip_heal_gaps: false (default). When true, cells keep exact row geometry skip_heal_gaps: false (default). When true, cells keep exact row geometry
positions without gap-healing expansion. Better for overlay rendering. positions without gap-healing expansion. Better for overlay rendering.
grid_method: 'v2' (default) or 'words_first' — grid construction strategy.
'v2' uses pre-detected columns/rows (top-down).
'words_first' clusters words bottom-up (no column/row detection needed).
""" """
if session_id not in _cache: if session_id not in _cache:
logger.info("detect_words: session %s not in cache, loading from DB", session_id) logger.info("detect_words: session %s not in cache, loading from DB", session_id)
@@ -1902,7 +1907,7 @@ async def detect_words(
"duration_seconds": 0, "duration_seconds": 0,
} }
logger.info("detect_words: no column_result — using full-page pseudo-column %dx%d", img_w_tmp, img_h_tmp) logger.info("detect_words: no column_result — using full-page pseudo-column %dx%d", img_w_tmp, img_h_tmp)
if not row_result or not row_result.get("rows"): if grid_method != "words_first" and (not row_result or not row_result.get("rows")):
raise HTTPException(status_code=400, detail="Row detection must be completed first") raise HTTPException(status_code=400, detail="Row detection must be completed first")
# Convert column dicts back to PageRegion objects # Convert column dicts back to PageRegion objects
@@ -1983,6 +1988,102 @@ async def detect_words(
if excluded: if excluded:
logger.info(f"detect_words: excluded {excluded} rows inside box zones") logger.info(f"detect_words: excluded {excluded} rows inside box zones")
# --- Words-First path: bottom-up grid from word boxes ---
if grid_method == "words_first":
t0 = time.time()
img_h, img_w = dewarped_bgr.shape[:2]
# Get word_dicts from cache or run Tesseract full-page
wf_word_dicts = cached.get("_word_dicts")
if wf_word_dicts is None:
ocr_img_tmp = create_ocr_image(dewarped_bgr)
geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr)
if geo_result is not None:
_geoms, left_x, right_x, top_y, bottom_y, wf_word_dicts, inv = geo_result
cached["_word_dicts"] = wf_word_dicts
cached["_inv"] = inv
cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
if not wf_word_dicts:
raise HTTPException(status_code=400, detail="No words detected — cannot build words-first grid")
# Convert word coordinates to absolute image coordinates if needed
# (detect_column_geometry returns words relative to content ROI)
content_bounds = cached.get("_content_bounds")
if content_bounds:
lx, _rx, ty, _by = content_bounds
abs_words = []
for w in wf_word_dicts:
abs_words.append({
**w,
'left': w['left'] + lx,
'top': w['top'] + ty,
})
wf_word_dicts = abs_words
cells, columns_meta = build_grid_from_words(wf_word_dicts, img_w, img_h)
duration = time.time() - t0
# Apply IPA phonetic fixes
fix_cell_phonetics(cells, pronunciation=pronunciation)
# Add zone_index for backward compat
for cell in cells:
cell.setdefault("zone_index", 0)
col_types = {c['type'] for c in columns_meta}
is_vocab = bool(col_types & {'column_en', 'column_de'})
n_rows = len(set(c['row_index'] for c in cells)) if cells else 0
n_cols = len(columns_meta)
used_engine = "words_first"
word_result = {
"cells": cells,
"grid_shape": {
"rows": n_rows,
"cols": n_cols,
"total_cells": len(cells),
},
"columns_used": columns_meta,
"layout": "vocab" if is_vocab else "generic",
"image_width": img_w,
"image_height": img_h,
"duration_seconds": round(duration, 2),
"ocr_engine": used_engine,
"grid_method": "words_first",
"summary": {
"total_cells": len(cells),
"non_empty_cells": sum(1 for c in cells if c.get("text")),
"low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
},
}
if is_vocab or 'column_text' in col_types:
entries = _cells_to_vocab_entries(cells, columns_meta)
entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
word_result["vocab_entries"] = entries
word_result["entries"] = entries
word_result["entry_count"] = len(entries)
word_result["summary"]["total_entries"] = len(entries)
word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english"))
word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))
await update_session_db(session_id, word_result=word_result, current_step=8)
cached["word_result"] = word_result
logger.info(f"OCR Pipeline: words-first session {session_id}: "
f"{len(cells)} cells ({duration:.2f}s), {n_rows} rows, {n_cols} cols")
await _append_pipeline_log(session_id, "words", {
"grid_method": "words_first",
"total_cells": len(cells),
"non_empty_cells": word_result["summary"]["non_empty_cells"],
"ocr_engine": used_engine,
"layout": word_result["layout"],
}, duration_ms=int(duration * 1000))
return {"session_id": session_id, **word_result}
if stream: if stream:
# Cell-First OCR v2: use batch-then-stream approach instead of # Cell-First OCR v2: use batch-then-stream approach instead of
# per-cell streaming. The parallel ThreadPoolExecutor in # per-cell streaming. The parallel ThreadPoolExecutor in
@@ -2001,7 +2102,7 @@ async def detect_words(
}, },
) )
# --- Non-streaming path --- # --- Non-streaming path (grid_method=v2) ---
t0 = time.time() t0 = time.time()
# Create binarized OCR image (for Tesseract) # Create binarized OCR image (for Tesseract)

View File

@@ -0,0 +1,214 @@
"""Tests for cv_words_first.py — Words-First Grid Builder."""
import pytest
from cv_words_first import (
_assign_word_to_column,
_assign_word_to_row,
_build_cells,
_cluster_columns,
_cluster_rows,
build_grid_from_words,
)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _word(text: str, left: int, top: int, width: int = 60, height: int = 20, conf: int = 90):
"""Create a synthetic word dict."""
return {
'text': text,
'left': left,
'top': top,
'width': width,
'height': height,
'conf': conf,
}
# ---------------------------------------------------------------------------
# _cluster_columns
# ---------------------------------------------------------------------------
class TestClusterColumns:
def test_single_column_freetext(self):
"""Words spread evenly across page → 1 column (column_text)."""
words = [
_word("Hello", 50, 10),
_word("world", 120, 10),
_word("this", 50, 40),
_word("is", 120, 40),
_word("text", 190, 40),
]
cols = _cluster_columns(words, img_w=400)
assert len(cols) == 1
assert cols[0]['type'] == 'column_text'
def test_two_columns(self):
"""Two word groups with large X-gap → 2 columns."""
words = [
_word("apple", 20, 10),
_word("Apfel", 300, 10),
_word("dog", 20, 40),
_word("Hund", 300, 40),
]
cols = _cluster_columns(words, img_w=500)
assert len(cols) == 2
assert cols[0]['type'] == 'column_1'
assert cols[1]['type'] == 'column_2'
def test_three_columns(self):
"""Three groups separated by wide gaps → 3 columns."""
words = [
_word("1", 10, 10, width=20),
_word("apple", 100, 10),
_word("Apfel", 400, 10),
_word("2", 10, 40, width=20),
_word("dog", 100, 40),
_word("Hund", 400, 40),
]
cols = _cluster_columns(words, img_w=600)
assert len(cols) == 3
def test_empty_words(self):
"""No words → empty result."""
assert _cluster_columns([], img_w=500) == []
# ---------------------------------------------------------------------------
# _cluster_rows
# ---------------------------------------------------------------------------
class TestClusterRows:
def test_two_rows(self):
"""Words at two Y-levels → 2 rows."""
words = [
_word("hello", 10, 20),
_word("world", 100, 25),
_word("foo", 10, 80),
_word("bar", 100, 82),
]
rows = _cluster_rows(words)
assert len(rows) == 2
assert rows[0]['y_min'] < rows[1]['y_min']
def test_single_row(self):
"""All words at same Y → 1 row."""
words = [
_word("a", 10, 50),
_word("b", 80, 52),
_word("c", 150, 51),
]
rows = _cluster_rows(words)
assert len(rows) == 1
def test_empty(self):
assert _cluster_rows([]) == []
# ---------------------------------------------------------------------------
# build_grid_from_words (integration)
# ---------------------------------------------------------------------------
class TestBuildGridFromWords:
def test_two_column_vocab(self):
"""Simulate a 2-column vocabulary page with 3 rows."""
words = [
_word("apple", 50, 20),
_word("Apfel", 400, 22),
_word("dog", 50, 60),
_word("Hund", 400, 62),
_word("cat", 50, 100),
_word("Katze", 400, 102),
]
cells, cols_meta = build_grid_from_words(words, img_w=600, img_h=200)
assert len(cols_meta) == 2
assert len(cells) == 6 # 3 rows × 2 cols
# Check cell_id format
cell_ids = {c['cell_id'] for c in cells}
assert 'R00_C0' in cell_ids
assert 'R00_C1' in cell_ids
def test_single_column_freetext(self):
"""Single-column text → 1 column, multiple rows."""
words = [
_word("Hello", 50, 20),
_word("world", 120, 22),
_word("Second", 50, 60),
_word("line", 120, 62),
]
cells, cols_meta = build_grid_from_words(words, img_w=300, img_h=150)
assert len(cols_meta) == 1
assert cols_meta[0]['type'] == 'column_text'
assert len(cells) == 2 # 2 rows, 1 column each
def test_empty_input(self):
cells, cols = build_grid_from_words([], img_w=500, img_h=500)
assert cells == []
assert cols == []
def test_low_confidence_filtered(self):
"""Words below min_confidence are excluded."""
words = [
_word("good", 50, 20, conf=90),
_word("bad", 200, 20, conf=10),
]
cells, cols = build_grid_from_words(words, img_w=400, img_h=100, min_confidence=30)
# Only the good word should produce a cell
assert len(cells) == 1
assert cells[0]['text'] == 'good'
def test_bbox_pct_correct(self):
"""Check that bbox_pct is correctly computed from pixel coords."""
words = [_word("test", 200, 100, width=100, height=30)]
cells, _ = build_grid_from_words(words, img_w=1000, img_h=500)
assert len(cells) == 1
bp = cells[0]['bbox_pct']
assert bp['x'] == 20.0 # 200/1000*100
assert bp['y'] == 20.0 # 100/500*100
assert bp['w'] == 10.0 # 100/1000*100
assert bp['h'] == 6.0 # 30/500*100
def test_columns_meta_format(self):
"""columns_meta has same keys as build_cell_grid_v2 output."""
words = [
_word("a", 50, 20),
_word("b", 400, 20),
]
_, cols_meta = build_grid_from_words(words, img_w=600, img_h=100)
for col in cols_meta:
assert 'index' in col
assert 'type' in col
assert 'x' in col
assert 'width' in col
def test_word_boxes_included(self):
"""Each cell should contain word_boxes with percent coords."""
words = [
_word("hello", 50, 20),
_word("world", 120, 22),
]
cells, _ = build_grid_from_words(words, img_w=300, img_h=100)
assert len(cells) == 1 # single row, single column
wb = cells[0].get('word_boxes', [])
assert len(wb) == 2
for w in wb:
assert 'left' in w
assert 'top' in w
assert 'text' in w
def test_all_whitespace_filtered(self):
"""Words with only whitespace text are filtered out."""
words = [
_word(" ", 50, 20, conf=90),
_word("hello", 200, 20, conf=90),
]
cells, _ = build_grid_from_words(words, img_w=400, img_h=100)
assert len(cells) == 1
assert cells[0]['text'] == 'hello'