feat: Words-First Grid Builder (bottom-up alternative zu cell_grid_v2)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 54s
CI / test-go-edu-search (push) Successful in 47s
CI / test-python-klausur (push) Failing after 2m31s
CI / test-python-agent-core (push) Successful in 23s
CI / test-nodejs-website (push) Successful in 32s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 54s
CI / test-go-edu-search (push) Successful in 47s
CI / test-python-klausur (push) Failing after 2m31s
CI / test-python-agent-core (push) Successful in 23s
CI / test-nodejs-website (push) Successful in 32s
Neuer Algorithmus in cv_words_first.py: Clustert Tesseract word_boxes direkt zu Spalten (X-Gap) und Zeilen (Y-Proximity), baut Zellen an Schnittpunkten. Kein Spalten-/Zeilenerkennung noetig. - cv_words_first.py: _cluster_columns, _cluster_rows, _build_cells, build_grid_from_words - ocr_pipeline_api.py: grid_method Parameter (v2|words_first) im /words Endpoint - StepWordRecognition.tsx: Dropdown Toggle fuer Grid-Methode - OCR-Pipeline.md: Doku v4.3.0 mit Words-First Algorithmus - 15 Unit-Tests fuer cv_words_first Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -63,6 +63,7 @@ export function StepWordRecognition({ sessionId, onNext, goToStep, skipHealGaps
|
|||||||
const [ocrEngine, setOcrEngine] = useState<'auto' | 'tesseract' | 'rapid'>('auto')
|
const [ocrEngine, setOcrEngine] = useState<'auto' | 'tesseract' | 'rapid'>('auto')
|
||||||
const [usedEngine, setUsedEngine] = useState<string>('')
|
const [usedEngine, setUsedEngine] = useState<string>('')
|
||||||
const [pronunciation, setPronunciation] = useState<'british' | 'american'>('british')
|
const [pronunciation, setPronunciation] = useState<'british' | 'american'>('british')
|
||||||
|
const [gridMethod, setGridMethod] = useState<'v2' | 'words_first'>('v2')
|
||||||
|
|
||||||
// Streaming progress state
|
// Streaming progress state
|
||||||
const [streamProgress, setStreamProgress] = useState<{ current: number; total: number } | null>(null)
|
const [streamProgress, setStreamProgress] = useState<{ current: number; total: number } | null>(null)
|
||||||
@@ -112,7 +113,7 @@ export function StepWordRecognition({ sessionId, onNext, goToStep, skipHealGaps
|
|||||||
let res: Response | null = null
|
let res: Response | null = null
|
||||||
for (let attempt = 0; attempt < 2; attempt++) {
|
for (let attempt = 0; attempt < 2; attempt++) {
|
||||||
res = await fetch(
|
res = await fetch(
|
||||||
`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/words?stream=true&engine=${eng}&pronunciation=${pronunciation}${skipHealGaps ? '&skip_heal_gaps=true' : ''}`,
|
`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/words?stream=${gridMethod === 'v2' ? 'true' : 'false'}&engine=${eng}&pronunciation=${pronunciation}${skipHealGaps ? '&skip_heal_gaps=true' : ''}&grid_method=${gridMethod}`,
|
||||||
{ method: 'POST' },
|
{ method: 'POST' },
|
||||||
)
|
)
|
||||||
if (res.ok) break
|
if (res.ok) break
|
||||||
@@ -128,6 +129,13 @@ export function StepWordRecognition({ sessionId, onNext, goToStep, skipHealGaps
|
|||||||
throw new Error(err.detail || 'Worterkennung fehlgeschlagen')
|
throw new Error(err.detail || 'Worterkennung fehlgeschlagen')
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// words_first returns plain JSON (no streaming)
|
||||||
|
if (gridMethod === 'words_first') {
|
||||||
|
const data = await res.json() as GridResult
|
||||||
|
applyGridResult(data)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
const reader = res.body!.getReader()
|
const reader = res.body!.getReader()
|
||||||
const decoder = new TextDecoder()
|
const decoder = new TextDecoder()
|
||||||
let buffer = ''
|
let buffer = ''
|
||||||
@@ -220,7 +228,7 @@ export function StepWordRecognition({ sessionId, onNext, goToStep, skipHealGaps
|
|||||||
setDetecting(false)
|
setDetecting(false)
|
||||||
}
|
}
|
||||||
// eslint-disable-next-line react-hooks/exhaustive-deps
|
// eslint-disable-next-line react-hooks/exhaustive-deps
|
||||||
}, [sessionId, ocrEngine, pronunciation])
|
}, [sessionId, ocrEngine, pronunciation, gridMethod])
|
||||||
|
|
||||||
const handleGroundTruth = useCallback(async (isCorrect: boolean) => {
|
const handleGroundTruth = useCallback(async (isCorrect: boolean) => {
|
||||||
if (!sessionId) return
|
if (!sessionId) return
|
||||||
@@ -789,6 +797,16 @@ export function StepWordRecognition({ sessionId, onNext, goToStep, skipHealGaps
|
|||||||
{gridResult && (
|
{gridResult && (
|
||||||
<div className="bg-white dark:bg-gray-800 rounded-xl border border-gray-200 dark:border-gray-700 p-4 space-y-3">
|
<div className="bg-white dark:bg-gray-800 rounded-xl border border-gray-200 dark:border-gray-700 p-4 space-y-3">
|
||||||
<div className="flex items-center gap-3 flex-wrap">
|
<div className="flex items-center gap-3 flex-wrap">
|
||||||
|
{/* Grid method selector */}
|
||||||
|
<select
|
||||||
|
value={gridMethod}
|
||||||
|
onChange={(e) => setGridMethod(e.target.value as 'v2' | 'words_first')}
|
||||||
|
className="px-2 py-1.5 text-xs border rounded-lg dark:bg-gray-700 dark:border-gray-600"
|
||||||
|
>
|
||||||
|
<option value="v2">Standard (v2)</option>
|
||||||
|
<option value="words_first">Words-First</option>
|
||||||
|
</select>
|
||||||
|
|
||||||
{/* OCR Engine selector */}
|
{/* OCR Engine selector */}
|
||||||
<select
|
<select
|
||||||
value={ocrEngine}
|
value={ocrEngine}
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
# OCR Pipeline - Schrittweise Seitenrekonstruktion
|
# OCR Pipeline - Schrittweise Seitenrekonstruktion
|
||||||
|
|
||||||
**Version:** 4.1.0
|
**Version:** 4.3.0
|
||||||
**Status:** Produktiv (Schritte 1–10 implementiert)
|
**Status:** Produktiv (Schritte 1–10 implementiert)
|
||||||
**URL:** https://macmini:3002/ai/ocr-pipeline
|
**URL:** https://macmini:3002/ai/ocr-pipeline
|
||||||
|
|
||||||
@@ -22,7 +22,7 @@ Jeder Schritt kann individuell geprueft, korrigiert und mit Ground-Truth-Daten v
|
|||||||
| 4 | Zuschneiden (Crop) | Content-basierter Crop: Buchruecken-Schatten + Ink-Projektion | Implementiert |
|
| 4 | Zuschneiden (Crop) | Content-basierter Crop: Buchruecken-Schatten + Ink-Projektion | Implementiert |
|
||||||
| 5 | Spaltenerkennung | Unsichtbare Spalten finden (Projektionsprofile + Wortvalidierung) | Implementiert |
|
| 5 | Spaltenerkennung | Unsichtbare Spalten finden (Projektionsprofile + Wortvalidierung) | Implementiert |
|
||||||
| 6 | Zeilenerkennung | Horizontale Zeilen + Kopf-/Fusszeilen-Klassifikation + Luecken-Heilung | Implementiert |
|
| 6 | Zeilenerkennung | Horizontale Zeilen + Kopf-/Fusszeilen-Klassifikation + Luecken-Heilung | Implementiert |
|
||||||
| 7 | Worterkennung | Hybrid-Grid: Breite Spalten full-page, schmale cell-crop | Implementiert |
|
| 7 | Worterkennung | Hybrid-Grid (v2) oder Words-First (bottom-up) | Implementiert |
|
||||||
| 8 | Korrektur | Zeichenverwirrung + regel-basierte Rechtschreibkorrektur (SSE-Stream) | Implementiert |
|
| 8 | Korrektur | Zeichenverwirrung + regel-basierte Rechtschreibkorrektur (SSE-Stream) | Implementiert |
|
||||||
| 9 | Rekonstruktion | Interaktive Zellenbearbeitung auf Bildhintergrund (Fabric.js) | Implementiert |
|
| 9 | Rekonstruktion | Interaktive Zellenbearbeitung auf Bildhintergrund (Fabric.js) | Implementiert |
|
||||||
| 10 | Validierung | Ground-Truth-Vergleich und Qualitaetspruefung | Implementiert |
|
| 10 | Validierung | Ground-Truth-Vergleich und Qualitaetspruefung | Implementiert |
|
||||||
@@ -82,28 +82,29 @@ flowchart TD
|
|||||||
│
|
│
|
||||||
detect_document_type()
|
detect_document_type()
|
||||||
│
|
│
|
||||||
┌─────────────────┴──────────────────┐
|
┌──────────────────┼──────────────────┐
|
||||||
▼ ▼
|
▼ ▼ ▼
|
||||||
FULL-TEXT PFAD CELL-FIRST PFAD
|
FULL-TEXT PFAD WORDS-FIRST PFAD CELL-FIRST PFAD
|
||||||
(pipeline='full_page') (pipeline='cell_first')
|
(pipeline= (grid_method= (grid_method=
|
||||||
│ │
|
'full_page') 'words_first') 'v2', default)
|
||||||
Keine Spalten/Zeilen Spaltenerkennung
|
│ │ │
|
||||||
analyze_layout_by_words() detect_column_geometry()
|
Keine Spalten/ Tesseract Full-Page Spaltenerkennung
|
||||||
Lese-Reihenfolge _detect_sub_columns()
|
Zeilen word_boxes detect_column_geometry()
|
||||||
│ expand_narrow_columns()
|
analyze_layout_ _cluster_columns() _detect_sub_columns()
|
||||||
│ Zeilenerkennung
|
by_words() _cluster_rows() expand_narrow_columns()
|
||||||
│ detect_row_geometry()
|
│ _build_cells() Zeilenerkennung
|
||||||
│ │
|
│ │ detect_row_geometry()
|
||||||
│ build_cell_grid_v2()
|
│ build_grid_from_ │
|
||||||
│ │
|
│ words() build_cell_grid_v2()
|
||||||
│ ┌─────────┴──────────┐
|
│ │ │
|
||||||
│ ▼ ▼
|
│ │ ┌─────────┴──────────┐
|
||||||
│ Breite Spalten Schmale Spalten
|
│ │ ▼ ▼
|
||||||
│ (>= 15% Breite) (< 15% Breite)
|
│ │ Breite Spalten Schmale Spalten
|
||||||
│ Full-Page Words Cell-Crop OCR
|
│ │ (>= 15% Breite) (< 15% Breite)
|
||||||
│ word_lookup cell_crop_v2
|
│ │ Full-Page Words Cell-Crop OCR
|
||||||
│ │ │
|
│ │ word_lookup cell_crop_v2
|
||||||
└───────────────────────────┴────────────────────┘
|
│ │ │ │
|
||||||
|
└──────────────────┴────┴────────────────────┘
|
||||||
│
|
│
|
||||||
Post-Processing Pipeline
|
Post-Processing Pipeline
|
||||||
(Lautschrift, Komma-Split, etc.)
|
(Lautschrift, Komma-Split, etc.)
|
||||||
@@ -147,6 +148,8 @@ klausur-service/backend/
|
|||||||
│ └── cv_vocab_pipeline.py # Computer Vision + NLP Algorithmen
|
│ └── cv_vocab_pipeline.py # Computer Vision + NLP Algorithmen
|
||||||
├── ocr_pipeline_api.py # FastAPI Router (Schritte 2-10)
|
├── ocr_pipeline_api.py # FastAPI Router (Schritte 2-10)
|
||||||
├── orientation_crop_api.py # FastAPI Router (Schritte 1 + 4)
|
├── orientation_crop_api.py # FastAPI Router (Schritte 1 + 4)
|
||||||
|
├── cv_box_detect.py # Box-Erkennung + Zonen-Aufteilung
|
||||||
|
├── cv_words_first.py # Words-First Grid Builder (bottom-up)
|
||||||
├── page_crop.py # Content-basierter Crop-Algorithmus
|
├── page_crop.py # Content-basierter Crop-Algorithmus
|
||||||
├── ocr_pipeline_session_store.py # PostgreSQL Persistence
|
├── ocr_pipeline_session_store.py # PostgreSQL Persistence
|
||||||
├── layout_reconstruction_service.py # Fabric.js JSON + PDF/DOCX Export
|
├── layout_reconstruction_service.py # Fabric.js JSON + PDF/DOCX Export
|
||||||
@@ -169,7 +172,8 @@ admin-lehrer/
|
|||||||
├── StepRowDetection.tsx # Schritt 6: Zeilenerkennung
|
├── StepRowDetection.tsx # Schritt 6: Zeilenerkennung
|
||||||
├── StepWordRecognition.tsx # Schritt 7: Worterkennung
|
├── StepWordRecognition.tsx # Schritt 7: Worterkennung
|
||||||
├── StepLlmReview.tsx # Schritt 8: Korrektur (SSE-Stream)
|
├── StepLlmReview.tsx # Schritt 8: Korrektur (SSE-Stream)
|
||||||
├── StepReconstruction.tsx # Schritt 9: Rekonstruktion (Canvas)
|
├── StepReconstruction.tsx # Schritt 9: Rekonstruktion (Canvas + Overlay)
|
||||||
|
├── usePixelWordPositions.ts # Shared Hook: Pixel-basierte Wortpositionierung
|
||||||
├── FabricReconstructionCanvas.tsx # Fabric.js Editor
|
├── FabricReconstructionCanvas.tsx # Fabric.js Editor
|
||||||
└── StepGroundTruth.tsx # Schritt 10: Validierung
|
└── StepGroundTruth.tsx # Schritt 10: Validierung
|
||||||
```
|
```
|
||||||
@@ -257,10 +261,20 @@ Alle Endpoints unter `/api/v1/ocr-pipeline/`.
|
|||||||
|
|
||||||
| Methode | Pfad | Beschreibung |
|
| Methode | Pfad | Beschreibung |
|
||||||
|---------|------|--------------|
|
|---------|------|--------------|
|
||||||
| `POST` | `/sessions/{id}/words` | Wort-Grid aus Spalten x Zeilen erstellen |
|
| `POST` | `/sessions/{id}/words` | Wort-Grid erstellen |
|
||||||
| `POST` | `/sessions/{id}/ground-truth/words` | Ground Truth speichern |
|
| `POST` | `/sessions/{id}/ground-truth/words` | Ground Truth speichern |
|
||||||
| `GET` | `/sessions/{id}/ground-truth/words` | Ground Truth abrufen |
|
| `GET` | `/sessions/{id}/ground-truth/words` | Ground Truth abrufen |
|
||||||
|
|
||||||
|
**Query-Parameter fuer `/sessions/{id}/words`:**
|
||||||
|
|
||||||
|
| Parameter | Default | Beschreibung |
|
||||||
|
|-----------|---------|--------------|
|
||||||
|
| `engine` | `auto` | OCR-Engine: `auto`, `tesseract`, `rapid` |
|
||||||
|
| `pronunciation` | `british` | IPA-Woerterbuch: `british` oder `american` |
|
||||||
|
| `stream` | `false` | SSE-Streaming (nur bei `grid_method=v2`) |
|
||||||
|
| `skip_heal_gaps` | `false` | Zeilen-Luecken nicht heilen (Overlay-Modus) |
|
||||||
|
| `grid_method` | `v2` | Grid-Strategie: `v2` (top-down) oder `words_first` (bottom-up) |
|
||||||
|
|
||||||
### Schritt 8: Korrektur
|
### Schritt 8: Korrektur
|
||||||
|
|
||||||
| Methode | Pfad | Beschreibung |
|
| Methode | Pfad | Beschreibung |
|
||||||
@@ -513,6 +527,12 @@ Horizontale Projektionsprofile finden Zeilen-Luecken; word-level Validierung ver
|
|||||||
der entstehenden Luecke ausgedehnt, damit kein Zeileninhalt durch schrumpfende Grenzen
|
der entstehenden Luecke ausgedehnt, damit kein Zeileninhalt durch schrumpfende Grenzen
|
||||||
abgeschnitten wird.
|
abgeschnitten wird.
|
||||||
|
|
||||||
|
3. **Box-Boundary-Schutz** (`box_ranges_inner`, neu in v4.2):
|
||||||
|
Bei Seiten mit Box-Zonen (Sub-Sessions) werden Zeilen am Box-Rand nicht faelschlich
|
||||||
|
ausgeschlossen. Das Problem: Die letzte Textzeile ueber einer Box ueberlappt haeufig
|
||||||
|
mit dem Box-Rahmen. Loesung: Die Exclusion-Zone wird um `max(border_thickness, 5px)`
|
||||||
|
geschrumpft, sodass nur Zeilen **innerhalb** der Box ausgeschlossen werden.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
def _is_artifact_row(row: RowGeometry) -> bool:
|
def _is_artifact_row(row: RowGeometry) -> bool:
|
||||||
"""Zeile ist Artefakt wenn alle Tokens <= 1 Zeichen."""
|
"""Zeile ist Artefakt wenn alle Tokens <= 1 Zeichen."""
|
||||||
@@ -524,13 +544,128 @@ def _heal_row_gaps(rows, top_bound, bottom_bound):
|
|||||||
...
|
...
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Box-Zonen und Content-Strips (Detail)
|
||||||
|
|
||||||
|
Seiten mit Box-Bereichen (z.B. Grammatik-Tipps, Uebungsboxen) werden in Zonen aufgeteilt:
|
||||||
|
|
||||||
|
```
|
||||||
|
┌──────────────────────────┐
|
||||||
|
│ Content Zone 0 (Zeilen) │ ← Vokabeltabelle oben
|
||||||
|
├──────────────────────────┤
|
||||||
|
│ ███ Box Zone (border) ███│ ← Sub-Session mit eigener OCR
|
||||||
|
├──────────────────────────┤
|
||||||
|
│ Content Zone 2 (Zeilen) │ ← Vokabeltabelle unten
|
||||||
|
└──────────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
**Content-Strip-Verfahren** (`detect_rows` in `ocr_pipeline_api.py`):
|
||||||
|
|
||||||
|
1. Box-Zonen identifizieren, `box_ranges_inner` berechnen (geschrumpft um Border-Dicke)
|
||||||
|
2. Content-Strips = Seitenbereiche **ohne** Box-Inneres, vertikal gestapelt
|
||||||
|
3. Zeilenerkennung auf gestapeltem Bild, Y-Koordinaten zurueckgemappt
|
||||||
|
4. Wort-Filterung: Woerter in Box-Innerem werden ausgeschlossen
|
||||||
|
|
||||||
|
**Wichtig:** `box_ranges_inner` (nicht `box_ranges`) wird verwendet, damit
|
||||||
|
Zeilen am Box-Rand nicht abgeschnitten werden. Minimum 5px Margin.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Schritt 7: Worterkennung — Hybrid-Grid (Detail)
|
## Schritt 7: Worterkennung (Detail)
|
||||||
|
|
||||||
### Algorithmus: `build_cell_grid_v2()`
|
Schritt 7 bietet zwei Grid-Strategien, auswaehlbar per `grid_method`-Parameter:
|
||||||
|
|
||||||
Schritt 5 nutzt eine **Hybrid-Strategie**: Breite Spalten verwenden die Full-Page-Tesseract-Woerter,
|
| Strategie | Parameter | Ansatz | Benoetigt Spalten/Zeilen? |
|
||||||
|
|-----------|-----------|--------|--------------------------|
|
||||||
|
| **Hybrid-Grid v2** | `grid_method=v2` (Default) | Top-down: Spalten → Zeilen → Zellen → OCR | Ja (Schritte 5+6) |
|
||||||
|
| **Words-First** | `grid_method=words_first` | Bottom-up: Woerter → Spalten clustern → Zeilen clustern → Zellen | Nein |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Words-First Grid Builder: `build_grid_from_words()`
|
||||||
|
|
||||||
|
**Datei:** `cv_words_first.py`
|
||||||
|
|
||||||
|
Der Words-First Builder arbeitet bottom-up: Er nimmt die pixelgenauen `word_boxes` aus einem
|
||||||
|
Tesseract Full-Page-Lauf und clustert sie direkt zu Spalten und Zeilen — ohne die
|
||||||
|
vorherige Spalten-/Zeilenerkennung (Schritte 5+6) zu benoetigen.
|
||||||
|
|
||||||
|
#### Algorithmus
|
||||||
|
|
||||||
|
```
|
||||||
|
Eingabe: word_dicts (flat list), img_w, img_h
|
||||||
|
│
|
||||||
|
┌───────────┴───────────┐
|
||||||
|
│ 1. Confidence-Filter │
|
||||||
|
│ conf >= 30 │
|
||||||
|
│ Whitespace entf. │
|
||||||
|
└───────────┬───────────┘
|
||||||
|
│
|
||||||
|
┌───────────┴───────────┐
|
||||||
|
│ 2. _cluster_columns() │
|
||||||
|
│ X-Gap-Analyse │
|
||||||
|
│ Schwelle: median_h │
|
||||||
|
│ × 3 (min 3% Breite)│
|
||||||
|
└───────────┬───────────┘
|
||||||
|
│
|
||||||
|
┌───────────┴───────────┐
|
||||||
|
│ 3. _cluster_rows() │
|
||||||
|
│ Y-Proximity-Grupp. │
|
||||||
|
│ Toleranz: median_h │
|
||||||
|
│ / 2 │
|
||||||
|
└───────────┬───────────┘
|
||||||
|
│
|
||||||
|
┌───────────┴───────────┐
|
||||||
|
│ 4. _build_cells() │
|
||||||
|
│ Wort → (col, row) │
|
||||||
|
│ Text + bbox + conf │
|
||||||
|
│ word_boxes pro Zelle│
|
||||||
|
└───────────┬───────────┘
|
||||||
|
│
|
||||||
|
Ausgabe: cells[], columns_meta[]
|
||||||
|
(identisch zu build_cell_grid_v2)
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Spalten-Clustering
|
||||||
|
|
||||||
|
1. Alle Woerter nach X-Mitte sortieren
|
||||||
|
2. Aufeinanderfolgende X-Gaps berechnen
|
||||||
|
3. Adaptiver Schwellwert: `median_word_height × 3` (min 3% Bildbreite)
|
||||||
|
4. Gaps > Schwellwert = Spaltengrenzen
|
||||||
|
5. Kein Gap gefunden → 1 Spalte (`column_text`)
|
||||||
|
6. Spaltentypen: `column_1`, `column_2`, ... (generisch, positionsbasiert)
|
||||||
|
|
||||||
|
#### Zeilen-Clustering
|
||||||
|
|
||||||
|
1. Woerter zu visuellen Zeilen gruppieren (Y-Toleranz: halbe Worthoehe)
|
||||||
|
2. Jede visuelle Zeile = eine Zeile im Grid
|
||||||
|
3. Sortiert von oben nach unten
|
||||||
|
|
||||||
|
#### Edge Cases
|
||||||
|
|
||||||
|
| Fall | Behandlung |
|
||||||
|
|------|------------|
|
||||||
|
| Einzelne Spalte (Fliesstext) | Kein X-Gap → 1 Spalte `column_text` |
|
||||||
|
| Keine Woerter erkannt | Leeres Ergebnis `([], [])` |
|
||||||
|
| Ueberschriften (grosse Schrift) | Eigene Zeile durch Y-Gap |
|
||||||
|
| Bilder/Grafiken | Keine Woerter → automatisch leerer Bereich |
|
||||||
|
| Schmale Spalten (Seitenzahlen) | Eigene Spalte durch X-Gap |
|
||||||
|
|
||||||
|
#### Vergleich v2 vs. Words-First
|
||||||
|
|
||||||
|
| Kriterium | v2 (Top-Down) | Words-First (Bottom-Up) |
|
||||||
|
|-----------|---------------|------------------------|
|
||||||
|
| **Abhaengigkeiten** | Spalten + Zeilen noetig | Nur Tesseract-Woerter |
|
||||||
|
| **Spaltentypen** | Semantisch (EN, DE, ...) | Positionsbasiert (1, 2, ...) |
|
||||||
|
| **OCR** | Hybrid (full-page + cell-crop) | Nur full-page Tesseract |
|
||||||
|
| **Robustheit** | Abhaengig von Spalten-/Zeilenerkennung | Direkt aus Wortpositionen |
|
||||||
|
| **Geschwindigkeit** | Langsamer (cell-crop pro Zelle) | Schneller (kein OCR-Lauf) |
|
||||||
|
| **Genauigkeit** | Besser bei schmalen Spalten | Besser bei ungewoehnlichen Layouts |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Hybrid-Grid v2: `build_cell_grid_v2()`
|
||||||
|
|
||||||
|
Schritt 7 nutzt im Default eine **Hybrid-Strategie**: Breite Spalten verwenden die Full-Page-Tesseract-Woerter,
|
||||||
schmale Spalten werden isoliert per Cell-Crop OCR verarbeitet.
|
schmale Spalten werden isoliert per Cell-Crop OCR verarbeitet.
|
||||||
|
|
||||||
!!! success "Warum Hybrid?"
|
!!! success "Warum Hybrid?"
|
||||||
@@ -692,7 +827,7 @@ Change-Format:
|
|||||||
|
|
||||||
## Schritt 9: Rekonstruktion (Detail)
|
## Schritt 9: Rekonstruktion (Detail)
|
||||||
|
|
||||||
Zwei Modi verfuegbar:
|
Drei Modi verfuegbar:
|
||||||
|
|
||||||
### Einfacher Modus
|
### Einfacher Modus
|
||||||
|
|
||||||
@@ -709,6 +844,73 @@ angezeigt, alle Grid-Zellen (auch leere!) werden als editierbare Textfelder daru
|
|||||||
- Zoom 50–200 %
|
- Zoom 50–200 %
|
||||||
- Per-Zell-Reset-Button bei geaenderten Zellen
|
- Per-Zell-Reset-Button bei geaenderten Zellen
|
||||||
|
|
||||||
|
### Overlay-Modus (neu in v4.2)
|
||||||
|
|
||||||
|
Ganzseitige Tabellenrekonstruktion mit **Pixel-basierter Wortpositionierung**.
|
||||||
|
Nur verfuegbar bei Parent-Sessions mit Sub-Sessions (Box-Bereiche).
|
||||||
|
|
||||||
|
**Funktionsweise:**
|
||||||
|
|
||||||
|
1. **Sub-Session-Merging:** Zellen aus Sub-Sessions werden koordinaten-konvertiert
|
||||||
|
und in die Parent-Session eingefuegt. Die Umrechnung laeuft ueber die Box-Zone:
|
||||||
|
```
|
||||||
|
parentCellX = boxXPct + (subCell.bbox_pct.x / 100) * boxWPct
|
||||||
|
parentCellY = boxYPct + (subCell.bbox_pct.y / 100) * boxHPct
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **180°-Rotation:** Bei Parent-Sessions mit Boxen wird das Bild standardmaessig
|
||||||
|
180° gedreht, da der Scan haeufig kopfueber vorliegt. Die Pixel-Analyse
|
||||||
|
arbeitet auf dem rotierten Bild:
|
||||||
|
- Canvas: `ctx.translate(W, H); ctx.rotate(Math.PI)`
|
||||||
|
- Zell-Koordinaten: `(100 - x - w, 100 - y - h)` fuer rotiertes Space
|
||||||
|
- Cluster-Ruecktransformation: `start → cw-1-end`, danach `reverse()`
|
||||||
|
|
||||||
|
3. **Pixel-Wortpositionierung:** Der `usePixelWordPositions` Hook analysiert
|
||||||
|
dunkle Pixel per vertikaler Projektion, findet Wortgruppen-Cluster und
|
||||||
|
berechnet die exakte horizontale Position + Auto-Schriftgroesse.
|
||||||
|
|
||||||
|
**Layout:** 50/50 Grid (links Originalbild, rechts Rekonstruktion)
|
||||||
|
|
||||||
|
**Toolbar:**
|
||||||
|
|
||||||
|
- Schriftgroessen-Slider (30–120%)
|
||||||
|
- Bold-Toggle
|
||||||
|
- 180°-Rotations-Toggle
|
||||||
|
- Speichern-Button
|
||||||
|
|
||||||
|
**Visuelle Elemente:**
|
||||||
|
|
||||||
|
- Spaltenlinien (aus `column_result.columns`)
|
||||||
|
- Zeilenlinien (aus `row_result.rows`)
|
||||||
|
- Box-Zonen-Markierung (blau, halbtransparent)
|
||||||
|
- Editierbare Inputs an Pixel-Positionen
|
||||||
|
|
||||||
|
### Shared Hook: `usePixelWordPositions`
|
||||||
|
|
||||||
|
Extrahierter Hook fuer die Pixel-basierte Wortpositionierung, genutzt in
|
||||||
|
StepLlmReview (Schritt 8) und StepReconstruction (Schritt 9).
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
function usePixelWordPositions(
|
||||||
|
imageUrl: string,
|
||||||
|
cells: GridCell[],
|
||||||
|
active: boolean,
|
||||||
|
rotation: 0 | 180 = 0,
|
||||||
|
): Map<string, WordPosition[]>
|
||||||
|
```
|
||||||
|
|
||||||
|
**Algorithmus:**
|
||||||
|
|
||||||
|
1. Bild in offscreen Canvas laden (optional 180° gedreht)
|
||||||
|
2. Pro Zelle: `getImageData()` → vertikale Projektion (dunkle Pixel pro Spalte)
|
||||||
|
3. Cluster-Erkennung (Schwelle: 3% der Zellhoehe, Gap: 2% der Zellbreite)
|
||||||
|
4. Bei Rotation: Cluster zurueck ins Original-Koordinatensystem spiegeln
|
||||||
|
5. Text-Gruppen (split bei 3+ Leerzeichen) auf Cluster matchen
|
||||||
|
6. Auto-Schriftgroesse per `measureText()` + `fontRatio`
|
||||||
|
7. Mode-Normalisierung: Haeufigste `fontRatio` (gerundet auf 0.02) auf alle anwenden
|
||||||
|
|
||||||
|
**Rueckgabe:** `Map<cell_id, WordPosition[]>` mit `xPct`, `wPct`, `text`, `fontRatio`
|
||||||
|
|
||||||
### Fabric.js Editor
|
### Fabric.js Editor
|
||||||
|
|
||||||
Erweiterter Canvas-Editor (`FabricReconstructionCanvas.tsx`):
|
Erweiterter Canvas-Editor (`FabricReconstructionCanvas.tsx`):
|
||||||
@@ -861,6 +1063,8 @@ ssh macmini "/usr/local/bin/docker compose -f /Users/benjaminadmin/Projekte/brea
|
|||||||
|
|
||||||
| Datum | Version | Aenderung |
|
| Datum | Version | Aenderung |
|
||||||
|-------|---------|----------|
|
|-------|---------|----------|
|
||||||
|
| 2026-03-12 | 4.3.0 | Words-First Grid Builder (`cv_words_first.py`): Bottom-up-Algorithmus clustert Tesseract word_boxes direkt zu Spalten/Zeilen/Zellen. Neuer `grid_method` Parameter im `/words` Endpoint. Frontend-Toggle in StepWordRecognition. |
|
||||||
|
| 2026-03-10 | 4.2.0 | Rekonstruktion: Overlay-Modus mit Pixel-Wortpositionierung, 180°-Rotation, Sub-Session-Merging, usePixelWordPositions Hook, Box-Boundary-Schutz (box_ranges_inner) |
|
||||||
| 2026-03-05 | 3.1.0 | Spalten: Seiten-Segmentierung an Sub-Headern, Word-Coverage Fallback, Segment-gefilterte Validierung |
|
| 2026-03-05 | 3.1.0 | Spalten: Seiten-Segmentierung an Sub-Headern, Word-Coverage Fallback, Segment-gefilterte Validierung |
|
||||||
| 2026-03-05 | 3.0.1 | Dewarp: Feinabstimmung mit 7 Schiebereglern (3 Rotation + 4 Shear), Combined-Adjust-Endpoint |
|
| 2026-03-05 | 3.0.1 | Dewarp: Feinabstimmung mit 7 Schiebereglern (3 Rotation + 4 Shear), Combined-Adjust-Endpoint |
|
||||||
| 2026-03-05 | 3.0.0 | Doku-Update: Dokumenttyp-Erkennung, Hybrid-Grid, Sub-Column-Detection, Pipeline-Pfade |
|
| 2026-03-05 | 3.0.0 | Doku-Update: Dokumenttyp-Erkennung, Hybrid-Grid, Sub-Column-Detection, Pipeline-Pfade |
|
||||||
|
|||||||
@@ -34,3 +34,4 @@ from cv_ocr_engines import ( # noqa: F401
|
|||||||
_fix_phonetic_brackets,
|
_fix_phonetic_brackets,
|
||||||
)
|
)
|
||||||
from cv_cell_grid import _cells_to_vocab_entries # noqa: F401
|
from cv_cell_grid import _cells_to_vocab_entries # noqa: F401
|
||||||
|
from cv_words_first import build_grid_from_words # noqa: F401
|
||||||
|
|||||||
282
klausur-service/backend/cv_words_first.py
Normal file
282
klausur-service/backend/cv_words_first.py
Normal file
@@ -0,0 +1,282 @@
|
|||||||
|
"""
|
||||||
|
Words-First Grid Builder (Bottom-Up).
|
||||||
|
|
||||||
|
Builds a cell grid from Tesseract word_boxes directly, without requiring
|
||||||
|
pre-detected columns or rows. Algorithm:
|
||||||
|
|
||||||
|
1. Cluster words into columns by X-gap analysis
|
||||||
|
2. Cluster words into rows by Y-proximity
|
||||||
|
3. Build cells at (column, row) intersections
|
||||||
|
|
||||||
|
Returns the same (cells, columns_meta) format as build_cell_grid_v2().
|
||||||
|
|
||||||
|
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||||
|
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import statistics
|
||||||
|
from typing import Any, Dict, List, Tuple
|
||||||
|
|
||||||
|
from cv_ocr_engines import (
|
||||||
|
_group_words_into_lines,
|
||||||
|
_words_to_reading_order_text,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 1. Column clustering
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _cluster_columns(
|
||||||
|
words: List[Dict],
|
||||||
|
img_w: int,
|
||||||
|
min_gap_pct: float = 3.0,
|
||||||
|
) -> List[Dict[str, Any]]:
|
||||||
|
"""Cluster words into columns by finding large horizontal gaps.
|
||||||
|
|
||||||
|
Returns a list of column dicts:
|
||||||
|
[{'index': 0, 'type': 'column_1', 'x_min': ..., 'x_max': ...}, ...]
|
||||||
|
sorted left-to-right.
|
||||||
|
"""
|
||||||
|
if not words:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Sort by X center
|
||||||
|
sorted_w = sorted(words, key=lambda w: w['left'] + w['width'] / 2)
|
||||||
|
|
||||||
|
# Collect word heights to compute adaptive threshold
|
||||||
|
heights = [w['height'] for w in sorted_w if w.get('height', 0) > 0]
|
||||||
|
median_h = statistics.median(heights) if heights else 30
|
||||||
|
|
||||||
|
# Adaptive gap threshold: 3× median word height, but at least min_gap_pct of image width
|
||||||
|
min_gap_px = max(median_h * 3, img_w * min_gap_pct / 100) if img_w > 0 else median_h * 3
|
||||||
|
|
||||||
|
# Find X-gap boundaries between consecutive words (sorted by X-center)
|
||||||
|
# For each word, compute right edge; for next word, compute left edge
|
||||||
|
boundaries: List[float] = [] # X positions where columns split
|
||||||
|
for i in range(len(sorted_w) - 1):
|
||||||
|
right_edge = sorted_w[i]['left'] + sorted_w[i]['width']
|
||||||
|
left_edge = sorted_w[i + 1]['left']
|
||||||
|
gap = left_edge - right_edge
|
||||||
|
if gap > min_gap_px:
|
||||||
|
# Split point is midway through the gap
|
||||||
|
boundaries.append((right_edge + left_edge) / 2)
|
||||||
|
|
||||||
|
# Build column ranges from boundaries
|
||||||
|
# Column ranges: (-inf, boundary[0]), (boundary[0], boundary[1]), ..., (boundary[-1], +inf)
|
||||||
|
col_edges = [0.0] + boundaries + [float(img_w)]
|
||||||
|
columns = []
|
||||||
|
for ci in range(len(col_edges) - 1):
|
||||||
|
columns.append({
|
||||||
|
'index': ci,
|
||||||
|
'type': f'column_{ci + 1}' if len(col_edges) > 2 else 'column_text',
|
||||||
|
'x_min': col_edges[ci],
|
||||||
|
'x_max': col_edges[ci + 1],
|
||||||
|
})
|
||||||
|
|
||||||
|
return columns
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 2. Row clustering
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _cluster_rows(
|
||||||
|
words: List[Dict],
|
||||||
|
) -> List[Dict[str, Any]]:
|
||||||
|
"""Cluster words into visual rows by Y-proximity.
|
||||||
|
|
||||||
|
Uses half the median word height as Y-tolerance.
|
||||||
|
|
||||||
|
Returns a list of row dicts:
|
||||||
|
[{'index': 0, 'y_min': ..., 'y_max': ..., 'y_center': ...}, ...]
|
||||||
|
sorted top-to-bottom.
|
||||||
|
"""
|
||||||
|
if not words:
|
||||||
|
return []
|
||||||
|
|
||||||
|
heights = [w['height'] for w in words if w.get('height', 0) > 0]
|
||||||
|
median_h = statistics.median(heights) if heights else 20
|
||||||
|
y_tol = max(median_h * 0.5, 5)
|
||||||
|
|
||||||
|
lines = _group_words_into_lines(words, y_tolerance_px=int(y_tol))
|
||||||
|
|
||||||
|
rows = []
|
||||||
|
for ri, line_words in enumerate(lines):
|
||||||
|
y_min = min(w['top'] for w in line_words)
|
||||||
|
y_max = max(w['top'] + w['height'] for w in line_words)
|
||||||
|
rows.append({
|
||||||
|
'index': ri,
|
||||||
|
'y_min': y_min,
|
||||||
|
'y_max': y_max,
|
||||||
|
'y_center': (y_min + y_max) / 2,
|
||||||
|
})
|
||||||
|
|
||||||
|
return rows
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 3. Build cells
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _assign_word_to_column(word: Dict, columns: List[Dict]) -> int:
|
||||||
|
"""Return column index for a word based on its X-center."""
|
||||||
|
x_center = word['left'] + word['width'] / 2
|
||||||
|
for col in columns:
|
||||||
|
if col['x_min'] <= x_center < col['x_max']:
|
||||||
|
return col['index']
|
||||||
|
# Fallback: nearest column
|
||||||
|
return min(columns, key=lambda c: abs((c['x_min'] + c['x_max']) / 2 - x_center))['index']
|
||||||
|
|
||||||
|
|
||||||
|
def _assign_word_to_row(word: Dict, rows: List[Dict]) -> int:
|
||||||
|
"""Return row index for a word based on its Y-center."""
|
||||||
|
y_center = word['top'] + word['height'] / 2
|
||||||
|
# Find the row whose y_range contains this word's center
|
||||||
|
for row in rows:
|
||||||
|
if row['y_min'] <= y_center <= row['y_max']:
|
||||||
|
return row['index']
|
||||||
|
# Fallback: nearest row by Y-center
|
||||||
|
return min(rows, key=lambda r: abs(r['y_center'] - y_center))['index']
|
||||||
|
|
||||||
|
|
||||||
|
def _build_cells(
|
||||||
|
words: List[Dict],
|
||||||
|
columns: List[Dict],
|
||||||
|
rows: List[Dict],
|
||||||
|
img_w: int,
|
||||||
|
img_h: int,
|
||||||
|
) -> List[Dict[str, Any]]:
|
||||||
|
"""Build cell dicts from word assignments to (column, row) pairs."""
|
||||||
|
if not columns or not rows:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Bucket words into (col_idx, row_idx)
|
||||||
|
buckets: Dict[Tuple[int, int], List[Dict]] = {}
|
||||||
|
for w in words:
|
||||||
|
ci = _assign_word_to_column(w, columns)
|
||||||
|
ri = _assign_word_to_row(w, rows)
|
||||||
|
buckets.setdefault((ci, ri), []).append(w)
|
||||||
|
|
||||||
|
cells = []
|
||||||
|
for (ci, ri), cell_words in sorted(buckets.items(), key=lambda kv: (kv[0][1], kv[0][0])):
|
||||||
|
col = columns[ci]
|
||||||
|
row = rows[ri]
|
||||||
|
|
||||||
|
# Compute tight bbox from actual word positions
|
||||||
|
x_min = min(w['left'] for w in cell_words)
|
||||||
|
y_min = min(w['top'] for w in cell_words)
|
||||||
|
x_max = max(w['left'] + w['width'] for w in cell_words)
|
||||||
|
y_max = max(w['top'] + w['height'] for w in cell_words)
|
||||||
|
bw = x_max - x_min
|
||||||
|
bh = y_max - y_min
|
||||||
|
|
||||||
|
# Text from words in reading order
|
||||||
|
text = _words_to_reading_order_text(cell_words, y_tolerance_px=max(10, int(bh * 0.4)))
|
||||||
|
|
||||||
|
# Average confidence
|
||||||
|
confs = [w.get('conf', 0) for w in cell_words if w.get('conf', 0) > 0]
|
||||||
|
avg_conf = sum(confs) / len(confs) if confs else 0.0
|
||||||
|
|
||||||
|
# Word boxes with percent coordinates
|
||||||
|
word_boxes = []
|
||||||
|
for w in sorted(cell_words, key=lambda ww: (ww['top'], ww['left'])):
|
||||||
|
word_boxes.append({
|
||||||
|
'text': w.get('text', ''),
|
||||||
|
'left': round(w['left'] / img_w * 100, 2) if img_w else 0,
|
||||||
|
'top': round(w['top'] / img_h * 100, 2) if img_h else 0,
|
||||||
|
'width': round(w['width'] / img_w * 100, 2) if img_w else 0,
|
||||||
|
'height': round(w['height'] / img_h * 100, 2) if img_h else 0,
|
||||||
|
'conf': w.get('conf', 0),
|
||||||
|
})
|
||||||
|
|
||||||
|
cells.append({
|
||||||
|
'cell_id': f"R{ri:02d}_C{ci}",
|
||||||
|
'row_index': ri,
|
||||||
|
'col_index': ci,
|
||||||
|
'col_type': col['type'],
|
||||||
|
'text': text,
|
||||||
|
'confidence': round(avg_conf, 1),
|
||||||
|
'bbox_px': {'x': x_min, 'y': y_min, 'w': bw, 'h': bh},
|
||||||
|
'bbox_pct': {
|
||||||
|
'x': round(x_min / img_w * 100, 2) if img_w else 0,
|
||||||
|
'y': round(y_min / img_h * 100, 2) if img_h else 0,
|
||||||
|
'w': round(bw / img_w * 100, 2) if img_w else 0,
|
||||||
|
'h': round(bh / img_h * 100, 2) if img_h else 0,
|
||||||
|
},
|
||||||
|
'word_boxes': word_boxes,
|
||||||
|
'ocr_engine': 'words_first',
|
||||||
|
'is_bold': False,
|
||||||
|
})
|
||||||
|
|
||||||
|
return cells
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# 4. Public API
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def build_grid_from_words(
|
||||||
|
word_dicts: List[Dict],
|
||||||
|
img_w: int,
|
||||||
|
img_h: int,
|
||||||
|
min_confidence: int = 30,
|
||||||
|
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
||||||
|
"""Build a cell grid bottom-up from Tesseract word boxes.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
word_dicts: Flat list of word dicts with keys:
|
||||||
|
text, left, top, width, height, conf
|
||||||
|
(absolute pixel coordinates).
|
||||||
|
img_w: Image width in pixels.
|
||||||
|
img_h: Image height in pixels.
|
||||||
|
min_confidence: Minimum OCR confidence to keep a word.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(cells, columns_meta) — same format as build_cell_grid_v2().
|
||||||
|
cells: list of cell dicts with cell_id, bbox_px, bbox_pct, etc.
|
||||||
|
columns_meta: list of {'index', 'type', 'x', 'width'} dicts.
|
||||||
|
"""
|
||||||
|
if not word_dicts:
|
||||||
|
logger.info("build_grid_from_words: no words — returning empty grid")
|
||||||
|
return [], []
|
||||||
|
|
||||||
|
# Filter by confidence
|
||||||
|
words = [
|
||||||
|
w for w in word_dicts
|
||||||
|
if w.get('conf', 0) >= min_confidence and w.get('text', '').strip()
|
||||||
|
]
|
||||||
|
if not words:
|
||||||
|
logger.info("build_grid_from_words: all words filtered (conf < %d)", min_confidence)
|
||||||
|
return [], []
|
||||||
|
|
||||||
|
logger.info("build_grid_from_words: %d words (after confidence filter from %d)", len(words), len(word_dicts))
|
||||||
|
|
||||||
|
# Step 1: cluster columns
|
||||||
|
columns = _cluster_columns(words, img_w)
|
||||||
|
logger.info("build_grid_from_words: %d column(s) detected", len(columns))
|
||||||
|
|
||||||
|
# Step 2: cluster rows
|
||||||
|
rows = _cluster_rows(words)
|
||||||
|
logger.info("build_grid_from_words: %d row(s) detected", len(rows))
|
||||||
|
|
||||||
|
# Step 3: build cells
|
||||||
|
cells = _build_cells(words, columns, rows, img_w, img_h)
|
||||||
|
logger.info("build_grid_from_words: %d cells built", len(cells))
|
||||||
|
|
||||||
|
# Build columns_meta in same format as build_cell_grid_v2
|
||||||
|
columns_meta = []
|
||||||
|
for col in columns:
|
||||||
|
x = int(col['x_min'])
|
||||||
|
w = int(col['x_max'] - col['x_min'])
|
||||||
|
columns_meta.append({
|
||||||
|
'index': col['index'],
|
||||||
|
'type': col['type'],
|
||||||
|
'x': x,
|
||||||
|
'width': w,
|
||||||
|
})
|
||||||
|
|
||||||
|
return cells, columns_meta
|
||||||
@@ -71,6 +71,7 @@ from cv_vocab_pipeline import (
|
|||||||
render_image_high_res,
|
render_image_high_res,
|
||||||
render_pdf_high_res,
|
render_pdf_high_res,
|
||||||
)
|
)
|
||||||
|
from cv_words_first import build_grid_from_words
|
||||||
from ocr_pipeline_session_store import (
|
from ocr_pipeline_session_store import (
|
||||||
create_session_db,
|
create_session_db,
|
||||||
delete_all_sessions_db,
|
delete_all_sessions_db,
|
||||||
@@ -1859,6 +1860,7 @@ async def detect_words(
|
|||||||
pronunciation: str = "british",
|
pronunciation: str = "british",
|
||||||
stream: bool = False,
|
stream: bool = False,
|
||||||
skip_heal_gaps: bool = False,
|
skip_heal_gaps: bool = False,
|
||||||
|
grid_method: str = "v2",
|
||||||
):
|
):
|
||||||
"""Build word grid from columns × rows, OCR each cell.
|
"""Build word grid from columns × rows, OCR each cell.
|
||||||
|
|
||||||
@@ -1868,6 +1870,9 @@ async def detect_words(
|
|||||||
stream: false (default) for JSON response, true for SSE streaming
|
stream: false (default) for JSON response, true for SSE streaming
|
||||||
skip_heal_gaps: false (default). When true, cells keep exact row geometry
|
skip_heal_gaps: false (default). When true, cells keep exact row geometry
|
||||||
positions without gap-healing expansion. Better for overlay rendering.
|
positions without gap-healing expansion. Better for overlay rendering.
|
||||||
|
grid_method: 'v2' (default) or 'words_first' — grid construction strategy.
|
||||||
|
'v2' uses pre-detected columns/rows (top-down).
|
||||||
|
'words_first' clusters words bottom-up (no column/row detection needed).
|
||||||
"""
|
"""
|
||||||
if session_id not in _cache:
|
if session_id not in _cache:
|
||||||
logger.info("detect_words: session %s not in cache, loading from DB", session_id)
|
logger.info("detect_words: session %s not in cache, loading from DB", session_id)
|
||||||
@@ -1902,7 +1907,7 @@ async def detect_words(
|
|||||||
"duration_seconds": 0,
|
"duration_seconds": 0,
|
||||||
}
|
}
|
||||||
logger.info("detect_words: no column_result — using full-page pseudo-column %dx%d", img_w_tmp, img_h_tmp)
|
logger.info("detect_words: no column_result — using full-page pseudo-column %dx%d", img_w_tmp, img_h_tmp)
|
||||||
if not row_result or not row_result.get("rows"):
|
if grid_method != "words_first" and (not row_result or not row_result.get("rows")):
|
||||||
raise HTTPException(status_code=400, detail="Row detection must be completed first")
|
raise HTTPException(status_code=400, detail="Row detection must be completed first")
|
||||||
|
|
||||||
# Convert column dicts back to PageRegion objects
|
# Convert column dicts back to PageRegion objects
|
||||||
@@ -1983,6 +1988,102 @@ async def detect_words(
|
|||||||
if excluded:
|
if excluded:
|
||||||
logger.info(f"detect_words: excluded {excluded} rows inside box zones")
|
logger.info(f"detect_words: excluded {excluded} rows inside box zones")
|
||||||
|
|
||||||
|
# --- Words-First path: bottom-up grid from word boxes ---
|
||||||
|
if grid_method == "words_first":
|
||||||
|
t0 = time.time()
|
||||||
|
img_h, img_w = dewarped_bgr.shape[:2]
|
||||||
|
|
||||||
|
# Get word_dicts from cache or run Tesseract full-page
|
||||||
|
wf_word_dicts = cached.get("_word_dicts")
|
||||||
|
if wf_word_dicts is None:
|
||||||
|
ocr_img_tmp = create_ocr_image(dewarped_bgr)
|
||||||
|
geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr)
|
||||||
|
if geo_result is not None:
|
||||||
|
_geoms, left_x, right_x, top_y, bottom_y, wf_word_dicts, inv = geo_result
|
||||||
|
cached["_word_dicts"] = wf_word_dicts
|
||||||
|
cached["_inv"] = inv
|
||||||
|
cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
|
||||||
|
|
||||||
|
if not wf_word_dicts:
|
||||||
|
raise HTTPException(status_code=400, detail="No words detected — cannot build words-first grid")
|
||||||
|
|
||||||
|
# Convert word coordinates to absolute image coordinates if needed
|
||||||
|
# (detect_column_geometry returns words relative to content ROI)
|
||||||
|
content_bounds = cached.get("_content_bounds")
|
||||||
|
if content_bounds:
|
||||||
|
lx, _rx, ty, _by = content_bounds
|
||||||
|
abs_words = []
|
||||||
|
for w in wf_word_dicts:
|
||||||
|
abs_words.append({
|
||||||
|
**w,
|
||||||
|
'left': w['left'] + lx,
|
||||||
|
'top': w['top'] + ty,
|
||||||
|
})
|
||||||
|
wf_word_dicts = abs_words
|
||||||
|
|
||||||
|
cells, columns_meta = build_grid_from_words(wf_word_dicts, img_w, img_h)
|
||||||
|
duration = time.time() - t0
|
||||||
|
|
||||||
|
# Apply IPA phonetic fixes
|
||||||
|
fix_cell_phonetics(cells, pronunciation=pronunciation)
|
||||||
|
|
||||||
|
# Add zone_index for backward compat
|
||||||
|
for cell in cells:
|
||||||
|
cell.setdefault("zone_index", 0)
|
||||||
|
|
||||||
|
col_types = {c['type'] for c in columns_meta}
|
||||||
|
is_vocab = bool(col_types & {'column_en', 'column_de'})
|
||||||
|
n_rows = len(set(c['row_index'] for c in cells)) if cells else 0
|
||||||
|
n_cols = len(columns_meta)
|
||||||
|
used_engine = "words_first"
|
||||||
|
|
||||||
|
word_result = {
|
||||||
|
"cells": cells,
|
||||||
|
"grid_shape": {
|
||||||
|
"rows": n_rows,
|
||||||
|
"cols": n_cols,
|
||||||
|
"total_cells": len(cells),
|
||||||
|
},
|
||||||
|
"columns_used": columns_meta,
|
||||||
|
"layout": "vocab" if is_vocab else "generic",
|
||||||
|
"image_width": img_w,
|
||||||
|
"image_height": img_h,
|
||||||
|
"duration_seconds": round(duration, 2),
|
||||||
|
"ocr_engine": used_engine,
|
||||||
|
"grid_method": "words_first",
|
||||||
|
"summary": {
|
||||||
|
"total_cells": len(cells),
|
||||||
|
"non_empty_cells": sum(1 for c in cells if c.get("text")),
|
||||||
|
"low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
if is_vocab or 'column_text' in col_types:
|
||||||
|
entries = _cells_to_vocab_entries(cells, columns_meta)
|
||||||
|
entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
|
||||||
|
word_result["vocab_entries"] = entries
|
||||||
|
word_result["entries"] = entries
|
||||||
|
word_result["entry_count"] = len(entries)
|
||||||
|
word_result["summary"]["total_entries"] = len(entries)
|
||||||
|
word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english"))
|
||||||
|
word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))
|
||||||
|
|
||||||
|
await update_session_db(session_id, word_result=word_result, current_step=8)
|
||||||
|
cached["word_result"] = word_result
|
||||||
|
|
||||||
|
logger.info(f"OCR Pipeline: words-first session {session_id}: "
|
||||||
|
f"{len(cells)} cells ({duration:.2f}s), {n_rows} rows, {n_cols} cols")
|
||||||
|
|
||||||
|
await _append_pipeline_log(session_id, "words", {
|
||||||
|
"grid_method": "words_first",
|
||||||
|
"total_cells": len(cells),
|
||||||
|
"non_empty_cells": word_result["summary"]["non_empty_cells"],
|
||||||
|
"ocr_engine": used_engine,
|
||||||
|
"layout": word_result["layout"],
|
||||||
|
}, duration_ms=int(duration * 1000))
|
||||||
|
|
||||||
|
return {"session_id": session_id, **word_result}
|
||||||
|
|
||||||
if stream:
|
if stream:
|
||||||
# Cell-First OCR v2: use batch-then-stream approach instead of
|
# Cell-First OCR v2: use batch-then-stream approach instead of
|
||||||
# per-cell streaming. The parallel ThreadPoolExecutor in
|
# per-cell streaming. The parallel ThreadPoolExecutor in
|
||||||
@@ -2001,7 +2102,7 @@ async def detect_words(
|
|||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
# --- Non-streaming path ---
|
# --- Non-streaming path (grid_method=v2) ---
|
||||||
t0 = time.time()
|
t0 = time.time()
|
||||||
|
|
||||||
# Create binarized OCR image (for Tesseract)
|
# Create binarized OCR image (for Tesseract)
|
||||||
|
|||||||
214
klausur-service/backend/tests/test_cv_words_first.py
Normal file
214
klausur-service/backend/tests/test_cv_words_first.py
Normal file
@@ -0,0 +1,214 @@
|
|||||||
|
"""Tests for cv_words_first.py — Words-First Grid Builder."""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from cv_words_first import (
|
||||||
|
_assign_word_to_column,
|
||||||
|
_assign_word_to_row,
|
||||||
|
_build_cells,
|
||||||
|
_cluster_columns,
|
||||||
|
_cluster_rows,
|
||||||
|
build_grid_from_words,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _word(text: str, left: int, top: int, width: int = 60, height: int = 20, conf: int = 90):
|
||||||
|
"""Create a synthetic word dict."""
|
||||||
|
return {
|
||||||
|
'text': text,
|
||||||
|
'left': left,
|
||||||
|
'top': top,
|
||||||
|
'width': width,
|
||||||
|
'height': height,
|
||||||
|
'conf': conf,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# _cluster_columns
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestClusterColumns:
|
||||||
|
|
||||||
|
def test_single_column_freetext(self):
|
||||||
|
"""Words spread evenly across page → 1 column (column_text)."""
|
||||||
|
words = [
|
||||||
|
_word("Hello", 50, 10),
|
||||||
|
_word("world", 120, 10),
|
||||||
|
_word("this", 50, 40),
|
||||||
|
_word("is", 120, 40),
|
||||||
|
_word("text", 190, 40),
|
||||||
|
]
|
||||||
|
cols = _cluster_columns(words, img_w=400)
|
||||||
|
assert len(cols) == 1
|
||||||
|
assert cols[0]['type'] == 'column_text'
|
||||||
|
|
||||||
|
def test_two_columns(self):
|
||||||
|
"""Two word groups with large X-gap → 2 columns."""
|
||||||
|
words = [
|
||||||
|
_word("apple", 20, 10),
|
||||||
|
_word("Apfel", 300, 10),
|
||||||
|
_word("dog", 20, 40),
|
||||||
|
_word("Hund", 300, 40),
|
||||||
|
]
|
||||||
|
cols = _cluster_columns(words, img_w=500)
|
||||||
|
assert len(cols) == 2
|
||||||
|
assert cols[0]['type'] == 'column_1'
|
||||||
|
assert cols[1]['type'] == 'column_2'
|
||||||
|
|
||||||
|
def test_three_columns(self):
|
||||||
|
"""Three groups separated by wide gaps → 3 columns."""
|
||||||
|
words = [
|
||||||
|
_word("1", 10, 10, width=20),
|
||||||
|
_word("apple", 100, 10),
|
||||||
|
_word("Apfel", 400, 10),
|
||||||
|
_word("2", 10, 40, width=20),
|
||||||
|
_word("dog", 100, 40),
|
||||||
|
_word("Hund", 400, 40),
|
||||||
|
]
|
||||||
|
cols = _cluster_columns(words, img_w=600)
|
||||||
|
assert len(cols) == 3
|
||||||
|
|
||||||
|
def test_empty_words(self):
|
||||||
|
"""No words → empty result."""
|
||||||
|
assert _cluster_columns([], img_w=500) == []
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# _cluster_rows
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestClusterRows:
|
||||||
|
|
||||||
|
def test_two_rows(self):
|
||||||
|
"""Words at two Y-levels → 2 rows."""
|
||||||
|
words = [
|
||||||
|
_word("hello", 10, 20),
|
||||||
|
_word("world", 100, 25),
|
||||||
|
_word("foo", 10, 80),
|
||||||
|
_word("bar", 100, 82),
|
||||||
|
]
|
||||||
|
rows = _cluster_rows(words)
|
||||||
|
assert len(rows) == 2
|
||||||
|
assert rows[0]['y_min'] < rows[1]['y_min']
|
||||||
|
|
||||||
|
def test_single_row(self):
|
||||||
|
"""All words at same Y → 1 row."""
|
||||||
|
words = [
|
||||||
|
_word("a", 10, 50),
|
||||||
|
_word("b", 80, 52),
|
||||||
|
_word("c", 150, 51),
|
||||||
|
]
|
||||||
|
rows = _cluster_rows(words)
|
||||||
|
assert len(rows) == 1
|
||||||
|
|
||||||
|
def test_empty(self):
|
||||||
|
assert _cluster_rows([]) == []
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# build_grid_from_words (integration)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestBuildGridFromWords:
|
||||||
|
|
||||||
|
def test_two_column_vocab(self):
|
||||||
|
"""Simulate a 2-column vocabulary page with 3 rows."""
|
||||||
|
words = [
|
||||||
|
_word("apple", 50, 20),
|
||||||
|
_word("Apfel", 400, 22),
|
||||||
|
_word("dog", 50, 60),
|
||||||
|
_word("Hund", 400, 62),
|
||||||
|
_word("cat", 50, 100),
|
||||||
|
_word("Katze", 400, 102),
|
||||||
|
]
|
||||||
|
cells, cols_meta = build_grid_from_words(words, img_w=600, img_h=200)
|
||||||
|
|
||||||
|
assert len(cols_meta) == 2
|
||||||
|
assert len(cells) == 6 # 3 rows × 2 cols
|
||||||
|
# Check cell_id format
|
||||||
|
cell_ids = {c['cell_id'] for c in cells}
|
||||||
|
assert 'R00_C0' in cell_ids
|
||||||
|
assert 'R00_C1' in cell_ids
|
||||||
|
|
||||||
|
def test_single_column_freetext(self):
|
||||||
|
"""Single-column text → 1 column, multiple rows."""
|
||||||
|
words = [
|
||||||
|
_word("Hello", 50, 20),
|
||||||
|
_word("world", 120, 22),
|
||||||
|
_word("Second", 50, 60),
|
||||||
|
_word("line", 120, 62),
|
||||||
|
]
|
||||||
|
cells, cols_meta = build_grid_from_words(words, img_w=300, img_h=150)
|
||||||
|
|
||||||
|
assert len(cols_meta) == 1
|
||||||
|
assert cols_meta[0]['type'] == 'column_text'
|
||||||
|
assert len(cells) == 2 # 2 rows, 1 column each
|
||||||
|
|
||||||
|
def test_empty_input(self):
|
||||||
|
cells, cols = build_grid_from_words([], img_w=500, img_h=500)
|
||||||
|
assert cells == []
|
||||||
|
assert cols == []
|
||||||
|
|
||||||
|
def test_low_confidence_filtered(self):
|
||||||
|
"""Words below min_confidence are excluded."""
|
||||||
|
words = [
|
||||||
|
_word("good", 50, 20, conf=90),
|
||||||
|
_word("bad", 200, 20, conf=10),
|
||||||
|
]
|
||||||
|
cells, cols = build_grid_from_words(words, img_w=400, img_h=100, min_confidence=30)
|
||||||
|
# Only the good word should produce a cell
|
||||||
|
assert len(cells) == 1
|
||||||
|
assert cells[0]['text'] == 'good'
|
||||||
|
|
||||||
|
def test_bbox_pct_correct(self):
|
||||||
|
"""Check that bbox_pct is correctly computed from pixel coords."""
|
||||||
|
words = [_word("test", 200, 100, width=100, height=30)]
|
||||||
|
cells, _ = build_grid_from_words(words, img_w=1000, img_h=500)
|
||||||
|
assert len(cells) == 1
|
||||||
|
bp = cells[0]['bbox_pct']
|
||||||
|
assert bp['x'] == 20.0 # 200/1000*100
|
||||||
|
assert bp['y'] == 20.0 # 100/500*100
|
||||||
|
assert bp['w'] == 10.0 # 100/1000*100
|
||||||
|
assert bp['h'] == 6.0 # 30/500*100
|
||||||
|
|
||||||
|
def test_columns_meta_format(self):
|
||||||
|
"""columns_meta has same keys as build_cell_grid_v2 output."""
|
||||||
|
words = [
|
||||||
|
_word("a", 50, 20),
|
||||||
|
_word("b", 400, 20),
|
||||||
|
]
|
||||||
|
_, cols_meta = build_grid_from_words(words, img_w=600, img_h=100)
|
||||||
|
for col in cols_meta:
|
||||||
|
assert 'index' in col
|
||||||
|
assert 'type' in col
|
||||||
|
assert 'x' in col
|
||||||
|
assert 'width' in col
|
||||||
|
|
||||||
|
def test_word_boxes_included(self):
|
||||||
|
"""Each cell should contain word_boxes with percent coords."""
|
||||||
|
words = [
|
||||||
|
_word("hello", 50, 20),
|
||||||
|
_word("world", 120, 22),
|
||||||
|
]
|
||||||
|
cells, _ = build_grid_from_words(words, img_w=300, img_h=100)
|
||||||
|
assert len(cells) == 1 # single row, single column
|
||||||
|
wb = cells[0].get('word_boxes', [])
|
||||||
|
assert len(wb) == 2
|
||||||
|
for w in wb:
|
||||||
|
assert 'left' in w
|
||||||
|
assert 'top' in w
|
||||||
|
assert 'text' in w
|
||||||
|
|
||||||
|
def test_all_whitespace_filtered(self):
|
||||||
|
"""Words with only whitespace text are filtered out."""
|
||||||
|
words = [
|
||||||
|
_word(" ", 50, 20, conf=90),
|
||||||
|
_word("hello", 200, 20, conf=90),
|
||||||
|
]
|
||||||
|
cells, _ = build_grid_from_words(words, img_w=400, img_h=100)
|
||||||
|
assert len(cells) == 1
|
||||||
|
assert cells[0]['text'] == 'hello'
|
||||||
Reference in New Issue
Block a user