feat: Words-First Grid Builder (bottom-up alternative zu cell_grid_v2)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 54s
CI / test-go-edu-search (push) Successful in 47s
CI / test-python-klausur (push) Failing after 2m31s
CI / test-python-agent-core (push) Successful in 23s
CI / test-nodejs-website (push) Successful in 32s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 54s
CI / test-go-edu-search (push) Successful in 47s
CI / test-python-klausur (push) Failing after 2m31s
CI / test-python-agent-core (push) Successful in 23s
CI / test-nodejs-website (push) Successful in 32s
Neuer Algorithmus in cv_words_first.py: Clustert Tesseract word_boxes direkt zu Spalten (X-Gap) und Zeilen (Y-Proximity), baut Zellen an Schnittpunkten. Kein Spalten-/Zeilenerkennung noetig. - cv_words_first.py: _cluster_columns, _cluster_rows, _build_cells, build_grid_from_words - ocr_pipeline_api.py: grid_method Parameter (v2|words_first) im /words Endpoint - StepWordRecognition.tsx: Dropdown Toggle fuer Grid-Methode - OCR-Pipeline.md: Doku v4.3.0 mit Words-First Algorithmus - 15 Unit-Tests fuer cv_words_first Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -63,6 +63,7 @@ export function StepWordRecognition({ sessionId, onNext, goToStep, skipHealGaps
|
||||
const [ocrEngine, setOcrEngine] = useState<'auto' | 'tesseract' | 'rapid'>('auto')
|
||||
const [usedEngine, setUsedEngine] = useState<string>('')
|
||||
const [pronunciation, setPronunciation] = useState<'british' | 'american'>('british')
|
||||
const [gridMethod, setGridMethod] = useState<'v2' | 'words_first'>('v2')
|
||||
|
||||
// Streaming progress state
|
||||
const [streamProgress, setStreamProgress] = useState<{ current: number; total: number } | null>(null)
|
||||
@@ -112,7 +113,7 @@ export function StepWordRecognition({ sessionId, onNext, goToStep, skipHealGaps
|
||||
let res: Response | null = null
|
||||
for (let attempt = 0; attempt < 2; attempt++) {
|
||||
res = await fetch(
|
||||
`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/words?stream=true&engine=${eng}&pronunciation=${pronunciation}${skipHealGaps ? '&skip_heal_gaps=true' : ''}`,
|
||||
`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/words?stream=${gridMethod === 'v2' ? 'true' : 'false'}&engine=${eng}&pronunciation=${pronunciation}${skipHealGaps ? '&skip_heal_gaps=true' : ''}&grid_method=${gridMethod}`,
|
||||
{ method: 'POST' },
|
||||
)
|
||||
if (res.ok) break
|
||||
@@ -128,6 +129,13 @@ export function StepWordRecognition({ sessionId, onNext, goToStep, skipHealGaps
|
||||
throw new Error(err.detail || 'Worterkennung fehlgeschlagen')
|
||||
}
|
||||
|
||||
// words_first returns plain JSON (no streaming)
|
||||
if (gridMethod === 'words_first') {
|
||||
const data = await res.json() as GridResult
|
||||
applyGridResult(data)
|
||||
return
|
||||
}
|
||||
|
||||
const reader = res.body!.getReader()
|
||||
const decoder = new TextDecoder()
|
||||
let buffer = ''
|
||||
@@ -220,7 +228,7 @@ export function StepWordRecognition({ sessionId, onNext, goToStep, skipHealGaps
|
||||
setDetecting(false)
|
||||
}
|
||||
// eslint-disable-next-line react-hooks/exhaustive-deps
|
||||
}, [sessionId, ocrEngine, pronunciation])
|
||||
}, [sessionId, ocrEngine, pronunciation, gridMethod])
|
||||
|
||||
const handleGroundTruth = useCallback(async (isCorrect: boolean) => {
|
||||
if (!sessionId) return
|
||||
@@ -789,6 +797,16 @@ export function StepWordRecognition({ sessionId, onNext, goToStep, skipHealGaps
|
||||
{gridResult && (
|
||||
<div className="bg-white dark:bg-gray-800 rounded-xl border border-gray-200 dark:border-gray-700 p-4 space-y-3">
|
||||
<div className="flex items-center gap-3 flex-wrap">
|
||||
{/* Grid method selector */}
|
||||
<select
|
||||
value={gridMethod}
|
||||
onChange={(e) => setGridMethod(e.target.value as 'v2' | 'words_first')}
|
||||
className="px-2 py-1.5 text-xs border rounded-lg dark:bg-gray-700 dark:border-gray-600"
|
||||
>
|
||||
<option value="v2">Standard (v2)</option>
|
||||
<option value="words_first">Words-First</option>
|
||||
</select>
|
||||
|
||||
{/* OCR Engine selector */}
|
||||
<select
|
||||
value={ocrEngine}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# OCR Pipeline - Schrittweise Seitenrekonstruktion
|
||||
|
||||
**Version:** 4.1.0
|
||||
**Version:** 4.3.0
|
||||
**Status:** Produktiv (Schritte 1–10 implementiert)
|
||||
**URL:** https://macmini:3002/ai/ocr-pipeline
|
||||
|
||||
@@ -22,7 +22,7 @@ Jeder Schritt kann individuell geprueft, korrigiert und mit Ground-Truth-Daten v
|
||||
| 4 | Zuschneiden (Crop) | Content-basierter Crop: Buchruecken-Schatten + Ink-Projektion | Implementiert |
|
||||
| 5 | Spaltenerkennung | Unsichtbare Spalten finden (Projektionsprofile + Wortvalidierung) | Implementiert |
|
||||
| 6 | Zeilenerkennung | Horizontale Zeilen + Kopf-/Fusszeilen-Klassifikation + Luecken-Heilung | Implementiert |
|
||||
| 7 | Worterkennung | Hybrid-Grid: Breite Spalten full-page, schmale cell-crop | Implementiert |
|
||||
| 7 | Worterkennung | Hybrid-Grid (v2) oder Words-First (bottom-up) | Implementiert |
|
||||
| 8 | Korrektur | Zeichenverwirrung + regel-basierte Rechtschreibkorrektur (SSE-Stream) | Implementiert |
|
||||
| 9 | Rekonstruktion | Interaktive Zellenbearbeitung auf Bildhintergrund (Fabric.js) | Implementiert |
|
||||
| 10 | Validierung | Ground-Truth-Vergleich und Qualitaetspruefung | Implementiert |
|
||||
@@ -82,28 +82,29 @@ flowchart TD
|
||||
│
|
||||
detect_document_type()
|
||||
│
|
||||
┌─────────────────┴──────────────────┐
|
||||
▼ ▼
|
||||
FULL-TEXT PFAD CELL-FIRST PFAD
|
||||
(pipeline='full_page') (pipeline='cell_first')
|
||||
│ │
|
||||
Keine Spalten/Zeilen Spaltenerkennung
|
||||
analyze_layout_by_words() detect_column_geometry()
|
||||
Lese-Reihenfolge _detect_sub_columns()
|
||||
│ expand_narrow_columns()
|
||||
│ Zeilenerkennung
|
||||
│ detect_row_geometry()
|
||||
│ │
|
||||
│ build_cell_grid_v2()
|
||||
│ │
|
||||
│ ┌─────────┴──────────┐
|
||||
│ ▼ ▼
|
||||
│ Breite Spalten Schmale Spalten
|
||||
│ (>= 15% Breite) (< 15% Breite)
|
||||
│ Full-Page Words Cell-Crop OCR
|
||||
│ word_lookup cell_crop_v2
|
||||
│ │ │
|
||||
└───────────────────────────┴────────────────────┘
|
||||
┌──────────────────┼──────────────────┐
|
||||
▼ ▼ ▼
|
||||
FULL-TEXT PFAD WORDS-FIRST PFAD CELL-FIRST PFAD
|
||||
(pipeline= (grid_method= (grid_method=
|
||||
'full_page') 'words_first') 'v2', default)
|
||||
│ │ │
|
||||
Keine Spalten/ Tesseract Full-Page Spaltenerkennung
|
||||
Zeilen word_boxes detect_column_geometry()
|
||||
analyze_layout_ _cluster_columns() _detect_sub_columns()
|
||||
by_words() _cluster_rows() expand_narrow_columns()
|
||||
│ _build_cells() Zeilenerkennung
|
||||
│ │ detect_row_geometry()
|
||||
│ build_grid_from_ │
|
||||
│ words() build_cell_grid_v2()
|
||||
│ │ │
|
||||
│ │ ┌─────────┴──────────┐
|
||||
│ │ ▼ ▼
|
||||
│ │ Breite Spalten Schmale Spalten
|
||||
│ │ (>= 15% Breite) (< 15% Breite)
|
||||
│ │ Full-Page Words Cell-Crop OCR
|
||||
│ │ word_lookup cell_crop_v2
|
||||
│ │ │ │
|
||||
└──────────────────┴────┴────────────────────┘
|
||||
│
|
||||
Post-Processing Pipeline
|
||||
(Lautschrift, Komma-Split, etc.)
|
||||
@@ -147,6 +148,8 @@ klausur-service/backend/
|
||||
│ └── cv_vocab_pipeline.py # Computer Vision + NLP Algorithmen
|
||||
├── ocr_pipeline_api.py # FastAPI Router (Schritte 2-10)
|
||||
├── orientation_crop_api.py # FastAPI Router (Schritte 1 + 4)
|
||||
├── cv_box_detect.py # Box-Erkennung + Zonen-Aufteilung
|
||||
├── cv_words_first.py # Words-First Grid Builder (bottom-up)
|
||||
├── page_crop.py # Content-basierter Crop-Algorithmus
|
||||
├── ocr_pipeline_session_store.py # PostgreSQL Persistence
|
||||
├── layout_reconstruction_service.py # Fabric.js JSON + PDF/DOCX Export
|
||||
@@ -169,7 +172,8 @@ admin-lehrer/
|
||||
├── StepRowDetection.tsx # Schritt 6: Zeilenerkennung
|
||||
├── StepWordRecognition.tsx # Schritt 7: Worterkennung
|
||||
├── StepLlmReview.tsx # Schritt 8: Korrektur (SSE-Stream)
|
||||
├── StepReconstruction.tsx # Schritt 9: Rekonstruktion (Canvas)
|
||||
├── StepReconstruction.tsx # Schritt 9: Rekonstruktion (Canvas + Overlay)
|
||||
├── usePixelWordPositions.ts # Shared Hook: Pixel-basierte Wortpositionierung
|
||||
├── FabricReconstructionCanvas.tsx # Fabric.js Editor
|
||||
└── StepGroundTruth.tsx # Schritt 10: Validierung
|
||||
```
|
||||
@@ -257,10 +261,20 @@ Alle Endpoints unter `/api/v1/ocr-pipeline/`.
|
||||
|
||||
| Methode | Pfad | Beschreibung |
|
||||
|---------|------|--------------|
|
||||
| `POST` | `/sessions/{id}/words` | Wort-Grid aus Spalten x Zeilen erstellen |
|
||||
| `POST` | `/sessions/{id}/words` | Wort-Grid erstellen |
|
||||
| `POST` | `/sessions/{id}/ground-truth/words` | Ground Truth speichern |
|
||||
| `GET` | `/sessions/{id}/ground-truth/words` | Ground Truth abrufen |
|
||||
|
||||
**Query-Parameter fuer `/sessions/{id}/words`:**
|
||||
|
||||
| Parameter | Default | Beschreibung |
|
||||
|-----------|---------|--------------|
|
||||
| `engine` | `auto` | OCR-Engine: `auto`, `tesseract`, `rapid` |
|
||||
| `pronunciation` | `british` | IPA-Woerterbuch: `british` oder `american` |
|
||||
| `stream` | `false` | SSE-Streaming (nur bei `grid_method=v2`) |
|
||||
| `skip_heal_gaps` | `false` | Zeilen-Luecken nicht heilen (Overlay-Modus) |
|
||||
| `grid_method` | `v2` | Grid-Strategie: `v2` (top-down) oder `words_first` (bottom-up) |
|
||||
|
||||
### Schritt 8: Korrektur
|
||||
|
||||
| Methode | Pfad | Beschreibung |
|
||||
@@ -513,6 +527,12 @@ Horizontale Projektionsprofile finden Zeilen-Luecken; word-level Validierung ver
|
||||
der entstehenden Luecke ausgedehnt, damit kein Zeileninhalt durch schrumpfende Grenzen
|
||||
abgeschnitten wird.
|
||||
|
||||
3. **Box-Boundary-Schutz** (`box_ranges_inner`, neu in v4.2):
|
||||
Bei Seiten mit Box-Zonen (Sub-Sessions) werden Zeilen am Box-Rand nicht faelschlich
|
||||
ausgeschlossen. Das Problem: Die letzte Textzeile ueber einer Box ueberlappt haeufig
|
||||
mit dem Box-Rahmen. Loesung: Die Exclusion-Zone wird um `max(border_thickness, 5px)`
|
||||
geschrumpft, sodass nur Zeilen **innerhalb** der Box ausgeschlossen werden.
|
||||
|
||||
```python
|
||||
def _is_artifact_row(row: RowGeometry) -> bool:
|
||||
"""Zeile ist Artefakt wenn alle Tokens <= 1 Zeichen."""
|
||||
@@ -524,13 +544,128 @@ def _heal_row_gaps(rows, top_bound, bottom_bound):
|
||||
...
|
||||
```
|
||||
|
||||
### Box-Zonen und Content-Strips (Detail)
|
||||
|
||||
Seiten mit Box-Bereichen (z.B. Grammatik-Tipps, Uebungsboxen) werden in Zonen aufgeteilt:
|
||||
|
||||
```
|
||||
┌──────────────────────────┐
|
||||
│ Content Zone 0 (Zeilen) │ ← Vokabeltabelle oben
|
||||
├──────────────────────────┤
|
||||
│ ███ Box Zone (border) ███│ ← Sub-Session mit eigener OCR
|
||||
├──────────────────────────┤
|
||||
│ Content Zone 2 (Zeilen) │ ← Vokabeltabelle unten
|
||||
└──────────────────────────┘
|
||||
```
|
||||
|
||||
**Content-Strip-Verfahren** (`detect_rows` in `ocr_pipeline_api.py`):
|
||||
|
||||
1. Box-Zonen identifizieren, `box_ranges_inner` berechnen (geschrumpft um Border-Dicke)
|
||||
2. Content-Strips = Seitenbereiche **ohne** Box-Inneres, vertikal gestapelt
|
||||
3. Zeilenerkennung auf gestapeltem Bild, Y-Koordinaten zurueckgemappt
|
||||
4. Wort-Filterung: Woerter in Box-Innerem werden ausgeschlossen
|
||||
|
||||
**Wichtig:** `box_ranges_inner` (nicht `box_ranges`) wird verwendet, damit
|
||||
Zeilen am Box-Rand nicht abgeschnitten werden. Minimum 5px Margin.
|
||||
|
||||
---
|
||||
|
||||
## Schritt 7: Worterkennung — Hybrid-Grid (Detail)
|
||||
## Schritt 7: Worterkennung (Detail)
|
||||
|
||||
### Algorithmus: `build_cell_grid_v2()`
|
||||
Schritt 7 bietet zwei Grid-Strategien, auswaehlbar per `grid_method`-Parameter:
|
||||
|
||||
Schritt 5 nutzt eine **Hybrid-Strategie**: Breite Spalten verwenden die Full-Page-Tesseract-Woerter,
|
||||
| Strategie | Parameter | Ansatz | Benoetigt Spalten/Zeilen? |
|
||||
|-----------|-----------|--------|--------------------------|
|
||||
| **Hybrid-Grid v2** | `grid_method=v2` (Default) | Top-down: Spalten → Zeilen → Zellen → OCR | Ja (Schritte 5+6) |
|
||||
| **Words-First** | `grid_method=words_first` | Bottom-up: Woerter → Spalten clustern → Zeilen clustern → Zellen | Nein |
|
||||
|
||||
---
|
||||
|
||||
### Words-First Grid Builder: `build_grid_from_words()`
|
||||
|
||||
**Datei:** `cv_words_first.py`
|
||||
|
||||
Der Words-First Builder arbeitet bottom-up: Er nimmt die pixelgenauen `word_boxes` aus einem
|
||||
Tesseract Full-Page-Lauf und clustert sie direkt zu Spalten und Zeilen — ohne die
|
||||
vorherige Spalten-/Zeilenerkennung (Schritte 5+6) zu benoetigen.
|
||||
|
||||
#### Algorithmus
|
||||
|
||||
```
|
||||
Eingabe: word_dicts (flat list), img_w, img_h
|
||||
│
|
||||
┌───────────┴───────────┐
|
||||
│ 1. Confidence-Filter │
|
||||
│ conf >= 30 │
|
||||
│ Whitespace entf. │
|
||||
└───────────┬───────────┘
|
||||
│
|
||||
┌───────────┴───────────┐
|
||||
│ 2. _cluster_columns() │
|
||||
│ X-Gap-Analyse │
|
||||
│ Schwelle: median_h │
|
||||
│ × 3 (min 3% Breite)│
|
||||
└───────────┬───────────┘
|
||||
│
|
||||
┌───────────┴───────────┐
|
||||
│ 3. _cluster_rows() │
|
||||
│ Y-Proximity-Grupp. │
|
||||
│ Toleranz: median_h │
|
||||
│ / 2 │
|
||||
└───────────┬───────────┘
|
||||
│
|
||||
┌───────────┴───────────┐
|
||||
│ 4. _build_cells() │
|
||||
│ Wort → (col, row) │
|
||||
│ Text + bbox + conf │
|
||||
│ word_boxes pro Zelle│
|
||||
└───────────┬───────────┘
|
||||
│
|
||||
Ausgabe: cells[], columns_meta[]
|
||||
(identisch zu build_cell_grid_v2)
|
||||
```
|
||||
|
||||
#### Spalten-Clustering
|
||||
|
||||
1. Alle Woerter nach X-Mitte sortieren
|
||||
2. Aufeinanderfolgende X-Gaps berechnen
|
||||
3. Adaptiver Schwellwert: `median_word_height × 3` (min 3% Bildbreite)
|
||||
4. Gaps > Schwellwert = Spaltengrenzen
|
||||
5. Kein Gap gefunden → 1 Spalte (`column_text`)
|
||||
6. Spaltentypen: `column_1`, `column_2`, ... (generisch, positionsbasiert)
|
||||
|
||||
#### Zeilen-Clustering
|
||||
|
||||
1. Woerter zu visuellen Zeilen gruppieren (Y-Toleranz: halbe Worthoehe)
|
||||
2. Jede visuelle Zeile = eine Zeile im Grid
|
||||
3. Sortiert von oben nach unten
|
||||
|
||||
#### Edge Cases
|
||||
|
||||
| Fall | Behandlung |
|
||||
|------|------------|
|
||||
| Einzelne Spalte (Fliesstext) | Kein X-Gap → 1 Spalte `column_text` |
|
||||
| Keine Woerter erkannt | Leeres Ergebnis `([], [])` |
|
||||
| Ueberschriften (grosse Schrift) | Eigene Zeile durch Y-Gap |
|
||||
| Bilder/Grafiken | Keine Woerter → automatisch leerer Bereich |
|
||||
| Schmale Spalten (Seitenzahlen) | Eigene Spalte durch X-Gap |
|
||||
|
||||
#### Vergleich v2 vs. Words-First
|
||||
|
||||
| Kriterium | v2 (Top-Down) | Words-First (Bottom-Up) |
|
||||
|-----------|---------------|------------------------|
|
||||
| **Abhaengigkeiten** | Spalten + Zeilen noetig | Nur Tesseract-Woerter |
|
||||
| **Spaltentypen** | Semantisch (EN, DE, ...) | Positionsbasiert (1, 2, ...) |
|
||||
| **OCR** | Hybrid (full-page + cell-crop) | Nur full-page Tesseract |
|
||||
| **Robustheit** | Abhaengig von Spalten-/Zeilenerkennung | Direkt aus Wortpositionen |
|
||||
| **Geschwindigkeit** | Langsamer (cell-crop pro Zelle) | Schneller (kein OCR-Lauf) |
|
||||
| **Genauigkeit** | Besser bei schmalen Spalten | Besser bei ungewoehnlichen Layouts |
|
||||
|
||||
---
|
||||
|
||||
### Hybrid-Grid v2: `build_cell_grid_v2()`
|
||||
|
||||
Schritt 7 nutzt im Default eine **Hybrid-Strategie**: Breite Spalten verwenden die Full-Page-Tesseract-Woerter,
|
||||
schmale Spalten werden isoliert per Cell-Crop OCR verarbeitet.
|
||||
|
||||
!!! success "Warum Hybrid?"
|
||||
@@ -692,7 +827,7 @@ Change-Format:
|
||||
|
||||
## Schritt 9: Rekonstruktion (Detail)
|
||||
|
||||
Zwei Modi verfuegbar:
|
||||
Drei Modi verfuegbar:
|
||||
|
||||
### Einfacher Modus
|
||||
|
||||
@@ -709,6 +844,73 @@ angezeigt, alle Grid-Zellen (auch leere!) werden als editierbare Textfelder daru
|
||||
- Zoom 50–200 %
|
||||
- Per-Zell-Reset-Button bei geaenderten Zellen
|
||||
|
||||
### Overlay-Modus (neu in v4.2)
|
||||
|
||||
Ganzseitige Tabellenrekonstruktion mit **Pixel-basierter Wortpositionierung**.
|
||||
Nur verfuegbar bei Parent-Sessions mit Sub-Sessions (Box-Bereiche).
|
||||
|
||||
**Funktionsweise:**
|
||||
|
||||
1. **Sub-Session-Merging:** Zellen aus Sub-Sessions werden koordinaten-konvertiert
|
||||
und in die Parent-Session eingefuegt. Die Umrechnung laeuft ueber die Box-Zone:
|
||||
```
|
||||
parentCellX = boxXPct + (subCell.bbox_pct.x / 100) * boxWPct
|
||||
parentCellY = boxYPct + (subCell.bbox_pct.y / 100) * boxHPct
|
||||
```
|
||||
|
||||
2. **180°-Rotation:** Bei Parent-Sessions mit Boxen wird das Bild standardmaessig
|
||||
180° gedreht, da der Scan haeufig kopfueber vorliegt. Die Pixel-Analyse
|
||||
arbeitet auf dem rotierten Bild:
|
||||
- Canvas: `ctx.translate(W, H); ctx.rotate(Math.PI)`
|
||||
- Zell-Koordinaten: `(100 - x - w, 100 - y - h)` fuer rotiertes Space
|
||||
- Cluster-Ruecktransformation: `start → cw-1-end`, danach `reverse()`
|
||||
|
||||
3. **Pixel-Wortpositionierung:** Der `usePixelWordPositions` Hook analysiert
|
||||
dunkle Pixel per vertikaler Projektion, findet Wortgruppen-Cluster und
|
||||
berechnet die exakte horizontale Position + Auto-Schriftgroesse.
|
||||
|
||||
**Layout:** 50/50 Grid (links Originalbild, rechts Rekonstruktion)
|
||||
|
||||
**Toolbar:**
|
||||
|
||||
- Schriftgroessen-Slider (30–120%)
|
||||
- Bold-Toggle
|
||||
- 180°-Rotations-Toggle
|
||||
- Speichern-Button
|
||||
|
||||
**Visuelle Elemente:**
|
||||
|
||||
- Spaltenlinien (aus `column_result.columns`)
|
||||
- Zeilenlinien (aus `row_result.rows`)
|
||||
- Box-Zonen-Markierung (blau, halbtransparent)
|
||||
- Editierbare Inputs an Pixel-Positionen
|
||||
|
||||
### Shared Hook: `usePixelWordPositions`
|
||||
|
||||
Extrahierter Hook fuer die Pixel-basierte Wortpositionierung, genutzt in
|
||||
StepLlmReview (Schritt 8) und StepReconstruction (Schritt 9).
|
||||
|
||||
```typescript
|
||||
function usePixelWordPositions(
|
||||
imageUrl: string,
|
||||
cells: GridCell[],
|
||||
active: boolean,
|
||||
rotation: 0 | 180 = 0,
|
||||
): Map<string, WordPosition[]>
|
||||
```
|
||||
|
||||
**Algorithmus:**
|
||||
|
||||
1. Bild in offscreen Canvas laden (optional 180° gedreht)
|
||||
2. Pro Zelle: `getImageData()` → vertikale Projektion (dunkle Pixel pro Spalte)
|
||||
3. Cluster-Erkennung (Schwelle: 3% der Zellhoehe, Gap: 2% der Zellbreite)
|
||||
4. Bei Rotation: Cluster zurueck ins Original-Koordinatensystem spiegeln
|
||||
5. Text-Gruppen (split bei 3+ Leerzeichen) auf Cluster matchen
|
||||
6. Auto-Schriftgroesse per `measureText()` + `fontRatio`
|
||||
7. Mode-Normalisierung: Haeufigste `fontRatio` (gerundet auf 0.02) auf alle anwenden
|
||||
|
||||
**Rueckgabe:** `Map<cell_id, WordPosition[]>` mit `xPct`, `wPct`, `text`, `fontRatio`
|
||||
|
||||
### Fabric.js Editor
|
||||
|
||||
Erweiterter Canvas-Editor (`FabricReconstructionCanvas.tsx`):
|
||||
@@ -861,6 +1063,8 @@ ssh macmini "/usr/local/bin/docker compose -f /Users/benjaminadmin/Projekte/brea
|
||||
|
||||
| Datum | Version | Aenderung |
|
||||
|-------|---------|----------|
|
||||
| 2026-03-12 | 4.3.0 | Words-First Grid Builder (`cv_words_first.py`): Bottom-up-Algorithmus clustert Tesseract word_boxes direkt zu Spalten/Zeilen/Zellen. Neuer `grid_method` Parameter im `/words` Endpoint. Frontend-Toggle in StepWordRecognition. |
|
||||
| 2026-03-10 | 4.2.0 | Rekonstruktion: Overlay-Modus mit Pixel-Wortpositionierung, 180°-Rotation, Sub-Session-Merging, usePixelWordPositions Hook, Box-Boundary-Schutz (box_ranges_inner) |
|
||||
| 2026-03-05 | 3.1.0 | Spalten: Seiten-Segmentierung an Sub-Headern, Word-Coverage Fallback, Segment-gefilterte Validierung |
|
||||
| 2026-03-05 | 3.0.1 | Dewarp: Feinabstimmung mit 7 Schiebereglern (3 Rotation + 4 Shear), Combined-Adjust-Endpoint |
|
||||
| 2026-03-05 | 3.0.0 | Doku-Update: Dokumenttyp-Erkennung, Hybrid-Grid, Sub-Column-Detection, Pipeline-Pfade |
|
||||
|
||||
@@ -34,3 +34,4 @@ from cv_ocr_engines import ( # noqa: F401
|
||||
_fix_phonetic_brackets,
|
||||
)
|
||||
from cv_cell_grid import _cells_to_vocab_entries # noqa: F401
|
||||
from cv_words_first import build_grid_from_words # noqa: F401
|
||||
|
||||
282
klausur-service/backend/cv_words_first.py
Normal file
282
klausur-service/backend/cv_words_first.py
Normal file
@@ -0,0 +1,282 @@
|
||||
"""
|
||||
Words-First Grid Builder (Bottom-Up).
|
||||
|
||||
Builds a cell grid from Tesseract word_boxes directly, without requiring
|
||||
pre-detected columns or rows. Algorithm:
|
||||
|
||||
1. Cluster words into columns by X-gap analysis
|
||||
2. Cluster words into rows by Y-proximity
|
||||
3. Build cells at (column, row) intersections
|
||||
|
||||
Returns the same (cells, columns_meta) format as build_cell_grid_v2().
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import statistics
|
||||
from typing import Any, Dict, List, Tuple
|
||||
|
||||
from cv_ocr_engines import (
|
||||
_group_words_into_lines,
|
||||
_words_to_reading_order_text,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 1. Column clustering
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _cluster_columns(
|
||||
words: List[Dict],
|
||||
img_w: int,
|
||||
min_gap_pct: float = 3.0,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Cluster words into columns by finding large horizontal gaps.
|
||||
|
||||
Returns a list of column dicts:
|
||||
[{'index': 0, 'type': 'column_1', 'x_min': ..., 'x_max': ...}, ...]
|
||||
sorted left-to-right.
|
||||
"""
|
||||
if not words:
|
||||
return []
|
||||
|
||||
# Sort by X center
|
||||
sorted_w = sorted(words, key=lambda w: w['left'] + w['width'] / 2)
|
||||
|
||||
# Collect word heights to compute adaptive threshold
|
||||
heights = [w['height'] for w in sorted_w if w.get('height', 0) > 0]
|
||||
median_h = statistics.median(heights) if heights else 30
|
||||
|
||||
# Adaptive gap threshold: 3× median word height, but at least min_gap_pct of image width
|
||||
min_gap_px = max(median_h * 3, img_w * min_gap_pct / 100) if img_w > 0 else median_h * 3
|
||||
|
||||
# Find X-gap boundaries between consecutive words (sorted by X-center)
|
||||
# For each word, compute right edge; for next word, compute left edge
|
||||
boundaries: List[float] = [] # X positions where columns split
|
||||
for i in range(len(sorted_w) - 1):
|
||||
right_edge = sorted_w[i]['left'] + sorted_w[i]['width']
|
||||
left_edge = sorted_w[i + 1]['left']
|
||||
gap = left_edge - right_edge
|
||||
if gap > min_gap_px:
|
||||
# Split point is midway through the gap
|
||||
boundaries.append((right_edge + left_edge) / 2)
|
||||
|
||||
# Build column ranges from boundaries
|
||||
# Column ranges: (-inf, boundary[0]), (boundary[0], boundary[1]), ..., (boundary[-1], +inf)
|
||||
col_edges = [0.0] + boundaries + [float(img_w)]
|
||||
columns = []
|
||||
for ci in range(len(col_edges) - 1):
|
||||
columns.append({
|
||||
'index': ci,
|
||||
'type': f'column_{ci + 1}' if len(col_edges) > 2 else 'column_text',
|
||||
'x_min': col_edges[ci],
|
||||
'x_max': col_edges[ci + 1],
|
||||
})
|
||||
|
||||
return columns
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 2. Row clustering
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _cluster_rows(
|
||||
words: List[Dict],
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Cluster words into visual rows by Y-proximity.
|
||||
|
||||
Uses half the median word height as Y-tolerance.
|
||||
|
||||
Returns a list of row dicts:
|
||||
[{'index': 0, 'y_min': ..., 'y_max': ..., 'y_center': ...}, ...]
|
||||
sorted top-to-bottom.
|
||||
"""
|
||||
if not words:
|
||||
return []
|
||||
|
||||
heights = [w['height'] for w in words if w.get('height', 0) > 0]
|
||||
median_h = statistics.median(heights) if heights else 20
|
||||
y_tol = max(median_h * 0.5, 5)
|
||||
|
||||
lines = _group_words_into_lines(words, y_tolerance_px=int(y_tol))
|
||||
|
||||
rows = []
|
||||
for ri, line_words in enumerate(lines):
|
||||
y_min = min(w['top'] for w in line_words)
|
||||
y_max = max(w['top'] + w['height'] for w in line_words)
|
||||
rows.append({
|
||||
'index': ri,
|
||||
'y_min': y_min,
|
||||
'y_max': y_max,
|
||||
'y_center': (y_min + y_max) / 2,
|
||||
})
|
||||
|
||||
return rows
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 3. Build cells
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _assign_word_to_column(word: Dict, columns: List[Dict]) -> int:
|
||||
"""Return column index for a word based on its X-center."""
|
||||
x_center = word['left'] + word['width'] / 2
|
||||
for col in columns:
|
||||
if col['x_min'] <= x_center < col['x_max']:
|
||||
return col['index']
|
||||
# Fallback: nearest column
|
||||
return min(columns, key=lambda c: abs((c['x_min'] + c['x_max']) / 2 - x_center))['index']
|
||||
|
||||
|
||||
def _assign_word_to_row(word: Dict, rows: List[Dict]) -> int:
|
||||
"""Return row index for a word based on its Y-center."""
|
||||
y_center = word['top'] + word['height'] / 2
|
||||
# Find the row whose y_range contains this word's center
|
||||
for row in rows:
|
||||
if row['y_min'] <= y_center <= row['y_max']:
|
||||
return row['index']
|
||||
# Fallback: nearest row by Y-center
|
||||
return min(rows, key=lambda r: abs(r['y_center'] - y_center))['index']
|
||||
|
||||
|
||||
def _build_cells(
|
||||
words: List[Dict],
|
||||
columns: List[Dict],
|
||||
rows: List[Dict],
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Build cell dicts from word assignments to (column, row) pairs."""
|
||||
if not columns or not rows:
|
||||
return []
|
||||
|
||||
# Bucket words into (col_idx, row_idx)
|
||||
buckets: Dict[Tuple[int, int], List[Dict]] = {}
|
||||
for w in words:
|
||||
ci = _assign_word_to_column(w, columns)
|
||||
ri = _assign_word_to_row(w, rows)
|
||||
buckets.setdefault((ci, ri), []).append(w)
|
||||
|
||||
cells = []
|
||||
for (ci, ri), cell_words in sorted(buckets.items(), key=lambda kv: (kv[0][1], kv[0][0])):
|
||||
col = columns[ci]
|
||||
row = rows[ri]
|
||||
|
||||
# Compute tight bbox from actual word positions
|
||||
x_min = min(w['left'] for w in cell_words)
|
||||
y_min = min(w['top'] for w in cell_words)
|
||||
x_max = max(w['left'] + w['width'] for w in cell_words)
|
||||
y_max = max(w['top'] + w['height'] for w in cell_words)
|
||||
bw = x_max - x_min
|
||||
bh = y_max - y_min
|
||||
|
||||
# Text from words in reading order
|
||||
text = _words_to_reading_order_text(cell_words, y_tolerance_px=max(10, int(bh * 0.4)))
|
||||
|
||||
# Average confidence
|
||||
confs = [w.get('conf', 0) for w in cell_words if w.get('conf', 0) > 0]
|
||||
avg_conf = sum(confs) / len(confs) if confs else 0.0
|
||||
|
||||
# Word boxes with percent coordinates
|
||||
word_boxes = []
|
||||
for w in sorted(cell_words, key=lambda ww: (ww['top'], ww['left'])):
|
||||
word_boxes.append({
|
||||
'text': w.get('text', ''),
|
||||
'left': round(w['left'] / img_w * 100, 2) if img_w else 0,
|
||||
'top': round(w['top'] / img_h * 100, 2) if img_h else 0,
|
||||
'width': round(w['width'] / img_w * 100, 2) if img_w else 0,
|
||||
'height': round(w['height'] / img_h * 100, 2) if img_h else 0,
|
||||
'conf': w.get('conf', 0),
|
||||
})
|
||||
|
||||
cells.append({
|
||||
'cell_id': f"R{ri:02d}_C{ci}",
|
||||
'row_index': ri,
|
||||
'col_index': ci,
|
||||
'col_type': col['type'],
|
||||
'text': text,
|
||||
'confidence': round(avg_conf, 1),
|
||||
'bbox_px': {'x': x_min, 'y': y_min, 'w': bw, 'h': bh},
|
||||
'bbox_pct': {
|
||||
'x': round(x_min / img_w * 100, 2) if img_w else 0,
|
||||
'y': round(y_min / img_h * 100, 2) if img_h else 0,
|
||||
'w': round(bw / img_w * 100, 2) if img_w else 0,
|
||||
'h': round(bh / img_h * 100, 2) if img_h else 0,
|
||||
},
|
||||
'word_boxes': word_boxes,
|
||||
'ocr_engine': 'words_first',
|
||||
'is_bold': False,
|
||||
})
|
||||
|
||||
return cells
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 4. Public API
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def build_grid_from_words(
|
||||
word_dicts: List[Dict],
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
min_confidence: int = 30,
|
||||
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
||||
"""Build a cell grid bottom-up from Tesseract word boxes.
|
||||
|
||||
Args:
|
||||
word_dicts: Flat list of word dicts with keys:
|
||||
text, left, top, width, height, conf
|
||||
(absolute pixel coordinates).
|
||||
img_w: Image width in pixels.
|
||||
img_h: Image height in pixels.
|
||||
min_confidence: Minimum OCR confidence to keep a word.
|
||||
|
||||
Returns:
|
||||
(cells, columns_meta) — same format as build_cell_grid_v2().
|
||||
cells: list of cell dicts with cell_id, bbox_px, bbox_pct, etc.
|
||||
columns_meta: list of {'index', 'type', 'x', 'width'} dicts.
|
||||
"""
|
||||
if not word_dicts:
|
||||
logger.info("build_grid_from_words: no words — returning empty grid")
|
||||
return [], []
|
||||
|
||||
# Filter by confidence
|
||||
words = [
|
||||
w for w in word_dicts
|
||||
if w.get('conf', 0) >= min_confidence and w.get('text', '').strip()
|
||||
]
|
||||
if not words:
|
||||
logger.info("build_grid_from_words: all words filtered (conf < %d)", min_confidence)
|
||||
return [], []
|
||||
|
||||
logger.info("build_grid_from_words: %d words (after confidence filter from %d)", len(words), len(word_dicts))
|
||||
|
||||
# Step 1: cluster columns
|
||||
columns = _cluster_columns(words, img_w)
|
||||
logger.info("build_grid_from_words: %d column(s) detected", len(columns))
|
||||
|
||||
# Step 2: cluster rows
|
||||
rows = _cluster_rows(words)
|
||||
logger.info("build_grid_from_words: %d row(s) detected", len(rows))
|
||||
|
||||
# Step 3: build cells
|
||||
cells = _build_cells(words, columns, rows, img_w, img_h)
|
||||
logger.info("build_grid_from_words: %d cells built", len(cells))
|
||||
|
||||
# Build columns_meta in same format as build_cell_grid_v2
|
||||
columns_meta = []
|
||||
for col in columns:
|
||||
x = int(col['x_min'])
|
||||
w = int(col['x_max'] - col['x_min'])
|
||||
columns_meta.append({
|
||||
'index': col['index'],
|
||||
'type': col['type'],
|
||||
'x': x,
|
||||
'width': w,
|
||||
})
|
||||
|
||||
return cells, columns_meta
|
||||
@@ -71,6 +71,7 @@ from cv_vocab_pipeline import (
|
||||
render_image_high_res,
|
||||
render_pdf_high_res,
|
||||
)
|
||||
from cv_words_first import build_grid_from_words
|
||||
from ocr_pipeline_session_store import (
|
||||
create_session_db,
|
||||
delete_all_sessions_db,
|
||||
@@ -1859,6 +1860,7 @@ async def detect_words(
|
||||
pronunciation: str = "british",
|
||||
stream: bool = False,
|
||||
skip_heal_gaps: bool = False,
|
||||
grid_method: str = "v2",
|
||||
):
|
||||
"""Build word grid from columns × rows, OCR each cell.
|
||||
|
||||
@@ -1868,6 +1870,9 @@ async def detect_words(
|
||||
stream: false (default) for JSON response, true for SSE streaming
|
||||
skip_heal_gaps: false (default). When true, cells keep exact row geometry
|
||||
positions without gap-healing expansion. Better for overlay rendering.
|
||||
grid_method: 'v2' (default) or 'words_first' — grid construction strategy.
|
||||
'v2' uses pre-detected columns/rows (top-down).
|
||||
'words_first' clusters words bottom-up (no column/row detection needed).
|
||||
"""
|
||||
if session_id not in _cache:
|
||||
logger.info("detect_words: session %s not in cache, loading from DB", session_id)
|
||||
@@ -1902,7 +1907,7 @@ async def detect_words(
|
||||
"duration_seconds": 0,
|
||||
}
|
||||
logger.info("detect_words: no column_result — using full-page pseudo-column %dx%d", img_w_tmp, img_h_tmp)
|
||||
if not row_result or not row_result.get("rows"):
|
||||
if grid_method != "words_first" and (not row_result or not row_result.get("rows")):
|
||||
raise HTTPException(status_code=400, detail="Row detection must be completed first")
|
||||
|
||||
# Convert column dicts back to PageRegion objects
|
||||
@@ -1983,6 +1988,102 @@ async def detect_words(
|
||||
if excluded:
|
||||
logger.info(f"detect_words: excluded {excluded} rows inside box zones")
|
||||
|
||||
# --- Words-First path: bottom-up grid from word boxes ---
|
||||
if grid_method == "words_first":
|
||||
t0 = time.time()
|
||||
img_h, img_w = dewarped_bgr.shape[:2]
|
||||
|
||||
# Get word_dicts from cache or run Tesseract full-page
|
||||
wf_word_dicts = cached.get("_word_dicts")
|
||||
if wf_word_dicts is None:
|
||||
ocr_img_tmp = create_ocr_image(dewarped_bgr)
|
||||
geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr)
|
||||
if geo_result is not None:
|
||||
_geoms, left_x, right_x, top_y, bottom_y, wf_word_dicts, inv = geo_result
|
||||
cached["_word_dicts"] = wf_word_dicts
|
||||
cached["_inv"] = inv
|
||||
cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
|
||||
|
||||
if not wf_word_dicts:
|
||||
raise HTTPException(status_code=400, detail="No words detected — cannot build words-first grid")
|
||||
|
||||
# Convert word coordinates to absolute image coordinates if needed
|
||||
# (detect_column_geometry returns words relative to content ROI)
|
||||
content_bounds = cached.get("_content_bounds")
|
||||
if content_bounds:
|
||||
lx, _rx, ty, _by = content_bounds
|
||||
abs_words = []
|
||||
for w in wf_word_dicts:
|
||||
abs_words.append({
|
||||
**w,
|
||||
'left': w['left'] + lx,
|
||||
'top': w['top'] + ty,
|
||||
})
|
||||
wf_word_dicts = abs_words
|
||||
|
||||
cells, columns_meta = build_grid_from_words(wf_word_dicts, img_w, img_h)
|
||||
duration = time.time() - t0
|
||||
|
||||
# Apply IPA phonetic fixes
|
||||
fix_cell_phonetics(cells, pronunciation=pronunciation)
|
||||
|
||||
# Add zone_index for backward compat
|
||||
for cell in cells:
|
||||
cell.setdefault("zone_index", 0)
|
||||
|
||||
col_types = {c['type'] for c in columns_meta}
|
||||
is_vocab = bool(col_types & {'column_en', 'column_de'})
|
||||
n_rows = len(set(c['row_index'] for c in cells)) if cells else 0
|
||||
n_cols = len(columns_meta)
|
||||
used_engine = "words_first"
|
||||
|
||||
word_result = {
|
||||
"cells": cells,
|
||||
"grid_shape": {
|
||||
"rows": n_rows,
|
||||
"cols": n_cols,
|
||||
"total_cells": len(cells),
|
||||
},
|
||||
"columns_used": columns_meta,
|
||||
"layout": "vocab" if is_vocab else "generic",
|
||||
"image_width": img_w,
|
||||
"image_height": img_h,
|
||||
"duration_seconds": round(duration, 2),
|
||||
"ocr_engine": used_engine,
|
||||
"grid_method": "words_first",
|
||||
"summary": {
|
||||
"total_cells": len(cells),
|
||||
"non_empty_cells": sum(1 for c in cells if c.get("text")),
|
||||
"low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
|
||||
},
|
||||
}
|
||||
|
||||
if is_vocab or 'column_text' in col_types:
|
||||
entries = _cells_to_vocab_entries(cells, columns_meta)
|
||||
entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
|
||||
word_result["vocab_entries"] = entries
|
||||
word_result["entries"] = entries
|
||||
word_result["entry_count"] = len(entries)
|
||||
word_result["summary"]["total_entries"] = len(entries)
|
||||
word_result["summary"]["with_english"] = sum(1 for e in entries if e.get("english"))
|
||||
word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))
|
||||
|
||||
await update_session_db(session_id, word_result=word_result, current_step=8)
|
||||
cached["word_result"] = word_result
|
||||
|
||||
logger.info(f"OCR Pipeline: words-first session {session_id}: "
|
||||
f"{len(cells)} cells ({duration:.2f}s), {n_rows} rows, {n_cols} cols")
|
||||
|
||||
await _append_pipeline_log(session_id, "words", {
|
||||
"grid_method": "words_first",
|
||||
"total_cells": len(cells),
|
||||
"non_empty_cells": word_result["summary"]["non_empty_cells"],
|
||||
"ocr_engine": used_engine,
|
||||
"layout": word_result["layout"],
|
||||
}, duration_ms=int(duration * 1000))
|
||||
|
||||
return {"session_id": session_id, **word_result}
|
||||
|
||||
if stream:
|
||||
# Cell-First OCR v2: use batch-then-stream approach instead of
|
||||
# per-cell streaming. The parallel ThreadPoolExecutor in
|
||||
@@ -2001,7 +2102,7 @@ async def detect_words(
|
||||
},
|
||||
)
|
||||
|
||||
# --- Non-streaming path ---
|
||||
# --- Non-streaming path (grid_method=v2) ---
|
||||
t0 = time.time()
|
||||
|
||||
# Create binarized OCR image (for Tesseract)
|
||||
|
||||
214
klausur-service/backend/tests/test_cv_words_first.py
Normal file
214
klausur-service/backend/tests/test_cv_words_first.py
Normal file
@@ -0,0 +1,214 @@
|
||||
"""Tests for cv_words_first.py — Words-First Grid Builder."""
|
||||
|
||||
import pytest
|
||||
from cv_words_first import (
|
||||
_assign_word_to_column,
|
||||
_assign_word_to_row,
|
||||
_build_cells,
|
||||
_cluster_columns,
|
||||
_cluster_rows,
|
||||
build_grid_from_words,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _word(text: str, left: int, top: int, width: int = 60, height: int = 20, conf: int = 90):
|
||||
"""Create a synthetic word dict."""
|
||||
return {
|
||||
'text': text,
|
||||
'left': left,
|
||||
'top': top,
|
||||
'width': width,
|
||||
'height': height,
|
||||
'conf': conf,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _cluster_columns
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestClusterColumns:
|
||||
|
||||
def test_single_column_freetext(self):
|
||||
"""Words spread evenly across page → 1 column (column_text)."""
|
||||
words = [
|
||||
_word("Hello", 50, 10),
|
||||
_word("world", 120, 10),
|
||||
_word("this", 50, 40),
|
||||
_word("is", 120, 40),
|
||||
_word("text", 190, 40),
|
||||
]
|
||||
cols = _cluster_columns(words, img_w=400)
|
||||
assert len(cols) == 1
|
||||
assert cols[0]['type'] == 'column_text'
|
||||
|
||||
def test_two_columns(self):
|
||||
"""Two word groups with large X-gap → 2 columns."""
|
||||
words = [
|
||||
_word("apple", 20, 10),
|
||||
_word("Apfel", 300, 10),
|
||||
_word("dog", 20, 40),
|
||||
_word("Hund", 300, 40),
|
||||
]
|
||||
cols = _cluster_columns(words, img_w=500)
|
||||
assert len(cols) == 2
|
||||
assert cols[0]['type'] == 'column_1'
|
||||
assert cols[1]['type'] == 'column_2'
|
||||
|
||||
def test_three_columns(self):
|
||||
"""Three groups separated by wide gaps → 3 columns."""
|
||||
words = [
|
||||
_word("1", 10, 10, width=20),
|
||||
_word("apple", 100, 10),
|
||||
_word("Apfel", 400, 10),
|
||||
_word("2", 10, 40, width=20),
|
||||
_word("dog", 100, 40),
|
||||
_word("Hund", 400, 40),
|
||||
]
|
||||
cols = _cluster_columns(words, img_w=600)
|
||||
assert len(cols) == 3
|
||||
|
||||
def test_empty_words(self):
|
||||
"""No words → empty result."""
|
||||
assert _cluster_columns([], img_w=500) == []
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _cluster_rows
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestClusterRows:
|
||||
|
||||
def test_two_rows(self):
|
||||
"""Words at two Y-levels → 2 rows."""
|
||||
words = [
|
||||
_word("hello", 10, 20),
|
||||
_word("world", 100, 25),
|
||||
_word("foo", 10, 80),
|
||||
_word("bar", 100, 82),
|
||||
]
|
||||
rows = _cluster_rows(words)
|
||||
assert len(rows) == 2
|
||||
assert rows[0]['y_min'] < rows[1]['y_min']
|
||||
|
||||
def test_single_row(self):
|
||||
"""All words at same Y → 1 row."""
|
||||
words = [
|
||||
_word("a", 10, 50),
|
||||
_word("b", 80, 52),
|
||||
_word("c", 150, 51),
|
||||
]
|
||||
rows = _cluster_rows(words)
|
||||
assert len(rows) == 1
|
||||
|
||||
def test_empty(self):
|
||||
assert _cluster_rows([]) == []
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# build_grid_from_words (integration)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestBuildGridFromWords:
|
||||
|
||||
def test_two_column_vocab(self):
|
||||
"""Simulate a 2-column vocabulary page with 3 rows."""
|
||||
words = [
|
||||
_word("apple", 50, 20),
|
||||
_word("Apfel", 400, 22),
|
||||
_word("dog", 50, 60),
|
||||
_word("Hund", 400, 62),
|
||||
_word("cat", 50, 100),
|
||||
_word("Katze", 400, 102),
|
||||
]
|
||||
cells, cols_meta = build_grid_from_words(words, img_w=600, img_h=200)
|
||||
|
||||
assert len(cols_meta) == 2
|
||||
assert len(cells) == 6 # 3 rows × 2 cols
|
||||
# Check cell_id format
|
||||
cell_ids = {c['cell_id'] for c in cells}
|
||||
assert 'R00_C0' in cell_ids
|
||||
assert 'R00_C1' in cell_ids
|
||||
|
||||
def test_single_column_freetext(self):
|
||||
"""Single-column text → 1 column, multiple rows."""
|
||||
words = [
|
||||
_word("Hello", 50, 20),
|
||||
_word("world", 120, 22),
|
||||
_word("Second", 50, 60),
|
||||
_word("line", 120, 62),
|
||||
]
|
||||
cells, cols_meta = build_grid_from_words(words, img_w=300, img_h=150)
|
||||
|
||||
assert len(cols_meta) == 1
|
||||
assert cols_meta[0]['type'] == 'column_text'
|
||||
assert len(cells) == 2 # 2 rows, 1 column each
|
||||
|
||||
def test_empty_input(self):
|
||||
cells, cols = build_grid_from_words([], img_w=500, img_h=500)
|
||||
assert cells == []
|
||||
assert cols == []
|
||||
|
||||
def test_low_confidence_filtered(self):
|
||||
"""Words below min_confidence are excluded."""
|
||||
words = [
|
||||
_word("good", 50, 20, conf=90),
|
||||
_word("bad", 200, 20, conf=10),
|
||||
]
|
||||
cells, cols = build_grid_from_words(words, img_w=400, img_h=100, min_confidence=30)
|
||||
# Only the good word should produce a cell
|
||||
assert len(cells) == 1
|
||||
assert cells[0]['text'] == 'good'
|
||||
|
||||
def test_bbox_pct_correct(self):
|
||||
"""Check that bbox_pct is correctly computed from pixel coords."""
|
||||
words = [_word("test", 200, 100, width=100, height=30)]
|
||||
cells, _ = build_grid_from_words(words, img_w=1000, img_h=500)
|
||||
assert len(cells) == 1
|
||||
bp = cells[0]['bbox_pct']
|
||||
assert bp['x'] == 20.0 # 200/1000*100
|
||||
assert bp['y'] == 20.0 # 100/500*100
|
||||
assert bp['w'] == 10.0 # 100/1000*100
|
||||
assert bp['h'] == 6.0 # 30/500*100
|
||||
|
||||
def test_columns_meta_format(self):
|
||||
"""columns_meta has same keys as build_cell_grid_v2 output."""
|
||||
words = [
|
||||
_word("a", 50, 20),
|
||||
_word("b", 400, 20),
|
||||
]
|
||||
_, cols_meta = build_grid_from_words(words, img_w=600, img_h=100)
|
||||
for col in cols_meta:
|
||||
assert 'index' in col
|
||||
assert 'type' in col
|
||||
assert 'x' in col
|
||||
assert 'width' in col
|
||||
|
||||
def test_word_boxes_included(self):
|
||||
"""Each cell should contain word_boxes with percent coords."""
|
||||
words = [
|
||||
_word("hello", 50, 20),
|
||||
_word("world", 120, 22),
|
||||
]
|
||||
cells, _ = build_grid_from_words(words, img_w=300, img_h=100)
|
||||
assert len(cells) == 1 # single row, single column
|
||||
wb = cells[0].get('word_boxes', [])
|
||||
assert len(wb) == 2
|
||||
for w in wb:
|
||||
assert 'left' in w
|
||||
assert 'top' in w
|
||||
assert 'text' in w
|
||||
|
||||
def test_all_whitespace_filtered(self):
|
||||
"""Words with only whitespace text are filtered out."""
|
||||
words = [
|
||||
_word(" ", 50, 20, conf=90),
|
||||
_word("hello", 200, 20, conf=90),
|
||||
]
|
||||
cells, _ = build_grid_from_words(words, img_w=400, img_h=100)
|
||||
assert len(cells) == 1
|
||||
assert cells[0]['text'] == 'hello'
|
||||
Reference in New Issue
Block a user