diff --git a/admin-lehrer/components/ocr-pipeline/FabricReconstructionCanvas.tsx b/admin-lehrer/components/ocr-pipeline/FabricReconstructionCanvas.tsx index f99e8b8..210046f 100644 --- a/admin-lehrer/components/ocr-pipeline/FabricReconstructionCanvas.tsx +++ b/admin-lehrer/components/ocr-pipeline/FabricReconstructionCanvas.tsx @@ -101,7 +101,7 @@ export function FabricReconstructionCanvas({ if (!canvasEl) return // Load background image first to get dimensions - const imgUrl = `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/dewarped` + const imgUrl = `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/cropped` const bgImg = await fabricModule.FabricImage.fromURL(imgUrl, { crossOrigin: 'anonymous' }) as FabricImage diff --git a/admin-lehrer/components/ocr-pipeline/StepColumnDetection.tsx b/admin-lehrer/components/ocr-pipeline/StepColumnDetection.tsx index 30bad55..b707348 100644 --- a/admin-lehrer/components/ocr-pipeline/StepColumnDetection.tsx +++ b/admin-lehrer/components/ocr-pipeline/StepColumnDetection.tsx @@ -192,7 +192,7 @@ export function StepColumnDetection({ sessionId, onNext }: StepColumnDetectionPr ) } - const dewarpedUrl = `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/dewarped` + const dewarpedUrl = `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/cropped` const overlayUrl = `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/columns-overlay` // Pre-compute editor state from saved GT or auto columns for GT mode diff --git a/admin-lehrer/components/ocr-pipeline/StepLlmReview.tsx b/admin-lehrer/components/ocr-pipeline/StepLlmReview.tsx index 18eedc2..013f8fa 100644 --- a/admin-lehrer/components/ocr-pipeline/StepLlmReview.tsx +++ b/admin-lehrer/components/ocr-pipeline/StepLlmReview.tsx @@ -320,7 +320,7 @@ export function StepLlmReview({ sessionId, onNext }: StepLlmReviewProps) { } const dewarpedUrl = sessionId - ? `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/dewarped` + ? `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/cropped` : '' if (!sessionId) { diff --git a/admin-lehrer/components/ocr-pipeline/StepReconstruction.tsx b/admin-lehrer/components/ocr-pipeline/StepReconstruction.tsx index baf5717..9b8d6d2 100644 --- a/admin-lehrer/components/ocr-pipeline/StepReconstruction.tsx +++ b/admin-lehrer/components/ocr-pipeline/StepReconstruction.tsx @@ -276,7 +276,7 @@ export function StepReconstruction({ sessionId, onNext }: StepReconstructionProp }, []) const dewarpedUrl = sessionId - ? `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/dewarped` + ? `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/cropped` : '' const colTypeColor = (colType: string): string => { diff --git a/admin-lehrer/components/ocr-pipeline/StepRowDetection.tsx b/admin-lehrer/components/ocr-pipeline/StepRowDetection.tsx index 5bb5ad6..6d971ad 100644 --- a/admin-lehrer/components/ocr-pipeline/StepRowDetection.tsx +++ b/admin-lehrer/components/ocr-pipeline/StepRowDetection.tsx @@ -95,7 +95,7 @@ export function StepRowDetection({ sessionId, onNext }: StepRowDetectionProps) { } const overlayUrl = `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/rows-overlay` - const dewarpedUrl = `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/dewarped` + const dewarpedUrl = `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/cropped` const rowTypeColors: Record = { header: 'bg-gray-200 dark:bg-gray-600 text-gray-700 dark:text-gray-300', diff --git a/admin-lehrer/components/ocr-pipeline/StepWordRecognition.tsx b/admin-lehrer/components/ocr-pipeline/StepWordRecognition.tsx index 75768a5..ed98818 100644 --- a/admin-lehrer/components/ocr-pipeline/StepWordRecognition.tsx +++ b/admin-lehrer/components/ocr-pipeline/StepWordRecognition.tsx @@ -334,7 +334,7 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec } const overlayUrl = `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/words-overlay` - const dewarpedUrl = `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/dewarped` + const dewarpedUrl = `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/cropped` const confColor = (conf: number) => { if (conf >= 70) return 'text-green-600 dark:text-green-400' diff --git a/docs-src/services/klausur-service/OCR-Pipeline.md b/docs-src/services/klausur-service/OCR-Pipeline.md index 7b75253..4e32672 100644 --- a/docs-src/services/klausur-service/OCR-Pipeline.md +++ b/docs-src/services/klausur-service/OCR-Pipeline.md @@ -1,6 +1,6 @@ # OCR Pipeline - Schrittweise Seitenrekonstruktion -**Version:** 4.0.0 +**Version:** 4.1.0 **Status:** Produktiv (Schritte 1–10 implementiert) **URL:** https://macmini:3002/ai/ocr-pipeline @@ -17,9 +17,9 @@ Jeder Schritt kann individuell geprueft, korrigiert und mit Ground-Truth-Daten v | Schritt | Name | Beschreibung | Status | |---------|------|--------------|--------| | 1 | Orientierung | 90/180/270° Drehungen von Scannern korrigieren | Implementiert | -| 2 | Zuschneiden (Crop) | Scannerraender entfernen, Papierformat (A4) erkennen | Implementiert | -| 3 | Begradigung (Deskew) | Scan begradigen (Hough Lines + Word Alignment) | Implementiert | -| 4 | Entzerrung (Dewarp) | Buchwoelbung entzerren (Vertikalkanten-Analyse) | Implementiert | +| 2 | Begradigung (Deskew) | Scan begradigen (Hough Lines + Word Alignment) | Implementiert | +| 3 | Entzerrung (Dewarp) | Buchwoelbung entzerren (Vertikalkanten-Analyse) | Implementiert | +| 4 | Zuschneiden (Crop) | Content-basierter Crop: Buchruecken-Schatten + Ink-Projektion | Implementiert | | 5 | Spaltenerkennung | Unsichtbare Spalten finden (Projektionsprofile + Wortvalidierung) | Implementiert | | 6 | Zeilenerkennung | Horizontale Zeilen + Kopf-/Fusszeilen-Klassifikation + Luecken-Heilung | Implementiert | | 7 | Worterkennung | Hybrid-Grid: Breite Spalten full-page, schmale cell-crop | Implementiert | @@ -27,6 +27,11 @@ Jeder Schritt kann individuell geprueft, korrigiert und mit Ground-Truth-Daten v | 9 | Rekonstruktion | Interaktive Zellenbearbeitung auf Bildhintergrund (Fabric.js) | Implementiert | | 10 | Validierung | Ground-Truth-Vergleich und Qualitaetspruefung | Implementiert | +!!! note "Reihenfolge-Aenderung (v4.1)" + Crop wurde hinter Deskew/Dewarp verschoben. Das Bild ist dann bereits gerade, + was den Content-basierten Crop deutlich zuverlaessiger macht — insbesondere + bei Buchscans mit Ruecken-Schatten und weissem Scanner-Hintergrund. + --- ## Dokumenttyp-Erkennung und Pipeline-Pfade @@ -34,7 +39,7 @@ Jeder Schritt kann individuell geprueft, korrigiert und mit Ground-Truth-Daten v ### Automatische Weiche: `detect_document_type()` Nicht jedes Dokument durchlaeuft denselben Pfad. Nach den gemeinsamen Vorverarbeitungsschritten -(Deskew, Dewarp, Binarisierung) analysiert `detect_document_type()` die Seitenstruktur +(Orientierung, Deskew, Dewarp, Crop) analysiert `detect_document_type()` die Seitenstruktur **ohne OCR** — rein ueber Projektionsprofile und Textdichte-Analyse (< 2 Sekunden). ``` @@ -69,10 +74,10 @@ flowchart TD ┌─────────────────────────────────────────────────────────────────────┐ │ GEMEINSAME VORVERARBEITUNG (alle Dokumente) │ │ │ -│ Stage 1: Render (432 DPI, 3× Zoom) │ -│ Stage 2: Deskew (Hough Lines + Ensemble) │ -│ Stage 3: Dewarp (Vertikalkanten-Drift, Ensemble Shear) │ -│ Stage 4: Dual-Bild (ocr_img = binarisiert, layout_img = CLAHE) │ +│ Schritt 1: Orientierung (90/180/270° Drehung korrigieren) │ +│ Schritt 2: Deskew (Hough Lines + Iterative Projektion + Ensemble) │ +│ Schritt 3: Dewarp (Vertikalkanten-Drift, Ensemble Shear) │ +│ Schritt 4: Crop (Content-basiert: Schatten + Ink-Projektion) │ └─────────────────────────────────────┬───────────────────────────────┘ │ detect_document_type() @@ -103,9 +108,9 @@ flowchart TD Post-Processing Pipeline (Lautschrift, Komma-Split, etc.) │ - Schritt 6: Korrektur (Spell) - Schritt 7: Rekonstruktion - Schritt 8: Validierung + Schritt 8: Korrektur (Spell) + Schritt 9: Rekonstruktion + Schritt 10: Validierung ``` --- @@ -140,7 +145,9 @@ Admin-Lehrer (Next.js) klausur-service (FastAPI :8086) klausur-service/backend/ ├── services/ │ └── cv_vocab_pipeline.py # Computer Vision + NLP Algorithmen -├── ocr_pipeline_api.py # FastAPI Router (alle Endpoints) +├── ocr_pipeline_api.py # FastAPI Router (Schritte 2-10) +├── orientation_crop_api.py # FastAPI Router (Schritte 1 + 4) +├── page_crop.py # Content-basierter Crop-Algorithmus ├── ocr_pipeline_session_store.py # PostgreSQL Persistence ├── layout_reconstruction_service.py # Fabric.js JSON + PDF/DOCX Export └── migrations/ @@ -154,15 +161,17 @@ admin-lehrer/ │ └── types.ts # TypeScript Interfaces └── components/ocr-pipeline/ ├── PipelineStepper.tsx # Fortschritts-Stepper - ├── StepDeskew.tsx # Schritt 1: Begradigung - ├── StepDewarp.tsx # Schritt 2: Entzerrung - ├── StepColumnDetection.tsx # Schritt 3: Spaltenerkennung - ├── StepRowDetection.tsx # Schritt 4: Zeilenerkennung - ├── StepWordRecognition.tsx # Schritt 5: Worterkennung - ├── StepLlmReview.tsx # Schritt 6: Korrektur (SSE-Stream) - ├── StepReconstruction.tsx # Schritt 7: Rekonstruktion (Canvas) + ├── StepOrientation.tsx # Schritt 1: Orientierung + ├── StepDeskew.tsx # Schritt 2: Begradigung + ├── StepDewarp.tsx # Schritt 3: Entzerrung + ├── StepCrop.tsx # Schritt 4: Zuschneiden + ├── StepColumnDetection.tsx # Schritt 5: Spaltenerkennung + ├── StepRowDetection.tsx # Schritt 6: Zeilenerkennung + ├── StepWordRecognition.tsx # Schritt 7: Worterkennung + ├── StepLlmReview.tsx # Schritt 8: Korrektur (SSE-Stream) + ├── StepReconstruction.tsx # Schritt 9: Rekonstruktion (Canvas) ├── FabricReconstructionCanvas.tsx # Fabric.js Editor - └── StepGroundTruth.tsx # Schritt 8: Validierung + └── StepGroundTruth.tsx # Schritt 10: Validierung ``` --- @@ -187,14 +196,22 @@ Alle Endpoints unter `/api/v1/ocr-pipeline/`. | Methode | Pfad | Beschreibung | |---------|------|--------------| | `GET` | `/sessions/{id}/image/original` | Originalbild | +| `GET` | `/sessions/{id}/image/oriented` | Orientiertes Bild | | `GET` | `/sessions/{id}/image/deskewed` | Begradigtes Bild | | `GET` | `/sessions/{id}/image/dewarped` | Entzerrtes Bild | +| `GET` | `/sessions/{id}/image/cropped` | Zugeschnittenes Bild | | `GET` | `/sessions/{id}/image/binarized` | Binarisiertes Bild | | `GET` | `/sessions/{id}/image/columns-overlay` | Spalten-Overlay | | `GET` | `/sessions/{id}/image/rows-overlay` | Zeilen-Overlay | | `GET` | `/sessions/{id}/image/words-overlay` | Wort-Grid-Overlay | -### Schritt 1: Begradigung +### Schritt 1: Orientierung + +| Methode | Pfad | Beschreibung | +|---------|------|--------------| +| `POST` | `/sessions/{id}/orientation` | 90/180/270° Drehung erkennen und korrigieren | + +### Schritt 2: Begradigung | Methode | Pfad | Beschreibung | |---------|------|--------------| @@ -202,7 +219,7 @@ Alle Endpoints unter `/api/v1/ocr-pipeline/`. | `POST` | `/sessions/{id}/deskew/manual` | Manuelle Winkelkorrektur | | `POST` | `/sessions/{id}/ground-truth/deskew` | Ground Truth speichern | -### Schritt 2: Entzerrung +### Schritt 3: Entzerrung | Methode | Pfad | Beschreibung | |---------|------|--------------| @@ -211,7 +228,15 @@ Alle Endpoints unter `/api/v1/ocr-pipeline/`. | `POST` | `/sessions/{id}/adjust-combined` | Kombinierte Rotation + Shear Feinabstimmung | | `POST` | `/sessions/{id}/ground-truth/dewarp` | Ground Truth speichern | -### Schritt 3: Spalten +### Schritt 4: Zuschneiden + +| Methode | Pfad | Beschreibung | +|---------|------|--------------| +| `POST` | `/sessions/{id}/crop` | Automatischer Content-Crop | +| `POST` | `/sessions/{id}/crop/manual` | Manueller Crop (Prozent-Koordinaten) | +| `POST` | `/sessions/{id}/crop/skip` | Crop ueberspringen | + +### Schritt 5: Spalten | Methode | Pfad | Beschreibung | |---------|------|--------------| @@ -219,7 +244,7 @@ Alle Endpoints unter `/api/v1/ocr-pipeline/`. | `POST` | `/sessions/{id}/columns/manual` | Manuelle Spalten-Definition | | `POST` | `/sessions/{id}/ground-truth/columns` | Ground Truth speichern | -### Schritt 4: Zeilen +### Schritt 6: Zeilen | Methode | Pfad | Beschreibung | |---------|------|--------------| @@ -228,7 +253,7 @@ Alle Endpoints unter `/api/v1/ocr-pipeline/`. | `POST` | `/sessions/{id}/ground-truth/rows` | Ground Truth speichern | | `GET` | `/sessions/{id}/ground-truth/rows` | Ground Truth abrufen | -### Schritt 5: Worterkennung +### Schritt 7: Worterkennung | Methode | Pfad | Beschreibung | |---------|------|--------------| @@ -236,14 +261,14 @@ Alle Endpoints unter `/api/v1/ocr-pipeline/`. | `POST` | `/sessions/{id}/ground-truth/words` | Ground Truth speichern | | `GET` | `/sessions/{id}/ground-truth/words` | Ground Truth abrufen | -### Schritt 6: Korrektur +### Schritt 8: Korrektur | Methode | Pfad | Beschreibung | |---------|------|--------------| | `POST` | `/sessions/{id}/llm-review?stream=true` | SSE-Stream Korrektur starten | | `POST` | `/sessions/{id}/llm-review/apply` | Ausgewaehlte Korrekturen speichern | -### Schritt 7: Rekonstruktion +### Schritt 9: Rekonstruktion | Methode | Pfad | Beschreibung | |---------|------|--------------| @@ -253,12 +278,66 @@ Alle Endpoints unter `/api/v1/ocr-pipeline/`. | `GET` | `/sessions/{id}/reconstruction/export/docx` | DOCX-Export (python-docx) | | `POST` | `/sessions/{id}/reconstruction/detect-images` | Bildbereiche per VLM erkennen | | `POST` | `/sessions/{id}/reconstruction/generate-image` | Bild per mflux generieren | -| `POST` | `/sessions/{id}/reconstruction/validate` | Validierung speichern (Step 8) | +| `POST` | `/sessions/{id}/reconstruction/validate` | Validierung speichern (Step 10) | | `GET` | `/sessions/{id}/reconstruction/validation` | Validierungsdaten abrufen | --- -## Schritt 2: Entzerrung/Dewarp (Detail) +## Schritt 4: Zuschneiden/Crop (Detail) + +### Warum Crop nach Deskew/Dewarp? + +In frueheren Versionen lief Crop als Schritt 2 (vor Deskew). Das fuehrte zu Problemen: + +- **Schiefes Bild**: `boundingRect` einer schiefen Seite schliesst viel Scanner-Hintergrund ein +- **Buchscans**: Otsu-Binarisierung versagt bei weiss-auf-weiss (Seite auf weissem Scanner) +- **Buchruecken**: Gradueller Schatten-Uebergang wird nicht als Kante erkannt + +**Loesung (v4.1):** Crop laeuft jetzt nach Dewarp — das Bild ist dann gerade. + +### Algorithmus: Content-basierte 4-Kanten-Erkennung + +Datei: `page_crop.py` + +``` +Input: Entzerrtes BGR-Bild + │ + ├─ Adaptive Threshold (Gauss, blockSize=51) + │ → binary (Text=255, Hintergrund=0) + │ + ├─ Linker Rand (Buchruecken-Schatten): + │ 1. Grauwert-Spaltenmittel in linken 25% + │ 2. Glaetten mit Boxcar-Kernel + │ 3. Transition hell→dunkel finden (> 60% des Helligkeitsbereichs) + │ 4. Fallback: Binaere Vertikal-Projektion + │ + ├─ Rechter Rand: Binaere Vertikal-Projektion (letzte Ink-Spalte) + │ + ├─ Oben/Unten: Binaere Horizontal-Projektion (erste/letzte Ink-Zeile) + │ + ├─ Rausch-Filter: Runs < 0.5% der Dimension ignorieren + │ + ├─ Sanity-Checks: + │ - Mindestens eine Kante > 2% Border + │ - Crop-Flaeche >= 40% des Originals + │ + └─ Crop + konfigurierbarer Rand (default 1%) +``` + +### Vergleich alt vs. neu + +| Eigenschaft | Alt (Otsu + Kontur) | Neu (Content-basiert) | +|-------------|--------------------|-----------------------| +| Binarisierung | Otsu (global) | Adaptive Threshold | +| Methode | Groesste Kontur → boundingRect | 4-Kanten Ink-Projektion | +| Buchruecken | Nicht erkannt | Schatten-Gradient-Erkennung | +| Weiss-auf-weiss | Versagt | Funktioniert (adaptive) | +| Format-Matching | A4/Letter erzwungen | Kein Format-Matching (Content-Bounds) | +| Position in Pipeline | Vor Deskew (Schritt 2) | Nach Dewarp (Schritt 4) | + +--- + +## Schritt 3: Entzerrung/Dewarp (Detail) ### Algorithmus: Vertikalkanten-Drift @@ -311,7 +390,7 @@ Response: {"method_used": "manual_combined", "shear_degrees": -0.45, "dewarped_i --- -## Schritt 3: Spaltenerkennung (Detail) +## Schritt 5: Spaltenerkennung (Detail) ### Algorithmus: `detect_column_geometry()` @@ -417,7 +496,7 @@ min_real_col_w = max(20, int(content_w * 0.03)) --- -## Schritt 4: Zeilenerkennung (Detail) +## Schritt 6: Zeilenerkennung (Detail) ### Algorithmus: `detect_row_geometry()` @@ -447,7 +526,7 @@ def _heal_row_gaps(rows, top_bound, bottom_bound): --- -## Schritt 5: Worterkennung — Hybrid-Grid (Detail) +## Schritt 7: Worterkennung — Hybrid-Grid (Detail) ### Algorithmus: `build_cell_grid_v2()` @@ -554,7 +633,7 @@ Eingabe: ocr_img, column_regions, row_geometries --- -## Schritt 6: Korrektur (Detail) +## Schritt 8: Korrektur (Detail) ### Korrektur-Engine @@ -611,7 +690,7 @@ Change-Format: --- -## Schritt 7: Rekonstruktion (Detail) +## Schritt 9: Rekonstruktion (Detail) Zwei Modi verfuegbar: diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py index 9a032d3..a65455c 100644 --- a/klausur-service/backend/ocr_pipeline_api.py +++ b/klausur-service/backend/ocr_pipeline_api.py @@ -1207,7 +1207,7 @@ async def get_column_ground_truth(session_id: str): async def _get_columns_overlay(session_id: str) -> Response: - """Generate dewarped image with column borders drawn on it.""" + """Generate cropped (or dewarped) image with column borders drawn on it.""" session = await get_session_db(session_id) if not session: raise HTTPException(status_code=404, detail=f"Session {session_id} not found") @@ -1216,12 +1216,14 @@ async def _get_columns_overlay(session_id: str) -> Response: if not column_result or not column_result.get("columns"): raise HTTPException(status_code=404, detail="No column data available") - # Load dewarped image - dewarped_png = await get_session_image(session_id, "dewarped") - if not dewarped_png: - raise HTTPException(status_code=404, detail="Dewarped image not available") + # Load cropped image (preferred) or dewarped as fallback + base_png = await get_session_image(session_id, "cropped") + if not base_png: + base_png = await get_session_image(session_id, "dewarped") + if not base_png: + raise HTTPException(status_code=404, detail="No base image available (cropped/dewarped)") - arr = np.frombuffer(dewarped_png, dtype=np.uint8) + arr = np.frombuffer(base_png, dtype=np.uint8) img = cv2.imdecode(arr, cv2.IMREAD_COLOR) if img is None: raise HTTPException(status_code=500, detail="Failed to decode image") @@ -2692,7 +2694,7 @@ async def reprocess_session(session_id: str, request: Request): async def _get_rows_overlay(session_id: str) -> Response: - """Generate dewarped image with row bands drawn on it.""" + """Generate cropped (or dewarped) image with row bands drawn on it.""" session = await get_session_db(session_id) if not session: raise HTTPException(status_code=404, detail=f"Session {session_id} not found") @@ -2701,12 +2703,14 @@ async def _get_rows_overlay(session_id: str) -> Response: if not row_result or not row_result.get("rows"): raise HTTPException(status_code=404, detail="No row data available") - # Load dewarped image - dewarped_png = await get_session_image(session_id, "dewarped") - if not dewarped_png: - raise HTTPException(status_code=404, detail="Dewarped image not available") + # Load cropped image (preferred) or dewarped as fallback + base_png = await get_session_image(session_id, "cropped") + if not base_png: + base_png = await get_session_image(session_id, "dewarped") + if not base_png: + raise HTTPException(status_code=404, detail="No base image available (cropped/dewarped)") - arr = np.frombuffer(dewarped_png, dtype=np.uint8) + arr = np.frombuffer(base_png, dtype=np.uint8) img = cv2.imdecode(arr, cv2.IMREAD_COLOR) if img is None: raise HTTPException(status_code=500, detail="Failed to decode image") @@ -2753,7 +2757,7 @@ async def _get_rows_overlay(session_id: str) -> Response: async def _get_words_overlay(session_id: str) -> Response: - """Generate dewarped image with cell grid drawn on it.""" + """Generate cropped (or dewarped) image with cell grid drawn on it.""" session = await get_session_db(session_id) if not session: raise HTTPException(status_code=404, detail=f"Session {session_id} not found") @@ -2767,12 +2771,14 @@ async def _get_words_overlay(session_id: str) -> Response: if not cells and not word_result.get("entries"): raise HTTPException(status_code=404, detail="No word data available") - # Load dewarped image - dewarped_png = await get_session_image(session_id, "dewarped") - if not dewarped_png: - raise HTTPException(status_code=404, detail="Dewarped image not available") + # Load cropped image (preferred) or dewarped as fallback + base_png = await get_session_image(session_id, "cropped") + if not base_png: + base_png = await get_session_image(session_id, "dewarped") + if not base_png: + raise HTTPException(status_code=404, detail="No base image available (cropped/dewarped)") - arr = np.frombuffer(dewarped_png, dtype=np.uint8) + arr = np.frombuffer(base_png, dtype=np.uint8) img = cv2.imdecode(arr, cv2.IMREAD_COLOR) if img is None: raise HTTPException(status_code=500, detail="Failed to decode image") diff --git a/klausur-service/backend/tests/test_page_crop.py b/klausur-service/backend/tests/test_page_crop.py new file mode 100644 index 0000000..d16d8b4 --- /dev/null +++ b/klausur-service/backend/tests/test_page_crop.py @@ -0,0 +1,327 @@ +""" +Tests for page_crop.py — content-based crop algorithm. + +Tests cover: +- Edge detection via ink projections +- Spine shadow detection for book scans +- Narrow run filtering +- Paper format detection +- Sanity checks (min area, min border) +- End-to-end crop on synthetic images +""" + +import numpy as np +import pytest + +from page_crop import ( + detect_and_crop_page, + _detect_format, + _detect_edge_projection, + _detect_left_edge_shadow, + _filter_narrow_runs, +) + + +# --------------------------------------------------------------------------- +# Helper: create synthetic images +# --------------------------------------------------------------------------- + +def _make_white_image(h: int, w: int) -> np.ndarray: + """Create a white BGR image.""" + return np.full((h, w, 3), 255, dtype=np.uint8) + + +def _make_image_with_content( + h: int, w: int, + content_rect: tuple, # (y1, y2, x1, x2) + bg_color: int = 255, + content_color: int = 0, +) -> np.ndarray: + """Create an image with a dark content rectangle on a light background.""" + img = np.full((h, w, 3), bg_color, dtype=np.uint8) + y1, y2, x1, x2 = content_rect + img[y1:y2, x1:x2] = content_color + return img + + +def _make_book_scan(h: int = 1000, w: int = 800) -> np.ndarray: + """Create a synthetic book scan with spine shadow on the left. + + Left 10%: gradient from dark (50) to white (255) + Top 5%: white (empty scanner border) + Bottom 5%: white (empty scanner border) + Center: text-like content (dark pixels scattered) + """ + img = np.full((h, w, 3), 255, dtype=np.uint8) + + # Spine shadow: left 10% has gradient from dark to bright + shadow_w = w // 10 + for x in range(shadow_w): + brightness = int(50 + (255 - 50) * x / shadow_w) + img[:, x] = brightness + + # Content area: scatter some dark pixels (simulate text) + content_top = h // 20 # 5% top margin + content_bottom = h - h // 20 # 5% bottom margin + content_left = shadow_w + w // 20 # past shadow + small margin + content_right = w - w // 20 # 5% right margin + + rng = np.random.RandomState(42) + for _ in range(500): + y = rng.randint(content_top, content_bottom) + x = rng.randint(content_left, content_right) + # Small text-like blob + y2 = min(y + 3, h) + x2 = min(x + 10, w) + img[y:y2, x:x2] = 20 + + return img + + +# --------------------------------------------------------------------------- +# Tests: _filter_narrow_runs +# --------------------------------------------------------------------------- + +class TestFilterNarrowRuns: + def test_removes_short_runs(self): + mask = np.array([False, True, True, False, False, True, False]) + result = _filter_narrow_runs(mask, min_run=3) + # The run [True, True] (length 2) and [True] (length 1) should be removed + assert not result.any() + + def test_keeps_long_runs(self): + mask = np.array([False, True, True, True, True, False]) + result = _filter_narrow_runs(mask, min_run=3) + expected = np.array([False, True, True, True, True, False]) + np.testing.assert_array_equal(result, expected) + + def test_min_run_1_keeps_all(self): + mask = np.array([True, False, True]) + result = _filter_narrow_runs(mask, min_run=1) + np.testing.assert_array_equal(result, mask) + + def test_empty_mask(self): + mask = np.array([], dtype=bool) + result = _filter_narrow_runs(mask, min_run=5) + assert len(result) == 0 + + def test_mixed_runs(self): + mask = np.array([True, False, True, True, True, True, True, False, True, True]) + result = _filter_narrow_runs(mask, min_run=3) + # Run of 1 at [0]: removed + # Run of 5 at [2:7]: kept + # Run of 2 at [8:10]: removed + expected = np.array([False, False, True, True, True, True, True, False, False, False]) + np.testing.assert_array_equal(result, expected) + + +# --------------------------------------------------------------------------- +# Tests: _detect_format +# --------------------------------------------------------------------------- + +class TestDetectFormat: + def test_a4_portrait(self): + fmt, conf = _detect_format(210, 297) + assert fmt == "A4" + assert conf > 0.8 + + def test_a4_landscape(self): + fmt, conf = _detect_format(297, 210) + assert fmt == "A4" + assert conf > 0.8 + + def test_letter(self): + fmt, conf = _detect_format(850, 1100) + assert fmt == "Letter" + assert conf > 0.5 + + def test_unknown_square(self): + fmt, conf = _detect_format(100, 100) + # Aspect ratio 1.0 doesn't match any paper format well + assert fmt == "unknown" or conf < 0.5 + + def test_zero_dimensions(self): + fmt, conf = _detect_format(0, 100) + assert fmt == "unknown" + assert conf == 0.0 + + +# --------------------------------------------------------------------------- +# Tests: _detect_edge_projection +# --------------------------------------------------------------------------- + +class TestDetectEdgeProjection: + def test_finds_first_ink_column(self): + """Binary image with ink starting at column 50.""" + binary = np.zeros((100, 200), dtype=np.uint8) + binary[10:90, 50:180] = 255 # Content from x=50 to x=180 + + edge = _detect_edge_projection(binary, axis=0, from_start=True, dim=200) + assert edge == 50 + + def test_finds_last_ink_column(self): + binary = np.zeros((100, 200), dtype=np.uint8) + binary[10:90, 50:180] = 255 + + edge = _detect_edge_projection(binary, axis=0, from_start=False, dim=200) + assert edge == 179 # last column with ink + + def test_finds_first_ink_row(self): + binary = np.zeros((200, 100), dtype=np.uint8) + binary[30:170, 10:90] = 255 + + edge = _detect_edge_projection(binary, axis=1, from_start=True, dim=200) + assert edge == 30 + + def test_finds_last_ink_row(self): + binary = np.zeros((200, 100), dtype=np.uint8) + binary[30:170, 10:90] = 255 + + edge = _detect_edge_projection(binary, axis=1, from_start=False, dim=200) + assert edge == 169 + + def test_empty_image_returns_boundary(self): + binary = np.zeros((100, 100), dtype=np.uint8) + assert _detect_edge_projection(binary, axis=0, from_start=True, dim=100) == 0 + assert _detect_edge_projection(binary, axis=0, from_start=False, dim=100) == 100 + + +# --------------------------------------------------------------------------- +# Tests: _detect_left_edge_shadow +# --------------------------------------------------------------------------- + +class TestDetectLeftEdgeShadow: + def test_detects_shadow_gradient(self): + """Synthetic image with left-side shadow gradient.""" + h, w = 500, 400 + gray = np.full((h, w), 255, dtype=np.uint8) + binary = np.zeros((h, w), dtype=np.uint8) + + # Shadow: left 15% gradually darkens + shadow_w = w * 15 // 100 + for x in range(shadow_w): + brightness = int(50 + (255 - 50) * x / shadow_w) + gray[:, x] = brightness + + # Content starts after shadow + binary[:, shadow_w + 10:w - 10] = 255 + + edge = _detect_left_edge_shadow(gray, binary, w, h) + # Edge should be within the shadow transition zone + # The 60% threshold fires before the actual shadow boundary + assert 0 < edge < shadow_w + 20 + + def test_no_shadow_uses_binary_fallback(self): + """When shadow range is small, falls back to binary projection.""" + h, w = 400, 400 + gray = np.full((h, w), 200, dtype=np.uint8) + binary = np.zeros((h, w), dtype=np.uint8) + # Content block from x=80 onward (large enough to survive noise filtering) + binary[50:350, 80:380] = 255 + + edge = _detect_left_edge_shadow(gray, binary, w, h) + # Should find content start via projection fallback (near x=80) + assert edge <= 85 + + +# --------------------------------------------------------------------------- +# Tests: detect_and_crop_page (end-to-end) +# --------------------------------------------------------------------------- + +class TestDetectAndCropPage: + def test_no_crop_needed_all_content(self): + """Image that is all content — no borders to crop.""" + img = np.full((100, 80, 3), 40, dtype=np.uint8) # Dark content everywhere + cropped, result = detect_and_crop_page(img) + # Should return original (all borders < 2%) + assert not result["crop_applied"] + assert result["cropped_size"] == {"width": 80, "height": 100} + + def test_crops_white_borders(self): + """Image with wide white borders around dark content.""" + h, w = 400, 300 + img = _make_image_with_content(h, w, (80, 320, 60, 240)) + + cropped, result = detect_and_crop_page(img) + assert result["crop_applied"] + # Cropped size should be close to the content area (with margin) + assert result["cropped_size"]["width"] < w + assert result["cropped_size"]["height"] < h + # Content should be roughly 180x240 + margins (adaptive threshold may widen slightly) + assert 160 <= result["cropped_size"]["width"] <= 260 + assert 220 <= result["cropped_size"]["height"] <= 300 + + def test_book_scan_detects_spine_shadow(self): + """Synthetic book scan with spine shadow on left.""" + img = _make_book_scan(1000, 800) + cropped, result = detect_and_crop_page(img) + + # Should crop the spine shadow area + left_border = result["border_fractions"]["left"] + # Spine shadow is ~10% of width, plus some margin + assert left_border > 0.05 # At least 5% left border detected + + def test_sanity_check_too_small_crop(self): + """If detected content area is too small, skip crop.""" + h, w = 500, 500 + # Tiny content area (5x5 pixels) — should fail sanity check + img = _make_white_image(h, w) + # Add tiny dark spot + img[248:253, 248:253] = 0 + + cropped, result = detect_and_crop_page(img) + # Should either not crop or crop is too small (< 40%) + if result["crop_applied"]: + crop_area = result["cropped_size"]["width"] * result["cropped_size"]["height"] + assert crop_area >= 0.4 * h * w + + def test_crop_preserves_content(self): + """Verify that content is preserved after cropping.""" + h, w = 300, 200 + img = _make_image_with_content(h, w, (50, 250, 40, 160)) + cropped, result = detect_and_crop_page(img) + + if result["crop_applied"]: + # Cropped image should contain dark pixels (content) + gray = np.mean(cropped, axis=2) + assert np.min(gray) < 50 # Content is dark + + def test_result_structure(self): + """Verify all expected keys are present in result dict.""" + img = _make_white_image(100, 100) + _, result = detect_and_crop_page(img) + + assert "crop_applied" in result + assert "original_size" in result + assert "cropped_size" in result + assert "border_fractions" in result + assert "detected_format" in result + assert "format_confidence" in result + assert "aspect_ratio" in result + + def test_margin_parameter(self): + """Custom margin_frac should affect crop bounds.""" + h, w = 400, 300 + img = _make_image_with_content(h, w, (80, 320, 60, 240)) + + _, result_small = detect_and_crop_page(img, margin_frac=0.005) + _, result_large = detect_and_crop_page(img, margin_frac=0.05) + + if result_small["crop_applied"] and result_large["crop_applied"]: + # Larger margin should produce a larger crop + small_area = result_small["cropped_size"]["width"] * result_small["cropped_size"]["height"] + large_area = result_large["cropped_size"]["width"] * result_large["cropped_size"]["height"] + assert large_area >= small_area + + def test_crop_rect_pct_values(self): + """crop_rect_pct values should be in 0-100 range.""" + h, w = 400, 300 + img = _make_image_with_content(h, w, (80, 320, 60, 240)) + _, result = detect_and_crop_page(img) + + if result["crop_applied"] and result["crop_rect_pct"]: + pct = result["crop_rect_pct"] + assert 0 <= pct["x"] <= 100 + assert 0 <= pct["y"] <= 100 + assert 0 < pct["width"] <= 100 + assert 0 < pct["height"] <= 100