feat: Orientierung + Zuschneiden als Schritte 1-2 in OCR-Pipeline
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m59s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 18s

Zwei neue Wizard-Schritte vor Begradigung:
- Step 1: Orientierungserkennung (0/90/180/270° via Tesseract OSD)
- Step 2: Seitenrand-Erkennung und Zuschnitt (Scannerraender entfernen)

Backend:
- orientation_crop_api.py: POST /orientation, POST /crop, POST /crop/skip
- page_crop.py: detect_and_crop_page() mit Format-Erkennung (A4/A5/Letter)
- Session-Store: orientation_result, crop_result Felder
- Pipeline nutzt zugeschnittenes Bild fuer Deskew/Dewarp

Frontend:
- StepOrientation.tsx: Upload + Auto-Orientierung + Vorher/Nachher
- StepCrop.tsx: Auto-Crop + Format-Badge + Ueberspringen-Option
- Pipeline-Stepper: 10 Schritte (war 8)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-08 23:55:23 +01:00
parent 9a5a35bff1
commit 2763631711
12 changed files with 1247 additions and 259 deletions

View File

@@ -42,7 +42,8 @@ try:
except ImportError:
trocr_router = None
from vocab_worksheet_api import router as vocab_router, set_db_pool as set_vocab_db_pool, _init_vocab_table, _load_all_sessions, DATABASE_URL as VOCAB_DATABASE_URL
from ocr_pipeline_api import router as ocr_pipeline_router
from ocr_pipeline_api import router as ocr_pipeline_router, _cache as ocr_pipeline_cache
from orientation_crop_api import router as orientation_crop_router, set_cache_ref as set_orientation_crop_cache
from ocr_pipeline_session_store import init_ocr_pipeline_tables
try:
from handwriting_htr_api import router as htr_router
@@ -177,6 +178,8 @@ if trocr_router:
app.include_router(trocr_router) # TrOCR Handwriting OCR
app.include_router(vocab_router) # Vocabulary Worksheet Generator
app.include_router(ocr_pipeline_router) # OCR Pipeline (step-by-step)
set_orientation_crop_cache(ocr_pipeline_cache)
app.include_router(orientation_crop_router) # OCR Pipeline: Orientation + Crop
if htr_router:
app.include_router(htr_router) # Handwriting HTR (Klausur)
if dsfa_rag_router:

View File

@@ -1,15 +1,17 @@
"""
OCR Pipeline API - Schrittweise Seitenrekonstruktion.
Zerlegt den OCR-Prozess in 8 einzelne Schritte:
1. Deskewing - Scan begradigen
2. Dewarping - Buchwoelbung entzerren
3. Spaltenerkennung - Unsichtbare Spalten finden
4. Zeilenerkennung - Horizontale Zeilen + Kopf-/Fusszeilen
5. Worterkennung - OCR mit Bounding Boxes
6. LLM-Korrektur - OCR-Fehler per LLM korrigieren
7. Seitenrekonstruktion - Seite nachbauen
8. Ground Truth Validierung - Gesamtpruefung
Zerlegt den OCR-Prozess in 10 einzelne Schritte:
1. Orientierung - 90/180/270° Drehungen korrigieren (orientation_crop_api.py)
2. Zuschneiden - Scannerraender entfernen (orientation_crop_api.py)
3. Deskewing - Scan begradigen
4. Dewarping - Buchwoelbung entzerren
5. Spaltenerkennung - Unsichtbare Spalten finden
6. Zeilenerkennung - Horizontale Zeilen + Kopf-/Fusszeilen
7. Worterkennung - OCR mit Bounding Boxes
8. LLM-Korrektur - OCR-Fehler per LLM korrigieren
9. Seitenrekonstruktion - Seite nachbauen
10. Ground Truth Validierung - Gesamtpruefung
Lizenz: Apache 2.0
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
@@ -54,7 +56,6 @@ from cv_vocab_pipeline import (
deskew_image_by_word_alignment,
deskew_image_iterative,
deskew_two_pass,
detect_and_fix_orientation,
detect_column_geometry,
detect_document_type,
detect_row_geometry,
@@ -103,6 +104,8 @@ async def _load_session_to_cache(session_id: str) -> Dict[str, Any]:
"id": session_id,
**session,
"original_bgr": None,
"oriented_bgr": None,
"cropped_bgr": None,
"deskewed_bgr": None,
"dewarped_bgr": None,
}
@@ -110,6 +113,8 @@ async def _load_session_to_cache(session_id: str) -> Dict[str, Any]:
# Decode images from DB into BGR numpy arrays
for img_type, bgr_key in [
("original", "original_bgr"),
("oriented", "oriented_bgr"),
("cropped", "cropped_bgr"),
("deskewed", "deskewed_bgr"),
("dewarped", "dewarped_bgr"),
]:
@@ -252,8 +257,12 @@ async def create_session(
"filename": filename,
"name": session_name,
"original_bgr": img_bgr,
"oriented_bgr": None,
"cropped_bgr": None,
"deskewed_bgr": None,
"dewarped_bgr": None,
"orientation_result": None,
"crop_result": None,
"deskew_result": None,
"dewarp_result": None,
"ground_truth": {},
@@ -301,6 +310,10 @@ async def get_session_info(session_id: str):
"doc_type": session.get("doc_type"),
}
if session.get("orientation_result"):
result["orientation_result"] = session["orientation_result"]
if session.get("crop_result"):
result["crop_result"] = session["crop_result"]
if session.get("deskew_result"):
result["deskew_result"] = session["deskew_result"]
if session.get("dewarp_result"):
@@ -427,7 +440,7 @@ async def _append_pipeline_log(
@router.get("/sessions/{session_id}/image/{image_type}")
async def get_image(session_id: str, image_type: str):
"""Serve session images: original, deskewed, dewarped, binarized, columns-overlay, or rows-overlay."""
valid_types = {"original", "deskewed", "dewarped", "binarized", "columns-overlay", "rows-overlay", "words-overlay", "clean"}
valid_types = {"original", "oriented", "cropped", "deskewed", "dewarped", "binarized", "columns-overlay", "rows-overlay", "words-overlay", "clean"}
if image_type not in valid_types:
raise HTTPException(status_code=400, detail=f"Unknown image type: {image_type}")
@@ -470,22 +483,13 @@ async def auto_deskew(session_id: str):
await _load_session_to_cache(session_id)
cached = _get_cached(session_id)
img_bgr = cached.get("original_bgr")
# Use cropped image as input (from step 2), fall back to oriented, then original
img_bgr = cached.get("cropped_bgr") or cached.get("oriented_bgr") or cached.get("original_bgr")
if img_bgr is None:
raise HTTPException(status_code=400, detail="Original image not available")
raise HTTPException(status_code=400, detail="No image available for deskewing")
t0 = time.time()
# Orientation detection (fix 90/180/270° rotations from scanners)
img_bgr, orientation_deg = detect_and_fix_orientation(img_bgr)
if orientation_deg:
# Update original in cache + DB so all subsequent steps use corrected image
cached["original_bgr"] = img_bgr
success_ori, ori_buf = cv2.imencode(".png", img_bgr)
if success_ori:
await update_session_db(session_id, original_png=ori_buf.tobytes())
logger.info(f"OCR Pipeline: orientation corrected {orientation_deg}° for session {session_id}")
# Two-pass deskew: iterative (±5°) + word-alignment residual check
deskewed_bgr, angle_applied, two_pass_debug = deskew_two_pass(img_bgr.copy())
@@ -534,7 +538,6 @@ async def auto_deskew(session_id: str):
"angle_residual": round(angle_residual, 3),
"angle_textline": round(angle_textline, 3),
"angle_applied": round(angle_applied, 3),
"orientation_degrees": orientation_deg,
"method_used": method_used,
"confidence": round(confidence, 2),
"duration_seconds": round(duration, 2),
@@ -550,7 +553,7 @@ async def auto_deskew(session_id: str):
db_update = {
"deskewed_png": deskewed_png,
"deskew_result": deskew_result,
"current_step": 2,
"current_step": 4,
}
if binarized_png:
db_update["binarized_png"] = binarized_png
@@ -563,7 +566,6 @@ async def auto_deskew(session_id: str):
f"-> {method_used} total={angle_applied:.2f}")
await _append_pipeline_log(session_id, "deskew", {
"orientation": orientation_deg,
"angle_applied": round(angle_applied, 3),
"angle_iterative": round(angle_iterative, 3),
"angle_residual": round(angle_residual, 3),
@@ -582,14 +584,14 @@ async def auto_deskew(session_id: str):
@router.post("/sessions/{session_id}/deskew/manual")
async def manual_deskew(session_id: str, req: ManualDeskewRequest):
"""Apply a manual rotation angle to the original image."""
"""Apply a manual rotation angle to the cropped image."""
if session_id not in _cache:
await _load_session_to_cache(session_id)
cached = _get_cached(session_id)
img_bgr = cached.get("original_bgr")
img_bgr = cached.get("cropped_bgr") or cached.get("oriented_bgr") or cached.get("original_bgr")
if img_bgr is None:
raise HTTPException(status_code=400, detail="Original image not available")
raise HTTPException(status_code=400, detail="No image available for deskewing")
angle = max(-5.0, min(5.0, req.angle))
@@ -797,7 +799,7 @@ async def auto_dewarp(
dewarped_png=dewarped_png,
dewarp_result=dewarp_result,
auto_shear_degrees=dewarp_info.get("shear_degrees", 0.0),
current_step=3,
current_step=5,
)
logger.info(f"OCR Pipeline: dewarp session {session_id}: "
@@ -1109,7 +1111,7 @@ async def detect_columns(session_id: str):
column_result=column_result,
row_result=None,
word_result=None,
current_step=3,
current_step=5,
)
# Update cache
@@ -1335,7 +1337,7 @@ async def detect_rows(session_id: str):
session_id,
row_result=row_result,
word_result=None,
current_step=4,
current_step=6,
)
cached["row_result"] = row_result
@@ -1601,7 +1603,7 @@ async def detect_words(
await update_session_db(
session_id,
word_result=word_result,
current_step=5,
current_step=7,
)
cached["word_result"] = word_result
@@ -1745,7 +1747,7 @@ async def _word_batch_stream_generator(
word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))
vocab_entries = entries
await update_session_db(session_id, word_result=word_result, current_step=5)
await update_session_db(session_id, word_result=word_result, current_step=7)
cached["word_result"] = word_result
logger.info(f"OCR Pipeline SSE batch: words session {session_id}: "
@@ -1892,7 +1894,7 @@ async def _word_stream_generator(
await update_session_db(
session_id,
word_result=word_result,
current_step=5,
current_step=7,
)
cached["word_result"] = word_result
@@ -2016,7 +2018,7 @@ async def run_llm_review(session_id: str, request: Request, stream: bool = False
"duration_ms": result["duration_ms"],
"entries_corrected": result["entries_corrected"],
}
await update_session_db(session_id, word_result=word_result, current_step=6)
await update_session_db(session_id, word_result=word_result, current_step=8)
if session_id in _cache:
_cache[session_id]["word_result"] = word_result
@@ -2065,7 +2067,7 @@ async def _llm_review_stream_generator(
"duration_ms": event["duration_ms"],
"entries_corrected": event["entries_corrected"],
}
await update_session_db(session_id, word_result=word_result, current_step=6)
await update_session_db(session_id, word_result=word_result, current_step=8)
if session_id in _cache:
_cache[session_id]["word_result"] = word_result
@@ -2153,7 +2155,7 @@ async def save_reconstruction(session_id: str, request: Request):
cell_updates = body.get("cells", [])
if not cell_updates:
await update_session_db(session_id, current_step=7)
await update_session_db(session_id, current_step=9)
return {"session_id": session_id, "updated": 0}
# Build update map: cell_id -> new text
@@ -2189,7 +2191,7 @@ async def save_reconstruction(session_id: str, request: Request):
if "entries" in word_result:
word_result["entries"] = entries
await update_session_db(session_id, word_result=word_result, current_step=7)
await update_session_db(session_id, word_result=word_result, current_step=9)
if session_id in _cache:
_cache[session_id]["word_result"] = word_result
@@ -2572,7 +2574,7 @@ async def save_validation(session_id: str, req: ValidationRequest):
"""Save final validation results for step 8.
Stores notes, score, and preserves any detected/generated image regions.
Sets current_step = 8 to mark pipeline as complete.
Sets current_step = 10 to mark pipeline as complete.
"""
session = await get_session_db(session_id)
if not session:
@@ -2585,7 +2587,7 @@ async def save_validation(session_id: str, req: ValidationRequest):
validation["score"] = req.score
ground_truth["validation"] = validation
await update_session_db(session_id, ground_truth=ground_truth, current_step=8)
await update_session_db(session_id, ground_truth=ground_truth, current_step=10)
if session_id in _cache:
_cache[session_id]["ground_truth"] = ground_truth
@@ -2619,12 +2621,14 @@ async def reprocess_session(session_id: str, request: Request):
Body: {"from_step": 5} (1-indexed step number)
Clears downstream results:
- from_step <= 1: deskew_result, dewarp_result, column_result, row_result, word_result
- from_step <= 2: dewarp_result, column_result, row_result, word_result
- from_step <= 3: column_result, row_result, word_result
- from_step <= 4: row_result, word_result
- from_step <= 5: word_result (cells, vocab_entries)
- from_step <= 6: word_result.llm_review only
- from_step <= 1: orientation_result, crop_result, deskew_result, dewarp_result, column_result, row_result, word_result
- from_step <= 2: crop_result, deskew_result, dewarp_result, column_result, row_result, word_result
- from_step <= 3: deskew_result, dewarp_result, column_result, row_result, word_result
- from_step <= 4: dewarp_result, column_result, row_result, word_result
- from_step <= 5: column_result, row_result, word_result
- from_step <= 6: row_result, word_result
- from_step <= 7: word_result (cells, vocab_entries)
- from_step <= 8: word_result.llm_review only
"""
session = await get_session_db(session_id)
if not session:
@@ -2632,15 +2636,15 @@ async def reprocess_session(session_id: str, request: Request):
body = await request.json()
from_step = body.get("from_step", 1)
if not isinstance(from_step, int) or from_step < 1 or from_step > 7:
raise HTTPException(status_code=400, detail="from_step must be between 1 and 7")
if not isinstance(from_step, int) or from_step < 1 or from_step > 9:
raise HTTPException(status_code=400, detail="from_step must be between 1 and 9")
update_kwargs: Dict[str, Any] = {"current_step": from_step}
# Clear downstream data based on from_step
if from_step <= 5:
if from_step <= 7:
update_kwargs["word_result"] = None
elif from_step == 6:
elif from_step == 8:
# Only clear LLM review from word_result
word_result = session.get("word_result")
if word_result:
@@ -2648,14 +2652,18 @@ async def reprocess_session(session_id: str, request: Request):
word_result.pop("llm_corrections", None)
update_kwargs["word_result"] = word_result
if from_step <= 4:
if from_step <= 6:
update_kwargs["row_result"] = None
if from_step <= 3:
if from_step <= 5:
update_kwargs["column_result"] = None
if from_step <= 2:
if from_step <= 4:
update_kwargs["dewarp_result"] = None
if from_step <= 1:
if from_step <= 3:
update_kwargs["deskew_result"] = None
if from_step <= 2:
update_kwargs["crop_result"] = None
if from_step <= 1:
update_kwargs["orientation_result"] = None
await update_session_db(session_id, **update_kwargs)
@@ -3074,7 +3082,7 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request):
deskewed_png=deskewed_png,
deskew_result=deskew_result,
auto_rotation_degrees=float(angle_applied),
current_step=2,
current_step=4,
)
session = await get_session_db(session_id)
@@ -3137,7 +3145,7 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request):
dewarped_png=dewarped_png,
dewarp_result=dewarp_result,
auto_shear_degrees=dewarp_info.get("shear_degrees", 0.0),
current_step=3,
current_step=5,
)
session = await get_session_db(session_id)
@@ -3196,7 +3204,7 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request):
cached["column_result"] = column_result
await update_session_db(session_id, column_result=column_result,
row_result=None, word_result=None, current_step=4)
row_result=None, word_result=None, current_step=6)
session = await get_session_db(session_id)
steps_run.append("columns")
@@ -3273,7 +3281,7 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request):
}
cached["row_result"] = row_result
await update_session_db(session_id, row_result=row_result, current_step=5)
await update_session_db(session_id, row_result=row_result, current_step=7)
session = await get_session_db(session_id)
steps_run.append("rows")
@@ -3381,7 +3389,7 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request):
word_result_data["entry_count"] = len(entries)
word_result_data["summary"]["total_entries"] = len(entries)
await update_session_db(session_id, word_result=word_result_data, current_step=6)
await update_session_db(session_id, word_result=word_result_data, current_step=8)
cached["word_result"] = word_result_data
session = await get_session_db(session_id)
@@ -3426,7 +3434,7 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request):
word_result_updated["llm_reviewed"] = True
word_result_updated["llm_model"] = OLLAMA_REVIEW_MODEL
await update_session_db(session_id, word_result=word_result_updated, current_step=7)
await update_session_db(session_id, word_result=word_result_updated, current_step=9)
cached["word_result"] = word_result_updated
steps_run.append("llm_review")

View File

@@ -68,7 +68,11 @@ async def init_ocr_pipeline_tables():
ADD COLUMN IF NOT EXISTS doc_type VARCHAR(50),
ADD COLUMN IF NOT EXISTS doc_type_result JSONB,
ADD COLUMN IF NOT EXISTS document_category VARCHAR(50),
ADD COLUMN IF NOT EXISTS pipeline_log JSONB
ADD COLUMN IF NOT EXISTS pipeline_log JSONB,
ADD COLUMN IF NOT EXISTS oriented_png BYTEA,
ADD COLUMN IF NOT EXISTS cropped_png BYTEA,
ADD COLUMN IF NOT EXISTS orientation_result JSONB,
ADD COLUMN IF NOT EXISTS crop_result JSONB
""")
@@ -90,6 +94,7 @@ async def create_session_db(
id, name, filename, original_png, status, current_step
) VALUES ($1, $2, $3, $4, 'active', 1)
RETURNING id, name, filename, status, current_step,
orientation_result, crop_result,
deskew_result, dewarp_result, column_result, row_result,
word_result, ground_truth, auto_shear_degrees,
doc_type, doc_type_result,
@@ -106,6 +111,7 @@ async def get_session_db(session_id: str) -> Optional[Dict[str, Any]]:
async with pool.acquire() as conn:
row = await conn.fetchrow("""
SELECT id, name, filename, status, current_step,
orientation_result, crop_result,
deskew_result, dewarp_result, column_result, row_result,
word_result, ground_truth, auto_shear_degrees,
doc_type, doc_type_result,
@@ -123,6 +129,8 @@ async def get_session_image(session_id: str, image_type: str) -> Optional[bytes]
"""Load a single image (BYTEA) from the session."""
column_map = {
"original": "original_png",
"oriented": "oriented_png",
"cropped": "cropped_png",
"deskewed": "deskewed_png",
"binarized": "binarized_png",
"dewarped": "dewarped_png",
@@ -150,15 +158,17 @@ async def update_session_db(session_id: str, **kwargs) -> Optional[Dict[str, Any
allowed_fields = {
'name', 'filename', 'status', 'current_step',
'original_png', 'deskewed_png', 'binarized_png', 'dewarped_png',
'original_png', 'oriented_png', 'cropped_png',
'deskewed_png', 'binarized_png', 'dewarped_png',
'clean_png', 'handwriting_removal_meta',
'orientation_result', 'crop_result',
'deskew_result', 'dewarp_result', 'column_result', 'row_result',
'word_result', 'ground_truth', 'auto_shear_degrees',
'doc_type', 'doc_type_result',
'document_category', 'pipeline_log',
}
jsonb_fields = {'deskew_result', 'dewarp_result', 'column_result', 'row_result', 'word_result', 'ground_truth', 'handwriting_removal_meta', 'doc_type_result', 'pipeline_log'}
jsonb_fields = {'orientation_result', 'crop_result', 'deskew_result', 'dewarp_result', 'column_result', 'row_result', 'word_result', 'ground_truth', 'handwriting_removal_meta', 'doc_type_result', 'pipeline_log'}
for key, value in kwargs.items():
if key in allowed_fields:
@@ -182,6 +192,7 @@ async def update_session_db(session_id: str, **kwargs) -> Optional[Dict[str, Any
SET {', '.join(fields)}
WHERE id = ${param_idx}
RETURNING id, name, filename, status, current_step,
orientation_result, crop_result,
deskew_result, dewarp_result, column_result, row_result,
word_result, ground_truth, auto_shear_degrees,
doc_type, doc_type_result,
@@ -254,7 +265,7 @@ def _row_to_dict(row: asyncpg.Record) -> Dict[str, Any]:
result[key] = result[key].isoformat()
# JSONB → parsed (asyncpg returns str for JSONB)
for key in ['deskew_result', 'dewarp_result', 'column_result', 'row_result', 'word_result', 'ground_truth', 'doc_type_result', 'pipeline_log']:
for key in ['orientation_result', 'crop_result', 'deskew_result', 'dewarp_result', 'column_result', 'row_result', 'word_result', 'ground_truth', 'doc_type_result', 'pipeline_log']:
if key in result and result[key] is not None:
if isinstance(result[key], str):
result[key] = json.loads(result[key])

View File

@@ -0,0 +1,330 @@
"""
Orientation & Crop API - Steps 1-2 of the OCR Pipeline.
Step 1: Orientation detection (fix 90/180/270 degree rotations)
Step 2: Page cropping (remove scanner borders, detect paper format)
These endpoints were extracted from the main pipeline to keep files manageable.
"""
import logging
import time
from typing import Any, Dict, Optional
import cv2
import numpy as np
from fastapi import APIRouter, HTTPException
from pydantic import BaseModel
from cv_vocab_pipeline import detect_and_fix_orientation
from page_crop import detect_and_crop_page
from ocr_pipeline_session_store import (
get_session_db,
get_session_image,
update_session_db,
)
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["ocr-pipeline"])
# Reference to the shared cache from ocr_pipeline_api (set in main.py)
_cache: Dict[str, Dict[str, Any]] = {}
def set_cache_ref(cache: Dict[str, Dict[str, Any]]):
"""Set reference to the shared cache from ocr_pipeline_api."""
global _cache
_cache = cache
async def _ensure_cached(session_id: str) -> Dict[str, Any]:
"""Ensure session is in cache, loading from DB if needed."""
if session_id in _cache:
return _cache[session_id]
session = await get_session_db(session_id)
if not session:
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
cache_entry: Dict[str, Any] = {
"id": session_id,
**session,
"original_bgr": None,
"oriented_bgr": None,
"cropped_bgr": None,
"deskewed_bgr": None,
"dewarped_bgr": None,
}
for img_type, bgr_key in [
("original", "original_bgr"),
("oriented", "oriented_bgr"),
("cropped", "cropped_bgr"),
("deskewed", "deskewed_bgr"),
("dewarped", "dewarped_bgr"),
]:
png_data = await get_session_image(session_id, img_type)
if png_data:
arr = np.frombuffer(png_data, dtype=np.uint8)
bgr = cv2.imdecode(arr, cv2.IMREAD_COLOR)
cache_entry[bgr_key] = bgr
_cache[session_id] = cache_entry
return cache_entry
async def _append_pipeline_log(session_id: str, step: str, metrics: dict, duration_ms: int):
"""Append a step entry to the pipeline log."""
from datetime import datetime
session = await get_session_db(session_id)
if not session:
return
pipeline_log = session.get("pipeline_log") or {"steps": []}
pipeline_log["steps"].append({
"step": step,
"completed_at": datetime.utcnow().isoformat(),
"success": True,
"duration_ms": duration_ms,
"metrics": metrics,
})
await update_session_db(session_id, pipeline_log=pipeline_log)
# ---------------------------------------------------------------------------
# Step 1: Orientation
# ---------------------------------------------------------------------------
@router.post("/sessions/{session_id}/orientation")
async def detect_orientation(session_id: str):
"""Detect and fix 90/180/270 degree rotations from scanners.
Reads the original image, applies orientation correction,
stores the result as oriented_png.
"""
cached = await _ensure_cached(session_id)
img_bgr = cached.get("original_bgr")
if img_bgr is None:
raise HTTPException(status_code=400, detail="Original image not available")
t0 = time.time()
# Detect and fix orientation
oriented_bgr, orientation_deg = detect_and_fix_orientation(img_bgr.copy())
duration = time.time() - t0
orientation_result = {
"orientation_degrees": orientation_deg,
"corrected": orientation_deg != 0,
"duration_seconds": round(duration, 2),
}
# Encode oriented image
success, png_buf = cv2.imencode(".png", oriented_bgr)
oriented_png = png_buf.tobytes() if success else b""
# Update cache
cached["oriented_bgr"] = oriented_bgr
cached["orientation_result"] = orientation_result
# Persist to DB
await update_session_db(
session_id,
oriented_png=oriented_png,
orientation_result=orientation_result,
current_step=2,
)
logger.info(
"OCR Pipeline: orientation session %s: %d° (%s) in %.2fs",
session_id, orientation_deg,
"corrected" if orientation_deg else "no change",
duration,
)
await _append_pipeline_log(session_id, "orientation", {
"orientation_degrees": orientation_deg,
"corrected": orientation_deg != 0,
}, duration_ms=int(duration * 1000))
h, w = oriented_bgr.shape[:2]
return {
"session_id": session_id,
**orientation_result,
"image_width": w,
"image_height": h,
"oriented_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/oriented",
}
# ---------------------------------------------------------------------------
# Step 2: Crop
# ---------------------------------------------------------------------------
@router.post("/sessions/{session_id}/crop")
async def auto_crop(session_id: str):
"""Auto-detect and crop scanner borders.
Reads the oriented image (or original if no orientation step),
detects the page boundary and crops.
"""
cached = await _ensure_cached(session_id)
# Use oriented image if available, else original
img_bgr = cached.get("oriented_bgr") or cached.get("original_bgr")
if img_bgr is None:
raise HTTPException(status_code=400, detail="No image available for cropping")
t0 = time.time()
cropped_bgr, crop_info = detect_and_crop_page(img_bgr)
duration = time.time() - t0
crop_info["duration_seconds"] = round(duration, 2)
# Encode cropped image
success, png_buf = cv2.imencode(".png", cropped_bgr)
cropped_png = png_buf.tobytes() if success else b""
# Update cache
cached["cropped_bgr"] = cropped_bgr
cached["crop_result"] = crop_info
# Persist to DB
await update_session_db(
session_id,
cropped_png=cropped_png,
crop_result=crop_info,
current_step=3,
)
logger.info(
"OCR Pipeline: crop session %s: applied=%s format=%s in %.2fs",
session_id, crop_info["crop_applied"],
crop_info.get("detected_format", "?"),
duration,
)
await _append_pipeline_log(session_id, "crop", {
"crop_applied": crop_info["crop_applied"],
"detected_format": crop_info.get("detected_format"),
"format_confidence": crop_info.get("format_confidence"),
}, duration_ms=int(duration * 1000))
h, w = cropped_bgr.shape[:2]
return {
"session_id": session_id,
**crop_info,
"image_width": w,
"image_height": h,
"cropped_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/cropped",
}
class ManualCropRequest(BaseModel):
x: float # percentage 0-100
y: float # percentage 0-100
width: float # percentage 0-100
height: float # percentage 0-100
@router.post("/sessions/{session_id}/crop/manual")
async def manual_crop(session_id: str, req: ManualCropRequest):
"""Manually crop using percentage coordinates."""
cached = await _ensure_cached(session_id)
img_bgr = cached.get("oriented_bgr") or cached.get("original_bgr")
if img_bgr is None:
raise HTTPException(status_code=400, detail="No image available for cropping")
h, w = img_bgr.shape[:2]
# Convert percentages to pixels
px_x = int(w * req.x / 100.0)
px_y = int(h * req.y / 100.0)
px_w = int(w * req.width / 100.0)
px_h = int(h * req.height / 100.0)
# Clamp
px_x = max(0, min(px_x, w - 1))
px_y = max(0, min(px_y, h - 1))
px_w = max(1, min(px_w, w - px_x))
px_h = max(1, min(px_h, h - px_y))
cropped_bgr = img_bgr[px_y:px_y + px_h, px_x:px_x + px_w].copy()
success, png_buf = cv2.imencode(".png", cropped_bgr)
cropped_png = png_buf.tobytes() if success else b""
crop_result = {
"crop_applied": True,
"crop_rect": {"x": px_x, "y": px_y, "width": px_w, "height": px_h},
"crop_rect_pct": {"x": round(req.x, 2), "y": round(req.y, 2),
"width": round(req.width, 2), "height": round(req.height, 2)},
"original_size": {"width": w, "height": h},
"cropped_size": {"width": px_w, "height": px_h},
"method": "manual",
}
cached["cropped_bgr"] = cropped_bgr
cached["crop_result"] = crop_result
await update_session_db(
session_id,
cropped_png=cropped_png,
crop_result=crop_result,
current_step=3,
)
ch, cw = cropped_bgr.shape[:2]
return {
"session_id": session_id,
**crop_result,
"image_width": cw,
"image_height": ch,
"cropped_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/cropped",
}
@router.post("/sessions/{session_id}/crop/skip")
async def skip_crop(session_id: str):
"""Skip cropping — use oriented (or original) image as-is."""
cached = await _ensure_cached(session_id)
img_bgr = cached.get("oriented_bgr") or cached.get("original_bgr")
if img_bgr is None:
raise HTTPException(status_code=400, detail="No image available")
h, w = img_bgr.shape[:2]
# Store the oriented image as cropped (identity crop)
success, png_buf = cv2.imencode(".png", img_bgr)
cropped_png = png_buf.tobytes() if success else b""
crop_result = {
"crop_applied": False,
"skipped": True,
"original_size": {"width": w, "height": h},
"cropped_size": {"width": w, "height": h},
}
cached["cropped_bgr"] = img_bgr
cached["crop_result"] = crop_result
await update_session_db(
session_id,
cropped_png=cropped_png,
crop_result=crop_result,
current_step=3,
)
return {
"session_id": session_id,
**crop_result,
"image_width": w,
"image_height": h,
"cropped_image_url": f"/api/v1/ocr-pipeline/sessions/{session_id}/image/cropped",
}

View File

@@ -0,0 +1,187 @@
"""
Page Crop - Automatic scanner border removal and page format detection.
Detects the paper boundary in a scanned image and crops away scanner borders.
Also identifies the paper format (A4, Letter, etc.) from the aspect ratio.
License: Apache 2.0
"""
import logging
from typing import Dict, Any, Tuple
import cv2
import numpy as np
logger = logging.getLogger(__name__)
# Known paper format aspect ratios (height / width, portrait orientation)
PAPER_FORMATS = {
"A4": 297.0 / 210.0, # 1.4143
"A5": 210.0 / 148.0, # 1.4189
"Letter": 11.0 / 8.5, # 1.2941
"Legal": 14.0 / 8.5, # 1.6471
"A3": 420.0 / 297.0, # 1.4141
}
def detect_and_crop_page(
img_bgr: np.ndarray,
min_border_fraction: float = 0.01,
) -> Tuple[np.ndarray, Dict[str, Any]]:
"""Detect page boundary and crop scanner borders.
Algorithm:
1. Grayscale + GaussianBlur to smooth out text
2. Otsu threshold (page=bright, scanner border=dark)
3. Morphological close to fill gaps
4. Find largest contour = page
5. If contour covers >95% of image area -> no crop needed
6. Get bounding rect, add safety margin
7. Match aspect ratio to known paper formats
Args:
img_bgr: Input BGR image
min_border_fraction: Minimum border fraction to trigger crop (default 1%)
Returns:
Tuple of (cropped_image, result_dict)
"""
h, w = img_bgr.shape[:2]
total_area = h * w
result: Dict[str, Any] = {
"crop_applied": False,
"crop_rect": None,
"crop_rect_pct": None,
"original_size": {"width": w, "height": h},
"cropped_size": {"width": w, "height": h},
"detected_format": None,
"format_confidence": 0.0,
"aspect_ratio": round(max(h, w) / max(min(h, w), 1), 4),
"border_fractions": {"top": 0.0, "bottom": 0.0, "left": 0.0, "right": 0.0},
}
# 1. Grayscale + blur
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
blurred = cv2.GaussianBlur(gray, (21, 21), 0)
# 2. Otsu threshold
_, binary = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
# 3. Morphological close to fill text gaps
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (50, 50))
closed = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
# 4. Find contours
contours, _ = cv2.findContours(closed, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
if not contours:
logger.info("No contours found - returning original image")
return img_bgr, result
# Get the largest contour
largest = max(contours, key=cv2.contourArea)
contour_area = cv2.contourArea(largest)
# 5. If contour covers >95% of image, no crop needed
if contour_area > 0.95 * total_area:
logger.info("Page covers >95%% of image - no crop needed")
result["detected_format"], result["format_confidence"] = _detect_format(w, h)
return img_bgr, result
# 6. Get bounding rect
rx, ry, rw, rh = cv2.boundingRect(largest)
# Calculate border fractions
border_top = ry / h
border_bottom = (h - (ry + rh)) / h
border_left = rx / w
border_right = (w - (rx + rw)) / w
result["border_fractions"] = {
"top": round(border_top, 4),
"bottom": round(border_bottom, 4),
"left": round(border_left, 4),
"right": round(border_right, 4),
}
# 7. Check if borders are significant enough to crop
if all(f < min_border_fraction for f in [border_top, border_bottom, border_left, border_right]):
logger.info("All borders < %.1f%% - no crop needed", min_border_fraction * 100)
result["detected_format"], result["format_confidence"] = _detect_format(w, h)
return img_bgr, result
# 8. Add safety margin (0.5% of image dimensions)
margin_x = int(w * 0.005)
margin_y = int(h * 0.005)
crop_x = max(0, rx - margin_x)
crop_y = max(0, ry - margin_y)
crop_x2 = min(w, rx + rw + margin_x)
crop_y2 = min(h, ry + rh + margin_y)
crop_w = crop_x2 - crop_x
crop_h = crop_y2 - crop_y
# Sanity check: cropped area should be at least 50% of original
if crop_w * crop_h < 0.5 * total_area:
logger.warning("Cropped area too small (%.0f%%) - skipping crop",
100.0 * crop_w * crop_h / total_area)
result["detected_format"], result["format_confidence"] = _detect_format(w, h)
return img_bgr, result
# 9. Crop
cropped = img_bgr[crop_y:crop_y2, crop_x:crop_x2].copy()
# 10. Detect format from cropped dimensions
detected_format, format_confidence = _detect_format(crop_w, crop_h)
result["crop_applied"] = True
result["crop_rect"] = {"x": crop_x, "y": crop_y, "width": crop_w, "height": crop_h}
result["crop_rect_pct"] = {
"x": round(100.0 * crop_x / w, 2),
"y": round(100.0 * crop_y / h, 2),
"width": round(100.0 * crop_w / w, 2),
"height": round(100.0 * crop_h / h, 2),
}
result["cropped_size"] = {"width": crop_w, "height": crop_h}
result["detected_format"] = detected_format
result["format_confidence"] = format_confidence
result["aspect_ratio"] = round(max(crop_w, crop_h) / max(min(crop_w, crop_h), 1), 4)
logger.info("Page cropped: %dx%d -> %dx%d, format=%s (%.0f%%), borders: T=%.1f%% B=%.1f%% L=%.1f%% R=%.1f%%",
w, h, crop_w, crop_h, detected_format, format_confidence * 100,
border_top * 100, border_bottom * 100, border_left * 100, border_right * 100)
return cropped, result
def _detect_format(width: int, height: int) -> Tuple[str, float]:
"""Detect paper format from dimensions by comparing aspect ratios.
Returns:
(format_name, confidence) where confidence is 0.0-1.0
"""
if width <= 0 or height <= 0:
return "unknown", 0.0
# Use portrait aspect ratio (taller / shorter)
aspect = max(width, height) / min(width, height)
best_format = "unknown"
best_diff = float("inf")
for fmt, expected_ratio in PAPER_FORMATS.items():
diff = abs(aspect - expected_ratio)
if diff < best_diff:
best_diff = diff
best_format = fmt
# Confidence: 1.0 if exact match, decreasing with deviation
# Threshold: if diff > 0.1, confidence drops below 0.5
confidence = max(0.0, 1.0 - best_diff * 5.0)
if confidence < 0.3:
return "unknown", 0.0
return best_format, round(confidence, 3)