refactor: Crop nach Deskew/Dewarp verschieben + content-basierter Buchscan-Crop
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m56s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 17s

Pipeline-Reihenfolge neu: Orientierung → Begradigung → Entzerrung → Zuschneiden → Spalten...
Crop arbeitet jetzt auf dem bereits geraden Bild, was bessere Ergebnisse liefert.

page_crop.py komplett ersetzt: Adaptive Threshold + 4-Kanten-Erkennung
(Buchruecken-Schatten links, Ink-Projektion fuer alle Raender) statt
Otsu + groesste Kontur.

Backend: Step-Nummern, Input-Bilder, Reprocess-Kaskade angepasst.
Frontend: PIPELINE_STEPS umgeordnet, Switch-Cases, Vorher-Bilder aktualisiert.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-09 08:52:11 +01:00
parent eb45bb4879
commit 156a818246
7 changed files with 295 additions and 173 deletions

View File

@@ -71,8 +71,8 @@ export default function OcrPipelinePage() {
// Determine which step to jump to based on current_step
const dbStep = data.current_step || 1
// Steps: 1=deskew, 2=dewarp, 3=columns, ...
// UI steps are 0-indexed: 0=deskew, 1=dewarp, 2=columns, ...
// DB steps: 1=start, 2=orientation, 3=deskew, 4=dewarp, 5=crop, 6=columns, ...
// UI steps are 0-indexed: 0=orientation, 1=deskew, 2=dewarp, 3=crop, 4=columns, ...
const uiStep = Math.max(0, dbStep - 1)
const skipSteps = savedDocType?.skip_steps || []
@@ -205,8 +205,8 @@ export default function OcrPipelinePage() {
handleNext()
}
const handleDewarpNext = async () => {
// Auto-detect document type after dewarp, then advance
const handleCropNext = async () => {
// Auto-detect document type after crop (last image-processing step), then advance
if (sessionId) {
try {
const res = await fetch(
@@ -273,9 +273,9 @@ export default function OcrPipelinePage() {
const stepNames: Record<number, string> = {
1: 'Orientierung',
2: 'Zuschneiden',
3: 'Begradigung',
4: 'Entzerrung',
2: 'Begradigung',
3: 'Entzerrung',
4: 'Zuschneiden',
5: 'Spalten',
6: 'Zeilen',
7: 'Woerter',
@@ -312,11 +312,11 @@ export default function OcrPipelinePage() {
case 0:
return <StepOrientation sessionId={sessionId} onNext={handleOrientationComplete} />
case 1:
return <StepCrop sessionId={sessionId} onNext={handleNext} />
case 2:
return <StepDeskew sessionId={sessionId} onNext={handleNext} />
case 2:
return <StepDewarp sessionId={sessionId} onNext={handleNext} />
case 3:
return <StepDewarp sessionId={sessionId} onNext={handleDewarpNext} />
return <StepCrop sessionId={sessionId} onNext={handleCropNext} />
case 4:
return <StepColumnDetection sessionId={sessionId} onNext={handleNext} />
case 5:

View File

@@ -310,9 +310,9 @@ export const IMAGE_STYLES: { value: ImageStyle; label: string }[] = [
export const PIPELINE_STEPS: PipelineStep[] = [
{ id: 'orientation', name: 'Orientierung', icon: '🔄', status: 'pending' },
{ id: 'crop', name: 'Zuschneiden', icon: '✂️', status: 'pending' },
{ id: 'deskew', name: 'Begradigung', icon: '📐', status: 'pending' },
{ id: 'dewarp', name: 'Entzerrung', icon: '🔧', status: 'pending' },
{ id: 'crop', name: 'Zuschneiden', icon: '✂️', status: 'pending' },
{ id: 'columns', name: 'Spalten', icon: '📊', status: 'pending' },
{ id: 'rows', name: 'Zeilen', icon: '📏', status: 'pending' },
{ id: 'words', name: 'Woerter', icon: '🔤', status: 'pending' },

View File

@@ -78,7 +78,7 @@ export function StepCrop({ sessionId, onNext }: StepCropProps) {
return <div className="text-sm text-gray-400">Keine Session ausgewaehlt.</div>
}
const orientedUrl = `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/oriented`
const dewarpedUrl = `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/dewarped`
const croppedUrl = cropResult
? `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/cropped`
: null
@@ -95,12 +95,12 @@ export function StepCrop({ sessionId, onNext }: StepCropProps) {
{/* Image comparison */}
<ImageCompareView
originalUrl={orientedUrl}
originalUrl={dewarpedUrl}
deskewedUrl={croppedUrl}
showGrid={false}
showBinarized={false}
binarizedUrl={null}
leftLabel="Orientiert"
leftLabel="Entzerrt"
rightLabel="Zugeschnitten"
/>

View File

@@ -37,8 +37,8 @@ export function StepDeskew({ sessionId, onNext }: StepDeskewProps) {
filename: data.filename,
image_width: data.image_width,
image_height: data.image_height,
// Use cropped image as "before" view
original_image_url: `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/cropped`,
// Use oriented image as "before" view (deskew runs right after orientation)
original_image_url: `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/oriented`,
}
setSession(sessionInfo)
@@ -155,7 +155,7 @@ export function StepDeskew({ sessionId, onNext }: StepDeskewProps) {
showGrid={showGrid}
showBinarized={showBinarized}
binarizedUrl={deskewResult?.binarized_image_url ?? null}
leftLabel="Zugeschnitten"
leftLabel="Orientiert"
rightLabel="Begradigt"
/>
)}

View File

@@ -3,9 +3,9 @@ OCR Pipeline API - Schrittweise Seitenrekonstruktion.
Zerlegt den OCR-Prozess in 10 einzelne Schritte:
1. Orientierung - 90/180/270° Drehungen korrigieren (orientation_crop_api.py)
2. Zuschneiden - Scannerraender entfernen (orientation_crop_api.py)
3. Deskewing - Scan begradigen
4. Dewarping - Buchwoelbung entzerren
2. Begradigung (Deskew) - Scan begradigen
3. Entzerrung (Dewarp) - Buchwoelbung entzerren
4. Zuschneiden - Scannerraender/Buchruecken entfernen (orientation_crop_api.py)
5. Spaltenerkennung - Unsichtbare Spalten finden
6. Zeilenerkennung - Horizontale Zeilen + Kopf-/Fusszeilen
7. Worterkennung - OCR mit Bounding Boxes
@@ -483,8 +483,8 @@ async def auto_deskew(session_id: str):
await _load_session_to_cache(session_id)
cached = _get_cached(session_id)
# Use cropped image as input (from step 2), fall back to oriented, then original
img_bgr = next((v for k in ("cropped_bgr", "oriented_bgr", "original_bgr")
# Deskew runs right after orientation — use oriented image, fall back to original
img_bgr = next((v for k in ("oriented_bgr", "original_bgr")
if (v := cached.get(k)) is not None), None)
if img_bgr is None:
raise HTTPException(status_code=400, detail="No image available for deskewing")
@@ -554,7 +554,7 @@ async def auto_deskew(session_id: str):
db_update = {
"deskewed_png": deskewed_png,
"deskew_result": deskew_result,
"current_step": 4,
"current_step": 3,
}
if binarized_png:
db_update["binarized_png"] = binarized_png
@@ -585,12 +585,12 @@ async def auto_deskew(session_id: str):
@router.post("/sessions/{session_id}/deskew/manual")
async def manual_deskew(session_id: str, req: ManualDeskewRequest):
"""Apply a manual rotation angle to the cropped image."""
"""Apply a manual rotation angle to the oriented image."""
if session_id not in _cache:
await _load_session_to_cache(session_id)
cached = _get_cached(session_id)
img_bgr = next((v for k in ("cropped_bgr", "oriented_bgr", "original_bgr")
img_bgr = next((v for k in ("oriented_bgr", "original_bgr")
if (v := cached.get(k)) is not None), None)
if img_bgr is None:
raise HTTPException(status_code=400, detail="No image available for deskewing")
@@ -801,7 +801,7 @@ async def auto_dewarp(
dewarped_png=dewarped_png,
dewarp_result=dewarp_result,
auto_shear_degrees=dewarp_info.get("shear_degrees", 0.0),
current_step=5,
current_step=4,
)
logger.info(f"OCR Pipeline: dewarp session {session_id}: "
@@ -993,20 +993,21 @@ async def save_dewarp_ground_truth(session_id: str, req: DewarpGroundTruthReques
async def detect_type(session_id: str):
"""Detect document type (vocab_table, full_text, generic_table).
Should be called after dewarp (clean image available).
Should be called after crop (clean image available).
Falls back to dewarped if crop was skipped.
Stores result in session for frontend to decide pipeline flow.
"""
if session_id not in _cache:
await _load_session_to_cache(session_id)
cached = _get_cached(session_id)
dewarped_bgr = cached.get("dewarped_bgr")
if dewarped_bgr is None:
raise HTTPException(status_code=400, detail="Dewarp must be completed first")
img_bgr = cached.get("cropped_bgr") or cached.get("dewarped_bgr")
if img_bgr is None:
raise HTTPException(status_code=400, detail="Crop or dewarp must be completed first")
t0 = time.time()
ocr_img = create_ocr_image(dewarped_bgr)
result = detect_document_type(ocr_img, dewarped_bgr)
ocr_img = create_ocr_image(img_bgr)
result = detect_document_type(ocr_img, img_bgr)
duration = time.time() - t0
result_dict = {
@@ -1046,27 +1047,27 @@ async def detect_type(session_id: str):
@router.post("/sessions/{session_id}/columns")
async def detect_columns(session_id: str):
"""Run column detection on the dewarped image."""
"""Run column detection on the cropped (or dewarped) image."""
if session_id not in _cache:
await _load_session_to_cache(session_id)
cached = _get_cached(session_id)
dewarped_bgr = cached.get("dewarped_bgr")
if dewarped_bgr is None:
raise HTTPException(status_code=400, detail="Dewarp must be completed before column detection")
img_bgr = cached.get("cropped_bgr") or cached.get("dewarped_bgr")
if img_bgr is None:
raise HTTPException(status_code=400, detail="Crop or dewarp must be completed before column detection")
t0 = time.time()
# Binarized image for layout analysis
ocr_img = create_ocr_image(dewarped_bgr)
ocr_img = create_ocr_image(img_bgr)
h, w = ocr_img.shape[:2]
# Phase A: Geometry detection (returns word_dicts + inv for reuse)
geo_result = detect_column_geometry(ocr_img, dewarped_bgr)
geo_result = detect_column_geometry(ocr_img, img_bgr)
if geo_result is None:
# Fallback to projection-based layout
layout_img = create_layout_image(dewarped_bgr)
layout_img = create_layout_image(img_bgr)
regions = analyze_layout(layout_img, ocr_img)
else:
geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result
@@ -1113,7 +1114,7 @@ async def detect_columns(session_id: str):
column_result=column_result,
row_result=None,
word_result=None,
current_step=5,
current_step=6,
)
# Update cache
@@ -1125,7 +1126,7 @@ async def detect_columns(session_id: str):
logger.info(f"OCR Pipeline: columns session {session_id}: "
f"{col_count} columns detected ({duration:.2f}s)")
img_w = dewarped_bgr.shape[1]
img_w = img_bgr.shape[1]
await _append_pipeline_log(session_id, "columns", {
"total_columns": len(columns),
"column_widths_pct": [round(c["width"] / img_w * 100, 1) for c in columns],
@@ -1276,14 +1277,14 @@ async def _get_columns_overlay(session_id: str) -> Response:
@router.post("/sessions/{session_id}/rows")
async def detect_rows(session_id: str):
"""Run row detection on the dewarped image using horizontal gap analysis."""
"""Run row detection on the cropped (or dewarped) image using horizontal gap analysis."""
if session_id not in _cache:
await _load_session_to_cache(session_id)
cached = _get_cached(session_id)
dewarped_bgr = cached.get("dewarped_bgr")
dewarped_bgr = cached.get("cropped_bgr") or cached.get("dewarped_bgr")
if dewarped_bgr is None:
raise HTTPException(status_code=400, detail="Dewarp must be completed before row detection")
raise HTTPException(status_code=400, detail="Crop or dewarp must be completed before row detection")
t0 = time.time()
@@ -1339,7 +1340,7 @@ async def detect_rows(session_id: str):
session_id,
row_result=row_result,
word_result=None,
current_step=6,
current_step=7,
)
cached["row_result"] = row_result
@@ -1453,11 +1454,11 @@ async def detect_words(
await _load_session_to_cache(session_id)
cached = _get_cached(session_id)
dewarped_bgr = cached.get("dewarped_bgr")
dewarped_bgr = cached.get("cropped_bgr") or cached.get("dewarped_bgr")
if dewarped_bgr is None:
logger.warning("detect_words: dewarped_bgr is None for session %s (cache keys: %s)",
logger.warning("detect_words: no cropped/dewarped image for session %s (cache keys: %s)",
session_id, [k for k in cached.keys() if k.endswith('_bgr')])
raise HTTPException(status_code=400, detail="Dewarp must be completed before word detection")
raise HTTPException(status_code=400, detail="Crop or dewarp must be completed before word detection")
session = await get_session_db(session_id)
if not session:
@@ -1605,7 +1606,7 @@ async def detect_words(
await update_session_db(
session_id,
word_result=word_result,
current_step=7,
current_step=8,
)
cached["word_result"] = word_result
@@ -1749,7 +1750,7 @@ async def _word_batch_stream_generator(
word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))
vocab_entries = entries
await update_session_db(session_id, word_result=word_result, current_step=7)
await update_session_db(session_id, word_result=word_result, current_step=8)
cached["word_result"] = word_result
logger.info(f"OCR Pipeline SSE batch: words session {session_id}: "
@@ -1896,7 +1897,7 @@ async def _word_stream_generator(
await update_session_db(
session_id,
word_result=word_result,
current_step=7,
current_step=8,
)
cached["word_result"] = word_result
@@ -2020,7 +2021,7 @@ async def run_llm_review(session_id: str, request: Request, stream: bool = False
"duration_ms": result["duration_ms"],
"entries_corrected": result["entries_corrected"],
}
await update_session_db(session_id, word_result=word_result, current_step=8)
await update_session_db(session_id, word_result=word_result, current_step=9)
if session_id in _cache:
_cache[session_id]["word_result"] = word_result
@@ -2069,7 +2070,7 @@ async def _llm_review_stream_generator(
"duration_ms": event["duration_ms"],
"entries_corrected": event["entries_corrected"],
}
await update_session_db(session_id, word_result=word_result, current_step=8)
await update_session_db(session_id, word_result=word_result, current_step=9)
if session_id in _cache:
_cache[session_id]["word_result"] = word_result
@@ -2157,7 +2158,7 @@ async def save_reconstruction(session_id: str, request: Request):
cell_updates = body.get("cells", [])
if not cell_updates:
await update_session_db(session_id, current_step=9)
await update_session_db(session_id, current_step=10)
return {"session_id": session_id, "updated": 0}
# Build update map: cell_id -> new text
@@ -2193,7 +2194,7 @@ async def save_reconstruction(session_id: str, request: Request):
if "entries" in word_result:
word_result["entries"] = entries
await update_session_db(session_id, word_result=word_result, current_step=9)
await update_session_db(session_id, word_result=word_result, current_step=10)
if session_id in _cache:
_cache[session_id]["word_result"] = word_result
@@ -2589,7 +2590,7 @@ async def save_validation(session_id: str, req: ValidationRequest):
validation["score"] = req.score
ground_truth["validation"] = validation
await update_session_db(session_id, ground_truth=ground_truth, current_step=10)
await update_session_db(session_id, ground_truth=ground_truth, current_step=11)
if session_id in _cache:
_cache[session_id]["ground_truth"] = ground_truth
@@ -2622,11 +2623,14 @@ async def reprocess_session(session_id: str, request: Request):
Body: {"from_step": 5} (1-indexed step number)
Pipeline order: Orientation(1) → Deskew(2) → Dewarp(3) → Crop(4) → Columns(5) →
Rows(6) → Words(7) → LLM-Review(8) → Reconstruction(9) → Validation(10)
Clears downstream results:
- from_step <= 1: orientation_result, crop_result, deskew_result, dewarp_result, column_result, row_result, word_result
- from_step <= 2: crop_result, deskew_result, dewarp_result, column_result, row_result, word_result
- from_step <= 3: deskew_result, dewarp_result, column_result, row_result, word_result
- from_step <= 4: dewarp_result, column_result, row_result, word_result
- from_step <= 1: orientation_result + all downstream
- from_step <= 2: deskew_result + all downstream
- from_step <= 3: dewarp_result + all downstream
- from_step <= 4: crop_result + all downstream
- from_step <= 5: column_result, row_result, word_result
- from_step <= 6: row_result, word_result
- from_step <= 7: word_result (cells, vocab_entries)
@@ -2638,15 +2642,17 @@ async def reprocess_session(session_id: str, request: Request):
body = await request.json()
from_step = body.get("from_step", 1)
if not isinstance(from_step, int) or from_step < 1 or from_step > 9:
raise HTTPException(status_code=400, detail="from_step must be between 1 and 9")
if not isinstance(from_step, int) or from_step < 1 or from_step > 10:
raise HTTPException(status_code=400, detail="from_step must be between 1 and 10")
update_kwargs: Dict[str, Any] = {"current_step": from_step}
# Clear downstream data based on from_step
if from_step <= 7:
# New pipeline order: Orient(2) → Deskew(3) → Dewarp(4) → Crop(5) →
# Columns(6) → Rows(7) → Words(8) → LLM(9) → Recon(10) → GT(11)
if from_step <= 8:
update_kwargs["word_result"] = None
elif from_step == 8:
elif from_step == 9:
# Only clear LLM review from word_result
word_result = session.get("word_result")
if word_result:
@@ -2654,16 +2660,16 @@ async def reprocess_session(session_id: str, request: Request):
word_result.pop("llm_corrections", None)
update_kwargs["word_result"] = word_result
if from_step <= 6:
if from_step <= 7:
update_kwargs["row_result"] = None
if from_step <= 5:
if from_step <= 6:
update_kwargs["column_result"] = None
if from_step <= 4:
update_kwargs["dewarp_result"] = None
if from_step <= 3:
update_kwargs["deskew_result"] = None
if from_step <= 2:
update_kwargs["crop_result"] = None
if from_step <= 3:
update_kwargs["dewarp_result"] = None
if from_step <= 2:
update_kwargs["deskew_result"] = None
if from_step <= 1:
update_kwargs["orientation_result"] = None
@@ -3084,7 +3090,7 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request):
deskewed_png=deskewed_png,
deskew_result=deskew_result,
auto_rotation_degrees=float(angle_applied),
current_step=4,
current_step=3,
)
session = await get_session_db(session_id)
@@ -3147,7 +3153,7 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request):
dewarped_png=dewarped_png,
dewarp_result=dewarp_result,
auto_shear_degrees=dewarp_info.get("shear_degrees", 0.0),
current_step=5,
current_step=4,
)
session = await get_session_db(session_id)
@@ -3170,16 +3176,16 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request):
yield await _auto_sse_event("columns", "start", {})
try:
t0 = time.time()
dewarped_bgr = cached.get("dewarped_bgr")
if dewarped_bgr is None:
raise ValueError("Dewarped image not available")
col_img = cached.get("cropped_bgr") or cached.get("dewarped_bgr")
if col_img is None:
raise ValueError("Cropped/dewarped image not available")
ocr_img = create_ocr_image(dewarped_bgr)
ocr_img = create_ocr_image(col_img)
h, w = ocr_img.shape[:2]
geo_result = detect_column_geometry(ocr_img, dewarped_bgr)
geo_result = detect_column_geometry(ocr_img, col_img)
if geo_result is None:
layout_img = create_layout_image(dewarped_bgr)
layout_img = create_layout_image(col_img)
regions = analyze_layout(layout_img, ocr_img)
cached["_word_dicts"] = None
cached["_inv"] = None
@@ -3231,7 +3237,7 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request):
yield await _auto_sse_event("rows", "start", {})
try:
t0 = time.time()
dewarped_bgr = cached.get("dewarped_bgr")
row_img = cached.get("cropped_bgr") or cached.get("dewarped_bgr")
session = await get_session_db(session_id)
column_result = session.get("column_result") or cached.get("column_result")
if not column_result or not column_result.get("columns"):
@@ -3252,8 +3258,8 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request):
content_bounds = cached.get("_content_bounds")
if word_dicts is None or inv is None or content_bounds is None:
ocr_img_tmp = create_ocr_image(dewarped_bgr)
geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr)
ocr_img_tmp = create_ocr_image(row_img)
geo_result = detect_column_geometry(ocr_img_tmp, row_img)
if geo_result is None:
raise ValueError("Column geometry detection failed — cannot detect rows")
_g, lx, rx, ty, by, word_dicts, inv = geo_result
@@ -3309,7 +3315,7 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request):
yield await _auto_sse_event("words", "start", {"engine": req.ocr_engine})
try:
t0 = time.time()
dewarped_bgr = cached.get("dewarped_bgr")
word_img = cached.get("cropped_bgr") or cached.get("dewarped_bgr")
session = await get_session_db(session_id)
column_result = session.get("column_result") or cached.get("column_result")
@@ -3348,12 +3354,12 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request):
]
row.word_count = len(row.words)
ocr_img = create_ocr_image(dewarped_bgr)
img_h, img_w = dewarped_bgr.shape[:2]
ocr_img = create_ocr_image(word_img)
img_h, img_w = word_img.shape[:2]
cells, columns_meta = build_cell_grid(
ocr_img, col_regions, row_geoms, img_w, img_h,
ocr_engine=req.ocr_engine, img_bgr=dewarped_bgr,
ocr_engine=req.ocr_engine, img_bgr=word_img,
)
duration = time.time() - t0

View File

@@ -1,8 +1,8 @@
"""
Orientation & Crop API - Steps 1-2 of the OCR Pipeline.
Orientation & Crop API - Steps 1 and 4 of the OCR Pipeline.
Step 1: Orientation detection (fix 90/180/270 degree rotations)
Step 2: Page cropping (remove scanner borders, detect paper format)
Step 4 (UI index 3): Page cropping (after deskew + dewarp, so the image is straight)
These endpoints were extracted from the main pipeline to keep files manageable.
"""
@@ -161,21 +161,24 @@ async def detect_orientation(session_id: str):
# ---------------------------------------------------------------------------
# Step 2: Crop
# Step 4 (UI index 3): Crop — runs after deskew + dewarp
# ---------------------------------------------------------------------------
@router.post("/sessions/{session_id}/crop")
async def auto_crop(session_id: str):
"""Auto-detect and crop scanner borders.
"""Auto-detect and crop scanner/book borders.
Reads the oriented image (or original if no orientation step),
detects the page boundary and crops.
Reads the dewarped image (post-deskew + dewarp, so the page is straight).
Falls back to oriented → original if earlier steps were skipped.
"""
cached = await _ensure_cached(session_id)
# Use oriented image if available, else original
oriented = cached.get("oriented_bgr")
img_bgr = oriented if oriented is not None else cached.get("original_bgr")
# Use dewarped (preferred), fall back to oriented, then original
img_bgr = next(
(v for k in ("dewarped_bgr", "oriented_bgr", "original_bgr")
if (v := cached.get(k)) is not None),
None,
)
if img_bgr is None:
raise HTTPException(status_code=400, detail="No image available for cropping")
@@ -199,7 +202,7 @@ async def auto_crop(session_id: str):
session_id,
cropped_png=cropped_png,
crop_result=crop_info,
current_step=3,
current_step=5,
)
logger.info(
@@ -237,8 +240,11 @@ async def manual_crop(session_id: str, req: ManualCropRequest):
"""Manually crop using percentage coordinates."""
cached = await _ensure_cached(session_id)
oriented = cached.get("oriented_bgr")
img_bgr = oriented if oriented is not None else cached.get("original_bgr")
img_bgr = next(
(v for k in ("dewarped_bgr", "oriented_bgr", "original_bgr")
if (v := cached.get(k)) is not None),
None,
)
if img_bgr is None:
raise HTTPException(status_code=400, detail="No image available for cropping")
@@ -278,7 +284,7 @@ async def manual_crop(session_id: str, req: ManualCropRequest):
session_id,
cropped_png=cropped_png,
crop_result=crop_result,
current_step=3,
current_step=5,
)
ch, cw = cropped_bgr.shape[:2]
@@ -293,17 +299,20 @@ async def manual_crop(session_id: str, req: ManualCropRequest):
@router.post("/sessions/{session_id}/crop/skip")
async def skip_crop(session_id: str):
"""Skip cropping — use oriented (or original) image as-is."""
"""Skip cropping — use dewarped (or oriented/original) image as-is."""
cached = await _ensure_cached(session_id)
oriented = cached.get("oriented_bgr")
img_bgr = oriented if oriented is not None else cached.get("original_bgr")
img_bgr = next(
(v for k in ("dewarped_bgr", "oriented_bgr", "original_bgr")
if (v := cached.get(k)) is not None),
None,
)
if img_bgr is None:
raise HTTPException(status_code=400, detail="No image available")
h, w = img_bgr.shape[:2]
# Store the oriented image as cropped (identity crop)
# Store the dewarped image as cropped (identity crop)
success, png_buf = cv2.imencode(".png", img_bgr)
cropped_png = png_buf.tobytes() if success else b""
@@ -321,7 +330,7 @@ async def skip_crop(session_id: str):
session_id,
cropped_png=cropped_png,
crop_result=crop_result,
current_step=3,
current_step=5,
)
return {

View File

@@ -1,14 +1,15 @@
"""
Page Crop - Automatic scanner border removal and page format detection.
Page Crop - Content-based crop for scanned pages and book scans.
Detects the paper boundary in a scanned image and crops away scanner borders.
Also identifies the paper format (A4, Letter, etc.) from the aspect ratio.
Detects the content boundary by analysing ink density projections and
(for book scans) the spine shadow gradient. Works with both loose A4
sheets on dark scanners AND book scans with white backgrounds.
License: Apache 2.0
"""
import logging
from typing import Dict, Any, Tuple
from typing import Dict, Any, Tuple, Optional
import cv2
import numpy as np
@@ -24,25 +25,30 @@ PAPER_FORMATS = {
"A3": 420.0 / 297.0, # 1.4141
}
# Minimum ink density (fraction of pixels) to count a row/column as "content"
_INK_THRESHOLD = 0.003 # 0.3%
# Minimum run length (fraction of dimension) to keep — shorter runs are noise
_MIN_RUN_FRAC = 0.005 # 0.5%
def detect_and_crop_page(
img_bgr: np.ndarray,
min_border_fraction: float = 0.01,
margin_frac: float = 0.01,
) -> Tuple[np.ndarray, Dict[str, Any]]:
"""Detect page boundary and crop scanner borders.
"""Detect content boundary and crop scanner/book borders.
Algorithm:
1. Grayscale + GaussianBlur to smooth out text
2. Otsu threshold (page=bright, scanner border=dark)
3. Morphological close to fill gaps
4. Find largest contour = page
5. If contour covers >95% of image area -> no crop needed
6. Get bounding rect, add safety margin
7. Match aspect ratio to known paper formats
Algorithm (4-edge detection):
1. Adaptive threshold → binary (text=255, bg=0)
2. Left edge: spine-shadow detection via grayscale column means,
fallback to binary vertical projection
3. Right edge: binary vertical projection (last ink column)
4. Top/bottom edges: binary horizontal projection
5. Sanity checks, then crop with configurable margin
Args:
img_bgr: Input BGR image
min_border_fraction: Minimum border fraction to trigger crop (default 1%)
img_bgr: Input BGR image (should already be deskewed/dewarped)
margin_frac: Extra margin around content (fraction of dimension, default 1%)
Returns:
Tuple of (cropped_image, result_dict)
@@ -62,41 +68,28 @@ def detect_and_crop_page(
"border_fractions": {"top": 0.0, "bottom": 0.0, "left": 0.0, "right": 0.0},
}
# 1. Grayscale + blur
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
blurred = cv2.GaussianBlur(gray, (21, 21), 0)
# 2. Otsu threshold
_, binary = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
# --- Binarise with adaptive threshold (works for white-on-white) ---
binary = cv2.adaptiveThreshold(
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY_INV, blockSize=51, C=15,
)
# 3. Morphological close to fill text gaps
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (50, 50))
closed = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
# --- Left edge: spine-shadow detection ---
left_edge = _detect_left_edge_shadow(gray, binary, w, h)
# 4. Find contours
contours, _ = cv2.findContours(closed, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
if not contours:
logger.info("No contours found - returning original image")
return img_bgr, result
# --- Right edge: binary vertical projection ---
right_edge = _detect_right_edge(binary, w, h)
# Get the largest contour
largest = max(contours, key=cv2.contourArea)
contour_area = cv2.contourArea(largest)
# --- Top / bottom edges: binary horizontal projection ---
top_edge, bottom_edge = _detect_top_bottom_edges(binary, w, h)
# 5. If contour covers >95% of image, no crop needed
if contour_area > 0.95 * total_area:
logger.info("Page covers >95%% of image - no crop needed")
result["detected_format"], result["format_confidence"] = _detect_format(w, h)
return img_bgr, result
# 6. Get bounding rect
rx, ry, rw, rh = cv2.boundingRect(largest)
# Calculate border fractions
border_top = ry / h
border_bottom = (h - (ry + rh)) / h
border_left = rx / w
border_right = (w - (rx + rw)) / w
# Compute border fractions
border_top = top_edge / h
border_bottom = (h - bottom_edge) / h
border_left = left_edge / w
border_right = (w - right_edge) / w
result["border_fractions"] = {
"top": round(border_top, 4),
@@ -105,35 +98,34 @@ def detect_and_crop_page(
"right": round(border_right, 4),
}
# 7. Check if borders are significant enough to crop
if all(f < min_border_fraction for f in [border_top, border_bottom, border_left, border_right]):
logger.info("All borders < %.1f%% - no crop needed", min_border_fraction * 100)
# Sanity: only crop if at least one edge has > 2% border
min_border = 0.02
if all(f < min_border for f in [border_top, border_bottom, border_left, border_right]):
logger.info("All borders < %.0f%% — no crop needed", min_border * 100)
result["detected_format"], result["format_confidence"] = _detect_format(w, h)
return img_bgr, result
# 8. Add safety margin (0.5% of image dimensions)
margin_x = int(w * 0.005)
margin_y = int(h * 0.005)
# Add margin
margin_x = int(w * margin_frac)
margin_y = int(h * margin_frac)
crop_x = max(0, rx - margin_x)
crop_y = max(0, ry - margin_y)
crop_x2 = min(w, rx + rw + margin_x)
crop_y2 = min(h, ry + rh + margin_y)
crop_x = max(0, left_edge - margin_x)
crop_y = max(0, top_edge - margin_y)
crop_x2 = min(w, right_edge + margin_x)
crop_y2 = min(h, bottom_edge + margin_y)
crop_w = crop_x2 - crop_x
crop_h = crop_y2 - crop_y
# Sanity check: cropped area should be at least 50% of original
if crop_w * crop_h < 0.5 * total_area:
logger.warning("Cropped area too small (%.0f%%) - skipping crop",
# Sanity: cropped area must be >= 40% of original
if crop_w * crop_h < 0.40 * total_area:
logger.warning("Cropped area too small (%.0f%%) skipping crop",
100.0 * crop_w * crop_h / total_area)
result["detected_format"], result["format_confidence"] = _detect_format(w, h)
return img_bgr, result
# 9. Crop
cropped = img_bgr[crop_y:crop_y2, crop_x:crop_x2].copy()
# 10. Detect format from cropped dimensions
detected_format, format_confidence = _detect_format(crop_w, crop_h)
result["crop_applied"] = True
@@ -149,23 +141,140 @@ def detect_and_crop_page(
result["format_confidence"] = format_confidence
result["aspect_ratio"] = round(max(crop_w, crop_h) / max(min(crop_w, crop_h), 1), 4)
logger.info("Page cropped: %dx%d -> %dx%d, format=%s (%.0f%%), borders: T=%.1f%% B=%.1f%% L=%.1f%% R=%.1f%%",
w, h, crop_w, crop_h, detected_format, format_confidence * 100,
border_top * 100, border_bottom * 100, border_left * 100, border_right * 100)
logger.info(
"Page cropped: %dx%d -> %dx%d, format=%s (%.0f%%), "
"borders: T=%.1f%% B=%.1f%% L=%.1f%% R=%.1f%%",
w, h, crop_w, crop_h, detected_format, format_confidence * 100,
border_top * 100, border_bottom * 100,
border_left * 100, border_right * 100,
)
return cropped, result
def _detect_format(width: int, height: int) -> Tuple[str, float]:
"""Detect paper format from dimensions by comparing aspect ratios.
# ---------------------------------------------------------------------------
# Edge detection helpers
# ---------------------------------------------------------------------------
Returns:
(format_name, confidence) where confidence is 0.0-1.0
def _detect_left_edge_shadow(
gray: np.ndarray,
binary: np.ndarray,
w: int,
h: int,
) -> int:
"""Detect left content edge, accounting for book-spine shadow.
Strategy: look at the left 25% of the image.
1. Compute column-mean brightness in grayscale.
2. Smooth with a boxcar kernel.
3. Find the transition from shadow (dark) to page (bright).
4. Fallback: use binary vertical projection if no shadow detected.
"""
search_w = max(1, w // 4)
# Column-mean brightness in the left quarter
col_means = np.mean(gray[:, :search_w], axis=0).astype(np.float64)
# Smooth with boxcar kernel (width = 1% of image width, min 5)
kernel_size = max(5, w // 100)
if kernel_size % 2 == 0:
kernel_size += 1
kernel = np.ones(kernel_size) / kernel_size
smoothed = np.convolve(col_means, kernel, mode="same")
# Determine brightness threshold: midpoint between darkest and brightest
val_min = float(np.min(smoothed))
val_max = float(np.max(smoothed))
shadow_range = val_max - val_min
# Only use shadow detection if there is a meaningful brightness gradient (> 20 levels)
if shadow_range > 20:
threshold = val_min + shadow_range * 0.6
# Find first column where brightness exceeds threshold
above = np.where(smoothed >= threshold)[0]
if len(above) > 0:
shadow_edge = int(above[0])
logger.debug("Left edge: shadow detected at x=%d (range=%.0f)", shadow_edge, shadow_range)
return shadow_edge
# Fallback: binary vertical projection
return _detect_edge_projection(binary, axis=0, from_start=True, dim=w)
def _detect_right_edge(binary: np.ndarray, w: int, h: int) -> int:
"""Detect right content edge via binary vertical projection."""
return _detect_edge_projection(binary, axis=0, from_start=False, dim=w)
def _detect_top_bottom_edges(binary: np.ndarray, w: int, h: int) -> Tuple[int, int]:
"""Detect top and bottom content edges via binary horizontal projection."""
top = _detect_edge_projection(binary, axis=1, from_start=True, dim=h)
bottom = _detect_edge_projection(binary, axis=1, from_start=False, dim=h)
return top, bottom
def _detect_edge_projection(
binary: np.ndarray,
axis: int,
from_start: bool,
dim: int,
) -> int:
"""Find the first/last row or column with ink density above threshold.
axis=0 → project vertically (column densities) → returns x position
axis=1 → project horizontally (row densities) → returns y position
Filters out narrow noise runs shorter than _MIN_RUN_FRAC of the dimension.
"""
# Compute density per row/column (mean of binary pixels / 255)
projection = np.mean(binary, axis=axis) / 255.0
# Create mask of "ink" positions
ink_mask = projection >= _INK_THRESHOLD
# Filter narrow runs (noise)
min_run = max(1, int(dim * _MIN_RUN_FRAC))
ink_mask = _filter_narrow_runs(ink_mask, min_run)
ink_positions = np.where(ink_mask)[0]
if len(ink_positions) == 0:
return 0 if from_start else dim
if from_start:
return int(ink_positions[0])
else:
return int(ink_positions[-1])
def _filter_narrow_runs(mask: np.ndarray, min_run: int) -> np.ndarray:
"""Remove True-runs shorter than min_run pixels."""
if min_run <= 1:
return mask
result = mask.copy()
n = len(result)
i = 0
while i < n:
if result[i]:
start = i
while i < n and result[i]:
i += 1
if i - start < min_run:
result[start:i] = False
else:
i += 1
return result
# ---------------------------------------------------------------------------
# Format detection (kept as optional metadata)
# ---------------------------------------------------------------------------
def _detect_format(width: int, height: int) -> Tuple[str, float]:
"""Detect paper format from dimensions by comparing aspect ratios."""
if width <= 0 or height <= 0:
return "unknown", 0.0
# Use portrait aspect ratio (taller / shorter)
aspect = max(width, height) / min(width, height)
best_format = "unknown"
@@ -177,8 +286,6 @@ def _detect_format(width: int, height: int) -> Tuple[str, float]:
best_diff = diff
best_format = fmt
# Confidence: 1.0 if exact match, decreasing with deviation
# Threshold: if diff > 0.1, confidence drops below 0.5
confidence = max(0.0, 1.0 - best_diff * 5.0)
if confidence < 0.3: