refactor: Crop nach Deskew/Dewarp verschieben + content-basierter Buchscan-Crop
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m56s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 17s

Pipeline-Reihenfolge neu: Orientierung → Begradigung → Entzerrung → Zuschneiden → Spalten...
Crop arbeitet jetzt auf dem bereits geraden Bild, was bessere Ergebnisse liefert.

page_crop.py komplett ersetzt: Adaptive Threshold + 4-Kanten-Erkennung
(Buchruecken-Schatten links, Ink-Projektion fuer alle Raender) statt
Otsu + groesste Kontur.

Backend: Step-Nummern, Input-Bilder, Reprocess-Kaskade angepasst.
Frontend: PIPELINE_STEPS umgeordnet, Switch-Cases, Vorher-Bilder aktualisiert.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-09 08:52:11 +01:00
parent eb45bb4879
commit 156a818246
7 changed files with 295 additions and 173 deletions

View File

@@ -71,8 +71,8 @@ export default function OcrPipelinePage() {
// Determine which step to jump to based on current_step // Determine which step to jump to based on current_step
const dbStep = data.current_step || 1 const dbStep = data.current_step || 1
// Steps: 1=deskew, 2=dewarp, 3=columns, ... // DB steps: 1=start, 2=orientation, 3=deskew, 4=dewarp, 5=crop, 6=columns, ...
// UI steps are 0-indexed: 0=deskew, 1=dewarp, 2=columns, ... // UI steps are 0-indexed: 0=orientation, 1=deskew, 2=dewarp, 3=crop, 4=columns, ...
const uiStep = Math.max(0, dbStep - 1) const uiStep = Math.max(0, dbStep - 1)
const skipSteps = savedDocType?.skip_steps || [] const skipSteps = savedDocType?.skip_steps || []
@@ -205,8 +205,8 @@ export default function OcrPipelinePage() {
handleNext() handleNext()
} }
const handleDewarpNext = async () => { const handleCropNext = async () => {
// Auto-detect document type after dewarp, then advance // Auto-detect document type after crop (last image-processing step), then advance
if (sessionId) { if (sessionId) {
try { try {
const res = await fetch( const res = await fetch(
@@ -273,9 +273,9 @@ export default function OcrPipelinePage() {
const stepNames: Record<number, string> = { const stepNames: Record<number, string> = {
1: 'Orientierung', 1: 'Orientierung',
2: 'Zuschneiden', 2: 'Begradigung',
3: 'Begradigung', 3: 'Entzerrung',
4: 'Entzerrung', 4: 'Zuschneiden',
5: 'Spalten', 5: 'Spalten',
6: 'Zeilen', 6: 'Zeilen',
7: 'Woerter', 7: 'Woerter',
@@ -312,11 +312,11 @@ export default function OcrPipelinePage() {
case 0: case 0:
return <StepOrientation sessionId={sessionId} onNext={handleOrientationComplete} /> return <StepOrientation sessionId={sessionId} onNext={handleOrientationComplete} />
case 1: case 1:
return <StepCrop sessionId={sessionId} onNext={handleNext} />
case 2:
return <StepDeskew sessionId={sessionId} onNext={handleNext} /> return <StepDeskew sessionId={sessionId} onNext={handleNext} />
case 2:
return <StepDewarp sessionId={sessionId} onNext={handleNext} />
case 3: case 3:
return <StepDewarp sessionId={sessionId} onNext={handleDewarpNext} /> return <StepCrop sessionId={sessionId} onNext={handleCropNext} />
case 4: case 4:
return <StepColumnDetection sessionId={sessionId} onNext={handleNext} /> return <StepColumnDetection sessionId={sessionId} onNext={handleNext} />
case 5: case 5:

View File

@@ -310,9 +310,9 @@ export const IMAGE_STYLES: { value: ImageStyle; label: string }[] = [
export const PIPELINE_STEPS: PipelineStep[] = [ export const PIPELINE_STEPS: PipelineStep[] = [
{ id: 'orientation', name: 'Orientierung', icon: '🔄', status: 'pending' }, { id: 'orientation', name: 'Orientierung', icon: '🔄', status: 'pending' },
{ id: 'crop', name: 'Zuschneiden', icon: '✂️', status: 'pending' },
{ id: 'deskew', name: 'Begradigung', icon: '📐', status: 'pending' }, { id: 'deskew', name: 'Begradigung', icon: '📐', status: 'pending' },
{ id: 'dewarp', name: 'Entzerrung', icon: '🔧', status: 'pending' }, { id: 'dewarp', name: 'Entzerrung', icon: '🔧', status: 'pending' },
{ id: 'crop', name: 'Zuschneiden', icon: '✂️', status: 'pending' },
{ id: 'columns', name: 'Spalten', icon: '📊', status: 'pending' }, { id: 'columns', name: 'Spalten', icon: '📊', status: 'pending' },
{ id: 'rows', name: 'Zeilen', icon: '📏', status: 'pending' }, { id: 'rows', name: 'Zeilen', icon: '📏', status: 'pending' },
{ id: 'words', name: 'Woerter', icon: '🔤', status: 'pending' }, { id: 'words', name: 'Woerter', icon: '🔤', status: 'pending' },

View File

@@ -78,7 +78,7 @@ export function StepCrop({ sessionId, onNext }: StepCropProps) {
return <div className="text-sm text-gray-400">Keine Session ausgewaehlt.</div> return <div className="text-sm text-gray-400">Keine Session ausgewaehlt.</div>
} }
const orientedUrl = `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/oriented` const dewarpedUrl = `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/dewarped`
const croppedUrl = cropResult const croppedUrl = cropResult
? `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/cropped` ? `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/cropped`
: null : null
@@ -95,12 +95,12 @@ export function StepCrop({ sessionId, onNext }: StepCropProps) {
{/* Image comparison */} {/* Image comparison */}
<ImageCompareView <ImageCompareView
originalUrl={orientedUrl} originalUrl={dewarpedUrl}
deskewedUrl={croppedUrl} deskewedUrl={croppedUrl}
showGrid={false} showGrid={false}
showBinarized={false} showBinarized={false}
binarizedUrl={null} binarizedUrl={null}
leftLabel="Orientiert" leftLabel="Entzerrt"
rightLabel="Zugeschnitten" rightLabel="Zugeschnitten"
/> />

View File

@@ -37,8 +37,8 @@ export function StepDeskew({ sessionId, onNext }: StepDeskewProps) {
filename: data.filename, filename: data.filename,
image_width: data.image_width, image_width: data.image_width,
image_height: data.image_height, image_height: data.image_height,
// Use cropped image as "before" view // Use oriented image as "before" view (deskew runs right after orientation)
original_image_url: `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/cropped`, original_image_url: `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/oriented`,
} }
setSession(sessionInfo) setSession(sessionInfo)
@@ -155,7 +155,7 @@ export function StepDeskew({ sessionId, onNext }: StepDeskewProps) {
showGrid={showGrid} showGrid={showGrid}
showBinarized={showBinarized} showBinarized={showBinarized}
binarizedUrl={deskewResult?.binarized_image_url ?? null} binarizedUrl={deskewResult?.binarized_image_url ?? null}
leftLabel="Zugeschnitten" leftLabel="Orientiert"
rightLabel="Begradigt" rightLabel="Begradigt"
/> />
)} )}

View File

@@ -3,9 +3,9 @@ OCR Pipeline API - Schrittweise Seitenrekonstruktion.
Zerlegt den OCR-Prozess in 10 einzelne Schritte: Zerlegt den OCR-Prozess in 10 einzelne Schritte:
1. Orientierung - 90/180/270° Drehungen korrigieren (orientation_crop_api.py) 1. Orientierung - 90/180/270° Drehungen korrigieren (orientation_crop_api.py)
2. Zuschneiden - Scannerraender entfernen (orientation_crop_api.py) 2. Begradigung (Deskew) - Scan begradigen
3. Deskewing - Scan begradigen 3. Entzerrung (Dewarp) - Buchwoelbung entzerren
4. Dewarping - Buchwoelbung entzerren 4. Zuschneiden - Scannerraender/Buchruecken entfernen (orientation_crop_api.py)
5. Spaltenerkennung - Unsichtbare Spalten finden 5. Spaltenerkennung - Unsichtbare Spalten finden
6. Zeilenerkennung - Horizontale Zeilen + Kopf-/Fusszeilen 6. Zeilenerkennung - Horizontale Zeilen + Kopf-/Fusszeilen
7. Worterkennung - OCR mit Bounding Boxes 7. Worterkennung - OCR mit Bounding Boxes
@@ -483,8 +483,8 @@ async def auto_deskew(session_id: str):
await _load_session_to_cache(session_id) await _load_session_to_cache(session_id)
cached = _get_cached(session_id) cached = _get_cached(session_id)
# Use cropped image as input (from step 2), fall back to oriented, then original # Deskew runs right after orientation — use oriented image, fall back to original
img_bgr = next((v for k in ("cropped_bgr", "oriented_bgr", "original_bgr") img_bgr = next((v for k in ("oriented_bgr", "original_bgr")
if (v := cached.get(k)) is not None), None) if (v := cached.get(k)) is not None), None)
if img_bgr is None: if img_bgr is None:
raise HTTPException(status_code=400, detail="No image available for deskewing") raise HTTPException(status_code=400, detail="No image available for deskewing")
@@ -554,7 +554,7 @@ async def auto_deskew(session_id: str):
db_update = { db_update = {
"deskewed_png": deskewed_png, "deskewed_png": deskewed_png,
"deskew_result": deskew_result, "deskew_result": deskew_result,
"current_step": 4, "current_step": 3,
} }
if binarized_png: if binarized_png:
db_update["binarized_png"] = binarized_png db_update["binarized_png"] = binarized_png
@@ -585,12 +585,12 @@ async def auto_deskew(session_id: str):
@router.post("/sessions/{session_id}/deskew/manual") @router.post("/sessions/{session_id}/deskew/manual")
async def manual_deskew(session_id: str, req: ManualDeskewRequest): async def manual_deskew(session_id: str, req: ManualDeskewRequest):
"""Apply a manual rotation angle to the cropped image.""" """Apply a manual rotation angle to the oriented image."""
if session_id not in _cache: if session_id not in _cache:
await _load_session_to_cache(session_id) await _load_session_to_cache(session_id)
cached = _get_cached(session_id) cached = _get_cached(session_id)
img_bgr = next((v for k in ("cropped_bgr", "oriented_bgr", "original_bgr") img_bgr = next((v for k in ("oriented_bgr", "original_bgr")
if (v := cached.get(k)) is not None), None) if (v := cached.get(k)) is not None), None)
if img_bgr is None: if img_bgr is None:
raise HTTPException(status_code=400, detail="No image available for deskewing") raise HTTPException(status_code=400, detail="No image available for deskewing")
@@ -801,7 +801,7 @@ async def auto_dewarp(
dewarped_png=dewarped_png, dewarped_png=dewarped_png,
dewarp_result=dewarp_result, dewarp_result=dewarp_result,
auto_shear_degrees=dewarp_info.get("shear_degrees", 0.0), auto_shear_degrees=dewarp_info.get("shear_degrees", 0.0),
current_step=5, current_step=4,
) )
logger.info(f"OCR Pipeline: dewarp session {session_id}: " logger.info(f"OCR Pipeline: dewarp session {session_id}: "
@@ -993,20 +993,21 @@ async def save_dewarp_ground_truth(session_id: str, req: DewarpGroundTruthReques
async def detect_type(session_id: str): async def detect_type(session_id: str):
"""Detect document type (vocab_table, full_text, generic_table). """Detect document type (vocab_table, full_text, generic_table).
Should be called after dewarp (clean image available). Should be called after crop (clean image available).
Falls back to dewarped if crop was skipped.
Stores result in session for frontend to decide pipeline flow. Stores result in session for frontend to decide pipeline flow.
""" """
if session_id not in _cache: if session_id not in _cache:
await _load_session_to_cache(session_id) await _load_session_to_cache(session_id)
cached = _get_cached(session_id) cached = _get_cached(session_id)
dewarped_bgr = cached.get("dewarped_bgr") img_bgr = cached.get("cropped_bgr") or cached.get("dewarped_bgr")
if dewarped_bgr is None: if img_bgr is None:
raise HTTPException(status_code=400, detail="Dewarp must be completed first") raise HTTPException(status_code=400, detail="Crop or dewarp must be completed first")
t0 = time.time() t0 = time.time()
ocr_img = create_ocr_image(dewarped_bgr) ocr_img = create_ocr_image(img_bgr)
result = detect_document_type(ocr_img, dewarped_bgr) result = detect_document_type(ocr_img, img_bgr)
duration = time.time() - t0 duration = time.time() - t0
result_dict = { result_dict = {
@@ -1046,27 +1047,27 @@ async def detect_type(session_id: str):
@router.post("/sessions/{session_id}/columns") @router.post("/sessions/{session_id}/columns")
async def detect_columns(session_id: str): async def detect_columns(session_id: str):
"""Run column detection on the dewarped image.""" """Run column detection on the cropped (or dewarped) image."""
if session_id not in _cache: if session_id not in _cache:
await _load_session_to_cache(session_id) await _load_session_to_cache(session_id)
cached = _get_cached(session_id) cached = _get_cached(session_id)
dewarped_bgr = cached.get("dewarped_bgr") img_bgr = cached.get("cropped_bgr") or cached.get("dewarped_bgr")
if dewarped_bgr is None: if img_bgr is None:
raise HTTPException(status_code=400, detail="Dewarp must be completed before column detection") raise HTTPException(status_code=400, detail="Crop or dewarp must be completed before column detection")
t0 = time.time() t0 = time.time()
# Binarized image for layout analysis # Binarized image for layout analysis
ocr_img = create_ocr_image(dewarped_bgr) ocr_img = create_ocr_image(img_bgr)
h, w = ocr_img.shape[:2] h, w = ocr_img.shape[:2]
# Phase A: Geometry detection (returns word_dicts + inv for reuse) # Phase A: Geometry detection (returns word_dicts + inv for reuse)
geo_result = detect_column_geometry(ocr_img, dewarped_bgr) geo_result = detect_column_geometry(ocr_img, img_bgr)
if geo_result is None: if geo_result is None:
# Fallback to projection-based layout # Fallback to projection-based layout
layout_img = create_layout_image(dewarped_bgr) layout_img = create_layout_image(img_bgr)
regions = analyze_layout(layout_img, ocr_img) regions = analyze_layout(layout_img, ocr_img)
else: else:
geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result
@@ -1113,7 +1114,7 @@ async def detect_columns(session_id: str):
column_result=column_result, column_result=column_result,
row_result=None, row_result=None,
word_result=None, word_result=None,
current_step=5, current_step=6,
) )
# Update cache # Update cache
@@ -1125,7 +1126,7 @@ async def detect_columns(session_id: str):
logger.info(f"OCR Pipeline: columns session {session_id}: " logger.info(f"OCR Pipeline: columns session {session_id}: "
f"{col_count} columns detected ({duration:.2f}s)") f"{col_count} columns detected ({duration:.2f}s)")
img_w = dewarped_bgr.shape[1] img_w = img_bgr.shape[1]
await _append_pipeline_log(session_id, "columns", { await _append_pipeline_log(session_id, "columns", {
"total_columns": len(columns), "total_columns": len(columns),
"column_widths_pct": [round(c["width"] / img_w * 100, 1) for c in columns], "column_widths_pct": [round(c["width"] / img_w * 100, 1) for c in columns],
@@ -1276,14 +1277,14 @@ async def _get_columns_overlay(session_id: str) -> Response:
@router.post("/sessions/{session_id}/rows") @router.post("/sessions/{session_id}/rows")
async def detect_rows(session_id: str): async def detect_rows(session_id: str):
"""Run row detection on the dewarped image using horizontal gap analysis.""" """Run row detection on the cropped (or dewarped) image using horizontal gap analysis."""
if session_id not in _cache: if session_id not in _cache:
await _load_session_to_cache(session_id) await _load_session_to_cache(session_id)
cached = _get_cached(session_id) cached = _get_cached(session_id)
dewarped_bgr = cached.get("dewarped_bgr") dewarped_bgr = cached.get("cropped_bgr") or cached.get("dewarped_bgr")
if dewarped_bgr is None: if dewarped_bgr is None:
raise HTTPException(status_code=400, detail="Dewarp must be completed before row detection") raise HTTPException(status_code=400, detail="Crop or dewarp must be completed before row detection")
t0 = time.time() t0 = time.time()
@@ -1339,7 +1340,7 @@ async def detect_rows(session_id: str):
session_id, session_id,
row_result=row_result, row_result=row_result,
word_result=None, word_result=None,
current_step=6, current_step=7,
) )
cached["row_result"] = row_result cached["row_result"] = row_result
@@ -1453,11 +1454,11 @@ async def detect_words(
await _load_session_to_cache(session_id) await _load_session_to_cache(session_id)
cached = _get_cached(session_id) cached = _get_cached(session_id)
dewarped_bgr = cached.get("dewarped_bgr") dewarped_bgr = cached.get("cropped_bgr") or cached.get("dewarped_bgr")
if dewarped_bgr is None: if dewarped_bgr is None:
logger.warning("detect_words: dewarped_bgr is None for session %s (cache keys: %s)", logger.warning("detect_words: no cropped/dewarped image for session %s (cache keys: %s)",
session_id, [k for k in cached.keys() if k.endswith('_bgr')]) session_id, [k for k in cached.keys() if k.endswith('_bgr')])
raise HTTPException(status_code=400, detail="Dewarp must be completed before word detection") raise HTTPException(status_code=400, detail="Crop or dewarp must be completed before word detection")
session = await get_session_db(session_id) session = await get_session_db(session_id)
if not session: if not session:
@@ -1605,7 +1606,7 @@ async def detect_words(
await update_session_db( await update_session_db(
session_id, session_id,
word_result=word_result, word_result=word_result,
current_step=7, current_step=8,
) )
cached["word_result"] = word_result cached["word_result"] = word_result
@@ -1749,7 +1750,7 @@ async def _word_batch_stream_generator(
word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german")) word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))
vocab_entries = entries vocab_entries = entries
await update_session_db(session_id, word_result=word_result, current_step=7) await update_session_db(session_id, word_result=word_result, current_step=8)
cached["word_result"] = word_result cached["word_result"] = word_result
logger.info(f"OCR Pipeline SSE batch: words session {session_id}: " logger.info(f"OCR Pipeline SSE batch: words session {session_id}: "
@@ -1896,7 +1897,7 @@ async def _word_stream_generator(
await update_session_db( await update_session_db(
session_id, session_id,
word_result=word_result, word_result=word_result,
current_step=7, current_step=8,
) )
cached["word_result"] = word_result cached["word_result"] = word_result
@@ -2020,7 +2021,7 @@ async def run_llm_review(session_id: str, request: Request, stream: bool = False
"duration_ms": result["duration_ms"], "duration_ms": result["duration_ms"],
"entries_corrected": result["entries_corrected"], "entries_corrected": result["entries_corrected"],
} }
await update_session_db(session_id, word_result=word_result, current_step=8) await update_session_db(session_id, word_result=word_result, current_step=9)
if session_id in _cache: if session_id in _cache:
_cache[session_id]["word_result"] = word_result _cache[session_id]["word_result"] = word_result
@@ -2069,7 +2070,7 @@ async def _llm_review_stream_generator(
"duration_ms": event["duration_ms"], "duration_ms": event["duration_ms"],
"entries_corrected": event["entries_corrected"], "entries_corrected": event["entries_corrected"],
} }
await update_session_db(session_id, word_result=word_result, current_step=8) await update_session_db(session_id, word_result=word_result, current_step=9)
if session_id in _cache: if session_id in _cache:
_cache[session_id]["word_result"] = word_result _cache[session_id]["word_result"] = word_result
@@ -2157,7 +2158,7 @@ async def save_reconstruction(session_id: str, request: Request):
cell_updates = body.get("cells", []) cell_updates = body.get("cells", [])
if not cell_updates: if not cell_updates:
await update_session_db(session_id, current_step=9) await update_session_db(session_id, current_step=10)
return {"session_id": session_id, "updated": 0} return {"session_id": session_id, "updated": 0}
# Build update map: cell_id -> new text # Build update map: cell_id -> new text
@@ -2193,7 +2194,7 @@ async def save_reconstruction(session_id: str, request: Request):
if "entries" in word_result: if "entries" in word_result:
word_result["entries"] = entries word_result["entries"] = entries
await update_session_db(session_id, word_result=word_result, current_step=9) await update_session_db(session_id, word_result=word_result, current_step=10)
if session_id in _cache: if session_id in _cache:
_cache[session_id]["word_result"] = word_result _cache[session_id]["word_result"] = word_result
@@ -2589,7 +2590,7 @@ async def save_validation(session_id: str, req: ValidationRequest):
validation["score"] = req.score validation["score"] = req.score
ground_truth["validation"] = validation ground_truth["validation"] = validation
await update_session_db(session_id, ground_truth=ground_truth, current_step=10) await update_session_db(session_id, ground_truth=ground_truth, current_step=11)
if session_id in _cache: if session_id in _cache:
_cache[session_id]["ground_truth"] = ground_truth _cache[session_id]["ground_truth"] = ground_truth
@@ -2622,11 +2623,14 @@ async def reprocess_session(session_id: str, request: Request):
Body: {"from_step": 5} (1-indexed step number) Body: {"from_step": 5} (1-indexed step number)
Pipeline order: Orientation(1) → Deskew(2) → Dewarp(3) → Crop(4) → Columns(5) →
Rows(6) → Words(7) → LLM-Review(8) → Reconstruction(9) → Validation(10)
Clears downstream results: Clears downstream results:
- from_step <= 1: orientation_result, crop_result, deskew_result, dewarp_result, column_result, row_result, word_result - from_step <= 1: orientation_result + all downstream
- from_step <= 2: crop_result, deskew_result, dewarp_result, column_result, row_result, word_result - from_step <= 2: deskew_result + all downstream
- from_step <= 3: deskew_result, dewarp_result, column_result, row_result, word_result - from_step <= 3: dewarp_result + all downstream
- from_step <= 4: dewarp_result, column_result, row_result, word_result - from_step <= 4: crop_result + all downstream
- from_step <= 5: column_result, row_result, word_result - from_step <= 5: column_result, row_result, word_result
- from_step <= 6: row_result, word_result - from_step <= 6: row_result, word_result
- from_step <= 7: word_result (cells, vocab_entries) - from_step <= 7: word_result (cells, vocab_entries)
@@ -2638,15 +2642,17 @@ async def reprocess_session(session_id: str, request: Request):
body = await request.json() body = await request.json()
from_step = body.get("from_step", 1) from_step = body.get("from_step", 1)
if not isinstance(from_step, int) or from_step < 1 or from_step > 9: if not isinstance(from_step, int) or from_step < 1 or from_step > 10:
raise HTTPException(status_code=400, detail="from_step must be between 1 and 9") raise HTTPException(status_code=400, detail="from_step must be between 1 and 10")
update_kwargs: Dict[str, Any] = {"current_step": from_step} update_kwargs: Dict[str, Any] = {"current_step": from_step}
# Clear downstream data based on from_step # Clear downstream data based on from_step
if from_step <= 7: # New pipeline order: Orient(2) → Deskew(3) → Dewarp(4) → Crop(5) →
# Columns(6) → Rows(7) → Words(8) → LLM(9) → Recon(10) → GT(11)
if from_step <= 8:
update_kwargs["word_result"] = None update_kwargs["word_result"] = None
elif from_step == 8: elif from_step == 9:
# Only clear LLM review from word_result # Only clear LLM review from word_result
word_result = session.get("word_result") word_result = session.get("word_result")
if word_result: if word_result:
@@ -2654,16 +2660,16 @@ async def reprocess_session(session_id: str, request: Request):
word_result.pop("llm_corrections", None) word_result.pop("llm_corrections", None)
update_kwargs["word_result"] = word_result update_kwargs["word_result"] = word_result
if from_step <= 6: if from_step <= 7:
update_kwargs["row_result"] = None update_kwargs["row_result"] = None
if from_step <= 5: if from_step <= 6:
update_kwargs["column_result"] = None update_kwargs["column_result"] = None
if from_step <= 4: if from_step <= 4:
update_kwargs["dewarp_result"] = None
if from_step <= 3:
update_kwargs["deskew_result"] = None
if from_step <= 2:
update_kwargs["crop_result"] = None update_kwargs["crop_result"] = None
if from_step <= 3:
update_kwargs["dewarp_result"] = None
if from_step <= 2:
update_kwargs["deskew_result"] = None
if from_step <= 1: if from_step <= 1:
update_kwargs["orientation_result"] = None update_kwargs["orientation_result"] = None
@@ -3084,7 +3090,7 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request):
deskewed_png=deskewed_png, deskewed_png=deskewed_png,
deskew_result=deskew_result, deskew_result=deskew_result,
auto_rotation_degrees=float(angle_applied), auto_rotation_degrees=float(angle_applied),
current_step=4, current_step=3,
) )
session = await get_session_db(session_id) session = await get_session_db(session_id)
@@ -3147,7 +3153,7 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request):
dewarped_png=dewarped_png, dewarped_png=dewarped_png,
dewarp_result=dewarp_result, dewarp_result=dewarp_result,
auto_shear_degrees=dewarp_info.get("shear_degrees", 0.0), auto_shear_degrees=dewarp_info.get("shear_degrees", 0.0),
current_step=5, current_step=4,
) )
session = await get_session_db(session_id) session = await get_session_db(session_id)
@@ -3170,16 +3176,16 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request):
yield await _auto_sse_event("columns", "start", {}) yield await _auto_sse_event("columns", "start", {})
try: try:
t0 = time.time() t0 = time.time()
dewarped_bgr = cached.get("dewarped_bgr") col_img = cached.get("cropped_bgr") or cached.get("dewarped_bgr")
if dewarped_bgr is None: if col_img is None:
raise ValueError("Dewarped image not available") raise ValueError("Cropped/dewarped image not available")
ocr_img = create_ocr_image(dewarped_bgr) ocr_img = create_ocr_image(col_img)
h, w = ocr_img.shape[:2] h, w = ocr_img.shape[:2]
geo_result = detect_column_geometry(ocr_img, dewarped_bgr) geo_result = detect_column_geometry(ocr_img, col_img)
if geo_result is None: if geo_result is None:
layout_img = create_layout_image(dewarped_bgr) layout_img = create_layout_image(col_img)
regions = analyze_layout(layout_img, ocr_img) regions = analyze_layout(layout_img, ocr_img)
cached["_word_dicts"] = None cached["_word_dicts"] = None
cached["_inv"] = None cached["_inv"] = None
@@ -3231,7 +3237,7 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request):
yield await _auto_sse_event("rows", "start", {}) yield await _auto_sse_event("rows", "start", {})
try: try:
t0 = time.time() t0 = time.time()
dewarped_bgr = cached.get("dewarped_bgr") row_img = cached.get("cropped_bgr") or cached.get("dewarped_bgr")
session = await get_session_db(session_id) session = await get_session_db(session_id)
column_result = session.get("column_result") or cached.get("column_result") column_result = session.get("column_result") or cached.get("column_result")
if not column_result or not column_result.get("columns"): if not column_result or not column_result.get("columns"):
@@ -3252,8 +3258,8 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request):
content_bounds = cached.get("_content_bounds") content_bounds = cached.get("_content_bounds")
if word_dicts is None or inv is None or content_bounds is None: if word_dicts is None or inv is None or content_bounds is None:
ocr_img_tmp = create_ocr_image(dewarped_bgr) ocr_img_tmp = create_ocr_image(row_img)
geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr) geo_result = detect_column_geometry(ocr_img_tmp, row_img)
if geo_result is None: if geo_result is None:
raise ValueError("Column geometry detection failed — cannot detect rows") raise ValueError("Column geometry detection failed — cannot detect rows")
_g, lx, rx, ty, by, word_dicts, inv = geo_result _g, lx, rx, ty, by, word_dicts, inv = geo_result
@@ -3309,7 +3315,7 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request):
yield await _auto_sse_event("words", "start", {"engine": req.ocr_engine}) yield await _auto_sse_event("words", "start", {"engine": req.ocr_engine})
try: try:
t0 = time.time() t0 = time.time()
dewarped_bgr = cached.get("dewarped_bgr") word_img = cached.get("cropped_bgr") or cached.get("dewarped_bgr")
session = await get_session_db(session_id) session = await get_session_db(session_id)
column_result = session.get("column_result") or cached.get("column_result") column_result = session.get("column_result") or cached.get("column_result")
@@ -3348,12 +3354,12 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request):
] ]
row.word_count = len(row.words) row.word_count = len(row.words)
ocr_img = create_ocr_image(dewarped_bgr) ocr_img = create_ocr_image(word_img)
img_h, img_w = dewarped_bgr.shape[:2] img_h, img_w = word_img.shape[:2]
cells, columns_meta = build_cell_grid( cells, columns_meta = build_cell_grid(
ocr_img, col_regions, row_geoms, img_w, img_h, ocr_img, col_regions, row_geoms, img_w, img_h,
ocr_engine=req.ocr_engine, img_bgr=dewarped_bgr, ocr_engine=req.ocr_engine, img_bgr=word_img,
) )
duration = time.time() - t0 duration = time.time() - t0

View File

@@ -1,8 +1,8 @@
""" """
Orientation & Crop API - Steps 1-2 of the OCR Pipeline. Orientation & Crop API - Steps 1 and 4 of the OCR Pipeline.
Step 1: Orientation detection (fix 90/180/270 degree rotations) Step 1: Orientation detection (fix 90/180/270 degree rotations)
Step 2: Page cropping (remove scanner borders, detect paper format) Step 4 (UI index 3): Page cropping (after deskew + dewarp, so the image is straight)
These endpoints were extracted from the main pipeline to keep files manageable. These endpoints were extracted from the main pipeline to keep files manageable.
""" """
@@ -161,21 +161,24 @@ async def detect_orientation(session_id: str):
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Step 2: Crop # Step 4 (UI index 3): Crop — runs after deskew + dewarp
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@router.post("/sessions/{session_id}/crop") @router.post("/sessions/{session_id}/crop")
async def auto_crop(session_id: str): async def auto_crop(session_id: str):
"""Auto-detect and crop scanner borders. """Auto-detect and crop scanner/book borders.
Reads the oriented image (or original if no orientation step), Reads the dewarped image (post-deskew + dewarp, so the page is straight).
detects the page boundary and crops. Falls back to oriented → original if earlier steps were skipped.
""" """
cached = await _ensure_cached(session_id) cached = await _ensure_cached(session_id)
# Use oriented image if available, else original # Use dewarped (preferred), fall back to oriented, then original
oriented = cached.get("oriented_bgr") img_bgr = next(
img_bgr = oriented if oriented is not None else cached.get("original_bgr") (v for k in ("dewarped_bgr", "oriented_bgr", "original_bgr")
if (v := cached.get(k)) is not None),
None,
)
if img_bgr is None: if img_bgr is None:
raise HTTPException(status_code=400, detail="No image available for cropping") raise HTTPException(status_code=400, detail="No image available for cropping")
@@ -199,7 +202,7 @@ async def auto_crop(session_id: str):
session_id, session_id,
cropped_png=cropped_png, cropped_png=cropped_png,
crop_result=crop_info, crop_result=crop_info,
current_step=3, current_step=5,
) )
logger.info( logger.info(
@@ -237,8 +240,11 @@ async def manual_crop(session_id: str, req: ManualCropRequest):
"""Manually crop using percentage coordinates.""" """Manually crop using percentage coordinates."""
cached = await _ensure_cached(session_id) cached = await _ensure_cached(session_id)
oriented = cached.get("oriented_bgr") img_bgr = next(
img_bgr = oriented if oriented is not None else cached.get("original_bgr") (v for k in ("dewarped_bgr", "oriented_bgr", "original_bgr")
if (v := cached.get(k)) is not None),
None,
)
if img_bgr is None: if img_bgr is None:
raise HTTPException(status_code=400, detail="No image available for cropping") raise HTTPException(status_code=400, detail="No image available for cropping")
@@ -278,7 +284,7 @@ async def manual_crop(session_id: str, req: ManualCropRequest):
session_id, session_id,
cropped_png=cropped_png, cropped_png=cropped_png,
crop_result=crop_result, crop_result=crop_result,
current_step=3, current_step=5,
) )
ch, cw = cropped_bgr.shape[:2] ch, cw = cropped_bgr.shape[:2]
@@ -293,17 +299,20 @@ async def manual_crop(session_id: str, req: ManualCropRequest):
@router.post("/sessions/{session_id}/crop/skip") @router.post("/sessions/{session_id}/crop/skip")
async def skip_crop(session_id: str): async def skip_crop(session_id: str):
"""Skip cropping — use oriented (or original) image as-is.""" """Skip cropping — use dewarped (or oriented/original) image as-is."""
cached = await _ensure_cached(session_id) cached = await _ensure_cached(session_id)
oriented = cached.get("oriented_bgr") img_bgr = next(
img_bgr = oriented if oriented is not None else cached.get("original_bgr") (v for k in ("dewarped_bgr", "oriented_bgr", "original_bgr")
if (v := cached.get(k)) is not None),
None,
)
if img_bgr is None: if img_bgr is None:
raise HTTPException(status_code=400, detail="No image available") raise HTTPException(status_code=400, detail="No image available")
h, w = img_bgr.shape[:2] h, w = img_bgr.shape[:2]
# Store the oriented image as cropped (identity crop) # Store the dewarped image as cropped (identity crop)
success, png_buf = cv2.imencode(".png", img_bgr) success, png_buf = cv2.imencode(".png", img_bgr)
cropped_png = png_buf.tobytes() if success else b"" cropped_png = png_buf.tobytes() if success else b""
@@ -321,7 +330,7 @@ async def skip_crop(session_id: str):
session_id, session_id,
cropped_png=cropped_png, cropped_png=cropped_png,
crop_result=crop_result, crop_result=crop_result,
current_step=3, current_step=5,
) )
return { return {

View File

@@ -1,14 +1,15 @@
""" """
Page Crop - Automatic scanner border removal and page format detection. Page Crop - Content-based crop for scanned pages and book scans.
Detects the paper boundary in a scanned image and crops away scanner borders. Detects the content boundary by analysing ink density projections and
Also identifies the paper format (A4, Letter, etc.) from the aspect ratio. (for book scans) the spine shadow gradient. Works with both loose A4
sheets on dark scanners AND book scans with white backgrounds.
License: Apache 2.0 License: Apache 2.0
""" """
import logging import logging
from typing import Dict, Any, Tuple from typing import Dict, Any, Tuple, Optional
import cv2 import cv2
import numpy as np import numpy as np
@@ -24,25 +25,30 @@ PAPER_FORMATS = {
"A3": 420.0 / 297.0, # 1.4141 "A3": 420.0 / 297.0, # 1.4141
} }
# Minimum ink density (fraction of pixels) to count a row/column as "content"
_INK_THRESHOLD = 0.003 # 0.3%
# Minimum run length (fraction of dimension) to keep — shorter runs are noise
_MIN_RUN_FRAC = 0.005 # 0.5%
def detect_and_crop_page( def detect_and_crop_page(
img_bgr: np.ndarray, img_bgr: np.ndarray,
min_border_fraction: float = 0.01, margin_frac: float = 0.01,
) -> Tuple[np.ndarray, Dict[str, Any]]: ) -> Tuple[np.ndarray, Dict[str, Any]]:
"""Detect page boundary and crop scanner borders. """Detect content boundary and crop scanner/book borders.
Algorithm: Algorithm (4-edge detection):
1. Grayscale + GaussianBlur to smooth out text 1. Adaptive threshold → binary (text=255, bg=0)
2. Otsu threshold (page=bright, scanner border=dark) 2. Left edge: spine-shadow detection via grayscale column means,
3. Morphological close to fill gaps fallback to binary vertical projection
4. Find largest contour = page 3. Right edge: binary vertical projection (last ink column)
5. If contour covers >95% of image area -> no crop needed 4. Top/bottom edges: binary horizontal projection
6. Get bounding rect, add safety margin 5. Sanity checks, then crop with configurable margin
7. Match aspect ratio to known paper formats
Args: Args:
img_bgr: Input BGR image img_bgr: Input BGR image (should already be deskewed/dewarped)
min_border_fraction: Minimum border fraction to trigger crop (default 1%) margin_frac: Extra margin around content (fraction of dimension, default 1%)
Returns: Returns:
Tuple of (cropped_image, result_dict) Tuple of (cropped_image, result_dict)
@@ -62,41 +68,28 @@ def detect_and_crop_page(
"border_fractions": {"top": 0.0, "bottom": 0.0, "left": 0.0, "right": 0.0}, "border_fractions": {"top": 0.0, "bottom": 0.0, "left": 0.0, "right": 0.0},
} }
# 1. Grayscale + blur
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
blurred = cv2.GaussianBlur(gray, (21, 21), 0)
# 2. Otsu threshold # --- Binarise with adaptive threshold (works for white-on-white) ---
_, binary = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) binary = cv2.adaptiveThreshold(
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY_INV, blockSize=51, C=15,
)
# 3. Morphological close to fill text gaps # --- Left edge: spine-shadow detection ---
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (50, 50)) left_edge = _detect_left_edge_shadow(gray, binary, w, h)
closed = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
# 4. Find contours # --- Right edge: binary vertical projection ---
contours, _ = cv2.findContours(closed, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) right_edge = _detect_right_edge(binary, w, h)
if not contours:
logger.info("No contours found - returning original image")
return img_bgr, result
# Get the largest contour # --- Top / bottom edges: binary horizontal projection ---
largest = max(contours, key=cv2.contourArea) top_edge, bottom_edge = _detect_top_bottom_edges(binary, w, h)
contour_area = cv2.contourArea(largest)
# 5. If contour covers >95% of image, no crop needed # Compute border fractions
if contour_area > 0.95 * total_area: border_top = top_edge / h
logger.info("Page covers >95%% of image - no crop needed") border_bottom = (h - bottom_edge) / h
result["detected_format"], result["format_confidence"] = _detect_format(w, h) border_left = left_edge / w
return img_bgr, result border_right = (w - right_edge) / w
# 6. Get bounding rect
rx, ry, rw, rh = cv2.boundingRect(largest)
# Calculate border fractions
border_top = ry / h
border_bottom = (h - (ry + rh)) / h
border_left = rx / w
border_right = (w - (rx + rw)) / w
result["border_fractions"] = { result["border_fractions"] = {
"top": round(border_top, 4), "top": round(border_top, 4),
@@ -105,35 +98,34 @@ def detect_and_crop_page(
"right": round(border_right, 4), "right": round(border_right, 4),
} }
# 7. Check if borders are significant enough to crop # Sanity: only crop if at least one edge has > 2% border
if all(f < min_border_fraction for f in [border_top, border_bottom, border_left, border_right]): min_border = 0.02
logger.info("All borders < %.1f%% - no crop needed", min_border_fraction * 100) if all(f < min_border for f in [border_top, border_bottom, border_left, border_right]):
logger.info("All borders < %.0f%% — no crop needed", min_border * 100)
result["detected_format"], result["format_confidence"] = _detect_format(w, h) result["detected_format"], result["format_confidence"] = _detect_format(w, h)
return img_bgr, result return img_bgr, result
# 8. Add safety margin (0.5% of image dimensions) # Add margin
margin_x = int(w * 0.005) margin_x = int(w * margin_frac)
margin_y = int(h * 0.005) margin_y = int(h * margin_frac)
crop_x = max(0, rx - margin_x) crop_x = max(0, left_edge - margin_x)
crop_y = max(0, ry - margin_y) crop_y = max(0, top_edge - margin_y)
crop_x2 = min(w, rx + rw + margin_x) crop_x2 = min(w, right_edge + margin_x)
crop_y2 = min(h, ry + rh + margin_y) crop_y2 = min(h, bottom_edge + margin_y)
crop_w = crop_x2 - crop_x crop_w = crop_x2 - crop_x
crop_h = crop_y2 - crop_y crop_h = crop_y2 - crop_y
# Sanity check: cropped area should be at least 50% of original # Sanity: cropped area must be >= 40% of original
if crop_w * crop_h < 0.5 * total_area: if crop_w * crop_h < 0.40 * total_area:
logger.warning("Cropped area too small (%.0f%%) - skipping crop", logger.warning("Cropped area too small (%.0f%%) skipping crop",
100.0 * crop_w * crop_h / total_area) 100.0 * crop_w * crop_h / total_area)
result["detected_format"], result["format_confidence"] = _detect_format(w, h) result["detected_format"], result["format_confidence"] = _detect_format(w, h)
return img_bgr, result return img_bgr, result
# 9. Crop
cropped = img_bgr[crop_y:crop_y2, crop_x:crop_x2].copy() cropped = img_bgr[crop_y:crop_y2, crop_x:crop_x2].copy()
# 10. Detect format from cropped dimensions
detected_format, format_confidence = _detect_format(crop_w, crop_h) detected_format, format_confidence = _detect_format(crop_w, crop_h)
result["crop_applied"] = True result["crop_applied"] = True
@@ -149,23 +141,140 @@ def detect_and_crop_page(
result["format_confidence"] = format_confidence result["format_confidence"] = format_confidence
result["aspect_ratio"] = round(max(crop_w, crop_h) / max(min(crop_w, crop_h), 1), 4) result["aspect_ratio"] = round(max(crop_w, crop_h) / max(min(crop_w, crop_h), 1), 4)
logger.info("Page cropped: %dx%d -> %dx%d, format=%s (%.0f%%), borders: T=%.1f%% B=%.1f%% L=%.1f%% R=%.1f%%", logger.info(
w, h, crop_w, crop_h, detected_format, format_confidence * 100, "Page cropped: %dx%d -> %dx%d, format=%s (%.0f%%), "
border_top * 100, border_bottom * 100, border_left * 100, border_right * 100) "borders: T=%.1f%% B=%.1f%% L=%.1f%% R=%.1f%%",
w, h, crop_w, crop_h, detected_format, format_confidence * 100,
border_top * 100, border_bottom * 100,
border_left * 100, border_right * 100,
)
return cropped, result return cropped, result
def _detect_format(width: int, height: int) -> Tuple[str, float]: # ---------------------------------------------------------------------------
"""Detect paper format from dimensions by comparing aspect ratios. # Edge detection helpers
# ---------------------------------------------------------------------------
Returns: def _detect_left_edge_shadow(
(format_name, confidence) where confidence is 0.0-1.0 gray: np.ndarray,
binary: np.ndarray,
w: int,
h: int,
) -> int:
"""Detect left content edge, accounting for book-spine shadow.
Strategy: look at the left 25% of the image.
1. Compute column-mean brightness in grayscale.
2. Smooth with a boxcar kernel.
3. Find the transition from shadow (dark) to page (bright).
4. Fallback: use binary vertical projection if no shadow detected.
""" """
search_w = max(1, w // 4)
# Column-mean brightness in the left quarter
col_means = np.mean(gray[:, :search_w], axis=0).astype(np.float64)
# Smooth with boxcar kernel (width = 1% of image width, min 5)
kernel_size = max(5, w // 100)
if kernel_size % 2 == 0:
kernel_size += 1
kernel = np.ones(kernel_size) / kernel_size
smoothed = np.convolve(col_means, kernel, mode="same")
# Determine brightness threshold: midpoint between darkest and brightest
val_min = float(np.min(smoothed))
val_max = float(np.max(smoothed))
shadow_range = val_max - val_min
# Only use shadow detection if there is a meaningful brightness gradient (> 20 levels)
if shadow_range > 20:
threshold = val_min + shadow_range * 0.6
# Find first column where brightness exceeds threshold
above = np.where(smoothed >= threshold)[0]
if len(above) > 0:
shadow_edge = int(above[0])
logger.debug("Left edge: shadow detected at x=%d (range=%.0f)", shadow_edge, shadow_range)
return shadow_edge
# Fallback: binary vertical projection
return _detect_edge_projection(binary, axis=0, from_start=True, dim=w)
def _detect_right_edge(binary: np.ndarray, w: int, h: int) -> int:
"""Detect right content edge via binary vertical projection."""
return _detect_edge_projection(binary, axis=0, from_start=False, dim=w)
def _detect_top_bottom_edges(binary: np.ndarray, w: int, h: int) -> Tuple[int, int]:
"""Detect top and bottom content edges via binary horizontal projection."""
top = _detect_edge_projection(binary, axis=1, from_start=True, dim=h)
bottom = _detect_edge_projection(binary, axis=1, from_start=False, dim=h)
return top, bottom
def _detect_edge_projection(
binary: np.ndarray,
axis: int,
from_start: bool,
dim: int,
) -> int:
"""Find the first/last row or column with ink density above threshold.
axis=0 → project vertically (column densities) → returns x position
axis=1 → project horizontally (row densities) → returns y position
Filters out narrow noise runs shorter than _MIN_RUN_FRAC of the dimension.
"""
# Compute density per row/column (mean of binary pixels / 255)
projection = np.mean(binary, axis=axis) / 255.0
# Create mask of "ink" positions
ink_mask = projection >= _INK_THRESHOLD
# Filter narrow runs (noise)
min_run = max(1, int(dim * _MIN_RUN_FRAC))
ink_mask = _filter_narrow_runs(ink_mask, min_run)
ink_positions = np.where(ink_mask)[0]
if len(ink_positions) == 0:
return 0 if from_start else dim
if from_start:
return int(ink_positions[0])
else:
return int(ink_positions[-1])
def _filter_narrow_runs(mask: np.ndarray, min_run: int) -> np.ndarray:
"""Remove True-runs shorter than min_run pixels."""
if min_run <= 1:
return mask
result = mask.copy()
n = len(result)
i = 0
while i < n:
if result[i]:
start = i
while i < n and result[i]:
i += 1
if i - start < min_run:
result[start:i] = False
else:
i += 1
return result
# ---------------------------------------------------------------------------
# Format detection (kept as optional metadata)
# ---------------------------------------------------------------------------
def _detect_format(width: int, height: int) -> Tuple[str, float]:
"""Detect paper format from dimensions by comparing aspect ratios."""
if width <= 0 or height <= 0: if width <= 0 or height <= 0:
return "unknown", 0.0 return "unknown", 0.0
# Use portrait aspect ratio (taller / shorter)
aspect = max(width, height) / min(width, height) aspect = max(width, height) / min(width, height)
best_format = "unknown" best_format = "unknown"
@@ -177,8 +286,6 @@ def _detect_format(width: int, height: int) -> Tuple[str, float]:
best_diff = diff best_diff = diff
best_format = fmt best_format = fmt
# Confidence: 1.0 if exact match, decreasing with deviation
# Threshold: if diff > 0.1, confidence drops below 0.5
confidence = max(0.0, 1.0 - best_diff * 5.0) confidence = max(0.0, 1.0 - best_diff * 5.0)
if confidence < 0.3: if confidence < 0.3: