refactor: Crop nach Deskew/Dewarp verschieben + content-basierter Buchscan-Crop
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m56s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 17s

Pipeline-Reihenfolge neu: Orientierung → Begradigung → Entzerrung → Zuschneiden → Spalten...
Crop arbeitet jetzt auf dem bereits geraden Bild, was bessere Ergebnisse liefert.

page_crop.py komplett ersetzt: Adaptive Threshold + 4-Kanten-Erkennung
(Buchruecken-Schatten links, Ink-Projektion fuer alle Raender) statt
Otsu + groesste Kontur.

Backend: Step-Nummern, Input-Bilder, Reprocess-Kaskade angepasst.
Frontend: PIPELINE_STEPS umgeordnet, Switch-Cases, Vorher-Bilder aktualisiert.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-09 08:52:11 +01:00
parent eb45bb4879
commit 156a818246
7 changed files with 295 additions and 173 deletions

View File

@@ -3,9 +3,9 @@ OCR Pipeline API - Schrittweise Seitenrekonstruktion.
Zerlegt den OCR-Prozess in 10 einzelne Schritte:
1. Orientierung - 90/180/270° Drehungen korrigieren (orientation_crop_api.py)
2. Zuschneiden - Scannerraender entfernen (orientation_crop_api.py)
3. Deskewing - Scan begradigen
4. Dewarping - Buchwoelbung entzerren
2. Begradigung (Deskew) - Scan begradigen
3. Entzerrung (Dewarp) - Buchwoelbung entzerren
4. Zuschneiden - Scannerraender/Buchruecken entfernen (orientation_crop_api.py)
5. Spaltenerkennung - Unsichtbare Spalten finden
6. Zeilenerkennung - Horizontale Zeilen + Kopf-/Fusszeilen
7. Worterkennung - OCR mit Bounding Boxes
@@ -483,8 +483,8 @@ async def auto_deskew(session_id: str):
await _load_session_to_cache(session_id)
cached = _get_cached(session_id)
# Use cropped image as input (from step 2), fall back to oriented, then original
img_bgr = next((v for k in ("cropped_bgr", "oriented_bgr", "original_bgr")
# Deskew runs right after orientation — use oriented image, fall back to original
img_bgr = next((v for k in ("oriented_bgr", "original_bgr")
if (v := cached.get(k)) is not None), None)
if img_bgr is None:
raise HTTPException(status_code=400, detail="No image available for deskewing")
@@ -554,7 +554,7 @@ async def auto_deskew(session_id: str):
db_update = {
"deskewed_png": deskewed_png,
"deskew_result": deskew_result,
"current_step": 4,
"current_step": 3,
}
if binarized_png:
db_update["binarized_png"] = binarized_png
@@ -585,12 +585,12 @@ async def auto_deskew(session_id: str):
@router.post("/sessions/{session_id}/deskew/manual")
async def manual_deskew(session_id: str, req: ManualDeskewRequest):
"""Apply a manual rotation angle to the cropped image."""
"""Apply a manual rotation angle to the oriented image."""
if session_id not in _cache:
await _load_session_to_cache(session_id)
cached = _get_cached(session_id)
img_bgr = next((v for k in ("cropped_bgr", "oriented_bgr", "original_bgr")
img_bgr = next((v for k in ("oriented_bgr", "original_bgr")
if (v := cached.get(k)) is not None), None)
if img_bgr is None:
raise HTTPException(status_code=400, detail="No image available for deskewing")
@@ -801,7 +801,7 @@ async def auto_dewarp(
dewarped_png=dewarped_png,
dewarp_result=dewarp_result,
auto_shear_degrees=dewarp_info.get("shear_degrees", 0.0),
current_step=5,
current_step=4,
)
logger.info(f"OCR Pipeline: dewarp session {session_id}: "
@@ -993,20 +993,21 @@ async def save_dewarp_ground_truth(session_id: str, req: DewarpGroundTruthReques
async def detect_type(session_id: str):
"""Detect document type (vocab_table, full_text, generic_table).
Should be called after dewarp (clean image available).
Should be called after crop (clean image available).
Falls back to dewarped if crop was skipped.
Stores result in session for frontend to decide pipeline flow.
"""
if session_id not in _cache:
await _load_session_to_cache(session_id)
cached = _get_cached(session_id)
dewarped_bgr = cached.get("dewarped_bgr")
if dewarped_bgr is None:
raise HTTPException(status_code=400, detail="Dewarp must be completed first")
img_bgr = cached.get("cropped_bgr") or cached.get("dewarped_bgr")
if img_bgr is None:
raise HTTPException(status_code=400, detail="Crop or dewarp must be completed first")
t0 = time.time()
ocr_img = create_ocr_image(dewarped_bgr)
result = detect_document_type(ocr_img, dewarped_bgr)
ocr_img = create_ocr_image(img_bgr)
result = detect_document_type(ocr_img, img_bgr)
duration = time.time() - t0
result_dict = {
@@ -1046,27 +1047,27 @@ async def detect_type(session_id: str):
@router.post("/sessions/{session_id}/columns")
async def detect_columns(session_id: str):
"""Run column detection on the dewarped image."""
"""Run column detection on the cropped (or dewarped) image."""
if session_id not in _cache:
await _load_session_to_cache(session_id)
cached = _get_cached(session_id)
dewarped_bgr = cached.get("dewarped_bgr")
if dewarped_bgr is None:
raise HTTPException(status_code=400, detail="Dewarp must be completed before column detection")
img_bgr = cached.get("cropped_bgr") or cached.get("dewarped_bgr")
if img_bgr is None:
raise HTTPException(status_code=400, detail="Crop or dewarp must be completed before column detection")
t0 = time.time()
# Binarized image for layout analysis
ocr_img = create_ocr_image(dewarped_bgr)
ocr_img = create_ocr_image(img_bgr)
h, w = ocr_img.shape[:2]
# Phase A: Geometry detection (returns word_dicts + inv for reuse)
geo_result = detect_column_geometry(ocr_img, dewarped_bgr)
geo_result = detect_column_geometry(ocr_img, img_bgr)
if geo_result is None:
# Fallback to projection-based layout
layout_img = create_layout_image(dewarped_bgr)
layout_img = create_layout_image(img_bgr)
regions = analyze_layout(layout_img, ocr_img)
else:
geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result
@@ -1113,7 +1114,7 @@ async def detect_columns(session_id: str):
column_result=column_result,
row_result=None,
word_result=None,
current_step=5,
current_step=6,
)
# Update cache
@@ -1125,7 +1126,7 @@ async def detect_columns(session_id: str):
logger.info(f"OCR Pipeline: columns session {session_id}: "
f"{col_count} columns detected ({duration:.2f}s)")
img_w = dewarped_bgr.shape[1]
img_w = img_bgr.shape[1]
await _append_pipeline_log(session_id, "columns", {
"total_columns": len(columns),
"column_widths_pct": [round(c["width"] / img_w * 100, 1) for c in columns],
@@ -1276,14 +1277,14 @@ async def _get_columns_overlay(session_id: str) -> Response:
@router.post("/sessions/{session_id}/rows")
async def detect_rows(session_id: str):
"""Run row detection on the dewarped image using horizontal gap analysis."""
"""Run row detection on the cropped (or dewarped) image using horizontal gap analysis."""
if session_id not in _cache:
await _load_session_to_cache(session_id)
cached = _get_cached(session_id)
dewarped_bgr = cached.get("dewarped_bgr")
dewarped_bgr = cached.get("cropped_bgr") or cached.get("dewarped_bgr")
if dewarped_bgr is None:
raise HTTPException(status_code=400, detail="Dewarp must be completed before row detection")
raise HTTPException(status_code=400, detail="Crop or dewarp must be completed before row detection")
t0 = time.time()
@@ -1339,7 +1340,7 @@ async def detect_rows(session_id: str):
session_id,
row_result=row_result,
word_result=None,
current_step=6,
current_step=7,
)
cached["row_result"] = row_result
@@ -1453,11 +1454,11 @@ async def detect_words(
await _load_session_to_cache(session_id)
cached = _get_cached(session_id)
dewarped_bgr = cached.get("dewarped_bgr")
dewarped_bgr = cached.get("cropped_bgr") or cached.get("dewarped_bgr")
if dewarped_bgr is None:
logger.warning("detect_words: dewarped_bgr is None for session %s (cache keys: %s)",
logger.warning("detect_words: no cropped/dewarped image for session %s (cache keys: %s)",
session_id, [k for k in cached.keys() if k.endswith('_bgr')])
raise HTTPException(status_code=400, detail="Dewarp must be completed before word detection")
raise HTTPException(status_code=400, detail="Crop or dewarp must be completed before word detection")
session = await get_session_db(session_id)
if not session:
@@ -1605,7 +1606,7 @@ async def detect_words(
await update_session_db(
session_id,
word_result=word_result,
current_step=7,
current_step=8,
)
cached["word_result"] = word_result
@@ -1749,7 +1750,7 @@ async def _word_batch_stream_generator(
word_result["summary"]["with_german"] = sum(1 for e in entries if e.get("german"))
vocab_entries = entries
await update_session_db(session_id, word_result=word_result, current_step=7)
await update_session_db(session_id, word_result=word_result, current_step=8)
cached["word_result"] = word_result
logger.info(f"OCR Pipeline SSE batch: words session {session_id}: "
@@ -1896,7 +1897,7 @@ async def _word_stream_generator(
await update_session_db(
session_id,
word_result=word_result,
current_step=7,
current_step=8,
)
cached["word_result"] = word_result
@@ -2020,7 +2021,7 @@ async def run_llm_review(session_id: str, request: Request, stream: bool = False
"duration_ms": result["duration_ms"],
"entries_corrected": result["entries_corrected"],
}
await update_session_db(session_id, word_result=word_result, current_step=8)
await update_session_db(session_id, word_result=word_result, current_step=9)
if session_id in _cache:
_cache[session_id]["word_result"] = word_result
@@ -2069,7 +2070,7 @@ async def _llm_review_stream_generator(
"duration_ms": event["duration_ms"],
"entries_corrected": event["entries_corrected"],
}
await update_session_db(session_id, word_result=word_result, current_step=8)
await update_session_db(session_id, word_result=word_result, current_step=9)
if session_id in _cache:
_cache[session_id]["word_result"] = word_result
@@ -2157,7 +2158,7 @@ async def save_reconstruction(session_id: str, request: Request):
cell_updates = body.get("cells", [])
if not cell_updates:
await update_session_db(session_id, current_step=9)
await update_session_db(session_id, current_step=10)
return {"session_id": session_id, "updated": 0}
# Build update map: cell_id -> new text
@@ -2193,7 +2194,7 @@ async def save_reconstruction(session_id: str, request: Request):
if "entries" in word_result:
word_result["entries"] = entries
await update_session_db(session_id, word_result=word_result, current_step=9)
await update_session_db(session_id, word_result=word_result, current_step=10)
if session_id in _cache:
_cache[session_id]["word_result"] = word_result
@@ -2589,7 +2590,7 @@ async def save_validation(session_id: str, req: ValidationRequest):
validation["score"] = req.score
ground_truth["validation"] = validation
await update_session_db(session_id, ground_truth=ground_truth, current_step=10)
await update_session_db(session_id, ground_truth=ground_truth, current_step=11)
if session_id in _cache:
_cache[session_id]["ground_truth"] = ground_truth
@@ -2622,11 +2623,14 @@ async def reprocess_session(session_id: str, request: Request):
Body: {"from_step": 5} (1-indexed step number)
Pipeline order: Orientation(1) → Deskew(2) → Dewarp(3) → Crop(4) → Columns(5) →
Rows(6) → Words(7) → LLM-Review(8) → Reconstruction(9) → Validation(10)
Clears downstream results:
- from_step <= 1: orientation_result, crop_result, deskew_result, dewarp_result, column_result, row_result, word_result
- from_step <= 2: crop_result, deskew_result, dewarp_result, column_result, row_result, word_result
- from_step <= 3: deskew_result, dewarp_result, column_result, row_result, word_result
- from_step <= 4: dewarp_result, column_result, row_result, word_result
- from_step <= 1: orientation_result + all downstream
- from_step <= 2: deskew_result + all downstream
- from_step <= 3: dewarp_result + all downstream
- from_step <= 4: crop_result + all downstream
- from_step <= 5: column_result, row_result, word_result
- from_step <= 6: row_result, word_result
- from_step <= 7: word_result (cells, vocab_entries)
@@ -2638,15 +2642,17 @@ async def reprocess_session(session_id: str, request: Request):
body = await request.json()
from_step = body.get("from_step", 1)
if not isinstance(from_step, int) or from_step < 1 or from_step > 9:
raise HTTPException(status_code=400, detail="from_step must be between 1 and 9")
if not isinstance(from_step, int) or from_step < 1 or from_step > 10:
raise HTTPException(status_code=400, detail="from_step must be between 1 and 10")
update_kwargs: Dict[str, Any] = {"current_step": from_step}
# Clear downstream data based on from_step
if from_step <= 7:
# New pipeline order: Orient(2) → Deskew(3) → Dewarp(4) → Crop(5) →
# Columns(6) → Rows(7) → Words(8) → LLM(9) → Recon(10) → GT(11)
if from_step <= 8:
update_kwargs["word_result"] = None
elif from_step == 8:
elif from_step == 9:
# Only clear LLM review from word_result
word_result = session.get("word_result")
if word_result:
@@ -2654,16 +2660,16 @@ async def reprocess_session(session_id: str, request: Request):
word_result.pop("llm_corrections", None)
update_kwargs["word_result"] = word_result
if from_step <= 6:
if from_step <= 7:
update_kwargs["row_result"] = None
if from_step <= 5:
if from_step <= 6:
update_kwargs["column_result"] = None
if from_step <= 4:
update_kwargs["dewarp_result"] = None
if from_step <= 3:
update_kwargs["deskew_result"] = None
if from_step <= 2:
update_kwargs["crop_result"] = None
if from_step <= 3:
update_kwargs["dewarp_result"] = None
if from_step <= 2:
update_kwargs["deskew_result"] = None
if from_step <= 1:
update_kwargs["orientation_result"] = None
@@ -3084,7 +3090,7 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request):
deskewed_png=deskewed_png,
deskew_result=deskew_result,
auto_rotation_degrees=float(angle_applied),
current_step=4,
current_step=3,
)
session = await get_session_db(session_id)
@@ -3147,7 +3153,7 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request):
dewarped_png=dewarped_png,
dewarp_result=dewarp_result,
auto_shear_degrees=dewarp_info.get("shear_degrees", 0.0),
current_step=5,
current_step=4,
)
session = await get_session_db(session_id)
@@ -3170,16 +3176,16 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request):
yield await _auto_sse_event("columns", "start", {})
try:
t0 = time.time()
dewarped_bgr = cached.get("dewarped_bgr")
if dewarped_bgr is None:
raise ValueError("Dewarped image not available")
col_img = cached.get("cropped_bgr") or cached.get("dewarped_bgr")
if col_img is None:
raise ValueError("Cropped/dewarped image not available")
ocr_img = create_ocr_image(dewarped_bgr)
ocr_img = create_ocr_image(col_img)
h, w = ocr_img.shape[:2]
geo_result = detect_column_geometry(ocr_img, dewarped_bgr)
geo_result = detect_column_geometry(ocr_img, col_img)
if geo_result is None:
layout_img = create_layout_image(dewarped_bgr)
layout_img = create_layout_image(col_img)
regions = analyze_layout(layout_img, ocr_img)
cached["_word_dicts"] = None
cached["_inv"] = None
@@ -3231,7 +3237,7 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request):
yield await _auto_sse_event("rows", "start", {})
try:
t0 = time.time()
dewarped_bgr = cached.get("dewarped_bgr")
row_img = cached.get("cropped_bgr") or cached.get("dewarped_bgr")
session = await get_session_db(session_id)
column_result = session.get("column_result") or cached.get("column_result")
if not column_result or not column_result.get("columns"):
@@ -3252,8 +3258,8 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request):
content_bounds = cached.get("_content_bounds")
if word_dicts is None or inv is None or content_bounds is None:
ocr_img_tmp = create_ocr_image(dewarped_bgr)
geo_result = detect_column_geometry(ocr_img_tmp, dewarped_bgr)
ocr_img_tmp = create_ocr_image(row_img)
geo_result = detect_column_geometry(ocr_img_tmp, row_img)
if geo_result is None:
raise ValueError("Column geometry detection failed — cannot detect rows")
_g, lx, rx, ty, by, word_dicts, inv = geo_result
@@ -3309,7 +3315,7 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request):
yield await _auto_sse_event("words", "start", {"engine": req.ocr_engine})
try:
t0 = time.time()
dewarped_bgr = cached.get("dewarped_bgr")
word_img = cached.get("cropped_bgr") or cached.get("dewarped_bgr")
session = await get_session_db(session_id)
column_result = session.get("column_result") or cached.get("column_result")
@@ -3348,12 +3354,12 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request):
]
row.word_count = len(row.words)
ocr_img = create_ocr_image(dewarped_bgr)
img_h, img_w = dewarped_bgr.shape[:2]
ocr_img = create_ocr_image(word_img)
img_h, img_w = word_img.shape[:2]
cells, columns_meta = build_cell_grid(
ocr_img, col_regions, row_geoms, img_w, img_h,
ocr_engine=req.ocr_engine, img_bgr=dewarped_bgr,
ocr_engine=req.ocr_engine, img_bgr=word_img,
)
duration = time.time() - t0