feat: generische Box-Erkennung fuer zonenbasierte Spaltenerkennung
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 29s
CI / test-go-edu-search (push) Successful in 30s
CI / test-python-klausur (push) Failing after 1m59s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 19s

- Neue Datei cv_box_detect.py: 2-Stufen-Algorithmus (Linien + Farbe)
- DetectedBox/PageZone Dataclasses in cv_vocab_types.py
- detect_column_geometry_zoned() in cv_layout.py
- API-Endpoints erweitert: zones/boxes_detected im column_result
- Overlay-Funktionen zeichnen Box-Grenzen als gestrichelte Rechtecke
- Fix: numpy array or-Verknuepfung an 7 Stellen in ocr_pipeline_api.py
- 12 Unit-Tests fuer Box-Erkennung und Zone-Splitting

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-09 15:06:23 +01:00
parent e60254bc75
commit 7005b18561
6 changed files with 821 additions and 14 deletions

View File

@@ -57,6 +57,7 @@ from cv_vocab_pipeline import (
deskew_image_iterative,
deskew_two_pass,
detect_column_geometry,
detect_column_geometry_zoned,
detect_document_type,
detect_row_geometry,
expand_narrow_columns,
@@ -1001,7 +1002,7 @@ async def detect_type(session_id: str):
await _load_session_to_cache(session_id)
cached = _get_cached(session_id)
img_bgr = cached.get("cropped_bgr") or cached.get("dewarped_bgr")
img_bgr = cached.get("cropped_bgr") if cached.get("cropped_bgr") is not None else cached.get("dewarped_bgr")
if img_bgr is None:
raise HTTPException(status_code=400, detail="Crop or dewarp must be completed first")
@@ -1052,7 +1053,7 @@ async def detect_columns(session_id: str):
await _load_session_to_cache(session_id)
cached = _get_cached(session_id)
img_bgr = cached.get("cropped_bgr") or cached.get("dewarped_bgr")
img_bgr = cached.get("cropped_bgr") if cached.get("cropped_bgr") is not None else cached.get("dewarped_bgr")
if img_bgr is None:
raise HTTPException(status_code=400, detail="Crop or dewarp must be completed before column detection")
@@ -1062,21 +1063,26 @@ async def detect_columns(session_id: str):
ocr_img = create_ocr_image(img_bgr)
h, w = ocr_img.shape[:2]
# Phase A: Geometry detection (returns word_dicts + inv for reuse)
geo_result = detect_column_geometry(ocr_img, img_bgr)
# Phase A: Zone-aware geometry detection
zoned_result = detect_column_geometry_zoned(ocr_img, img_bgr)
if geo_result is None:
if zoned_result is None:
# Fallback to projection-based layout
layout_img = create_layout_image(img_bgr)
regions = analyze_layout(layout_img, ocr_img)
zones_data = None
boxes_detected = 0
else:
geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result
geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv, zones_data, boxes = zoned_result
content_w = right_x - left_x
boxes_detected = len(boxes)
# Cache intermediates for row detection (avoids second Tesseract run)
cached["_word_dicts"] = word_dicts
cached["_inv"] = inv
cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
cached["_zones_data"] = zones_data
cached["_boxes_detected"] = boxes_detected
# Detect header/footer early so sub-column clustering ignores them
header_y, footer_y = _detect_header_footer_gaps(inv, w, h) if inv is not None else (None, None)
@@ -1106,8 +1112,13 @@ async def detect_columns(session_id: str):
"columns": columns,
"classification_methods": methods,
"duration_seconds": round(duration, 2),
"boxes_detected": boxes_detected,
}
# Add zone data when boxes are present
if zones_data and boxes_detected > 0:
column_result["zones"] = zones_data
# Persist to DB — also invalidate downstream results (rows, words)
await update_session_db(
session_id,
@@ -1124,13 +1135,14 @@ async def detect_columns(session_id: str):
col_count = len([c for c in columns if c["type"].startswith("column")])
logger.info(f"OCR Pipeline: columns session {session_id}: "
f"{col_count} columns detected ({duration:.2f}s)")
f"{col_count} columns detected, {boxes_detected} box(es) ({duration:.2f}s)")
img_w = img_bgr.shape[1]
await _append_pipeline_log(session_id, "columns", {
"total_columns": len(columns),
"column_widths_pct": [round(c["width"] / img_w * 100, 1) for c in columns],
"column_types": [c["type"] for c in columns],
"boxes_detected": boxes_detected,
}, duration_ms=int(duration * 1000))
return {
@@ -1266,6 +1278,27 @@ async def _get_columns_overlay(session_id: str) -> Response:
# Blend overlay at 20% opacity
cv2.addWeighted(overlay, 0.2, img, 0.8, 0, img)
# Draw detected box boundaries as dashed rectangles
zones = column_result.get("zones", [])
for zone in zones:
if zone.get("zone_type") == "box" and zone.get("box"):
box = zone["box"]
bx, by = box["x"], box["y"]
bw, bh = box["width"], box["height"]
box_color = (0, 200, 255) # Yellow (BGR)
# Draw dashed rectangle by drawing short line segments
dash_len = 15
for edge_x in range(bx, bx + bw, dash_len * 2):
end_x = min(edge_x + dash_len, bx + bw)
cv2.line(img, (edge_x, by), (end_x, by), box_color, 2)
cv2.line(img, (edge_x, by + bh), (end_x, by + bh), box_color, 2)
for edge_y in range(by, by + bh, dash_len * 2):
end_y = min(edge_y + dash_len, by + bh)
cv2.line(img, (bx, edge_y), (bx, end_y), box_color, 2)
cv2.line(img, (bx + bw, edge_y), (bx + bw, end_y), box_color, 2)
cv2.putText(img, "BOX", (bx + 10, by + bh - 10),
cv2.FONT_HERSHEY_SIMPLEX, 0.7, box_color, 2)
success, result_png = cv2.imencode(".png", img)
if not success:
raise HTTPException(status_code=500, detail="Failed to encode overlay image")
@@ -1284,7 +1317,7 @@ async def detect_rows(session_id: str):
await _load_session_to_cache(session_id)
cached = _get_cached(session_id)
dewarped_bgr = cached.get("cropped_bgr") or cached.get("dewarped_bgr")
dewarped_bgr = cached.get("cropped_bgr") if cached.get("cropped_bgr") is not None else cached.get("dewarped_bgr")
if dewarped_bgr is None:
raise HTTPException(status_code=400, detail="Crop or dewarp must be completed before row detection")
@@ -1315,7 +1348,7 @@ async def detect_rows(session_id: str):
# Build serializable result (exclude words to keep payload small)
rows_data = []
for r in rows:
rows_data.append({
rd = {
"index": r.index,
"x": r.x,
"y": r.y,
@@ -1324,7 +1357,9 @@ async def detect_rows(session_id: str):
"word_count": r.word_count,
"row_type": r.row_type,
"gap_before": r.gap_before,
})
"zone_index": 0,
}
rows_data.append(rd)
type_counts = {}
for r in rows:
@@ -1456,7 +1491,7 @@ async def detect_words(
await _load_session_to_cache(session_id)
cached = _get_cached(session_id)
dewarped_bgr = cached.get("cropped_bgr") or cached.get("dewarped_bgr")
dewarped_bgr = cached.get("cropped_bgr") if cached.get("cropped_bgr") is not None else cached.get("dewarped_bgr")
if dewarped_bgr is None:
logger.warning("detect_words: no cropped/dewarped image for session %s (cache keys: %s)",
session_id, [k for k in cached.keys() if k.endswith('_bgr')])
@@ -1560,6 +1595,10 @@ async def detect_words(
)
duration = time.time() - t0
# Add zone_index to each cell (default 0 for backward compatibility)
for cell in cells:
cell.setdefault("zone_index", 0)
# Layout detection
col_types = {c['type'] for c in columns_meta}
is_vocab = bool(col_types & {'column_en', 'column_de'})
@@ -2749,6 +2788,22 @@ async def _get_rows_overlay(session_id: str) -> Response:
# Blend overlay at 15% opacity
cv2.addWeighted(overlay, 0.15, img, 0.85, 0, img)
# Draw zone separator lines if zones exist
column_result = session.get("column_result") or {}
zones = column_result.get("zones", [])
if zones:
img_w_px = img.shape[1]
zone_color = (0, 200, 255) # Yellow (BGR)
dash_len = 20
for zone in zones:
if zone.get("zone_type") == "box":
zy = zone["y"]
zh = zone["height"]
for line_y in [zy, zy + zh]:
for sx in range(0, img_w_px, dash_len * 2):
ex = min(sx + dash_len, img_w_px)
cv2.line(img, (sx, line_y), (ex, line_y), zone_color, 2)
success, result_png = cv2.imencode(".png", img)
if not success:
raise HTTPException(status_code=500, detail="Failed to encode overlay image")
@@ -3182,7 +3237,7 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request):
yield await _auto_sse_event("columns", "start", {})
try:
t0 = time.time()
col_img = cached.get("cropped_bgr") or cached.get("dewarped_bgr")
col_img = cached.get("cropped_bgr") if cached.get("cropped_bgr") is not None else cached.get("dewarped_bgr")
if col_img is None:
raise ValueError("Cropped/dewarped image not available")
@@ -3243,7 +3298,7 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request):
yield await _auto_sse_event("rows", "start", {})
try:
t0 = time.time()
row_img = cached.get("cropped_bgr") or cached.get("dewarped_bgr")
row_img = cached.get("cropped_bgr") if cached.get("cropped_bgr") is not None else cached.get("dewarped_bgr")
session = await get_session_db(session_id)
column_result = session.get("column_result") or cached.get("column_result")
if not column_result or not column_result.get("columns"):
@@ -3321,7 +3376,7 @@ async def run_auto(session_id: str, req: RunAutoRequest, request: Request):
yield await _auto_sse_event("words", "start", {"engine": req.ocr_engine})
try:
t0 = time.time()
word_img = cached.get("cropped_bgr") or cached.get("dewarped_bgr")
word_img = cached.get("cropped_bgr") if cached.get("cropped_bgr") is not None else cached.get("dewarped_bgr")
session = await get_session_db(session_id)
column_result = session.get("column_result") or cached.get("column_result")