Keine Session ausgewaehlt.
+ }
+
+ const croppedUrl = `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/cropped`
+ const overlayUrl = `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/structure-overlay${overlayTs ? `?t=${overlayTs}` : ''}`
+
+ return (
+
+ {/* Loading indicator */}
+ {detecting && (
+
+
+ Dokumentstruktur wird analysiert...
+
+ )}
+
+ {/* Two-column image comparison */}
+
+ {/* Left: Original document */}
+
+
+ Original
+
+
+ {/* eslint-disable-next-line @next/next/no-img-element */}
+

{
+ (e.target as HTMLImageElement).style.display = 'none'
+ }}
+ />
+
+
+
+ {/* Right: Structure overlay */}
+
+
+ Erkannte Struktur
+
+
+ {/* eslint-disable-next-line @next/next/no-img-element */}
+

{
+ (e.target as HTMLImageElement).style.display = 'none'
+ }}
+ />
+
+
+
+
+ {/* Result info */}
+ {result && (
+
+ {/* Summary badges */}
+
+
+ {result.zones.length} Zone(n)
+
+
+ {result.boxes.length} Box(en)
+
+ {result.has_words && (
+
+ {result.word_count} Woerter
+
+ )}
+
+ {result.image_width}x{result.image_height}px | {result.duration_seconds}s
+
+
+
+ {/* Boxes detail */}
+ {result.boxes.length > 0 && (
+
+
Erkannte Boxen
+
+ {result.boxes.map((box, i) => (
+
+
+
+ Box {i + 1}:
+
+
+ {box.w}x{box.h}px @ ({box.x}, {box.y})
+
+ {box.bg_color_name && box.bg_color_name !== 'unknown' && box.bg_color_name !== 'white' && (
+
+ {box.bg_color_name}
+
+ )}
+ {box.border_thickness > 0 && (
+
+ Rahmen: {box.border_thickness}px
+
+ )}
+
+ {Math.round(box.confidence * 100)}%
+
+
+ ))}
+
+
+ )}
+
+ {/* Zones detail */}
+
+
Seitenzonen
+
+ {result.zones.map((zone) => (
+
+ {zone.zone_type === 'box' ? 'Box' : 'Inhalt'} {zone.index}
+
+ ({zone.w}x{zone.h})
+
+
+ ))}
+
+
+
+ {/* Color regions */}
+ {Object.keys(result.color_pixel_counts).length > 0 && (
+
+
Erkannte Farben
+
+ {Object.entries(result.color_pixel_counts)
+ .sort(([, a], [, b]) => b - a)
+ .map(([name, count]) => (
+
+
+ {name}
+ {count.toLocaleString()}px
+
+ ))}
+
+
+ )}
+
+ )}
+
+ {/* Action buttons */}
+ {result && (
+
+
+
+
+ )}
+
+ {error && (
+
+ {error}
+
+ )}
+
+ )
+}
diff --git a/klausur-service/backend/ocr_pipeline_api.py b/klausur-service/backend/ocr_pipeline_api.py
index a677326..e20b84e 100644
--- a/klausur-service/backend/ocr_pipeline_api.py
+++ b/klausur-service/backend/ocr_pipeline_api.py
@@ -71,6 +71,8 @@ from cv_vocab_pipeline import (
render_image_high_res,
render_pdf_high_res,
)
+from cv_box_detect import detect_boxes, split_page_into_zones
+from cv_color_detect import detect_word_colors, recover_colored_text, _COLOR_RANGES, _COLOR_HEX
from cv_words_first import build_grid_from_words
from ocr_pipeline_session_store import (
create_session_db,
@@ -591,11 +593,14 @@ async def _append_pipeline_log(
@router.get("/sessions/{session_id}/image/{image_type}")
async def get_image(session_id: str, image_type: str):
- """Serve session images: original, deskewed, dewarped, binarized, columns-overlay, or rows-overlay."""
- valid_types = {"original", "oriented", "cropped", "deskewed", "dewarped", "binarized", "columns-overlay", "rows-overlay", "words-overlay", "clean"}
+ """Serve session images: original, deskewed, dewarped, binarized, structure-overlay, columns-overlay, or rows-overlay."""
+ valid_types = {"original", "oriented", "cropped", "deskewed", "dewarped", "binarized", "structure-overlay", "columns-overlay", "rows-overlay", "words-overlay", "clean"}
if image_type not in valid_types:
raise HTTPException(status_code=400, detail=f"Unknown image type: {image_type}")
+ if image_type == "structure-overlay":
+ return await _get_structure_overlay(session_id)
+
if image_type == "columns-overlay":
return await _get_columns_overlay(session_id)
@@ -1196,6 +1201,153 @@ async def detect_type(session_id: str):
return {"session_id": session_id, **result_dict}
+# ---------------------------------------------------------------------------
+# Structure Detection Endpoint
+# ---------------------------------------------------------------------------
+
+@router.post("/sessions/{session_id}/detect-structure")
+async def detect_structure(session_id: str):
+ """Detect document structure: boxes, zones, and color regions.
+
+ Runs box detection (line + shading) and color analysis on the cropped
+ image. Returns structured JSON with all detected elements for the
+ structure visualization step.
+ """
+ if session_id not in _cache:
+ await _load_session_to_cache(session_id)
+ cached = _get_cached(session_id)
+
+ img_bgr = (
+ cached.get("cropped_bgr")
+ if cached.get("cropped_bgr") is not None
+ else cached.get("dewarped_bgr")
+ )
+ if img_bgr is None:
+ raise HTTPException(status_code=400, detail="Crop or dewarp must be completed first")
+
+ t0 = time.time()
+ h, w = img_bgr.shape[:2]
+
+ # --- Content bounds from word result (if available) or full image ---
+ word_result = cached.get("word_result")
+ words: List[Dict] = []
+ if word_result and word_result.get("cells"):
+ for cell in word_result["cells"]:
+ for wb in (cell.get("word_boxes") or []):
+ words.append(wb)
+ # If no words yet, use image dimensions with small margin
+ if words:
+ content_x = max(0, min(int(wb["left"]) for wb in words))
+ content_y = max(0, min(int(wb["top"]) for wb in words))
+ content_r = min(w, max(int(wb["left"] + wb["width"]) for wb in words))
+ content_b = min(h, max(int(wb["top"] + wb["height"]) for wb in words))
+ content_w_px = content_r - content_x
+ content_h_px = content_b - content_y
+ else:
+ margin = int(min(w, h) * 0.03)
+ content_x, content_y = margin, margin
+ content_w_px = w - 2 * margin
+ content_h_px = h - 2 * margin
+
+ # --- Box detection ---
+ boxes = detect_boxes(
+ img_bgr,
+ content_x=content_x,
+ content_w=content_w_px,
+ content_y=content_y,
+ content_h=content_h_px,
+ )
+
+ # --- Zone splitting ---
+ from cv_box_detect import split_page_into_zones as _split_zones
+ zones = _split_zones(content_x, content_y, content_w_px, content_h_px, boxes)
+
+ # --- Color region sampling ---
+ # Sample background shading in each detected box
+ hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
+ box_colors = []
+ for box in boxes:
+ # Sample the center region of each box
+ cy1 = box.y + box.height // 4
+ cy2 = box.y + 3 * box.height // 4
+ cx1 = box.x + box.width // 4
+ cx2 = box.x + 3 * box.width // 4
+ cy1 = max(0, min(cy1, h - 1))
+ cy2 = max(0, min(cy2, h - 1))
+ cx1 = max(0, min(cx1, w - 1))
+ cx2 = max(0, min(cx2, w - 1))
+ if cy2 > cy1 and cx2 > cx1:
+ roi_hsv = hsv[cy1:cy2, cx1:cx2]
+ med_h = float(np.median(roi_hsv[:, :, 0]))
+ med_s = float(np.median(roi_hsv[:, :, 1]))
+ med_v = float(np.median(roi_hsv[:, :, 2]))
+ if med_s > 15:
+ from cv_color_detect import _hue_to_color_name
+ bg_name = _hue_to_color_name(med_h)
+ bg_hex = _COLOR_HEX.get(bg_name, "#6b7280")
+ else:
+ bg_name = "gray" if med_v < 220 else "white"
+ bg_hex = "#6b7280" if bg_name == "gray" else "#ffffff"
+ else:
+ bg_name = "unknown"
+ bg_hex = "#6b7280"
+ box_colors.append({"color_name": bg_name, "color_hex": bg_hex})
+
+ # --- Color text detection overview ---
+ # Quick scan for colored text regions across the page
+ color_summary: Dict[str, int] = {}
+ for color_name, ranges in _COLOR_RANGES.items():
+ mask = np.zeros((h, w), dtype=np.uint8)
+ for lower, upper in ranges:
+ mask = cv2.bitwise_or(mask, cv2.inRange(hsv, lower, upper))
+ pixel_count = int(np.sum(mask > 0))
+ if pixel_count > 50: # minimum threshold
+ color_summary[color_name] = pixel_count
+
+ duration = time.time() - t0
+
+ result_dict = {
+ "image_width": w,
+ "image_height": h,
+ "content_bounds": {
+ "x": content_x, "y": content_y,
+ "w": content_w_px, "h": content_h_px,
+ },
+ "boxes": [
+ {
+ "x": b.x, "y": b.y, "w": b.width, "h": b.height,
+ "confidence": b.confidence,
+ "border_thickness": b.border_thickness,
+ "bg_color_name": box_colors[i]["color_name"],
+ "bg_color_hex": box_colors[i]["color_hex"],
+ }
+ for i, b in enumerate(boxes)
+ ],
+ "zones": [
+ {
+ "index": z.index,
+ "zone_type": z.zone_type,
+ "y": z.y, "h": z.height,
+ "x": z.x, "w": z.width,
+ }
+ for z in zones
+ ],
+ "color_pixel_counts": color_summary,
+ "has_words": len(words) > 0,
+ "word_count": len(words),
+ "duration_seconds": round(duration, 2),
+ }
+
+ # Persist to session
+ await update_session_db(session_id, structure_result=result_dict)
+ cached["structure_result"] = result_dict
+
+ logger.info("detect-structure session %s: %d boxes, %d zones, %.2fs",
+ session_id, len(boxes), len(zones), duration)
+
+ return {"session_id": session_id, **result_dict}
+
+
# ---------------------------------------------------------------------------
# Column Detection Endpoints (Step 3)
# ---------------------------------------------------------------------------
@@ -1485,6 +1637,151 @@ def _draw_box_exclusion_overlay(
cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
+async def _get_structure_overlay(session_id: str) -> Response:
+ """Generate overlay image showing detected boxes, zones, and color regions."""
+ base_png = await _get_base_image_png(session_id)
+ if not base_png:
+ raise HTTPException(status_code=404, detail="No base image available")
+
+ arr = np.frombuffer(base_png, dtype=np.uint8)
+ img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
+ if img is None:
+ raise HTTPException(status_code=500, detail="Failed to decode image")
+
+ h, w = img.shape[:2]
+
+ # Get structure result (run detection if not cached)
+ session = await get_session_db(session_id)
+ structure = (session or {}).get("structure_result")
+
+ if not structure:
+ # Run detection on-the-fly
+ margin = int(min(w, h) * 0.03)
+ content_x, content_y = margin, margin
+ content_w_px = w - 2 * margin
+ content_h_px = h - 2 * margin
+ boxes = detect_boxes(img, content_x, content_w_px, content_y, content_h_px)
+ zones = split_page_into_zones(content_x, content_y, content_w_px, content_h_px, boxes)
+ structure = {
+ "boxes": [
+ {"x": b.x, "y": b.y, "w": b.width, "h": b.height,
+ "confidence": b.confidence, "border_thickness": b.border_thickness}
+ for b in boxes
+ ],
+ "zones": [
+ {"index": z.index, "zone_type": z.zone_type,
+ "y": z.y, "h": z.height, "x": z.x, "w": z.width}
+ for z in zones
+ ],
+ }
+
+ overlay = img.copy()
+
+ # --- Draw zone boundaries ---
+ zone_colors = {
+ "content": (200, 200, 200), # light gray
+ "box": (255, 180, 0), # blue-ish (BGR)
+ }
+ for zone in structure.get("zones", []):
+ zx = zone["x"]
+ zy = zone["y"]
+ zw = zone["w"]
+ zh = zone["h"]
+ color = zone_colors.get(zone["zone_type"], (200, 200, 200))
+
+ # Draw zone boundary as dashed line
+ dash_len = 12
+ for edge_x in range(zx, zx + zw, dash_len * 2):
+ end_x = min(edge_x + dash_len, zx + zw)
+ cv2.line(img, (edge_x, zy), (end_x, zy), color, 1)
+ cv2.line(img, (edge_x, zy + zh), (end_x, zy + zh), color, 1)
+
+ # Zone label
+ zone_label = f"Zone {zone['index']} ({zone['zone_type']})"
+ cv2.putText(img, zone_label, (zx + 5, zy + 15),
+ cv2.FONT_HERSHEY_SIMPLEX, 0.45, color, 1)
+
+ # --- Draw detected boxes ---
+ # Color map for box backgrounds (BGR)
+ bg_hex_to_bgr = {
+ "#dc2626": (38, 38, 220), # red
+ "#2563eb": (235, 99, 37), # blue
+ "#16a34a": (74, 163, 22), # green
+ "#ea580c": (12, 88, 234), # orange
+ "#9333ea": (234, 51, 147), # purple
+ "#ca8a04": (4, 138, 202), # yellow
+ "#6b7280": (128, 114, 107), # gray
+ }
+
+ for box_data in structure.get("boxes", []):
+ bx = box_data["x"]
+ by = box_data["y"]
+ bw = box_data["w"]
+ bh = box_data["h"]
+ conf = box_data.get("confidence", 0)
+ thickness = box_data.get("border_thickness", 0)
+ bg_hex = box_data.get("bg_color_hex", "#6b7280")
+ bg_name = box_data.get("bg_color_name", "")
+
+ # Box fill color
+ fill_bgr = bg_hex_to_bgr.get(bg_hex, (128, 114, 107))
+
+ # Semi-transparent fill
+ cv2.rectangle(overlay, (bx, by), (bx + bw, by + bh), fill_bgr, -1)
+
+ # Solid border
+ border_color = fill_bgr
+ cv2.rectangle(img, (bx, by), (bx + bw, by + bh), border_color, 3)
+
+ # Label
+ label = f"BOX"
+ if bg_name and bg_name not in ("unknown", "white"):
+ label += f" ({bg_name})"
+ if thickness > 0:
+ label += f" border={thickness}px"
+ label += f" {int(conf * 100)}%"
+ cv2.putText(img, label, (bx + 8, by + 22),
+ cv2.FONT_HERSHEY_SIMPLEX, 0.55, (255, 255, 255), 2)
+ cv2.putText(img, label, (bx + 8, by + 22),
+ cv2.FONT_HERSHEY_SIMPLEX, 0.55, border_color, 1)
+
+ # Blend overlay at 15% opacity
+ cv2.addWeighted(overlay, 0.15, img, 0.85, 0, img)
+
+ # --- Draw color regions (HSV masks) ---
+ hsv = cv2.cvtColor(
+ cv2.imdecode(np.frombuffer(base_png, dtype=np.uint8), cv2.IMREAD_COLOR),
+ cv2.COLOR_BGR2HSV,
+ )
+ color_bgr_map = {
+ "red": (0, 0, 255),
+ "orange": (0, 140, 255),
+ "yellow": (0, 200, 255),
+ "green": (0, 200, 0),
+ "blue": (255, 150, 0),
+ "purple": (200, 0, 200),
+ }
+ for color_name, ranges in _COLOR_RANGES.items():
+ mask = np.zeros((h, w), dtype=np.uint8)
+ for lower, upper in ranges:
+ mask = cv2.bitwise_or(mask, cv2.inRange(hsv, lower, upper))
+ # Only draw if there are significant colored pixels
+ if np.sum(mask > 0) < 100:
+ continue
+ # Draw colored contours
+ contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+ draw_color = color_bgr_map.get(color_name, (200, 200, 200))
+ for cnt in contours:
+ area = cv2.contourArea(cnt)
+ if area < 20:
+ continue
+ cv2.drawContours(img, [cnt], -1, draw_color, 2)
+
+ # Encode result
+ _, png_buf = cv2.imencode(".png", img)
+ return Response(content=png_buf.tobytes(), media_type="image/png")
+
+
async def _get_columns_overlay(session_id: str) -> Response:
"""Generate cropped (or dewarped) image with column borders drawn on it."""
session = await get_session_db(session_id)
diff --git a/klausur-service/backend/ocr_pipeline_session_store.py b/klausur-service/backend/ocr_pipeline_session_store.py
index bf1d3ed..8c34cc7 100644
--- a/klausur-service/backend/ocr_pipeline_session_store.py
+++ b/klausur-service/backend/ocr_pipeline_session_store.py
@@ -75,7 +75,8 @@ async def init_ocr_pipeline_tables():
ADD COLUMN IF NOT EXISTS crop_result JSONB,
ADD COLUMN IF NOT EXISTS parent_session_id UUID REFERENCES ocr_pipeline_sessions(id) ON DELETE CASCADE,
ADD COLUMN IF NOT EXISTS box_index INT,
- ADD COLUMN IF NOT EXISTS grid_editor_result JSONB
+ ADD COLUMN IF NOT EXISTS grid_editor_result JSONB,
+ ADD COLUMN IF NOT EXISTS structure_result JSONB
""")
@@ -111,7 +112,7 @@ async def create_session_db(
word_result, ground_truth, auto_shear_degrees,
doc_type, doc_type_result,
document_category, pipeline_log,
- grid_editor_result,
+ grid_editor_result, structure_result,
parent_session_id, box_index,
created_at, updated_at
""", uuid.UUID(session_id), name, filename, original_png,
@@ -131,7 +132,7 @@ async def get_session_db(session_id: str) -> Optional[Dict[str, Any]]:
word_result, ground_truth, auto_shear_degrees,
doc_type, doc_type_result,
document_category, pipeline_log,
- grid_editor_result,
+ grid_editor_result, structure_result,
parent_session_id, box_index,
created_at, updated_at
FROM ocr_pipeline_sessions WHERE id = $1
@@ -183,11 +184,11 @@ async def update_session_db(session_id: str, **kwargs) -> Optional[Dict[str, Any
'word_result', 'ground_truth', 'auto_shear_degrees',
'doc_type', 'doc_type_result',
'document_category', 'pipeline_log',
- 'grid_editor_result',
+ 'grid_editor_result', 'structure_result',
'parent_session_id', 'box_index',
}
- jsonb_fields = {'orientation_result', 'crop_result', 'deskew_result', 'dewarp_result', 'column_result', 'row_result', 'word_result', 'ground_truth', 'handwriting_removal_meta', 'doc_type_result', 'pipeline_log', 'grid_editor_result'}
+ jsonb_fields = {'orientation_result', 'crop_result', 'deskew_result', 'dewarp_result', 'column_result', 'row_result', 'word_result', 'ground_truth', 'handwriting_removal_meta', 'doc_type_result', 'pipeline_log', 'grid_editor_result', 'structure_result'}
for key, value in kwargs.items():
if key in allowed_fields:
@@ -313,7 +314,7 @@ def _row_to_dict(row: asyncpg.Record) -> Dict[str, Any]:
result[key] = result[key].isoformat()
# JSONB → parsed (asyncpg returns str for JSONB)
- for key in ['orientation_result', 'crop_result', 'deskew_result', 'dewarp_result', 'column_result', 'row_result', 'word_result', 'ground_truth', 'doc_type_result', 'pipeline_log', 'grid_editor_result']:
+ for key in ['orientation_result', 'crop_result', 'deskew_result', 'dewarp_result', 'column_result', 'row_result', 'word_result', 'ground_truth', 'doc_type_result', 'pipeline_log', 'grid_editor_result', 'structure_result']:
if key in result and result[key] is not None:
if isinstance(result[key], str):
result[key] = json.loads(result[key])