feat: add Structure Detection step to OCR pipeline
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 1m58s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 16s

New pipeline step between Crop and Columns that visualizes detected
document structure: boxes (line-based + shading), page zones, and
color regions. Shows original image on the left, annotated overlay
on the right.

Backend: POST /detect-structure endpoint + /image/structure-overlay
Frontend: StepStructureDetection component with zone/box/color details

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-16 12:31:09 +01:00
parent fbbec6cf5e
commit 5b5213c2b9
5 changed files with 633 additions and 23 deletions

View File

@@ -7,6 +7,7 @@ import { StepOrientation } from '@/components/ocr-pipeline/StepOrientation'
import { StepCrop } from '@/components/ocr-pipeline/StepCrop'
import { StepDeskew } from '@/components/ocr-pipeline/StepDeskew'
import { StepDewarp } from '@/components/ocr-pipeline/StepDewarp'
import { StepStructureDetection } from '@/components/ocr-pipeline/StepStructureDetection'
import { StepColumnDetection } from '@/components/ocr-pipeline/StepColumnDetection'
import { StepRowDetection } from '@/components/ocr-pipeline/StepRowDetection'
import { StepWordRecognition } from '@/components/ocr-pipeline/StepWordRecognition'
@@ -91,15 +92,15 @@ export default function OcrPipelinePage() {
let uiStep = Math.max(0, dbStep - 1)
const skipSteps = [...(savedDocType?.skip_steps || [])]
// Sub-sessions: image is already cropped, skip pre-processing steps
// Jump directly to columns (UI step 4) unless already further ahead
// Sub-sessions: image is already cropped, skip pre-processing + structure steps
// Jump directly to columns (UI step 5) unless already further ahead
const isSubSession = !!data.parent_session_id
const SUB_SESSION_SKIP = ['orientation', 'deskew', 'dewarp', 'crop']
const SUB_SESSION_SKIP = ['orientation', 'deskew', 'dewarp', 'crop', 'structure']
if (isSubSession) {
for (const s of SUB_SESSION_SKIP) {
if (!skipSteps.includes(s)) skipSteps.push(s)
}
if (uiStep < 4) uiStep = 4 // columns step
if (uiStep < 5) uiStep = 5 // columns step (now index 5)
}
setSteps(
@@ -329,12 +330,13 @@ export default function OcrPipelinePage() {
2: 'Begradigung',
3: 'Entzerrung',
4: 'Zuschneiden',
5: 'Spalten',
6: 'Zeilen',
7: 'Woerter',
8: 'Korrektur',
9: 'Rekonstruktion',
10: 'Validierung',
5: 'Struktur',
6: 'Spalten',
7: 'Zeilen',
8: 'Woerter',
9: 'Korrektur',
10: 'Rekonstruktion',
11: 'Validierung',
}
const reprocessFromStep = useCallback(async (uiStep: number) => {
@@ -371,16 +373,18 @@ export default function OcrPipelinePage() {
case 3:
return <StepCrop sessionId={sessionId} onNext={handleCropNext} />
case 4:
return <StepColumnDetection sessionId={sessionId} onNext={handleNext} onBoxSessionsCreated={handleBoxSessionsCreated} />
return <StepStructureDetection sessionId={sessionId} onNext={handleNext} />
case 5:
return <StepRowDetection sessionId={sessionId} onNext={handleNext} />
return <StepColumnDetection sessionId={sessionId} onNext={handleNext} onBoxSessionsCreated={handleBoxSessionsCreated} />
case 6:
return <StepWordRecognition sessionId={sessionId} onNext={handleNext} goToStep={goToStep} />
return <StepRowDetection sessionId={sessionId} onNext={handleNext} />
case 7:
return <StepLlmReview sessionId={sessionId} onNext={handleNext} />
return <StepWordRecognition sessionId={sessionId} onNext={handleNext} goToStep={goToStep} />
case 8:
return <StepReconstruction sessionId={sessionId} onNext={handleNext} />
return <StepLlmReview sessionId={sessionId} onNext={handleNext} />
case 9:
return <StepReconstruction sessionId={sessionId} onNext={handleNext} />
case 10:
return <StepGroundTruth sessionId={sessionId} onNext={handleNext} />
default:
return null

View File

@@ -213,6 +213,38 @@ export interface RowGroundTruth {
notes?: string
}
export interface StructureResult {
image_width: number
image_height: number
content_bounds: { x: number; y: number; w: number; h: number }
boxes: StructureBox[]
zones: StructureZone[]
color_pixel_counts: Record<string, number>
has_words: boolean
word_count: number
duration_seconds: number
}
export interface StructureBox {
x: number
y: number
w: number
h: number
confidence: number
border_thickness: number
bg_color_name?: string
bg_color_hex?: string
}
export interface StructureZone {
index: number
zone_type: 'content' | 'box'
x: number
y: number
w: number
h: number
}
export interface WordBbox {
x: number
y: number
@@ -347,6 +379,7 @@ export const PIPELINE_STEPS: PipelineStep[] = [
{ id: 'deskew', name: 'Begradigung', icon: '📐', status: 'pending' },
{ id: 'dewarp', name: 'Entzerrung', icon: '🔧', status: 'pending' },
{ id: 'crop', name: 'Zuschneiden', icon: '✂️', status: 'pending' },
{ id: 'structure', name: 'Struktur', icon: '🔍', status: 'pending' },
{ id: 'columns', name: 'Spalten', icon: '📊', status: 'pending' },
{ id: 'rows', name: 'Zeilen', icon: '📏', status: 'pending' },
{ id: 'words', name: 'Woerter', icon: '🔤', status: 'pending' },

View File

@@ -0,0 +1,275 @@
'use client'
import { useEffect, useState } from 'react'
import type { StructureResult } from '@/app/(admin)/ai/ocr-pipeline/types'
const KLAUSUR_API = '/klausur-api'
interface StepStructureDetectionProps {
sessionId: string | null
onNext: () => void
}
const COLOR_HEX: Record<string, string> = {
red: '#dc2626',
orange: '#ea580c',
yellow: '#ca8a04',
green: '#16a34a',
blue: '#2563eb',
purple: '#9333ea',
}
export function StepStructureDetection({ sessionId, onNext }: StepStructureDetectionProps) {
const [result, setResult] = useState<StructureResult | null>(null)
const [detecting, setDetecting] = useState(false)
const [error, setError] = useState<string | null>(null)
const [hasRun, setHasRun] = useState(false)
const [overlayTs, setOverlayTs] = useState(0)
// Auto-trigger detection on mount
useEffect(() => {
if (!sessionId || hasRun) return
setHasRun(true)
const runDetection = async () => {
setDetecting(true)
setError(null)
try {
// Check if session already has structure result
const sessionRes = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}`)
if (sessionRes.ok) {
const sessionData = await sessionRes.json()
if (sessionData.structure_result) {
setResult(sessionData.structure_result)
setOverlayTs(Date.now())
setDetecting(false)
return
}
}
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/detect-structure`, {
method: 'POST',
})
if (!res.ok) {
throw new Error('Strukturerkennung fehlgeschlagen')
}
const data = await res.json()
setResult(data)
setOverlayTs(Date.now())
} catch (e) {
setError(e instanceof Error ? e.message : 'Unbekannter Fehler')
} finally {
setDetecting(false)
}
}
runDetection()
}, [sessionId, hasRun])
const handleRerun = async () => {
if (!sessionId) return
setDetecting(true)
setError(null)
try {
const res = await fetch(`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/detect-structure`, {
method: 'POST',
})
if (!res.ok) throw new Error('Erneute Erkennung fehlgeschlagen')
const data = await res.json()
setResult(data)
setOverlayTs(Date.now())
} catch (e) {
setError(e instanceof Error ? e.message : 'Unbekannter Fehler')
} finally {
setDetecting(false)
}
}
if (!sessionId) {
return <div className="text-sm text-gray-400">Keine Session ausgewaehlt.</div>
}
const croppedUrl = `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/cropped`
const overlayUrl = `${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/image/structure-overlay${overlayTs ? `?t=${overlayTs}` : ''}`
return (
<div className="space-y-4">
{/* Loading indicator */}
{detecting && (
<div className="flex items-center gap-2 text-teal-600 dark:text-teal-400 text-sm">
<div className="animate-spin w-4 h-4 border-2 border-teal-500 border-t-transparent rounded-full" />
Dokumentstruktur wird analysiert...
</div>
)}
{/* Two-column image comparison */}
<div className="grid grid-cols-1 lg:grid-cols-2 gap-4">
{/* Left: Original document */}
<div className="space-y-2">
<div className="text-xs font-medium text-gray-500 dark:text-gray-400 uppercase tracking-wider">
Original
</div>
<div className="relative bg-gray-100 dark:bg-gray-800 rounded-lg overflow-hidden" style={{ aspectRatio: '210/297' }}>
{/* eslint-disable-next-line @next/next/no-img-element */}
<img
src={croppedUrl}
alt="Originaldokument"
className="w-full h-full object-contain"
onError={(e) => {
(e.target as HTMLImageElement).style.display = 'none'
}}
/>
</div>
</div>
{/* Right: Structure overlay */}
<div className="space-y-2">
<div className="text-xs font-medium text-gray-500 dark:text-gray-400 uppercase tracking-wider">
Erkannte Struktur
</div>
<div className="relative bg-gray-100 dark:bg-gray-800 rounded-lg overflow-hidden" style={{ aspectRatio: '210/297' }}>
{/* eslint-disable-next-line @next/next/no-img-element */}
<img
src={overlayUrl}
alt="Strukturerkennung"
className="w-full h-full object-contain"
onError={(e) => {
(e.target as HTMLImageElement).style.display = 'none'
}}
/>
</div>
</div>
</div>
{/* Result info */}
{result && (
<div className="bg-white dark:bg-gray-800 rounded-lg border border-gray-200 dark:border-gray-700 p-4 space-y-3">
{/* Summary badges */}
<div className="flex flex-wrap items-center gap-3 text-sm">
<span className="inline-flex items-center gap-1.5 px-3 py-1 rounded-full bg-teal-50 dark:bg-teal-900/20 text-teal-700 dark:text-teal-400 text-xs font-medium">
{result.zones.length} Zone(n)
</span>
<span className="inline-flex items-center gap-1.5 px-3 py-1 rounded-full bg-amber-50 dark:bg-amber-900/20 text-amber-700 dark:text-amber-400 text-xs font-medium">
{result.boxes.length} Box(en)
</span>
{result.has_words && (
<span className="inline-flex items-center gap-1.5 px-3 py-1 rounded-full bg-blue-50 dark:bg-blue-900/20 text-blue-700 dark:text-blue-400 text-xs font-medium">
{result.word_count} Woerter
</span>
)}
<span className="text-gray-400 text-xs ml-auto">
{result.image_width}x{result.image_height}px | {result.duration_seconds}s
</span>
</div>
{/* Boxes detail */}
{result.boxes.length > 0 && (
<div>
<h4 className="text-xs font-medium text-gray-500 dark:text-gray-400 mb-2">Erkannte Boxen</h4>
<div className="space-y-1.5">
{result.boxes.map((box, i) => (
<div key={i} className="flex items-center gap-3 text-xs">
<span
className="w-3 h-3 rounded-sm flex-shrink-0 border border-gray-300 dark:border-gray-600"
style={{ backgroundColor: box.bg_color_hex || '#6b7280' }}
/>
<span className="text-gray-600 dark:text-gray-400">
Box {i + 1}:
</span>
<span className="font-mono text-gray-500">
{box.w}x{box.h}px @ ({box.x}, {box.y})
</span>
{box.bg_color_name && box.bg_color_name !== 'unknown' && box.bg_color_name !== 'white' && (
<span className="px-1.5 py-0.5 rounded bg-gray-100 dark:bg-gray-700 text-gray-500">
{box.bg_color_name}
</span>
)}
{box.border_thickness > 0 && (
<span className="text-gray-400">
Rahmen: {box.border_thickness}px
</span>
)}
<span className="text-gray-400">
{Math.round(box.confidence * 100)}%
</span>
</div>
))}
</div>
</div>
)}
{/* Zones detail */}
<div>
<h4 className="text-xs font-medium text-gray-500 dark:text-gray-400 mb-2">Seitenzonen</h4>
<div className="flex flex-wrap gap-2">
{result.zones.map((zone) => (
<span
key={zone.index}
className={`inline-flex items-center gap-1 px-2 py-1 rounded text-[11px] font-medium ${
zone.zone_type === 'box'
? 'bg-amber-50 dark:bg-amber-900/20 text-amber-700 dark:text-amber-300 border border-amber-200 dark:border-amber-800'
: 'bg-gray-50 dark:bg-gray-800 text-gray-500 dark:text-gray-400 border border-gray-200 dark:border-gray-700'
}`}
>
{zone.zone_type === 'box' ? 'Box' : 'Inhalt'} {zone.index}
<span className="text-[10px] font-normal opacity-70">
({zone.w}x{zone.h})
</span>
</span>
))}
</div>
</div>
{/* Color regions */}
{Object.keys(result.color_pixel_counts).length > 0 && (
<div>
<h4 className="text-xs font-medium text-gray-500 dark:text-gray-400 mb-2">Erkannte Farben</h4>
<div className="flex flex-wrap gap-2">
{Object.entries(result.color_pixel_counts)
.sort(([, a], [, b]) => b - a)
.map(([name, count]) => (
<span key={name} className="inline-flex items-center gap-1.5 px-2 py-1 rounded text-[11px] bg-gray-50 dark:bg-gray-800 border border-gray-200 dark:border-gray-700">
<span
className="w-2.5 h-2.5 rounded-full"
style={{ backgroundColor: COLOR_HEX[name] || '#6b7280' }}
/>
<span className="text-gray-600 dark:text-gray-400">{name}</span>
<span className="text-gray-400 text-[10px]">{count.toLocaleString()}px</span>
</span>
))}
</div>
</div>
)}
</div>
)}
{/* Action buttons */}
{result && (
<div className="flex justify-between">
<button
onClick={handleRerun}
disabled={detecting}
className="px-4 py-2 text-sm text-gray-500 hover:text-gray-700 dark:text-gray-400 dark:hover:text-gray-200 transition-colors disabled:opacity-50"
>
Erneut erkennen
</button>
<button
onClick={onNext}
className="px-6 py-2 bg-teal-600 text-white rounded-lg hover:bg-teal-700 font-medium transition-colors"
>
Weiter &rarr;
</button>
</div>
)}
{error && (
<div className="p-3 bg-red-50 dark:bg-red-900/20 text-red-600 dark:text-red-400 rounded-lg text-sm">
{error}
</div>
)}
</div>
)
}

View File

@@ -71,6 +71,8 @@ from cv_vocab_pipeline import (
render_image_high_res,
render_pdf_high_res,
)
from cv_box_detect import detect_boxes, split_page_into_zones
from cv_color_detect import detect_word_colors, recover_colored_text, _COLOR_RANGES, _COLOR_HEX
from cv_words_first import build_grid_from_words
from ocr_pipeline_session_store import (
create_session_db,
@@ -591,11 +593,14 @@ async def _append_pipeline_log(
@router.get("/sessions/{session_id}/image/{image_type}")
async def get_image(session_id: str, image_type: str):
"""Serve session images: original, deskewed, dewarped, binarized, columns-overlay, or rows-overlay."""
valid_types = {"original", "oriented", "cropped", "deskewed", "dewarped", "binarized", "columns-overlay", "rows-overlay", "words-overlay", "clean"}
"""Serve session images: original, deskewed, dewarped, binarized, structure-overlay, columns-overlay, or rows-overlay."""
valid_types = {"original", "oriented", "cropped", "deskewed", "dewarped", "binarized", "structure-overlay", "columns-overlay", "rows-overlay", "words-overlay", "clean"}
if image_type not in valid_types:
raise HTTPException(status_code=400, detail=f"Unknown image type: {image_type}")
if image_type == "structure-overlay":
return await _get_structure_overlay(session_id)
if image_type == "columns-overlay":
return await _get_columns_overlay(session_id)
@@ -1196,6 +1201,153 @@ async def detect_type(session_id: str):
return {"session_id": session_id, **result_dict}
# ---------------------------------------------------------------------------
# Structure Detection Endpoint
# ---------------------------------------------------------------------------
@router.post("/sessions/{session_id}/detect-structure")
async def detect_structure(session_id: str):
"""Detect document structure: boxes, zones, and color regions.
Runs box detection (line + shading) and color analysis on the cropped
image. Returns structured JSON with all detected elements for the
structure visualization step.
"""
if session_id not in _cache:
await _load_session_to_cache(session_id)
cached = _get_cached(session_id)
img_bgr = (
cached.get("cropped_bgr")
if cached.get("cropped_bgr") is not None
else cached.get("dewarped_bgr")
)
if img_bgr is None:
raise HTTPException(status_code=400, detail="Crop or dewarp must be completed first")
t0 = time.time()
h, w = img_bgr.shape[:2]
# --- Content bounds from word result (if available) or full image ---
word_result = cached.get("word_result")
words: List[Dict] = []
if word_result and word_result.get("cells"):
for cell in word_result["cells"]:
for wb in (cell.get("word_boxes") or []):
words.append(wb)
# If no words yet, use image dimensions with small margin
if words:
content_x = max(0, min(int(wb["left"]) for wb in words))
content_y = max(0, min(int(wb["top"]) for wb in words))
content_r = min(w, max(int(wb["left"] + wb["width"]) for wb in words))
content_b = min(h, max(int(wb["top"] + wb["height"]) for wb in words))
content_w_px = content_r - content_x
content_h_px = content_b - content_y
else:
margin = int(min(w, h) * 0.03)
content_x, content_y = margin, margin
content_w_px = w - 2 * margin
content_h_px = h - 2 * margin
# --- Box detection ---
boxes = detect_boxes(
img_bgr,
content_x=content_x,
content_w=content_w_px,
content_y=content_y,
content_h=content_h_px,
)
# --- Zone splitting ---
from cv_box_detect import split_page_into_zones as _split_zones
zones = _split_zones(content_x, content_y, content_w_px, content_h_px, boxes)
# --- Color region sampling ---
# Sample background shading in each detected box
hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
box_colors = []
for box in boxes:
# Sample the center region of each box
cy1 = box.y + box.height // 4
cy2 = box.y + 3 * box.height // 4
cx1 = box.x + box.width // 4
cx2 = box.x + 3 * box.width // 4
cy1 = max(0, min(cy1, h - 1))
cy2 = max(0, min(cy2, h - 1))
cx1 = max(0, min(cx1, w - 1))
cx2 = max(0, min(cx2, w - 1))
if cy2 > cy1 and cx2 > cx1:
roi_hsv = hsv[cy1:cy2, cx1:cx2]
med_h = float(np.median(roi_hsv[:, :, 0]))
med_s = float(np.median(roi_hsv[:, :, 1]))
med_v = float(np.median(roi_hsv[:, :, 2]))
if med_s > 15:
from cv_color_detect import _hue_to_color_name
bg_name = _hue_to_color_name(med_h)
bg_hex = _COLOR_HEX.get(bg_name, "#6b7280")
else:
bg_name = "gray" if med_v < 220 else "white"
bg_hex = "#6b7280" if bg_name == "gray" else "#ffffff"
else:
bg_name = "unknown"
bg_hex = "#6b7280"
box_colors.append({"color_name": bg_name, "color_hex": bg_hex})
# --- Color text detection overview ---
# Quick scan for colored text regions across the page
color_summary: Dict[str, int] = {}
for color_name, ranges in _COLOR_RANGES.items():
mask = np.zeros((h, w), dtype=np.uint8)
for lower, upper in ranges:
mask = cv2.bitwise_or(mask, cv2.inRange(hsv, lower, upper))
pixel_count = int(np.sum(mask > 0))
if pixel_count > 50: # minimum threshold
color_summary[color_name] = pixel_count
duration = time.time() - t0
result_dict = {
"image_width": w,
"image_height": h,
"content_bounds": {
"x": content_x, "y": content_y,
"w": content_w_px, "h": content_h_px,
},
"boxes": [
{
"x": b.x, "y": b.y, "w": b.width, "h": b.height,
"confidence": b.confidence,
"border_thickness": b.border_thickness,
"bg_color_name": box_colors[i]["color_name"],
"bg_color_hex": box_colors[i]["color_hex"],
}
for i, b in enumerate(boxes)
],
"zones": [
{
"index": z.index,
"zone_type": z.zone_type,
"y": z.y, "h": z.height,
"x": z.x, "w": z.width,
}
for z in zones
],
"color_pixel_counts": color_summary,
"has_words": len(words) > 0,
"word_count": len(words),
"duration_seconds": round(duration, 2),
}
# Persist to session
await update_session_db(session_id, structure_result=result_dict)
cached["structure_result"] = result_dict
logger.info("detect-structure session %s: %d boxes, %d zones, %.2fs",
session_id, len(boxes), len(zones), duration)
return {"session_id": session_id, **result_dict}
# ---------------------------------------------------------------------------
# Column Detection Endpoints (Step 3)
# ---------------------------------------------------------------------------
@@ -1485,6 +1637,151 @@ def _draw_box_exclusion_overlay(
cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
async def _get_structure_overlay(session_id: str) -> Response:
"""Generate overlay image showing detected boxes, zones, and color regions."""
base_png = await _get_base_image_png(session_id)
if not base_png:
raise HTTPException(status_code=404, detail="No base image available")
arr = np.frombuffer(base_png, dtype=np.uint8)
img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
if img is None:
raise HTTPException(status_code=500, detail="Failed to decode image")
h, w = img.shape[:2]
# Get structure result (run detection if not cached)
session = await get_session_db(session_id)
structure = (session or {}).get("structure_result")
if not structure:
# Run detection on-the-fly
margin = int(min(w, h) * 0.03)
content_x, content_y = margin, margin
content_w_px = w - 2 * margin
content_h_px = h - 2 * margin
boxes = detect_boxes(img, content_x, content_w_px, content_y, content_h_px)
zones = split_page_into_zones(content_x, content_y, content_w_px, content_h_px, boxes)
structure = {
"boxes": [
{"x": b.x, "y": b.y, "w": b.width, "h": b.height,
"confidence": b.confidence, "border_thickness": b.border_thickness}
for b in boxes
],
"zones": [
{"index": z.index, "zone_type": z.zone_type,
"y": z.y, "h": z.height, "x": z.x, "w": z.width}
for z in zones
],
}
overlay = img.copy()
# --- Draw zone boundaries ---
zone_colors = {
"content": (200, 200, 200), # light gray
"box": (255, 180, 0), # blue-ish (BGR)
}
for zone in structure.get("zones", []):
zx = zone["x"]
zy = zone["y"]
zw = zone["w"]
zh = zone["h"]
color = zone_colors.get(zone["zone_type"], (200, 200, 200))
# Draw zone boundary as dashed line
dash_len = 12
for edge_x in range(zx, zx + zw, dash_len * 2):
end_x = min(edge_x + dash_len, zx + zw)
cv2.line(img, (edge_x, zy), (end_x, zy), color, 1)
cv2.line(img, (edge_x, zy + zh), (end_x, zy + zh), color, 1)
# Zone label
zone_label = f"Zone {zone['index']} ({zone['zone_type']})"
cv2.putText(img, zone_label, (zx + 5, zy + 15),
cv2.FONT_HERSHEY_SIMPLEX, 0.45, color, 1)
# --- Draw detected boxes ---
# Color map for box backgrounds (BGR)
bg_hex_to_bgr = {
"#dc2626": (38, 38, 220), # red
"#2563eb": (235, 99, 37), # blue
"#16a34a": (74, 163, 22), # green
"#ea580c": (12, 88, 234), # orange
"#9333ea": (234, 51, 147), # purple
"#ca8a04": (4, 138, 202), # yellow
"#6b7280": (128, 114, 107), # gray
}
for box_data in structure.get("boxes", []):
bx = box_data["x"]
by = box_data["y"]
bw = box_data["w"]
bh = box_data["h"]
conf = box_data.get("confidence", 0)
thickness = box_data.get("border_thickness", 0)
bg_hex = box_data.get("bg_color_hex", "#6b7280")
bg_name = box_data.get("bg_color_name", "")
# Box fill color
fill_bgr = bg_hex_to_bgr.get(bg_hex, (128, 114, 107))
# Semi-transparent fill
cv2.rectangle(overlay, (bx, by), (bx + bw, by + bh), fill_bgr, -1)
# Solid border
border_color = fill_bgr
cv2.rectangle(img, (bx, by), (bx + bw, by + bh), border_color, 3)
# Label
label = f"BOX"
if bg_name and bg_name not in ("unknown", "white"):
label += f" ({bg_name})"
if thickness > 0:
label += f" border={thickness}px"
label += f" {int(conf * 100)}%"
cv2.putText(img, label, (bx + 8, by + 22),
cv2.FONT_HERSHEY_SIMPLEX, 0.55, (255, 255, 255), 2)
cv2.putText(img, label, (bx + 8, by + 22),
cv2.FONT_HERSHEY_SIMPLEX, 0.55, border_color, 1)
# Blend overlay at 15% opacity
cv2.addWeighted(overlay, 0.15, img, 0.85, 0, img)
# --- Draw color regions (HSV masks) ---
hsv = cv2.cvtColor(
cv2.imdecode(np.frombuffer(base_png, dtype=np.uint8), cv2.IMREAD_COLOR),
cv2.COLOR_BGR2HSV,
)
color_bgr_map = {
"red": (0, 0, 255),
"orange": (0, 140, 255),
"yellow": (0, 200, 255),
"green": (0, 200, 0),
"blue": (255, 150, 0),
"purple": (200, 0, 200),
}
for color_name, ranges in _COLOR_RANGES.items():
mask = np.zeros((h, w), dtype=np.uint8)
for lower, upper in ranges:
mask = cv2.bitwise_or(mask, cv2.inRange(hsv, lower, upper))
# Only draw if there are significant colored pixels
if np.sum(mask > 0) < 100:
continue
# Draw colored contours
contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
draw_color = color_bgr_map.get(color_name, (200, 200, 200))
for cnt in contours:
area = cv2.contourArea(cnt)
if area < 20:
continue
cv2.drawContours(img, [cnt], -1, draw_color, 2)
# Encode result
_, png_buf = cv2.imencode(".png", img)
return Response(content=png_buf.tobytes(), media_type="image/png")
async def _get_columns_overlay(session_id: str) -> Response:
"""Generate cropped (or dewarped) image with column borders drawn on it."""
session = await get_session_db(session_id)

View File

@@ -75,7 +75,8 @@ async def init_ocr_pipeline_tables():
ADD COLUMN IF NOT EXISTS crop_result JSONB,
ADD COLUMN IF NOT EXISTS parent_session_id UUID REFERENCES ocr_pipeline_sessions(id) ON DELETE CASCADE,
ADD COLUMN IF NOT EXISTS box_index INT,
ADD COLUMN IF NOT EXISTS grid_editor_result JSONB
ADD COLUMN IF NOT EXISTS grid_editor_result JSONB,
ADD COLUMN IF NOT EXISTS structure_result JSONB
""")
@@ -111,7 +112,7 @@ async def create_session_db(
word_result, ground_truth, auto_shear_degrees,
doc_type, doc_type_result,
document_category, pipeline_log,
grid_editor_result,
grid_editor_result, structure_result,
parent_session_id, box_index,
created_at, updated_at
""", uuid.UUID(session_id), name, filename, original_png,
@@ -131,7 +132,7 @@ async def get_session_db(session_id: str) -> Optional[Dict[str, Any]]:
word_result, ground_truth, auto_shear_degrees,
doc_type, doc_type_result,
document_category, pipeline_log,
grid_editor_result,
grid_editor_result, structure_result,
parent_session_id, box_index,
created_at, updated_at
FROM ocr_pipeline_sessions WHERE id = $1
@@ -183,11 +184,11 @@ async def update_session_db(session_id: str, **kwargs) -> Optional[Dict[str, Any
'word_result', 'ground_truth', 'auto_shear_degrees',
'doc_type', 'doc_type_result',
'document_category', 'pipeline_log',
'grid_editor_result',
'grid_editor_result', 'structure_result',
'parent_session_id', 'box_index',
}
jsonb_fields = {'orientation_result', 'crop_result', 'deskew_result', 'dewarp_result', 'column_result', 'row_result', 'word_result', 'ground_truth', 'handwriting_removal_meta', 'doc_type_result', 'pipeline_log', 'grid_editor_result'}
jsonb_fields = {'orientation_result', 'crop_result', 'deskew_result', 'dewarp_result', 'column_result', 'row_result', 'word_result', 'ground_truth', 'handwriting_removal_meta', 'doc_type_result', 'pipeline_log', 'grid_editor_result', 'structure_result'}
for key, value in kwargs.items():
if key in allowed_fields:
@@ -313,7 +314,7 @@ def _row_to_dict(row: asyncpg.Record) -> Dict[str, Any]:
result[key] = result[key].isoformat()
# JSONB → parsed (asyncpg returns str for JSONB)
for key in ['orientation_result', 'crop_result', 'deskew_result', 'dewarp_result', 'column_result', 'row_result', 'word_result', 'ground_truth', 'doc_type_result', 'pipeline_log', 'grid_editor_result']:
for key in ['orientation_result', 'crop_result', 'deskew_result', 'dewarp_result', 'column_result', 'row_result', 'word_result', 'ground_truth', 'doc_type_result', 'pipeline_log', 'grid_editor_result', 'structure_result']:
if key in result and result[key] is not None:
if isinstance(result[key], str):
result[key] = json.loads(result[key])