feat: add Structure Detection step to OCR pipeline
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 1m58s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 16s

New pipeline step between Crop and Columns that visualizes detected
document structure: boxes (line-based + shading), page zones, and
color regions. Shows original image on the left, annotated overlay
on the right.

Backend: POST /detect-structure endpoint + /image/structure-overlay
Frontend: StepStructureDetection component with zone/box/color details

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-16 12:31:09 +01:00
parent fbbec6cf5e
commit 5b5213c2b9
5 changed files with 633 additions and 23 deletions

View File

@@ -7,6 +7,7 @@ import { StepOrientation } from '@/components/ocr-pipeline/StepOrientation'
import { StepCrop } from '@/components/ocr-pipeline/StepCrop'
import { StepDeskew } from '@/components/ocr-pipeline/StepDeskew'
import { StepDewarp } from '@/components/ocr-pipeline/StepDewarp'
import { StepStructureDetection } from '@/components/ocr-pipeline/StepStructureDetection'
import { StepColumnDetection } from '@/components/ocr-pipeline/StepColumnDetection'
import { StepRowDetection } from '@/components/ocr-pipeline/StepRowDetection'
import { StepWordRecognition } from '@/components/ocr-pipeline/StepWordRecognition'
@@ -91,15 +92,15 @@ export default function OcrPipelinePage() {
let uiStep = Math.max(0, dbStep - 1)
const skipSteps = [...(savedDocType?.skip_steps || [])]
// Sub-sessions: image is already cropped, skip pre-processing steps
// Jump directly to columns (UI step 4) unless already further ahead
// Sub-sessions: image is already cropped, skip pre-processing + structure steps
// Jump directly to columns (UI step 5) unless already further ahead
const isSubSession = !!data.parent_session_id
const SUB_SESSION_SKIP = ['orientation', 'deskew', 'dewarp', 'crop']
const SUB_SESSION_SKIP = ['orientation', 'deskew', 'dewarp', 'crop', 'structure']
if (isSubSession) {
for (const s of SUB_SESSION_SKIP) {
if (!skipSteps.includes(s)) skipSteps.push(s)
}
if (uiStep < 4) uiStep = 4 // columns step
if (uiStep < 5) uiStep = 5 // columns step (now index 5)
}
setSteps(
@@ -329,12 +330,13 @@ export default function OcrPipelinePage() {
2: 'Begradigung',
3: 'Entzerrung',
4: 'Zuschneiden',
5: 'Spalten',
6: 'Zeilen',
7: 'Woerter',
8: 'Korrektur',
9: 'Rekonstruktion',
10: 'Validierung',
5: 'Struktur',
6: 'Spalten',
7: 'Zeilen',
8: 'Woerter',
9: 'Korrektur',
10: 'Rekonstruktion',
11: 'Validierung',
}
const reprocessFromStep = useCallback(async (uiStep: number) => {
@@ -371,16 +373,18 @@ export default function OcrPipelinePage() {
case 3:
return <StepCrop sessionId={sessionId} onNext={handleCropNext} />
case 4:
return <StepColumnDetection sessionId={sessionId} onNext={handleNext} onBoxSessionsCreated={handleBoxSessionsCreated} />
return <StepStructureDetection sessionId={sessionId} onNext={handleNext} />
case 5:
return <StepRowDetection sessionId={sessionId} onNext={handleNext} />
return <StepColumnDetection sessionId={sessionId} onNext={handleNext} onBoxSessionsCreated={handleBoxSessionsCreated} />
case 6:
return <StepWordRecognition sessionId={sessionId} onNext={handleNext} goToStep={goToStep} />
return <StepRowDetection sessionId={sessionId} onNext={handleNext} />
case 7:
return <StepLlmReview sessionId={sessionId} onNext={handleNext} />
return <StepWordRecognition sessionId={sessionId} onNext={handleNext} goToStep={goToStep} />
case 8:
return <StepReconstruction sessionId={sessionId} onNext={handleNext} />
return <StepLlmReview sessionId={sessionId} onNext={handleNext} />
case 9:
return <StepReconstruction sessionId={sessionId} onNext={handleNext} />
case 10:
return <StepGroundTruth sessionId={sessionId} onNext={handleNext} />
default:
return null

View File

@@ -213,6 +213,38 @@ export interface RowGroundTruth {
notes?: string
}
export interface StructureResult {
image_width: number
image_height: number
content_bounds: { x: number; y: number; w: number; h: number }
boxes: StructureBox[]
zones: StructureZone[]
color_pixel_counts: Record<string, number>
has_words: boolean
word_count: number
duration_seconds: number
}
export interface StructureBox {
x: number
y: number
w: number
h: number
confidence: number
border_thickness: number
bg_color_name?: string
bg_color_hex?: string
}
export interface StructureZone {
index: number
zone_type: 'content' | 'box'
x: number
y: number
w: number
h: number
}
export interface WordBbox {
x: number
y: number
@@ -347,6 +379,7 @@ export const PIPELINE_STEPS: PipelineStep[] = [
{ id: 'deskew', name: 'Begradigung', icon: '📐', status: 'pending' },
{ id: 'dewarp', name: 'Entzerrung', icon: '🔧', status: 'pending' },
{ id: 'crop', name: 'Zuschneiden', icon: '✂️', status: 'pending' },
{ id: 'structure', name: 'Struktur', icon: '🔍', status: 'pending' },
{ id: 'columns', name: 'Spalten', icon: '📊', status: 'pending' },
{ id: 'rows', name: 'Zeilen', icon: '📏', status: 'pending' },
{ id: 'words', name: 'Woerter', icon: '🔤', status: 'pending' },