feat: add Kombi-Vergleich mode for side-by-side Paddle vs RapidOCR comparison
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 33s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m55s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 21s

Add /rapid-kombi backend endpoint using local RapidOCR + Tesseract merge,
KombiCompareStep component for parallel execution and side-by-side overlay,
and wordResultOverride prop on OverlayReconstruction for direct data injection.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-14 07:59:06 +01:00
parent c2c082d4b4
commit a994ddee83
6 changed files with 504 additions and 35 deletions

View File

@@ -11,12 +11,13 @@ import { StepRowDetection } from '@/components/ocr-pipeline/StepRowDetection'
import { StepWordRecognition } from '@/components/ocr-pipeline/StepWordRecognition'
import { OverlayReconstruction } from '@/components/ocr-overlay/OverlayReconstruction'
import { PaddleDirectStep } from '@/components/ocr-overlay/PaddleDirectStep'
import { OVERLAY_PIPELINE_STEPS, PADDLE_DIRECT_STEPS, KOMBI_STEPS, DOCUMENT_CATEGORIES, dbStepToOverlayUi, type PipelineStep, type SessionListItem, type DocumentCategory } from './types'
import { KombiCompareStep } from '@/components/ocr-overlay/KombiCompareStep'
import { OVERLAY_PIPELINE_STEPS, PADDLE_DIRECT_STEPS, KOMBI_STEPS, KOMBI_COMPARE_STEPS, DOCUMENT_CATEGORIES, dbStepToOverlayUi, type PipelineStep, type SessionListItem, type DocumentCategory } from './types'
const KLAUSUR_API = '/klausur-api'
export default function OcrOverlayPage() {
const [mode, setMode] = useState<'pipeline' | 'paddle-direct' | 'kombi'>('pipeline')
const [mode, setMode] = useState<'pipeline' | 'paddle-direct' | 'kombi' | 'kombi-compare'>('pipeline')
const [currentStep, setCurrentStep] = useState(0)
const [sessionId, setSessionId] = useState<string | null>(null)
const [sessionName, setSessionName] = useState<string>('')
@@ -63,14 +64,15 @@ export default function OcrOverlayPage() {
setSessionName(data.name || data.filename || '')
setActiveCategory(data.document_category || undefined)
// Check if this session was processed with paddle_direct or kombi
// Check if this session was processed with paddle_direct, kombi, or rapid_kombi
const ocrEngine = data.word_result?.ocr_engine
const isPaddleDirect = ocrEngine === 'paddle_direct'
const isKombi = ocrEngine === 'kombi'
const isRapidKombi = ocrEngine === 'rapid_kombi'
if (isPaddleDirect || isKombi) {
const m = isKombi ? 'kombi' : 'paddle-direct'
const baseSteps = isKombi ? KOMBI_STEPS : PADDLE_DIRECT_STEPS
if (isPaddleDirect || isKombi || isRapidKombi) {
const m = isKombi ? 'kombi' : isPaddleDirect ? 'paddle-direct' : 'kombi-compare'
const baseSteps = isKombi ? KOMBI_STEPS : isRapidKombi ? KOMBI_COMPARE_STEPS : PADDLE_DIRECT_STEPS
setMode(m)
setSteps(
baseSteps.map((s, i) => ({
@@ -105,7 +107,7 @@ export default function OcrOverlayPage() {
if (sessionId === sid) {
setSessionId(null)
setCurrentStep(0)
const baseSteps = mode === 'kombi' ? KOMBI_STEPS : mode === 'paddle-direct' ? PADDLE_DIRECT_STEPS : OVERLAY_PIPELINE_STEPS
const baseSteps = mode === 'kombi' ? KOMBI_STEPS : mode === 'kombi-compare' ? KOMBI_COMPARE_STEPS : mode === 'paddle-direct' ? PADDLE_DIRECT_STEPS : OVERLAY_PIPELINE_STEPS
setSteps(baseSteps.map((s, i) => ({ ...s, status: i === 0 ? 'active' : 'pending' })))
}
} catch (e) {
@@ -162,7 +164,7 @@ export default function OcrOverlayPage() {
const handleNext = () => {
if (currentStep >= steps.length - 1) {
// Last step completed — return to session list
const baseSteps = mode === 'kombi' ? KOMBI_STEPS : mode === 'paddle-direct' ? PADDLE_DIRECT_STEPS : OVERLAY_PIPELINE_STEPS
const baseSteps = mode === 'kombi' ? KOMBI_STEPS : mode === 'kombi-compare' ? KOMBI_COMPARE_STEPS : mode === 'paddle-direct' ? PADDLE_DIRECT_STEPS : OVERLAY_PIPELINE_STEPS
setSteps(baseSteps.map((s, i) => ({ ...s, status: i === 0 ? 'active' : 'pending' })))
setCurrentStep(0)
setSessionId(null)
@@ -191,7 +193,7 @@ export default function OcrOverlayPage() {
setSessionId(null)
setSessionName('')
setCurrentStep(0)
const baseSteps = mode === 'kombi' ? KOMBI_STEPS : mode === 'paddle-direct' ? PADDLE_DIRECT_STEPS : OVERLAY_PIPELINE_STEPS
const baseSteps = mode === 'kombi' ? KOMBI_STEPS : mode === 'kombi-compare' ? KOMBI_COMPARE_STEPS : mode === 'paddle-direct' ? PADDLE_DIRECT_STEPS : OVERLAY_PIPELINE_STEPS
setSteps(baseSteps.map((s, i) => ({ ...s, status: i === 0 ? 'active' : 'pending' })))
}
@@ -230,7 +232,7 @@ export default function OcrOverlayPage() {
}, [sessionId, goToStep])
const renderStep = () => {
if (mode === 'paddle-direct' || mode === 'kombi') {
if (mode === 'paddle-direct' || mode === 'kombi' || mode === 'kombi-compare') {
switch (currentStep) {
case 0:
return <StepOrientation sessionId={sessionId} onNext={handleOrientationComplete} />
@@ -241,6 +243,9 @@ export default function OcrOverlayPage() {
case 3:
return <StepCrop sessionId={sessionId} onNext={handleNext} />
case 4:
if (mode === 'kombi-compare') {
return <KombiCompareStep sessionId={sessionId} onNext={handleNext} />
}
return mode === 'kombi' ? (
<PaddleDirectStep
sessionId={sessionId}
@@ -514,6 +519,22 @@ export default function OcrOverlayPage() {
>
Kombi (5 Schritte)
</button>
<button
onClick={() => {
if (mode === 'kombi-compare') return
setMode('kombi-compare')
setCurrentStep(0)
setSessionId(null)
setSteps(KOMBI_COMPARE_STEPS.map((s, i) => ({ ...s, status: i === 0 ? 'active' : 'pending' })))
}}
className={`px-3 py-1.5 text-xs font-medium rounded-md transition-colors ${
mode === 'kombi-compare'
? 'bg-white dark:bg-gray-700 text-gray-700 dark:text-gray-200 shadow-sm'
: 'text-gray-500 dark:text-gray-400 hover:text-gray-700 dark:hover:text-gray-300'
}`}
>
Vergleich (5 Schritte)
</button>
</div>
<PipelineStepper

View File

@@ -72,6 +72,18 @@ export const KOMBI_STEPS: PipelineStep[] = [
{ id: 'kombi', name: 'Paddle + Tesseract', icon: '🔀', status: 'pending' },
]
/**
* 5-step pipeline for Kombi-Vergleich mode (Paddle-Kombi vs Rapid-Kombi side-by-side).
* Same preprocessing, then both kombi engines run in parallel and are shown side-by-side.
*/
export const KOMBI_COMPARE_STEPS: PipelineStep[] = [
{ id: 'orientation', name: 'Orientierung', icon: '🔄', status: 'pending' },
{ id: 'deskew', name: 'Begradigung', icon: '📐', status: 'pending' },
{ id: 'dewarp', name: 'Entzerrung', icon: '🔧', status: 'pending' },
{ id: 'crop', name: 'Zuschneiden', icon: '✂️', status: 'pending' },
{ id: 'kombi-compare', name: 'Kombi-Vergleich', icon: '⚖️', status: 'pending' },
]
/** Map from DB step to overlay UI step index */
export function dbStepToOverlayUi(dbStep: number): number {
// DB: 1=start, 2=orient, 3=deskew, 4=dewarp, 5=crop, 6=columns, 7=rows, 8=words, 9=recon, 10=gt

View File

@@ -0,0 +1,231 @@
'use client'
import { useState } from 'react'
import { OverlayReconstruction } from './OverlayReconstruction'
import type { GridCell } from '@/app/(admin)/ai/ocr-overlay/types'
const KLAUSUR_API = '/klausur-api'
type Phase = 'idle' | 'running' | 'compare'
interface KombiResult {
cells: GridCell[]
image_width: number
image_height: number
duration_seconds: number
summary: {
total_cells: number
non_empty_cells: number
merged_words: number
[key: string]: unknown
}
[key: string]: unknown
}
interface KombiCompareStepProps {
sessionId: string | null
onNext: () => void
}
export function KombiCompareStep({ sessionId, onNext }: KombiCompareStepProps) {
const [phase, setPhase] = useState<Phase>('idle')
const [error, setError] = useState('')
const [paddleResult, setPaddleResult] = useState<KombiResult | null>(null)
const [rapidResult, setRapidResult] = useState<KombiResult | null>(null)
const [paddleStatus, setPaddleStatus] = useState<'pending' | 'running' | 'done' | 'error'>('pending')
const [rapidStatus, setRapidStatus] = useState<'pending' | 'running' | 'done' | 'error'>('pending')
const runBothEngines = async () => {
if (!sessionId) return
setPhase('running')
setError('')
setPaddleStatus('running')
setRapidStatus('running')
setPaddleResult(null)
setRapidResult(null)
const fetchEngine = async (
endpoint: string,
setResult: (r: KombiResult) => void,
setStatus: (s: 'pending' | 'running' | 'done' | 'error') => void,
) => {
try {
const res = await fetch(
`${KLAUSUR_API}/api/v1/ocr-pipeline/sessions/${sessionId}/${endpoint}`,
{ method: 'POST' },
)
if (!res.ok) {
const body = await res.json().catch(() => ({}))
throw new Error(body.detail || `HTTP ${res.status}`)
}
const data = await res.json()
setResult(data)
setStatus('done')
} catch (e: unknown) {
setStatus('error')
throw e
}
}
try {
await Promise.all([
fetchEngine('paddle-kombi', setPaddleResult, setPaddleStatus),
fetchEngine('rapid-kombi', setRapidResult, setRapidStatus),
])
setPhase('compare')
} catch (e: unknown) {
// At least one failed — still show compare if the other succeeded
setError(e instanceof Error ? e.message : String(e))
setPhase('compare')
}
}
if (phase === 'idle') {
return (
<div className="bg-white dark:bg-gray-800 rounded-xl border border-gray-200 dark:border-gray-700 p-8 text-center">
<div className="text-4xl mb-3"></div>
<h3 className="text-lg font-semibold text-gray-800 dark:text-gray-200 mb-2">
Kombi-Vergleich
</h3>
<p className="text-sm text-gray-500 dark:text-gray-400 mb-6 max-w-lg mx-auto">
Beide Kombi-Modi (Paddle + Tesseract vs. RapidOCR + Tesseract) laufen parallel.
Die Ergebnisse werden nebeneinander angezeigt, damit die Qualitaet direkt verglichen werden kann.
</p>
<button
onClick={runBothEngines}
disabled={!sessionId}
className="px-5 py-2.5 bg-teal-600 text-white rounded-lg hover:bg-teal-700 transition-colors disabled:opacity-50 disabled:cursor-not-allowed font-medium"
>
Beide Kombi-Modi starten
</button>
</div>
)
}
if (phase === 'running' && !paddleResult && !rapidResult) {
return (
<div className="bg-white dark:bg-gray-800 rounded-xl border border-gray-200 dark:border-gray-700 p-8">
<div className="flex items-center justify-center gap-8">
<EngineStatusCard label="Paddle + Tesseract" status={paddleStatus} />
<EngineStatusCard label="RapidOCR + Tesseract" status={rapidStatus} />
</div>
</div>
)
}
// compare phase
return (
<div className="space-y-4">
{error && (
<div className="bg-red-50 dark:bg-red-900/20 border border-red-200 dark:border-red-800 rounded-lg p-3 text-sm text-red-700 dark:text-red-300">
{error}
</div>
)}
<div className="flex items-center justify-between">
<h3 className="text-sm font-medium text-gray-700 dark:text-gray-300">
Side-by-Side Vergleich
</h3>
<button
onClick={() => { setPhase('idle'); setPaddleResult(null); setRapidResult(null) }}
className="text-xs px-3 py-1.5 border border-gray-300 dark:border-gray-600 rounded-lg hover:bg-gray-50 dark:hover:bg-gray-700 transition-colors"
>
Neu starten
</button>
</div>
<div className="grid grid-cols-2 gap-4">
{/* Left: Paddle-Kombi */}
<div className="space-y-2">
<div className="flex items-center gap-2">
<span className="text-sm font-medium text-gray-700 dark:text-gray-300">
🔀 Paddle + Tesseract
</span>
{paddleStatus === 'error' && (
<span className="text-xs text-red-500">Fehler</span>
)}
</div>
{paddleResult ? (
<>
<OverlayReconstruction
sessionId={sessionId}
onNext={() => {}}
wordResultOverride={paddleResult}
/>
<StatsBar result={paddleResult} engine="Paddle-Kombi" />
</>
) : (
<div className="bg-gray-50 dark:bg-gray-900 rounded-lg p-12 text-center text-sm text-gray-400">
{paddleStatus === 'running' ? 'Laeuft...' : 'Fehlgeschlagen'}
</div>
)}
</div>
{/* Right: Rapid-Kombi */}
<div className="space-y-2">
<div className="flex items-center gap-2">
<span className="text-sm font-medium text-gray-700 dark:text-gray-300">
RapidOCR + Tesseract
</span>
{rapidStatus === 'error' && (
<span className="text-xs text-red-500">Fehler</span>
)}
</div>
{rapidResult ? (
<>
<OverlayReconstruction
sessionId={sessionId}
onNext={() => {}}
wordResultOverride={rapidResult}
/>
<StatsBar result={rapidResult} engine="Rapid-Kombi" />
</>
) : (
<div className="bg-gray-50 dark:bg-gray-900 rounded-lg p-12 text-center text-sm text-gray-400">
{rapidStatus === 'running' ? 'Laeuft...' : 'Fehlgeschlagen'}
</div>
)}
</div>
</div>
<div className="flex justify-end">
<button
onClick={onNext}
className="px-4 py-2 bg-teal-600 text-white rounded-lg hover:bg-teal-700 transition-colors text-sm font-medium"
>
Fertig
</button>
</div>
</div>
)
}
function EngineStatusCard({ label, status }: { label: string; status: string }) {
return (
<div className="flex items-center gap-3 bg-gray-50 dark:bg-gray-900 rounded-lg px-5 py-4">
{status === 'running' && (
<div className="w-5 h-5 border-2 border-teal-400 border-t-transparent rounded-full animate-spin" />
)}
{status === 'done' && <span className="text-green-500 text-lg"></span>}
{status === 'error' && <span className="text-red-500 text-lg"></span>}
{status === 'pending' && <span className="text-gray-400 text-lg"></span>}
<span className="text-sm text-gray-700 dark:text-gray-300">{label}</span>
</div>
)
}
function StatsBar({ result, engine }: { result: KombiResult; engine: string }) {
const nonEmpty = result.summary?.non_empty_cells ?? 0
const totalCells = result.summary?.total_cells ?? 0
const merged = result.summary?.merged_words ?? 0
const duration = result.duration_seconds ?? 0
return (
<div className="flex items-center gap-3 text-[11px] text-gray-500 dark:text-gray-400 bg-gray-50 dark:bg-gray-900 rounded-lg px-3 py-2">
<span className="font-medium text-gray-600 dark:text-gray-300">{engine}</span>
<span>{merged} Woerter</span>
<span>{nonEmpty}/{totalCells} Zellen</span>
<span>{duration.toFixed(2)}s</span>
</div>
)
}

View File

@@ -10,6 +10,8 @@ const KLAUSUR_API = '/klausur-api'
interface OverlayReconstructionProps {
sessionId: string | null
onNext: () => void
/** When set, use this data directly instead of fetching from the session API. */
wordResultOverride?: { cells: GridCell[]; image_width: number; image_height: number; [key: string]: unknown }
}
interface EditableCell {
@@ -24,7 +26,7 @@ interface EditableCell {
type UndoAction = { cellId: string; oldText: string; newText: string }
export function OverlayReconstruction({ sessionId, onNext }: OverlayReconstructionProps) {
export function OverlayReconstruction({ sessionId, onNext, wordResultOverride }: OverlayReconstructionProps) {
const [status, setStatus] = useState<'loading' | 'ready' | 'saving' | 'saved' | 'error'>('loading')
const [error, setError] = useState('')
const [cells, setCells] = useState<EditableCell[]>([])
@@ -78,10 +80,39 @@ export function OverlayReconstruction({ sessionId, onNext }: OverlayReconstructi
// Load session data
useEffect(() => {
if (wordResultOverride) {
applyWordResult(wordResultOverride)
return
}
if (!sessionId) return
loadSessionData()
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [sessionId])
}, [sessionId, wordResultOverride])
const applyWordResult = (wordResult: { cells: GridCell[]; image_width: number; image_height: number; [key: string]: unknown }) => {
const rawGridCells: GridCell[] = wordResult.cells || []
setGridCells(rawGridCells)
const editableCells: EditableCell[] = rawGridCells.map(c => ({
cellId: c.cell_id,
text: c.text,
originalText: c.text,
bboxPct: c.bbox_pct,
colType: c.col_type,
rowIndex: c.row_index,
colIndex: c.col_index,
}))
setCells(editableCells)
setEditedTexts(new Map())
setUndoStack([])
setRedoStack([])
if (wordResult.image_width && wordResult.image_height) {
setImageNaturalSize({ w: wordResult.image_width, h: wordResult.image_height })
}
setStatus('ready')
}
const loadSessionData = async () => {
if (!sessionId) return
@@ -98,33 +129,11 @@ export function OverlayReconstruction({ sessionId, onNext }: OverlayReconstructi
return
}
const rawGridCells: GridCell[] = wordResult.cells || []
setGridCells(rawGridCells)
const editableCells: EditableCell[] = rawGridCells.map(c => ({
cellId: c.cell_id,
text: c.text,
originalText: c.text,
bboxPct: c.bbox_pct,
colType: c.col_type,
rowIndex: c.row_index,
colIndex: c.col_index,
}))
setCells(editableCells)
setEditedTexts(new Map())
setUndoStack([])
setRedoStack([])
applyWordResult(wordResult as unknown as { cells: GridCell[]; image_width: number; image_height: number })
// Load rows
const rowResult: RowResult | undefined = data.row_result
if (rowResult?.rows) setRows(rowResult.rows)
// Store image dimensions
if (wordResult.image_width && wordResult.image_height) {
setImageNaturalSize({ w: wordResult.image_width, h: wordResult.image_height })
}
setStatus('ready')
} catch (e: unknown) {
setError(e instanceof Error ? e.message : String(e))
setStatus('error')

View File

@@ -2976,6 +2976,141 @@ async def paddle_kombi(session_id: str):
return {"session_id": session_id, **word_result}
@router.post("/sessions/{session_id}/rapid-kombi")
async def rapid_kombi(session_id: str):
"""Run RapidOCR + Tesseract on the preprocessed image and merge results.
Same merge logic as paddle-kombi, but uses local RapidOCR (ONNX Runtime)
instead of remote PaddleOCR service.
"""
img_png = await get_session_image(session_id, "cropped")
if not img_png:
img_png = await get_session_image(session_id, "dewarped")
if not img_png:
img_png = await get_session_image(session_id, "original")
if not img_png:
raise HTTPException(status_code=404, detail="No image found for this session")
img_arr = np.frombuffer(img_png, dtype=np.uint8)
img_bgr = cv2.imdecode(img_arr, cv2.IMREAD_COLOR)
if img_bgr is None:
raise HTTPException(status_code=400, detail="Failed to decode image")
img_h, img_w = img_bgr.shape[:2]
from cv_ocr_engines import ocr_region_rapid
from cv_vocab_types import PageRegion
t0 = time.time()
# --- RapidOCR (local, synchronous) ---
full_region = PageRegion(
type="full_page", x=0, y=0, width=img_w, height=img_h,
)
rapid_words = ocr_region_rapid(img_bgr, full_region)
if not rapid_words:
rapid_words = []
# --- Tesseract ---
from PIL import Image
import pytesseract
pil_img = Image.fromarray(cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB))
data = pytesseract.image_to_data(
pil_img, lang="eng+deu",
config="--psm 6 --oem 3",
output_type=pytesseract.Output.DICT,
)
tess_words = []
for i in range(len(data["text"])):
text = str(data["text"][i]).strip()
conf_raw = str(data["conf"][i])
conf = int(conf_raw) if conf_raw.lstrip("-").isdigit() else -1
if not text or conf < 20:
continue
tess_words.append({
"text": text,
"left": data["left"][i],
"top": data["top"][i],
"width": data["width"][i],
"height": data["height"][i],
"conf": conf,
})
# --- Split multi-word RapidOCR boxes into individual words ---
rapid_words_split = _split_paddle_multi_words(rapid_words)
logger.info(
"rapid_kombi: split %d rapid boxes → %d individual words",
len(rapid_words), len(rapid_words_split),
)
# --- Merge ---
if not rapid_words_split and not tess_words:
raise HTTPException(status_code=400, detail="Both OCR engines returned no words")
merged_words = _merge_paddle_tesseract(rapid_words_split, tess_words)
cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h)
duration = time.time() - t0
for cell in cells:
cell["ocr_engine"] = "rapid_kombi"
n_rows = len(set(c["row_index"] for c in cells)) if cells else 0
n_cols = len(columns_meta)
col_types = {c.get("type") for c in columns_meta}
is_vocab = bool(col_types & {"column_en", "column_de"})
word_result = {
"cells": cells,
"grid_shape": {"rows": n_rows, "cols": n_cols, "total_cells": len(cells)},
"columns_used": columns_meta,
"layout": "vocab" if is_vocab else "generic",
"image_width": img_w,
"image_height": img_h,
"duration_seconds": round(duration, 2),
"ocr_engine": "rapid_kombi",
"grid_method": "rapid_kombi",
"raw_rapid_words": rapid_words,
"raw_rapid_words_split": rapid_words_split,
"raw_tesseract_words": tess_words,
"summary": {
"total_cells": len(cells),
"non_empty_cells": sum(1 for c in cells if c.get("text")),
"low_confidence": sum(1 for c in cells if 0 < c.get("confidence", 0) < 50),
"rapid_words": len(rapid_words),
"rapid_words_split": len(rapid_words_split),
"tesseract_words": len(tess_words),
"merged_words": len(merged_words),
},
}
await update_session_db(
session_id,
word_result=word_result,
cropped_png=img_png,
current_step=8,
)
logger.info(
"rapid_kombi session %s: %d cells (%d rows, %d cols) in %.2fs "
"[rapid=%d, tess=%d, merged=%d]",
session_id, len(cells), n_rows, n_cols, duration,
len(rapid_words), len(tess_words), len(merged_words),
)
await _append_pipeline_log(session_id, "rapid_kombi", {
"total_cells": len(cells),
"non_empty_cells": word_result["summary"]["non_empty_cells"],
"rapid_words": len(rapid_words),
"tesseract_words": len(tess_words),
"merged_words": len(merged_words),
"ocr_engine": "rapid_kombi",
}, duration_ms=int(duration * 1000))
return {"session_id": session_id, **word_result}
class WordGroundTruthRequest(BaseModel):
is_correct: bool
corrected_entries: Optional[List[Dict[str, Any]]] = None

View File

@@ -449,6 +449,67 @@ class TestSpatialOverlapDedup:
assert len(merged) == 2
class TestRapidOcrMergeCompatibility:
"""Test that _merge_paddle_tesseract works with RapidOCR word format.
RapidOCR words include an extra 'region_type' key that PaddleOCR words
don't have. The merge logic must tolerate this extra field.
"""
def _rapid_word(self, text, left, top, width=60, height=20, conf=80, region_type="full_page"):
"""Create a word dict in RapidOCR format (has region_type)."""
return {
"text": text,
"left": left,
"top": top,
"width": width,
"height": height,
"conf": conf,
"region_type": region_type,
}
def test_rapid_words_merge_with_tesseract(self):
"""RapidOCR words (with region_type) merge correctly with Tesseract words."""
rapid = [
self._rapid_word("apple", 50, 10, 70, 20, conf=90),
self._rapid_word("Apfel", 300, 10, 60, 20, conf=85),
]
tess = [
_word("apple", 52, 11, 68, 19, conf=75),
_word("Apfel", 298, 12, 62, 18, conf=70),
]
merged = _merge_paddle_tesseract(rapid, tess)
assert len(merged) == 2
texts = sorted(w["text"] for w in merged)
assert texts == ["Apfel", "apple"]
def test_rapid_words_split_then_merge(self):
"""Split + merge works with RapidOCR multi-word boxes."""
rapid_raw = [
self._rapid_word("More than 200", 944, 287, 160, 29, conf=96),
]
tess = [
_word("More", 948, 292, 60, 20, conf=90),
_word("than", 1017, 291, 49, 21, conf=96),
_word("200", 1076, 292, 43, 20, conf=93),
]
rapid_split = _split_paddle_multi_words(rapid_raw)
assert len(rapid_split) == 3
merged = _merge_paddle_tesseract(rapid_split, tess)
texts = [w["text"] for w in merged]
assert texts.count("More") == 1
assert texts.count("than") == 1
assert texts.count("200") == 1
def test_region_type_preserved_in_unmatched(self):
"""Unmatched RapidOCR words keep their region_type field."""
rapid = [self._rapid_word("unique", 500, 10, 80, 20, conf=90)]
tess = [] # No Tesseract words
merged = _merge_paddle_tesseract(rapid, tess)
assert len(merged) == 1
assert merged[0]["text"] == "unique"
class TestSplitThenMerge:
"""Test the full pipeline: split multi-word Paddle boxes, then merge."""