feat(ocr-pipeline): line breaks, hyphen rejoin & oversized row splitting

- Preserve \n between visual lines within cells (instead of joining with space)
- Rejoin hyphenated words split across line breaks (e.g. Fuß-\nboden → Fußboden)
- Split oversized rows (>1.5× median height) into sub-entries when EN/DE
  line counts match — deterministic fix for missed Step 4 row boundaries
- Frontend: render \n as <br/>, use textarea for multiline editing

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-02-28 18:49:28 +01:00
parent e7fb9d59f1
commit f7e0f2bb4f
2 changed files with 195 additions and 24 deletions

View File

@@ -5,6 +5,16 @@ import type { WordResult, WordEntry, WordGroundTruth } from '@/app/(admin)/ai/oc
const KLAUSUR_API = '/klausur-api'
/** Render text with \n as line breaks */
function MultilineText({ text }: { text: string }) {
if (!text) return <span className="text-gray-300 dark:text-gray-600"></span>
const lines = text.split('\n')
if (lines.length === 1) return <>{text}</>
return <>{lines.map((line, i) => (
<span key={i}>{line}{i < lines.length - 1 && <br />}</span>
))}</>
}
interface StepWordRecognitionProps {
sessionId: string | null
onNext: () => void
@@ -318,13 +328,13 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
>
<td className="py-1 pr-2 text-gray-400">{idx + 1}</td>
<td className="py-1 pr-2 font-mono text-gray-700 dark:text-gray-300 cursor-pointer">
{entry.english || <span className="text-gray-300 dark:text-gray-600"></span>}
<MultilineText text={entry.english} />
</td>
<td className="py-1 pr-2 font-mono text-gray-700 dark:text-gray-300 cursor-pointer">
{entry.german || <span className="text-gray-300 dark:text-gray-600"></span>}
<MultilineText text={entry.german} />
</td>
<td className="py-1 pr-2 font-mono text-gray-500 dark:text-gray-400 cursor-pointer max-w-[200px] truncate">
{entry.example || <span className="text-gray-300 dark:text-gray-600"></span>}
<td className="py-1 pr-2 font-mono text-gray-500 dark:text-gray-400 cursor-pointer max-w-[200px]">
<MultilineText text={entry.example} />
</td>
<td className={`py-1 text-right font-mono ${confColor(entry.confidence)}`}>
{entry.confidence}%
@@ -428,30 +438,30 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
<div className="space-y-2">
<div>
<label className="text-[10px] font-medium text-gray-500 dark:text-gray-400">English</label>
<input
ref={enRef}
type="text"
<textarea
ref={enRef as any}
rows={Math.max(1, (editedEntries[activeIndex]?.english || '').split('\n').length)}
value={editedEntries[activeIndex]?.english || ''}
onChange={(e) => updateEntry(activeIndex, 'english', e.target.value)}
className="w-full px-2 py-1.5 text-sm border rounded dark:bg-gray-700 dark:border-gray-600 font-mono"
className="w-full px-2 py-1.5 text-sm border rounded dark:bg-gray-700 dark:border-gray-600 font-mono resize-none"
/>
</div>
<div>
<label className="text-[10px] font-medium text-gray-500 dark:text-gray-400">Deutsch</label>
<input
type="text"
<textarea
rows={Math.max(1, (editedEntries[activeIndex]?.german || '').split('\n').length)}
value={editedEntries[activeIndex]?.german || ''}
onChange={(e) => updateEntry(activeIndex, 'german', e.target.value)}
className="w-full px-2 py-1.5 text-sm border rounded dark:bg-gray-700 dark:border-gray-600 font-mono"
className="w-full px-2 py-1.5 text-sm border rounded dark:bg-gray-700 dark:border-gray-600 font-mono resize-none"
/>
</div>
<div>
<label className="text-[10px] font-medium text-gray-500 dark:text-gray-400">Example</label>
<input
type="text"
<textarea
rows={Math.max(1, (editedEntries[activeIndex]?.example || '').split('\n').length)}
value={editedEntries[activeIndex]?.example || ''}
onChange={(e) => updateEntry(activeIndex, 'example', e.target.value)}
className="w-full px-2 py-1.5 text-sm border rounded dark:bg-gray-700 dark:border-gray-600 font-mono"
className="w-full px-2 py-1.5 text-sm border rounded dark:bg-gray-700 dark:border-gray-600 font-mono resize-none"
/>
</div>
</div>
@@ -503,7 +513,7 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
'bg-gray-300 dark:bg-gray-600'
}`} />
<span className="truncate text-gray-600 dark:text-gray-400 font-mono">
{entry.english || '—'} {entry.german || '—'}
{(entry.english || '—').replace(/\n/g, ' ')} {(entry.german || '—').replace(/\n/g, ' ')}
</span>
</div>
))}