feat(ocr-pipeline): line breaks, hyphen rejoin & oversized row splitting
- Preserve \n between visual lines within cells (instead of joining with space) - Rejoin hyphenated words split across line breaks (e.g. Fuß-\nboden → Fußboden) - Split oversized rows (>1.5× median height) into sub-entries when EN/DE line counts match — deterministic fix for missed Step 4 row boundaries - Frontend: render \n as <br/>, use textarea for multiline editing Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -5,6 +5,16 @@ import type { WordResult, WordEntry, WordGroundTruth } from '@/app/(admin)/ai/oc
|
|||||||
|
|
||||||
const KLAUSUR_API = '/klausur-api'
|
const KLAUSUR_API = '/klausur-api'
|
||||||
|
|
||||||
|
/** Render text with \n as line breaks */
|
||||||
|
function MultilineText({ text }: { text: string }) {
|
||||||
|
if (!text) return <span className="text-gray-300 dark:text-gray-600">—</span>
|
||||||
|
const lines = text.split('\n')
|
||||||
|
if (lines.length === 1) return <>{text}</>
|
||||||
|
return <>{lines.map((line, i) => (
|
||||||
|
<span key={i}>{line}{i < lines.length - 1 && <br />}</span>
|
||||||
|
))}</>
|
||||||
|
}
|
||||||
|
|
||||||
interface StepWordRecognitionProps {
|
interface StepWordRecognitionProps {
|
||||||
sessionId: string | null
|
sessionId: string | null
|
||||||
onNext: () => void
|
onNext: () => void
|
||||||
@@ -318,13 +328,13 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
|
|||||||
>
|
>
|
||||||
<td className="py-1 pr-2 text-gray-400">{idx + 1}</td>
|
<td className="py-1 pr-2 text-gray-400">{idx + 1}</td>
|
||||||
<td className="py-1 pr-2 font-mono text-gray-700 dark:text-gray-300 cursor-pointer">
|
<td className="py-1 pr-2 font-mono text-gray-700 dark:text-gray-300 cursor-pointer">
|
||||||
{entry.english || <span className="text-gray-300 dark:text-gray-600">—</span>}
|
<MultilineText text={entry.english} />
|
||||||
</td>
|
</td>
|
||||||
<td className="py-1 pr-2 font-mono text-gray-700 dark:text-gray-300 cursor-pointer">
|
<td className="py-1 pr-2 font-mono text-gray-700 dark:text-gray-300 cursor-pointer">
|
||||||
{entry.german || <span className="text-gray-300 dark:text-gray-600">—</span>}
|
<MultilineText text={entry.german} />
|
||||||
</td>
|
</td>
|
||||||
<td className="py-1 pr-2 font-mono text-gray-500 dark:text-gray-400 cursor-pointer max-w-[200px] truncate">
|
<td className="py-1 pr-2 font-mono text-gray-500 dark:text-gray-400 cursor-pointer max-w-[200px]">
|
||||||
{entry.example || <span className="text-gray-300 dark:text-gray-600">—</span>}
|
<MultilineText text={entry.example} />
|
||||||
</td>
|
</td>
|
||||||
<td className={`py-1 text-right font-mono ${confColor(entry.confidence)}`}>
|
<td className={`py-1 text-right font-mono ${confColor(entry.confidence)}`}>
|
||||||
{entry.confidence}%
|
{entry.confidence}%
|
||||||
@@ -428,30 +438,30 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
|
|||||||
<div className="space-y-2">
|
<div className="space-y-2">
|
||||||
<div>
|
<div>
|
||||||
<label className="text-[10px] font-medium text-gray-500 dark:text-gray-400">English</label>
|
<label className="text-[10px] font-medium text-gray-500 dark:text-gray-400">English</label>
|
||||||
<input
|
<textarea
|
||||||
ref={enRef}
|
ref={enRef as any}
|
||||||
type="text"
|
rows={Math.max(1, (editedEntries[activeIndex]?.english || '').split('\n').length)}
|
||||||
value={editedEntries[activeIndex]?.english || ''}
|
value={editedEntries[activeIndex]?.english || ''}
|
||||||
onChange={(e) => updateEntry(activeIndex, 'english', e.target.value)}
|
onChange={(e) => updateEntry(activeIndex, 'english', e.target.value)}
|
||||||
className="w-full px-2 py-1.5 text-sm border rounded dark:bg-gray-700 dark:border-gray-600 font-mono"
|
className="w-full px-2 py-1.5 text-sm border rounded dark:bg-gray-700 dark:border-gray-600 font-mono resize-none"
|
||||||
/>
|
/>
|
||||||
</div>
|
</div>
|
||||||
<div>
|
<div>
|
||||||
<label className="text-[10px] font-medium text-gray-500 dark:text-gray-400">Deutsch</label>
|
<label className="text-[10px] font-medium text-gray-500 dark:text-gray-400">Deutsch</label>
|
||||||
<input
|
<textarea
|
||||||
type="text"
|
rows={Math.max(1, (editedEntries[activeIndex]?.german || '').split('\n').length)}
|
||||||
value={editedEntries[activeIndex]?.german || ''}
|
value={editedEntries[activeIndex]?.german || ''}
|
||||||
onChange={(e) => updateEntry(activeIndex, 'german', e.target.value)}
|
onChange={(e) => updateEntry(activeIndex, 'german', e.target.value)}
|
||||||
className="w-full px-2 py-1.5 text-sm border rounded dark:bg-gray-700 dark:border-gray-600 font-mono"
|
className="w-full px-2 py-1.5 text-sm border rounded dark:bg-gray-700 dark:border-gray-600 font-mono resize-none"
|
||||||
/>
|
/>
|
||||||
</div>
|
</div>
|
||||||
<div>
|
<div>
|
||||||
<label className="text-[10px] font-medium text-gray-500 dark:text-gray-400">Example</label>
|
<label className="text-[10px] font-medium text-gray-500 dark:text-gray-400">Example</label>
|
||||||
<input
|
<textarea
|
||||||
type="text"
|
rows={Math.max(1, (editedEntries[activeIndex]?.example || '').split('\n').length)}
|
||||||
value={editedEntries[activeIndex]?.example || ''}
|
value={editedEntries[activeIndex]?.example || ''}
|
||||||
onChange={(e) => updateEntry(activeIndex, 'example', e.target.value)}
|
onChange={(e) => updateEntry(activeIndex, 'example', e.target.value)}
|
||||||
className="w-full px-2 py-1.5 text-sm border rounded dark:bg-gray-700 dark:border-gray-600 font-mono"
|
className="w-full px-2 py-1.5 text-sm border rounded dark:bg-gray-700 dark:border-gray-600 font-mono resize-none"
|
||||||
/>
|
/>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@@ -503,7 +513,7 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
|
|||||||
'bg-gray-300 dark:bg-gray-600'
|
'bg-gray-300 dark:bg-gray-600'
|
||||||
}`} />
|
}`} />
|
||||||
<span className="truncate text-gray-600 dark:text-gray-400 font-mono">
|
<span className="truncate text-gray-600 dark:text-gray-400 font-mono">
|
||||||
{entry.english || '—'} → {entry.german || '—'}
|
{(entry.english || '—').replace(/\n/g, ' ')} → {(entry.german || '—').replace(/\n/g, ' ')}
|
||||||
</span>
|
</span>
|
||||||
</div>
|
</div>
|
||||||
))}
|
))}
|
||||||
|
|||||||
@@ -2173,20 +2173,65 @@ def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Li
|
|||||||
# Pipeline Step 5: Word Grid from Columns × Rows
|
# Pipeline Step 5: Word Grid from Columns × Rows
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|
||||||
def _words_to_reading_order_text(words: List[Dict], y_tolerance_px: int = 15) -> str:
|
def _words_to_reading_order_lines(words: List[Dict], y_tolerance_px: int = 15) -> List[str]:
|
||||||
"""Join OCR words into text in correct reading order.
|
"""Group OCR words into visual lines in reading order.
|
||||||
|
|
||||||
Groups words into visual lines by Y-tolerance, sorts each line by X,
|
Returns a list of line strings (one per visual line in the cell).
|
||||||
then joins lines with spaces. This fixes multi-line cell reading order.
|
|
||||||
"""
|
"""
|
||||||
if not words:
|
if not words:
|
||||||
return ''
|
return []
|
||||||
|
|
||||||
lines = _group_words_into_lines(words, y_tolerance_px=y_tolerance_px)
|
lines = _group_words_into_lines(words, y_tolerance_px=y_tolerance_px)
|
||||||
line_texts = []
|
return [' '.join(w['text'] for w in line) for line in lines]
|
||||||
for line in lines:
|
|
||||||
line_texts.append(' '.join(w['text'] for w in line))
|
|
||||||
return ' '.join(line_texts)
|
def _rejoin_hyphenated(lines: List[str]) -> List[str]:
|
||||||
|
"""Rejoin words split by line-break hyphenation.
|
||||||
|
|
||||||
|
E.g. ['Fuß-', 'boden'] → ['Fußboden']
|
||||||
|
['some text-', 'thing here'] → ['something here']
|
||||||
|
"""
|
||||||
|
if len(lines) <= 1:
|
||||||
|
return lines
|
||||||
|
|
||||||
|
result = []
|
||||||
|
i = 0
|
||||||
|
while i < len(lines):
|
||||||
|
line = lines[i]
|
||||||
|
# If line ends with '-' and there's a next line, rejoin
|
||||||
|
if i + 1 < len(lines) and line.rstrip().endswith('-'):
|
||||||
|
stripped = line.rstrip()
|
||||||
|
# Get the word fragment before hyphen (last word)
|
||||||
|
prefix = stripped[:-1] # remove trailing hyphen
|
||||||
|
next_line = lines[i + 1]
|
||||||
|
# Join: last word of this line + first word of next line
|
||||||
|
prefix_words = prefix.rsplit(' ', 1)
|
||||||
|
next_words = next_line.split(' ', 1)
|
||||||
|
if len(prefix_words) > 1:
|
||||||
|
joined = prefix_words[0] + ' ' + prefix_words[1] + next_words[0]
|
||||||
|
else:
|
||||||
|
joined = prefix_words[0] + next_words[0]
|
||||||
|
remainder = next_words[1] if len(next_words) > 1 else ''
|
||||||
|
if remainder:
|
||||||
|
result.append(joined + ' ' + remainder)
|
||||||
|
else:
|
||||||
|
result.append(joined)
|
||||||
|
i += 2
|
||||||
|
else:
|
||||||
|
result.append(line)
|
||||||
|
i += 1
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _words_to_reading_order_text(words: List[Dict], y_tolerance_px: int = 15) -> str:
|
||||||
|
"""Join OCR words into text in correct reading order, preserving line breaks.
|
||||||
|
|
||||||
|
Groups words into visual lines by Y-tolerance, sorts each line by X,
|
||||||
|
rejoins hyphenated words, then joins lines with newlines.
|
||||||
|
"""
|
||||||
|
lines = _words_to_reading_order_lines(words, y_tolerance_px)
|
||||||
|
lines = _rejoin_hyphenated(lines)
|
||||||
|
return '\n'.join(lines)
|
||||||
|
|
||||||
|
|
||||||
# --- RapidOCR integration (PaddleOCR models on ONNX Runtime) ---
|
# --- RapidOCR integration (PaddleOCR models on ONNX Runtime) ---
|
||||||
@@ -2279,6 +2324,119 @@ def ocr_region_rapid(
|
|||||||
return words
|
return words
|
||||||
|
|
||||||
|
|
||||||
|
def _split_oversized_entries(
|
||||||
|
entries: List[Dict[str, Any]],
|
||||||
|
content_rows: List[RowGeometry],
|
||||||
|
img_w: int,
|
||||||
|
img_h: int,
|
||||||
|
) -> List[Dict[str, Any]]:
|
||||||
|
"""Split entries from oversized rows into multiple entries.
|
||||||
|
|
||||||
|
If a row is >1.5× the median height, it likely contains multiple vocabulary
|
||||||
|
entries that Step 4 failed to separate. We split based on line count:
|
||||||
|
if EN and DE have the same number of newline-separated lines, each line
|
||||||
|
becomes its own entry.
|
||||||
|
|
||||||
|
This is a deterministic plausibility check — no LLM needed.
|
||||||
|
"""
|
||||||
|
if len(entries) < 3:
|
||||||
|
return entries
|
||||||
|
|
||||||
|
# Calculate median row height from pixel heights
|
||||||
|
row_heights = [r.height for r in content_rows]
|
||||||
|
row_heights_sorted = sorted(row_heights)
|
||||||
|
median_h = row_heights_sorted[len(row_heights_sorted) // 2]
|
||||||
|
|
||||||
|
if median_h <= 0:
|
||||||
|
return entries
|
||||||
|
|
||||||
|
height_threshold = median_h * 1.5
|
||||||
|
result: List[Dict[str, Any]] = []
|
||||||
|
split_count = 0
|
||||||
|
|
||||||
|
for entry in entries:
|
||||||
|
# Get pixel height from bbox percent
|
||||||
|
entry_h_px = entry['bbox']['h'] / 100.0 * img_h
|
||||||
|
|
||||||
|
if entry_h_px <= height_threshold:
|
||||||
|
result.append(entry)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# This row is oversized — check if we can split
|
||||||
|
en_lines = entry['english'].split('\n') if entry['english'] else ['']
|
||||||
|
de_lines = entry['german'].split('\n') if entry['german'] else ['']
|
||||||
|
ex_lines = entry['example'].split('\n') if entry['example'] else ['']
|
||||||
|
|
||||||
|
# Filter empty lines
|
||||||
|
en_lines = [l for l in en_lines if l.strip()] or ['']
|
||||||
|
de_lines = [l for l in de_lines if l.strip()] or ['']
|
||||||
|
ex_lines = [l for l in ex_lines if l.strip()] or ['']
|
||||||
|
|
||||||
|
# Determine split count: EN and DE must agree (or one is empty)
|
||||||
|
n_en = len(en_lines)
|
||||||
|
n_de = len(de_lines)
|
||||||
|
n_ex = len(ex_lines)
|
||||||
|
|
||||||
|
can_split = False
|
||||||
|
n_split = 1
|
||||||
|
|
||||||
|
if n_en > 1 and n_de > 1 and n_en == n_de:
|
||||||
|
n_split = n_en
|
||||||
|
can_split = True
|
||||||
|
elif n_en > 1 and n_de <= 1:
|
||||||
|
# Only EN has multiple lines — still split, DE goes to first
|
||||||
|
n_split = n_en
|
||||||
|
can_split = True
|
||||||
|
elif n_de > 1 and n_en <= 1:
|
||||||
|
# Only DE has multiple lines
|
||||||
|
n_split = n_de
|
||||||
|
can_split = True
|
||||||
|
|
||||||
|
if not can_split or n_split <= 1:
|
||||||
|
result.append(entry)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Split into n_split sub-entries
|
||||||
|
orig_y = entry['bbox']['y']
|
||||||
|
orig_h = entry['bbox']['h']
|
||||||
|
sub_h = orig_h / n_split
|
||||||
|
|
||||||
|
for k in range(n_split):
|
||||||
|
sub_entry = {
|
||||||
|
'row_index': entry['row_index'],
|
||||||
|
'english': en_lines[k] if k < len(en_lines) else '',
|
||||||
|
'german': de_lines[k] if k < len(de_lines) else '',
|
||||||
|
'example': ex_lines[k] if k < len(ex_lines) else '',
|
||||||
|
'confidence': entry['confidence'],
|
||||||
|
'bbox': {
|
||||||
|
'x': entry['bbox']['x'],
|
||||||
|
'y': round(orig_y + k * sub_h, 2),
|
||||||
|
'w': entry['bbox']['w'],
|
||||||
|
'h': round(sub_h, 2),
|
||||||
|
},
|
||||||
|
'bbox_en': entry['bbox_en'],
|
||||||
|
'bbox_de': entry['bbox_de'],
|
||||||
|
'bbox_ex': entry['bbox_ex'],
|
||||||
|
'ocr_engine': entry.get('ocr_engine', ''),
|
||||||
|
'split_from_row': entry['row_index'],
|
||||||
|
}
|
||||||
|
result.append(sub_entry)
|
||||||
|
|
||||||
|
split_count += 1
|
||||||
|
logger.info(f"split_oversized: row {entry['row_index']} "
|
||||||
|
f"(h={entry_h_px:.0f}px > {height_threshold:.0f}px) "
|
||||||
|
f"→ {n_split} sub-entries")
|
||||||
|
|
||||||
|
if split_count > 0:
|
||||||
|
# Re-number row indices
|
||||||
|
for i, e in enumerate(result):
|
||||||
|
e['row_index'] = i
|
||||||
|
logger.info(f"split_oversized: {split_count} rows split, "
|
||||||
|
f"{len(entries)} → {len(result)} entries")
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
def build_word_grid(
|
def build_word_grid(
|
||||||
ocr_img: np.ndarray,
|
ocr_img: np.ndarray,
|
||||||
column_regions: List[PageRegion],
|
column_regions: List[PageRegion],
|
||||||
@@ -2433,6 +2591,9 @@ def build_word_grid(
|
|||||||
if entry['english'] or entry['german'] or entry['example']:
|
if entry['english'] or entry['german'] or entry['example']:
|
||||||
entries.append(entry)
|
entries.append(entry)
|
||||||
|
|
||||||
|
# --- Post-processing: split oversized rows ---
|
||||||
|
entries = _split_oversized_entries(entries, content_rows, img_w, img_h)
|
||||||
|
|
||||||
logger.info(f"build_word_grid: {len(entries)} entries from "
|
logger.info(f"build_word_grid: {len(entries)} entries from "
|
||||||
f"{len(content_rows)} content rows × {len(relevant_cols)} columns "
|
f"{len(content_rows)} content rows × {len(relevant_cols)} columns "
|
||||||
f"(engine={engine_name})")
|
f"(engine={engine_name})")
|
||||||
|
|||||||
Reference in New Issue
Block a user