diff --git a/admin-lehrer/components/ocr-pipeline/StepWordRecognition.tsx b/admin-lehrer/components/ocr-pipeline/StepWordRecognition.tsx
index 5c2cb59..1d49d76 100644
--- a/admin-lehrer/components/ocr-pipeline/StepWordRecognition.tsx
+++ b/admin-lehrer/components/ocr-pipeline/StepWordRecognition.tsx
@@ -5,6 +5,16 @@ import type { WordResult, WordEntry, WordGroundTruth } from '@/app/(admin)/ai/oc
const KLAUSUR_API = '/klausur-api'
+/** Render text with \n as line breaks */
+function MultilineText({ text }: { text: string }) {
+ if (!text) return —
+ const lines = text.split('\n')
+ if (lines.length === 1) return <>{text}>
+ return <>{lines.map((line, i) => (
+ {line}{i < lines.length - 1 &&
}
+ ))}>
+}
+
interface StepWordRecognitionProps {
sessionId: string | null
onNext: () => void
@@ -318,13 +328,13 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
>
{idx + 1} |
- {entry.english || —}
+
|
- {entry.german || —}
+
|
-
- {entry.example || —}
+ |
+
|
{entry.confidence}%
@@ -428,30 +438,30 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
@@ -503,7 +513,7 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
'bg-gray-300 dark:bg-gray-600'
}`} />
- {entry.english || '—'} → {entry.german || '—'}
+ {(entry.english || '—').replace(/\n/g, ' ')} → {(entry.german || '—').replace(/\n/g, ' ')}
))}
diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py
index 8ed0b09..cc2c6e8 100644
--- a/klausur-service/backend/cv_vocab_pipeline.py
+++ b/klausur-service/backend/cv_vocab_pipeline.py
@@ -2173,20 +2173,65 @@ def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Li
# Pipeline Step 5: Word Grid from Columns × Rows
# =============================================================================
-def _words_to_reading_order_text(words: List[Dict], y_tolerance_px: int = 15) -> str:
- """Join OCR words into text in correct reading order.
+def _words_to_reading_order_lines(words: List[Dict], y_tolerance_px: int = 15) -> List[str]:
+ """Group OCR words into visual lines in reading order.
- Groups words into visual lines by Y-tolerance, sorts each line by X,
- then joins lines with spaces. This fixes multi-line cell reading order.
+ Returns a list of line strings (one per visual line in the cell).
"""
if not words:
- return ''
+ return []
lines = _group_words_into_lines(words, y_tolerance_px=y_tolerance_px)
- line_texts = []
- for line in lines:
- line_texts.append(' '.join(w['text'] for w in line))
- return ' '.join(line_texts)
+ return [' '.join(w['text'] for w in line) for line in lines]
+
+
+def _rejoin_hyphenated(lines: List[str]) -> List[str]:
+ """Rejoin words split by line-break hyphenation.
+
+ E.g. ['Fuß-', 'boden'] → ['Fußboden']
+ ['some text-', 'thing here'] → ['something here']
+ """
+ if len(lines) <= 1:
+ return lines
+
+ result = []
+ i = 0
+ while i < len(lines):
+ line = lines[i]
+ # If line ends with '-' and there's a next line, rejoin
+ if i + 1 < len(lines) and line.rstrip().endswith('-'):
+ stripped = line.rstrip()
+ # Get the word fragment before hyphen (last word)
+ prefix = stripped[:-1] # remove trailing hyphen
+ next_line = lines[i + 1]
+ # Join: last word of this line + first word of next line
+ prefix_words = prefix.rsplit(' ', 1)
+ next_words = next_line.split(' ', 1)
+ if len(prefix_words) > 1:
+ joined = prefix_words[0] + ' ' + prefix_words[1] + next_words[0]
+ else:
+ joined = prefix_words[0] + next_words[0]
+ remainder = next_words[1] if len(next_words) > 1 else ''
+ if remainder:
+ result.append(joined + ' ' + remainder)
+ else:
+ result.append(joined)
+ i += 2
+ else:
+ result.append(line)
+ i += 1
+ return result
+
+
+def _words_to_reading_order_text(words: List[Dict], y_tolerance_px: int = 15) -> str:
+ """Join OCR words into text in correct reading order, preserving line breaks.
+
+ Groups words into visual lines by Y-tolerance, sorts each line by X,
+ rejoins hyphenated words, then joins lines with newlines.
+ """
+ lines = _words_to_reading_order_lines(words, y_tolerance_px)
+ lines = _rejoin_hyphenated(lines)
+ return '\n'.join(lines)
# --- RapidOCR integration (PaddleOCR models on ONNX Runtime) ---
@@ -2279,6 +2324,119 @@ def ocr_region_rapid(
return words
+def _split_oversized_entries(
+ entries: List[Dict[str, Any]],
+ content_rows: List[RowGeometry],
+ img_w: int,
+ img_h: int,
+) -> List[Dict[str, Any]]:
+ """Split entries from oversized rows into multiple entries.
+
+ If a row is >1.5× the median height, it likely contains multiple vocabulary
+ entries that Step 4 failed to separate. We split based on line count:
+ if EN and DE have the same number of newline-separated lines, each line
+ becomes its own entry.
+
+ This is a deterministic plausibility check — no LLM needed.
+ """
+ if len(entries) < 3:
+ return entries
+
+ # Calculate median row height from pixel heights
+ row_heights = [r.height for r in content_rows]
+ row_heights_sorted = sorted(row_heights)
+ median_h = row_heights_sorted[len(row_heights_sorted) // 2]
+
+ if median_h <= 0:
+ return entries
+
+ height_threshold = median_h * 1.5
+ result: List[Dict[str, Any]] = []
+ split_count = 0
+
+ for entry in entries:
+ # Get pixel height from bbox percent
+ entry_h_px = entry['bbox']['h'] / 100.0 * img_h
+
+ if entry_h_px <= height_threshold:
+ result.append(entry)
+ continue
+
+ # This row is oversized — check if we can split
+ en_lines = entry['english'].split('\n') if entry['english'] else ['']
+ de_lines = entry['german'].split('\n') if entry['german'] else ['']
+ ex_lines = entry['example'].split('\n') if entry['example'] else ['']
+
+ # Filter empty lines
+ en_lines = [l for l in en_lines if l.strip()] or ['']
+ de_lines = [l for l in de_lines if l.strip()] or ['']
+ ex_lines = [l for l in ex_lines if l.strip()] or ['']
+
+ # Determine split count: EN and DE must agree (or one is empty)
+ n_en = len(en_lines)
+ n_de = len(de_lines)
+ n_ex = len(ex_lines)
+
+ can_split = False
+ n_split = 1
+
+ if n_en > 1 and n_de > 1 and n_en == n_de:
+ n_split = n_en
+ can_split = True
+ elif n_en > 1 and n_de <= 1:
+ # Only EN has multiple lines — still split, DE goes to first
+ n_split = n_en
+ can_split = True
+ elif n_de > 1 and n_en <= 1:
+ # Only DE has multiple lines
+ n_split = n_de
+ can_split = True
+
+ if not can_split or n_split <= 1:
+ result.append(entry)
+ continue
+
+ # Split into n_split sub-entries
+ orig_y = entry['bbox']['y']
+ orig_h = entry['bbox']['h']
+ sub_h = orig_h / n_split
+
+ for k in range(n_split):
+ sub_entry = {
+ 'row_index': entry['row_index'],
+ 'english': en_lines[k] if k < len(en_lines) else '',
+ 'german': de_lines[k] if k < len(de_lines) else '',
+ 'example': ex_lines[k] if k < len(ex_lines) else '',
+ 'confidence': entry['confidence'],
+ 'bbox': {
+ 'x': entry['bbox']['x'],
+ 'y': round(orig_y + k * sub_h, 2),
+ 'w': entry['bbox']['w'],
+ 'h': round(sub_h, 2),
+ },
+ 'bbox_en': entry['bbox_en'],
+ 'bbox_de': entry['bbox_de'],
+ 'bbox_ex': entry['bbox_ex'],
+ 'ocr_engine': entry.get('ocr_engine', ''),
+ 'split_from_row': entry['row_index'],
+ }
+ result.append(sub_entry)
+
+ split_count += 1
+ logger.info(f"split_oversized: row {entry['row_index']} "
+ f"(h={entry_h_px:.0f}px > {height_threshold:.0f}px) "
+ f"→ {n_split} sub-entries")
+
+ if split_count > 0:
+ # Re-number row indices
+ for i, e in enumerate(result):
+ e['row_index'] = i
+ logger.info(f"split_oversized: {split_count} rows split, "
+ f"{len(entries)} → {len(result)} entries")
+
+ return result
+
+
def build_word_grid(
ocr_img: np.ndarray,
column_regions: List[PageRegion],
@@ -2433,6 +2591,9 @@ def build_word_grid(
if entry['english'] or entry['german'] or entry['example']:
entries.append(entry)
+ # --- Post-processing: split oversized rows ---
+ entries = _split_oversized_entries(entries, content_rows, img_w, img_h)
+
logger.info(f"build_word_grid: {len(entries)} entries from "
f"{len(content_rows)} content rows × {len(relevant_cols)} columns "
f"(engine={engine_name})")
|