diff --git a/admin-lehrer/components/ocr-pipeline/StepWordRecognition.tsx b/admin-lehrer/components/ocr-pipeline/StepWordRecognition.tsx index 5c2cb59..1d49d76 100644 --- a/admin-lehrer/components/ocr-pipeline/StepWordRecognition.tsx +++ b/admin-lehrer/components/ocr-pipeline/StepWordRecognition.tsx @@ -5,6 +5,16 @@ import type { WordResult, WordEntry, WordGroundTruth } from '@/app/(admin)/ai/oc const KLAUSUR_API = '/klausur-api' +/** Render text with \n as line breaks */ +function MultilineText({ text }: { text: string }) { + if (!text) return + const lines = text.split('\n') + if (lines.length === 1) return <>{text} + return <>{lines.map((line, i) => ( + {line}{i < lines.length - 1 &&
}
+ ))} +} + interface StepWordRecognitionProps { sessionId: string | null onNext: () => void @@ -318,13 +328,13 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec > {idx + 1} - {entry.english || } + - {entry.german || } + - - {entry.example || } + + {entry.confidence}% @@ -428,30 +438,30 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec
- updateEntry(activeIndex, 'english', e.target.value)} - className="w-full px-2 py-1.5 text-sm border rounded dark:bg-gray-700 dark:border-gray-600 font-mono" + className="w-full px-2 py-1.5 text-sm border rounded dark:bg-gray-700 dark:border-gray-600 font-mono resize-none" />
- updateEntry(activeIndex, 'german', e.target.value)} - className="w-full px-2 py-1.5 text-sm border rounded dark:bg-gray-700 dark:border-gray-600 font-mono" + className="w-full px-2 py-1.5 text-sm border rounded dark:bg-gray-700 dark:border-gray-600 font-mono resize-none" />
- updateEntry(activeIndex, 'example', e.target.value)} - className="w-full px-2 py-1.5 text-sm border rounded dark:bg-gray-700 dark:border-gray-600 font-mono" + className="w-full px-2 py-1.5 text-sm border rounded dark:bg-gray-700 dark:border-gray-600 font-mono resize-none" />
@@ -503,7 +513,7 @@ export function StepWordRecognition({ sessionId, onNext, goToStep }: StepWordRec 'bg-gray-300 dark:bg-gray-600' }`} /> - {entry.english || '—'} → {entry.german || '—'} + {(entry.english || '—').replace(/\n/g, ' ')} → {(entry.german || '—').replace(/\n/g, ' ')} ))} diff --git a/klausur-service/backend/cv_vocab_pipeline.py b/klausur-service/backend/cv_vocab_pipeline.py index 8ed0b09..cc2c6e8 100644 --- a/klausur-service/backend/cv_vocab_pipeline.py +++ b/klausur-service/backend/cv_vocab_pipeline.py @@ -2173,20 +2173,65 @@ def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Li # Pipeline Step 5: Word Grid from Columns × Rows # ============================================================================= -def _words_to_reading_order_text(words: List[Dict], y_tolerance_px: int = 15) -> str: - """Join OCR words into text in correct reading order. +def _words_to_reading_order_lines(words: List[Dict], y_tolerance_px: int = 15) -> List[str]: + """Group OCR words into visual lines in reading order. - Groups words into visual lines by Y-tolerance, sorts each line by X, - then joins lines with spaces. This fixes multi-line cell reading order. + Returns a list of line strings (one per visual line in the cell). """ if not words: - return '' + return [] lines = _group_words_into_lines(words, y_tolerance_px=y_tolerance_px) - line_texts = [] - for line in lines: - line_texts.append(' '.join(w['text'] for w in line)) - return ' '.join(line_texts) + return [' '.join(w['text'] for w in line) for line in lines] + + +def _rejoin_hyphenated(lines: List[str]) -> List[str]: + """Rejoin words split by line-break hyphenation. + + E.g. ['Fuß-', 'boden'] → ['Fußboden'] + ['some text-', 'thing here'] → ['something here'] + """ + if len(lines) <= 1: + return lines + + result = [] + i = 0 + while i < len(lines): + line = lines[i] + # If line ends with '-' and there's a next line, rejoin + if i + 1 < len(lines) and line.rstrip().endswith('-'): + stripped = line.rstrip() + # Get the word fragment before hyphen (last word) + prefix = stripped[:-1] # remove trailing hyphen + next_line = lines[i + 1] + # Join: last word of this line + first word of next line + prefix_words = prefix.rsplit(' ', 1) + next_words = next_line.split(' ', 1) + if len(prefix_words) > 1: + joined = prefix_words[0] + ' ' + prefix_words[1] + next_words[0] + else: + joined = prefix_words[0] + next_words[0] + remainder = next_words[1] if len(next_words) > 1 else '' + if remainder: + result.append(joined + ' ' + remainder) + else: + result.append(joined) + i += 2 + else: + result.append(line) + i += 1 + return result + + +def _words_to_reading_order_text(words: List[Dict], y_tolerance_px: int = 15) -> str: + """Join OCR words into text in correct reading order, preserving line breaks. + + Groups words into visual lines by Y-tolerance, sorts each line by X, + rejoins hyphenated words, then joins lines with newlines. + """ + lines = _words_to_reading_order_lines(words, y_tolerance_px) + lines = _rejoin_hyphenated(lines) + return '\n'.join(lines) # --- RapidOCR integration (PaddleOCR models on ONNX Runtime) --- @@ -2279,6 +2324,119 @@ def ocr_region_rapid( return words +def _split_oversized_entries( + entries: List[Dict[str, Any]], + content_rows: List[RowGeometry], + img_w: int, + img_h: int, +) -> List[Dict[str, Any]]: + """Split entries from oversized rows into multiple entries. + + If a row is >1.5× the median height, it likely contains multiple vocabulary + entries that Step 4 failed to separate. We split based on line count: + if EN and DE have the same number of newline-separated lines, each line + becomes its own entry. + + This is a deterministic plausibility check — no LLM needed. + """ + if len(entries) < 3: + return entries + + # Calculate median row height from pixel heights + row_heights = [r.height for r in content_rows] + row_heights_sorted = sorted(row_heights) + median_h = row_heights_sorted[len(row_heights_sorted) // 2] + + if median_h <= 0: + return entries + + height_threshold = median_h * 1.5 + result: List[Dict[str, Any]] = [] + split_count = 0 + + for entry in entries: + # Get pixel height from bbox percent + entry_h_px = entry['bbox']['h'] / 100.0 * img_h + + if entry_h_px <= height_threshold: + result.append(entry) + continue + + # This row is oversized — check if we can split + en_lines = entry['english'].split('\n') if entry['english'] else [''] + de_lines = entry['german'].split('\n') if entry['german'] else [''] + ex_lines = entry['example'].split('\n') if entry['example'] else [''] + + # Filter empty lines + en_lines = [l for l in en_lines if l.strip()] or [''] + de_lines = [l for l in de_lines if l.strip()] or [''] + ex_lines = [l for l in ex_lines if l.strip()] or [''] + + # Determine split count: EN and DE must agree (or one is empty) + n_en = len(en_lines) + n_de = len(de_lines) + n_ex = len(ex_lines) + + can_split = False + n_split = 1 + + if n_en > 1 and n_de > 1 and n_en == n_de: + n_split = n_en + can_split = True + elif n_en > 1 and n_de <= 1: + # Only EN has multiple lines — still split, DE goes to first + n_split = n_en + can_split = True + elif n_de > 1 and n_en <= 1: + # Only DE has multiple lines + n_split = n_de + can_split = True + + if not can_split or n_split <= 1: + result.append(entry) + continue + + # Split into n_split sub-entries + orig_y = entry['bbox']['y'] + orig_h = entry['bbox']['h'] + sub_h = orig_h / n_split + + for k in range(n_split): + sub_entry = { + 'row_index': entry['row_index'], + 'english': en_lines[k] if k < len(en_lines) else '', + 'german': de_lines[k] if k < len(de_lines) else '', + 'example': ex_lines[k] if k < len(ex_lines) else '', + 'confidence': entry['confidence'], + 'bbox': { + 'x': entry['bbox']['x'], + 'y': round(orig_y + k * sub_h, 2), + 'w': entry['bbox']['w'], + 'h': round(sub_h, 2), + }, + 'bbox_en': entry['bbox_en'], + 'bbox_de': entry['bbox_de'], + 'bbox_ex': entry['bbox_ex'], + 'ocr_engine': entry.get('ocr_engine', ''), + 'split_from_row': entry['row_index'], + } + result.append(sub_entry) + + split_count += 1 + logger.info(f"split_oversized: row {entry['row_index']} " + f"(h={entry_h_px:.0f}px > {height_threshold:.0f}px) " + f"→ {n_split} sub-entries") + + if split_count > 0: + # Re-number row indices + for i, e in enumerate(result): + e['row_index'] = i + logger.info(f"split_oversized: {split_count} rows split, " + f"{len(entries)} → {len(result)} entries") + + return result + + def build_word_grid( ocr_img: np.ndarray, column_regions: List[PageRegion], @@ -2433,6 +2591,9 @@ def build_word_grid( if entry['english'] or entry['german'] or entry['example']: entries.append(entry) + # --- Post-processing: split oversized rows --- + entries = _split_oversized_entries(entries, content_rows, img_w, img_h) + logger.info(f"build_word_grid: {len(entries)} entries from " f"{len(content_rows)} content rows × {len(relevant_cols)} columns " f"(engine={engine_name})")