Files
breakpilot-lehrer/klausur-service/backend/ocr/engines/word_assembly.py
Benjamin Admin 45287b3541 Fix: Sidebar scrollable + add Eltern-Portal nav link
overflow-hidden → overflow-y-auto so all nav items are reachable.
Added /parent (Eltern-Portal) link with people icon.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 20:49:44 +02:00

135 lines
4.7 KiB
Python

"""
Word assembly helpers for OCR output.
Groups raw OCR word dicts (with 'top', 'left', 'width', 'text' keys)
into visual lines, rejoins hyphenated words, and produces reading-order
text. All functions are pure standard-library; no NumPy or project
imports required.
"""
import logging
from typing import Dict, List
logger = logging.getLogger(__name__)
def _group_words_into_lines(words: List[Dict], y_tolerance_px: int = 20) -> List[List[Dict]]:
"""Group words by Y position into lines, sorted by X within each line."""
if not words:
return []
sorted_words = sorted(words, key=lambda w: (w['top'], w['left']))
lines: List[List[Dict]] = []
current_line: List[Dict] = [sorted_words[0]]
current_y = sorted_words[0]['top']
for word in sorted_words[1:]:
if abs(word['top'] - current_y) <= y_tolerance_px:
current_line.append(word)
else:
current_line.sort(key=lambda w: w['left'])
lines.append(current_line)
current_line = [word]
current_y = word['top']
if current_line:
current_line.sort(key=lambda w: w['left'])
lines.append(current_line)
return lines
def _words_to_reading_order_lines(words: List[Dict], y_tolerance_px: int = 15) -> List[str]:
"""Group OCR words into visual lines in reading order.
Returns a list of line strings (one per visual line in the cell).
"""
if not words:
return []
lines = _group_words_into_lines(words, y_tolerance_px=y_tolerance_px)
return [' '.join(w['text'] for w in line) for line in lines]
def _rejoin_hyphenated(lines: List[str]) -> List[str]:
"""Rejoin words split by line-break hyphenation.
E.g. ['Fu\u00df-', 'boden'] \u2192 ['Fu\u00dfboden']
['some text-', 'thing here'] \u2192 ['something here']
"""
if len(lines) <= 1:
return lines
result = []
i = 0
while i < len(lines):
line = lines[i]
# If line ends with '-' and there's a next line, rejoin
if i + 1 < len(lines) and line.rstrip().endswith('-'):
stripped = line.rstrip()
# Get the word fragment before hyphen (last word)
prefix = stripped[:-1] # remove trailing hyphen
next_line = lines[i + 1]
# Join: last word of this line + first word of next line
prefix_words = prefix.rsplit(' ', 1)
next_words = next_line.split(' ', 1)
if len(prefix_words) > 1:
joined = prefix_words[0] + ' ' + prefix_words[1] + next_words[0]
else:
joined = prefix_words[0] + next_words[0]
remainder = next_words[1] if len(next_words) > 1 else ''
if remainder:
result.append(joined + ' ' + remainder)
else:
result.append(joined)
i += 2
else:
result.append(line)
i += 1
return result
def _words_to_reading_order_text(words: List[Dict], y_tolerance_px: int = 15) -> str:
"""Join OCR words into text in correct reading order, preserving line breaks.
Groups words into visual lines by Y-tolerance, sorts each line by X,
rejoins hyphenated words, then joins lines with newlines.
"""
lines = _words_to_reading_order_lines(words, y_tolerance_px)
lines = _rejoin_hyphenated(lines)
return '\n'.join(lines)
def _words_to_spaced_text(words: List[Dict], y_tolerance_px: int = 15) -> str:
"""Join OCR words preserving proportional horizontal spacing.
Instead of single spaces between words, inserts multiple spaces based on
the pixel gap between words relative to average character width.
Useful for box sub-sessions where spatial layout matters.
"""
lines = _group_words_into_lines(words, y_tolerance_px=y_tolerance_px)
result_lines = []
for line_words in lines:
if not line_words:
continue
sorted_words = sorted(line_words, key=lambda w: w['left'])
# Calculate average character width from all words in line
total_chars = sum(len(w['text']) for w in sorted_words if w.get('text'))
total_width = sum(w['width'] for w in sorted_words if w.get('text'))
avg_char_width = total_width / total_chars if total_chars > 0 else 10
parts = []
for i, word in enumerate(sorted_words):
parts.append(word.get('text', ''))
if i < len(sorted_words) - 1:
next_word = sorted_words[i + 1]
gap_px = next_word['left'] - (word['left'] + word['width'])
num_spaces = max(1, round(gap_px / avg_char_width))
parts.append(' ' * num_spaces)
result_lines.append(''.join(parts))
return '\n'.join(result_lines)