Files
breakpilot-lehrer/klausur-service/backend/cv_ocr_word_assembly.py
Benjamin Admin 9ba420fa91
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
Fix: Remove broken getKlausurApiUrl and clean up empty lines
sed replacement left orphaned hostname references in story page
and empty lines in getApiBase functions.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 16:02:04 +02:00

135 lines
4.7 KiB
Python

"""
Word assembly helpers for OCR output.
Groups raw OCR word dicts (with 'top', 'left', 'width', 'text' keys)
into visual lines, rejoins hyphenated words, and produces reading-order
text. All functions are pure standard-library; no NumPy or project
imports required.
"""
import logging
from typing import Dict, List
logger = logging.getLogger(__name__)
def _group_words_into_lines(words: List[Dict], y_tolerance_px: int = 20) -> List[List[Dict]]:
"""Group words by Y position into lines, sorted by X within each line."""
if not words:
return []
sorted_words = sorted(words, key=lambda w: (w['top'], w['left']))
lines: List[List[Dict]] = []
current_line: List[Dict] = [sorted_words[0]]
current_y = sorted_words[0]['top']
for word in sorted_words[1:]:
if abs(word['top'] - current_y) <= y_tolerance_px:
current_line.append(word)
else:
current_line.sort(key=lambda w: w['left'])
lines.append(current_line)
current_line = [word]
current_y = word['top']
if current_line:
current_line.sort(key=lambda w: w['left'])
lines.append(current_line)
return lines
def _words_to_reading_order_lines(words: List[Dict], y_tolerance_px: int = 15) -> List[str]:
"""Group OCR words into visual lines in reading order.
Returns a list of line strings (one per visual line in the cell).
"""
if not words:
return []
lines = _group_words_into_lines(words, y_tolerance_px=y_tolerance_px)
return [' '.join(w['text'] for w in line) for line in lines]
def _rejoin_hyphenated(lines: List[str]) -> List[str]:
"""Rejoin words split by line-break hyphenation.
E.g. ['Fu\u00df-', 'boden'] \u2192 ['Fu\u00dfboden']
['some text-', 'thing here'] \u2192 ['something here']
"""
if len(lines) <= 1:
return lines
result = []
i = 0
while i < len(lines):
line = lines[i]
# If line ends with '-' and there's a next line, rejoin
if i + 1 < len(lines) and line.rstrip().endswith('-'):
stripped = line.rstrip()
# Get the word fragment before hyphen (last word)
prefix = stripped[:-1] # remove trailing hyphen
next_line = lines[i + 1]
# Join: last word of this line + first word of next line
prefix_words = prefix.rsplit(' ', 1)
next_words = next_line.split(' ', 1)
if len(prefix_words) > 1:
joined = prefix_words[0] + ' ' + prefix_words[1] + next_words[0]
else:
joined = prefix_words[0] + next_words[0]
remainder = next_words[1] if len(next_words) > 1 else ''
if remainder:
result.append(joined + ' ' + remainder)
else:
result.append(joined)
i += 2
else:
result.append(line)
i += 1
return result
def _words_to_reading_order_text(words: List[Dict], y_tolerance_px: int = 15) -> str:
"""Join OCR words into text in correct reading order, preserving line breaks.
Groups words into visual lines by Y-tolerance, sorts each line by X,
rejoins hyphenated words, then joins lines with newlines.
"""
lines = _words_to_reading_order_lines(words, y_tolerance_px)
lines = _rejoin_hyphenated(lines)
return '\n'.join(lines)
def _words_to_spaced_text(words: List[Dict], y_tolerance_px: int = 15) -> str:
"""Join OCR words preserving proportional horizontal spacing.
Instead of single spaces between words, inserts multiple spaces based on
the pixel gap between words relative to average character width.
Useful for box sub-sessions where spatial layout matters.
"""
lines = _group_words_into_lines(words, y_tolerance_px=y_tolerance_px)
result_lines = []
for line_words in lines:
if not line_words:
continue
sorted_words = sorted(line_words, key=lambda w: w['left'])
# Calculate average character width from all words in line
total_chars = sum(len(w['text']) for w in sorted_words if w.get('text'))
total_width = sum(w['width'] for w in sorted_words if w.get('text'))
avg_char_width = total_width / total_chars if total_chars > 0 else 10
parts = []
for i, word in enumerate(sorted_words):
parts.append(word.get('text', ''))
if i < len(sorted_words) - 1:
next_word = sorted_words[i + 1]
gap_px = next_word['left'] - (word['left'] + word['width'])
num_spaces = max(1, round(gap_px / avg_char_width))
parts.append(' ' * num_spaces)
result_lines.append(''.join(parts))
return '\n'.join(result_lines)