breakpilot-lehrer/klausur-service/backend/cv_ocr_word_assembly.py

"""
Word assembly helpers for OCR output.

Groups raw OCR word dicts (with 'top', 'left', 'width', 'text' keys)
into visual lines, rejoins hyphenated words, and produces reading-order
text.  All functions are pure standard-library; no NumPy or project
imports required.
"""

import logging
from typing import Dict, List

logger = logging.getLogger(__name__)


def _group_words_into_lines(words: List[Dict], y_tolerance_px: int = 20) -> List[List[Dict]]:
    """Group words by Y position into lines, sorted by X within each line."""
    if not words:
        return []

    sorted_words = sorted(words, key=lambda w: (w['top'], w['left']))
    lines: List[List[Dict]] = []
    current_line: List[Dict] = [sorted_words[0]]
    current_y = sorted_words[0]['top']

    for word in sorted_words[1:]:
        if abs(word['top'] - current_y) <= y_tolerance_px:
            current_line.append(word)
        else:
            current_line.sort(key=lambda w: w['left'])
            lines.append(current_line)
            current_line = [word]
            current_y = word['top']

    if current_line:
        current_line.sort(key=lambda w: w['left'])
        lines.append(current_line)

    return lines


def _words_to_reading_order_lines(words: List[Dict], y_tolerance_px: int = 15) -> List[str]:
    """Group OCR words into visual lines in reading order.

    Returns a list of line strings (one per visual line in the cell).
    """
    if not words:
        return []

    lines = _group_words_into_lines(words, y_tolerance_px=y_tolerance_px)
    return [' '.join(w['text'] for w in line) for line in lines]


def _rejoin_hyphenated(lines: List[str]) -> List[str]:
    """Rejoin words split by line-break hyphenation.

    E.g. ['Fu\u00df-', 'boden'] \u2192 ['Fu\u00dfboden']
         ['some text-', 'thing here'] \u2192 ['something here']
    """
    if len(lines) <= 1:
        return lines

    result = []
    i = 0
    while i < len(lines):
        line = lines[i]
        # If line ends with '-' and there's a next line, rejoin
        if i + 1 < len(lines) and line.rstrip().endswith('-'):
            stripped = line.rstrip()
            # Get the word fragment before hyphen (last word)
            prefix = stripped[:-1]  # remove trailing hyphen
            next_line = lines[i + 1]
            # Join: last word of this line + first word of next line
            prefix_words = prefix.rsplit(' ', 1)
            next_words = next_line.split(' ', 1)
            if len(prefix_words) > 1:
                joined = prefix_words[0] + ' ' + prefix_words[1] + next_words[0]
            else:
                joined = prefix_words[0] + next_words[0]
            remainder = next_words[1] if len(next_words) > 1 else ''
            if remainder:
                result.append(joined + ' ' + remainder)
            else:
                result.append(joined)
            i += 2
        else:
            result.append(line)
            i += 1
    return result


def _words_to_reading_order_text(words: List[Dict], y_tolerance_px: int = 15) -> str:
    """Join OCR words into text in correct reading order, preserving line breaks.

    Groups words into visual lines by Y-tolerance, sorts each line by X,
    rejoins hyphenated words, then joins lines with newlines.
    """
    lines = _words_to_reading_order_lines(words, y_tolerance_px)
    lines = _rejoin_hyphenated(lines)
    return '\n'.join(lines)


def _words_to_spaced_text(words: List[Dict], y_tolerance_px: int = 15) -> str:
    """Join OCR words preserving proportional horizontal spacing.

    Instead of single spaces between words, inserts multiple spaces based on
    the pixel gap between words relative to average character width.
    Useful for box sub-sessions where spatial layout matters.
    """
    lines = _group_words_into_lines(words, y_tolerance_px=y_tolerance_px)
    result_lines = []

    for line_words in lines:
        if not line_words:
            continue
        sorted_words = sorted(line_words, key=lambda w: w['left'])

        # Calculate average character width from all words in line
        total_chars = sum(len(w['text']) for w in sorted_words if w.get('text'))
        total_width = sum(w['width'] for w in sorted_words if w.get('text'))
        avg_char_width = total_width / total_chars if total_chars > 0 else 10

        parts = []
        for i, word in enumerate(sorted_words):
            parts.append(word.get('text', ''))
            if i < len(sorted_words) - 1:
                next_word = sorted_words[i + 1]
                gap_px = next_word['left'] - (word['left'] + word['width'])
                num_spaces = max(1, round(gap_px / avg_char_width))
                parts.append(' ' * num_spaces)

        result_lines.append(''.join(parts))

    return '\n'.join(result_lines)