""" Word assembly helpers for OCR output. Groups raw OCR word dicts (with 'top', 'left', 'width', 'text' keys) into visual lines, rejoins hyphenated words, and produces reading-order text. All functions are pure standard-library; no NumPy or project imports required. """ import logging from typing import Dict, List logger = logging.getLogger(__name__) def _group_words_into_lines(words: List[Dict], y_tolerance_px: int = 20) -> List[List[Dict]]: """Group words by Y position into lines, sorted by X within each line.""" if not words: return [] sorted_words = sorted(words, key=lambda w: (w['top'], w['left'])) lines: List[List[Dict]] = [] current_line: List[Dict] = [sorted_words[0]] current_y = sorted_words[0]['top'] for word in sorted_words[1:]: if abs(word['top'] - current_y) <= y_tolerance_px: current_line.append(word) else: current_line.sort(key=lambda w: w['left']) lines.append(current_line) current_line = [word] current_y = word['top'] if current_line: current_line.sort(key=lambda w: w['left']) lines.append(current_line) return lines def _words_to_reading_order_lines(words: List[Dict], y_tolerance_px: int = 15) -> List[str]: """Group OCR words into visual lines in reading order. Returns a list of line strings (one per visual line in the cell). """ if not words: return [] lines = _group_words_into_lines(words, y_tolerance_px=y_tolerance_px) return [' '.join(w['text'] for w in line) for line in lines] def _rejoin_hyphenated(lines: List[str]) -> List[str]: """Rejoin words split by line-break hyphenation. E.g. ['Fu\u00df-', 'boden'] \u2192 ['Fu\u00dfboden'] ['some text-', 'thing here'] \u2192 ['something here'] """ if len(lines) <= 1: return lines result = [] i = 0 while i < len(lines): line = lines[i] # If line ends with '-' and there's a next line, rejoin if i + 1 < len(lines) and line.rstrip().endswith('-'): stripped = line.rstrip() # Get the word fragment before hyphen (last word) prefix = stripped[:-1] # remove trailing hyphen next_line = lines[i + 1] # Join: last word of this line + first word of next line prefix_words = prefix.rsplit(' ', 1) next_words = next_line.split(' ', 1) if len(prefix_words) > 1: joined = prefix_words[0] + ' ' + prefix_words[1] + next_words[0] else: joined = prefix_words[0] + next_words[0] remainder = next_words[1] if len(next_words) > 1 else '' if remainder: result.append(joined + ' ' + remainder) else: result.append(joined) i += 2 else: result.append(line) i += 1 return result def _words_to_reading_order_text(words: List[Dict], y_tolerance_px: int = 15) -> str: """Join OCR words into text in correct reading order, preserving line breaks. Groups words into visual lines by Y-tolerance, sorts each line by X, rejoins hyphenated words, then joins lines with newlines. """ lines = _words_to_reading_order_lines(words, y_tolerance_px) lines = _rejoin_hyphenated(lines) return '\n'.join(lines) def _words_to_spaced_text(words: List[Dict], y_tolerance_px: int = 15) -> str: """Join OCR words preserving proportional horizontal spacing. Instead of single spaces between words, inserts multiple spaces based on the pixel gap between words relative to average character width. Useful for box sub-sessions where spatial layout matters. """ lines = _group_words_into_lines(words, y_tolerance_px=y_tolerance_px) result_lines = [] for line_words in lines: if not line_words: continue sorted_words = sorted(line_words, key=lambda w: w['left']) # Calculate average character width from all words in line total_chars = sum(len(w['text']) for w in sorted_words if w.get('text')) total_width = sum(w['width'] for w in sorted_words if w.get('text')) avg_char_width = total_width / total_chars if total_chars > 0 else 10 parts = [] for i, word in enumerate(sorted_words): parts.append(word.get('text', '')) if i < len(sorted_words) - 1: next_word = sorted_words[i + 1] gap_px = next_word['left'] - (word['left'] + word['width']) num_spaces = max(1, round(gap_px / avg_char_width)) parts.append(' ' * num_spaces) result_lines.append(''.join(parts)) return '\n'.join(result_lines)