Fix: Remove broken getKlausurApiUrl and clean up empty lines
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
134
klausur-service/backend/cv_ocr_word_assembly.py
Normal file
134
klausur-service/backend/cv_ocr_word_assembly.py
Normal file
@@ -0,0 +1,134 @@
|
||||
"""
|
||||
Word assembly helpers for OCR output.
|
||||
|
||||
Groups raw OCR word dicts (with 'top', 'left', 'width', 'text' keys)
|
||||
into visual lines, rejoins hyphenated words, and produces reading-order
|
||||
text. All functions are pure standard-library; no NumPy or project
|
||||
imports required.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Dict, List
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _group_words_into_lines(words: List[Dict], y_tolerance_px: int = 20) -> List[List[Dict]]:
|
||||
"""Group words by Y position into lines, sorted by X within each line."""
|
||||
if not words:
|
||||
return []
|
||||
|
||||
sorted_words = sorted(words, key=lambda w: (w['top'], w['left']))
|
||||
lines: List[List[Dict]] = []
|
||||
current_line: List[Dict] = [sorted_words[0]]
|
||||
current_y = sorted_words[0]['top']
|
||||
|
||||
for word in sorted_words[1:]:
|
||||
if abs(word['top'] - current_y) <= y_tolerance_px:
|
||||
current_line.append(word)
|
||||
else:
|
||||
current_line.sort(key=lambda w: w['left'])
|
||||
lines.append(current_line)
|
||||
current_line = [word]
|
||||
current_y = word['top']
|
||||
|
||||
if current_line:
|
||||
current_line.sort(key=lambda w: w['left'])
|
||||
lines.append(current_line)
|
||||
|
||||
return lines
|
||||
|
||||
|
||||
def _words_to_reading_order_lines(words: List[Dict], y_tolerance_px: int = 15) -> List[str]:
|
||||
"""Group OCR words into visual lines in reading order.
|
||||
|
||||
Returns a list of line strings (one per visual line in the cell).
|
||||
"""
|
||||
if not words:
|
||||
return []
|
||||
|
||||
lines = _group_words_into_lines(words, y_tolerance_px=y_tolerance_px)
|
||||
return [' '.join(w['text'] for w in line) for line in lines]
|
||||
|
||||
|
||||
def _rejoin_hyphenated(lines: List[str]) -> List[str]:
|
||||
"""Rejoin words split by line-break hyphenation.
|
||||
|
||||
E.g. ['Fu\u00df-', 'boden'] \u2192 ['Fu\u00dfboden']
|
||||
['some text-', 'thing here'] \u2192 ['something here']
|
||||
"""
|
||||
if len(lines) <= 1:
|
||||
return lines
|
||||
|
||||
result = []
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
line = lines[i]
|
||||
# If line ends with '-' and there's a next line, rejoin
|
||||
if i + 1 < len(lines) and line.rstrip().endswith('-'):
|
||||
stripped = line.rstrip()
|
||||
# Get the word fragment before hyphen (last word)
|
||||
prefix = stripped[:-1] # remove trailing hyphen
|
||||
next_line = lines[i + 1]
|
||||
# Join: last word of this line + first word of next line
|
||||
prefix_words = prefix.rsplit(' ', 1)
|
||||
next_words = next_line.split(' ', 1)
|
||||
if len(prefix_words) > 1:
|
||||
joined = prefix_words[0] + ' ' + prefix_words[1] + next_words[0]
|
||||
else:
|
||||
joined = prefix_words[0] + next_words[0]
|
||||
remainder = next_words[1] if len(next_words) > 1 else ''
|
||||
if remainder:
|
||||
result.append(joined + ' ' + remainder)
|
||||
else:
|
||||
result.append(joined)
|
||||
i += 2
|
||||
else:
|
||||
result.append(line)
|
||||
i += 1
|
||||
return result
|
||||
|
||||
|
||||
def _words_to_reading_order_text(words: List[Dict], y_tolerance_px: int = 15) -> str:
|
||||
"""Join OCR words into text in correct reading order, preserving line breaks.
|
||||
|
||||
Groups words into visual lines by Y-tolerance, sorts each line by X,
|
||||
rejoins hyphenated words, then joins lines with newlines.
|
||||
"""
|
||||
lines = _words_to_reading_order_lines(words, y_tolerance_px)
|
||||
lines = _rejoin_hyphenated(lines)
|
||||
return '\n'.join(lines)
|
||||
|
||||
|
||||
def _words_to_spaced_text(words: List[Dict], y_tolerance_px: int = 15) -> str:
|
||||
"""Join OCR words preserving proportional horizontal spacing.
|
||||
|
||||
Instead of single spaces between words, inserts multiple spaces based on
|
||||
the pixel gap between words relative to average character width.
|
||||
Useful for box sub-sessions where spatial layout matters.
|
||||
"""
|
||||
lines = _group_words_into_lines(words, y_tolerance_px=y_tolerance_px)
|
||||
result_lines = []
|
||||
|
||||
for line_words in lines:
|
||||
if not line_words:
|
||||
continue
|
||||
sorted_words = sorted(line_words, key=lambda w: w['left'])
|
||||
|
||||
# Calculate average character width from all words in line
|
||||
total_chars = sum(len(w['text']) for w in sorted_words if w.get('text'))
|
||||
total_width = sum(w['width'] for w in sorted_words if w.get('text'))
|
||||
avg_char_width = total_width / total_chars if total_chars > 0 else 10
|
||||
|
||||
parts = []
|
||||
for i, word in enumerate(sorted_words):
|
||||
parts.append(word.get('text', ''))
|
||||
if i < len(sorted_words) - 1:
|
||||
next_word = sorted_words[i + 1]
|
||||
gap_px = next_word['left'] - (word['left'] + word['width'])
|
||||
num_spaces = max(1, round(gap_px / avg_char_width))
|
||||
parts.append(' ' * num_spaces)
|
||||
|
||||
result_lines.append(''.join(parts))
|
||||
|
||||
return '\n'.join(result_lines)
|
||||
Reference in New Issue
Block a user