diff --git a/klausur-service/backend/cv_layout.py b/klausur-service/backend/cv_layout.py index 22ae93a..b9709b8 100644 --- a/klausur-service/backend/cv_layout.py +++ b/klausur-service/backend/cv_layout.py @@ -21,6 +21,7 @@ from cv_vocab_types import ( PageZone, RowGeometry, ) +from cv_ocr_engines import _group_words_into_lines # noqa: E402 logger = logging.getLogger(__name__) diff --git a/klausur-service/backend/cv_ocr_engines.py b/klausur-service/backend/cv_ocr_engines.py index 2f630c3..112599b 100644 --- a/klausur-service/backend/cv_ocr_engines.py +++ b/klausur-service/backend/cv_ocr_engines.py @@ -37,6 +37,32 @@ except ImportError: # Pipeline Step 5: Word Grid from Columns × Rows # ============================================================================= +def _group_words_into_lines(words: List[Dict], y_tolerance_px: int = 20) -> List[List[Dict]]: + """Group words by Y position into lines, sorted by X within each line.""" + if not words: + return [] + + sorted_words = sorted(words, key=lambda w: (w['top'], w['left'])) + lines: List[List[Dict]] = [] + current_line: List[Dict] = [sorted_words[0]] + current_y = sorted_words[0]['top'] + + for word in sorted_words[1:]: + if abs(word['top'] - current_y) <= y_tolerance_px: + current_line.append(word) + else: + current_line.sort(key=lambda w: w['left']) + lines.append(current_line) + current_line = [word] + current_y = word['top'] + + if current_line: + current_line.sort(key=lambda w: w['left']) + lines.append(current_line) + + return lines + + def _words_to_reading_order_lines(words: List[Dict], y_tolerance_px: int = 15) -> List[str]: """Group OCR words into visual lines in reading order. diff --git a/klausur-service/backend/cv_review.py b/klausur-service/backend/cv_review.py index b3e0bc6..ee7fc7a 100644 --- a/klausur-service/backend/cv_review.py +++ b/klausur-service/backend/cv_review.py @@ -33,6 +33,7 @@ from cv_layout import ( ) from cv_ocr_engines import ( _fix_character_confusion, + _group_words_into_lines, ) logger = logging.getLogger(__name__) @@ -227,32 +228,6 @@ def run_multi_pass_ocr(ocr_img: np.ndarray, # Stage 7: Line Alignment → Vocabulary Entries # ============================================================================= -def _group_words_into_lines(words: List[Dict], y_tolerance_px: int = 20) -> List[List[Dict]]: - """Group words by Y position into lines, sorted by X within each line.""" - if not words: - return [] - - sorted_words = sorted(words, key=lambda w: (w['top'], w['left'])) - lines: List[List[Dict]] = [] - current_line: List[Dict] = [sorted_words[0]] - current_y = sorted_words[0]['top'] - - for word in sorted_words[1:]: - if abs(word['top'] - current_y) <= y_tolerance_px: - current_line.append(word) - else: - current_line.sort(key=lambda w: w['left']) - lines.append(current_line) - current_line = [word] - current_y = word['top'] - - if current_line: - current_line.sort(key=lambda w: w['left']) - lines.append(current_line) - - return lines - - def match_lines_to_vocab(ocr_results: Dict[str, List[Dict]], regions: List[PageRegion], y_tolerance_px: int = 25) -> List[VocabRow]: