fix: _group_words_into_lines nach cv_ocr_engines.py verschieben

Funktion war nur in cv_review.py definiert, wurde aber auch in cv_ocr_engines.py und cv_layout.py benutzt — NameError zur Laufzeit. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-09 15:24:56 +01:00
parent 60c4138660
commit cf9dde9876
3 changed files with 28 additions and 26 deletions
@@ -21,6 +21,7 @@ from cv_vocab_types import (
    PageZone,
    RowGeometry,
 )
+from cv_ocr_engines import _group_words_into_lines  # noqa: E402

 logger = logging.getLogger(__name__)

@@ -37,6 +37,32 @@ except ImportError:
 # Pipeline Step 5: Word Grid from Columns × Rows
 # =============================================================================

+def _group_words_into_lines(words: List[Dict], y_tolerance_px: int = 20) -> List[List[Dict]]:
+    """Group words by Y position into lines, sorted by X within each line."""
+    if not words:
+        return []
+
+    sorted_words = sorted(words, key=lambda w: (w['top'], w['left']))
+    lines: List[List[Dict]] = []
+    current_line: List[Dict] = [sorted_words[0]]
+    current_y = sorted_words[0]['top']
+
+    for word in sorted_words[1:]:
+        if abs(word['top'] - current_y) <= y_tolerance_px:
+            current_line.append(word)
+        else:
+            current_line.sort(key=lambda w: w['left'])
+            lines.append(current_line)
+            current_line = [word]
+            current_y = word['top']
+
+    if current_line:
+        current_line.sort(key=lambda w: w['left'])
+        lines.append(current_line)
+
+    return lines
+
+
 def _words_to_reading_order_lines(words: List[Dict], y_tolerance_px: int = 15) -> List[str]:
    """Group OCR words into visual lines in reading order.

@@ -33,6 +33,7 @@ from cv_layout import (
 )
 from cv_ocr_engines import (
    _fix_character_confusion,
+    _group_words_into_lines,
 )

 logger = logging.getLogger(__name__)
@@ -227,32 +228,6 @@ def run_multi_pass_ocr(ocr_img: np.ndarray,
 # Stage 7: Line Alignment → Vocabulary Entries
 # =============================================================================

-def _group_words_into_lines(words: List[Dict], y_tolerance_px: int = 20) -> List[List[Dict]]:
-    """Group words by Y position into lines, sorted by X within each line."""
-    if not words:
-        return []
-
-    sorted_words = sorted(words, key=lambda w: (w['top'], w['left']))
-    lines: List[List[Dict]] = []
-    current_line: List[Dict] = [sorted_words[0]]
-    current_y = sorted_words[0]['top']
-
-    for word in sorted_words[1:]:
-        if abs(word['top'] - current_y) <= y_tolerance_px:
-            current_line.append(word)
-        else:
-            current_line.sort(key=lambda w: w['left'])
-            lines.append(current_line)
-            current_line = [word]
-            current_y = word['top']
-
-    if current_line:
-        current_line.sort(key=lambda w: w['left'])
-        lines.append(current_line)
-
-    return lines
-
-
 def match_lines_to_vocab(ocr_results: Dict[str, List[Dict]],
                          regions: List[PageRegion],
                          y_tolerance_px: int = 25) -> List[VocabRow]: