fix: _group_words_into_lines nach cv_ocr_engines.py verschieben
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 30s
CI / test-python-klausur (push) Failing after 2m4s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 21s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 30s
CI / test-python-klausur (push) Failing after 2m4s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 21s
Funktion war nur in cv_review.py definiert, wurde aber auch in cv_ocr_engines.py und cv_layout.py benutzt — NameError zur Laufzeit. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -21,6 +21,7 @@ from cv_vocab_types import (
|
||||
PageZone,
|
||||
RowGeometry,
|
||||
)
|
||||
from cv_ocr_engines import _group_words_into_lines # noqa: E402
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -37,6 +37,32 @@ except ImportError:
|
||||
# Pipeline Step 5: Word Grid from Columns × Rows
|
||||
# =============================================================================
|
||||
|
||||
def _group_words_into_lines(words: List[Dict], y_tolerance_px: int = 20) -> List[List[Dict]]:
|
||||
"""Group words by Y position into lines, sorted by X within each line."""
|
||||
if not words:
|
||||
return []
|
||||
|
||||
sorted_words = sorted(words, key=lambda w: (w['top'], w['left']))
|
||||
lines: List[List[Dict]] = []
|
||||
current_line: List[Dict] = [sorted_words[0]]
|
||||
current_y = sorted_words[0]['top']
|
||||
|
||||
for word in sorted_words[1:]:
|
||||
if abs(word['top'] - current_y) <= y_tolerance_px:
|
||||
current_line.append(word)
|
||||
else:
|
||||
current_line.sort(key=lambda w: w['left'])
|
||||
lines.append(current_line)
|
||||
current_line = [word]
|
||||
current_y = word['top']
|
||||
|
||||
if current_line:
|
||||
current_line.sort(key=lambda w: w['left'])
|
||||
lines.append(current_line)
|
||||
|
||||
return lines
|
||||
|
||||
|
||||
def _words_to_reading_order_lines(words: List[Dict], y_tolerance_px: int = 15) -> List[str]:
|
||||
"""Group OCR words into visual lines in reading order.
|
||||
|
||||
|
||||
@@ -33,6 +33,7 @@ from cv_layout import (
|
||||
)
|
||||
from cv_ocr_engines import (
|
||||
_fix_character_confusion,
|
||||
_group_words_into_lines,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -227,32 +228,6 @@ def run_multi_pass_ocr(ocr_img: np.ndarray,
|
||||
# Stage 7: Line Alignment → Vocabulary Entries
|
||||
# =============================================================================
|
||||
|
||||
def _group_words_into_lines(words: List[Dict], y_tolerance_px: int = 20) -> List[List[Dict]]:
|
||||
"""Group words by Y position into lines, sorted by X within each line."""
|
||||
if not words:
|
||||
return []
|
||||
|
||||
sorted_words = sorted(words, key=lambda w: (w['top'], w['left']))
|
||||
lines: List[List[Dict]] = []
|
||||
current_line: List[Dict] = [sorted_words[0]]
|
||||
current_y = sorted_words[0]['top']
|
||||
|
||||
for word in sorted_words[1:]:
|
||||
if abs(word['top'] - current_y) <= y_tolerance_px:
|
||||
current_line.append(word)
|
||||
else:
|
||||
current_line.sort(key=lambda w: w['left'])
|
||||
lines.append(current_line)
|
||||
current_line = [word]
|
||||
current_y = word['top']
|
||||
|
||||
if current_line:
|
||||
current_line.sort(key=lambda w: w['left'])
|
||||
lines.append(current_line)
|
||||
|
||||
return lines
|
||||
|
||||
|
||||
def match_lines_to_vocab(ocr_results: Dict[str, List[Dict]],
|
||||
regions: List[PageRegion],
|
||||
y_tolerance_px: int = 25) -> List[VocabRow]:
|
||||
|
||||
Reference in New Issue
Block a user