fix: _group_words_into_lines nach cv_ocr_engines.py verschieben
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 30s
CI / test-python-klausur (push) Failing after 2m4s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 21s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 30s
CI / test-python-klausur (push) Failing after 2m4s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 21s
Funktion war nur in cv_review.py definiert, wurde aber auch in cv_ocr_engines.py und cv_layout.py benutzt — NameError zur Laufzeit. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -21,6 +21,7 @@ from cv_vocab_types import (
|
|||||||
PageZone,
|
PageZone,
|
||||||
RowGeometry,
|
RowGeometry,
|
||||||
)
|
)
|
||||||
|
from cv_ocr_engines import _group_words_into_lines # noqa: E402
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|||||||
@@ -37,6 +37,32 @@ except ImportError:
|
|||||||
# Pipeline Step 5: Word Grid from Columns × Rows
|
# Pipeline Step 5: Word Grid from Columns × Rows
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|
||||||
|
def _group_words_into_lines(words: List[Dict], y_tolerance_px: int = 20) -> List[List[Dict]]:
|
||||||
|
"""Group words by Y position into lines, sorted by X within each line."""
|
||||||
|
if not words:
|
||||||
|
return []
|
||||||
|
|
||||||
|
sorted_words = sorted(words, key=lambda w: (w['top'], w['left']))
|
||||||
|
lines: List[List[Dict]] = []
|
||||||
|
current_line: List[Dict] = [sorted_words[0]]
|
||||||
|
current_y = sorted_words[0]['top']
|
||||||
|
|
||||||
|
for word in sorted_words[1:]:
|
||||||
|
if abs(word['top'] - current_y) <= y_tolerance_px:
|
||||||
|
current_line.append(word)
|
||||||
|
else:
|
||||||
|
current_line.sort(key=lambda w: w['left'])
|
||||||
|
lines.append(current_line)
|
||||||
|
current_line = [word]
|
||||||
|
current_y = word['top']
|
||||||
|
|
||||||
|
if current_line:
|
||||||
|
current_line.sort(key=lambda w: w['left'])
|
||||||
|
lines.append(current_line)
|
||||||
|
|
||||||
|
return lines
|
||||||
|
|
||||||
|
|
||||||
def _words_to_reading_order_lines(words: List[Dict], y_tolerance_px: int = 15) -> List[str]:
|
def _words_to_reading_order_lines(words: List[Dict], y_tolerance_px: int = 15) -> List[str]:
|
||||||
"""Group OCR words into visual lines in reading order.
|
"""Group OCR words into visual lines in reading order.
|
||||||
|
|
||||||
|
|||||||
@@ -33,6 +33,7 @@ from cv_layout import (
|
|||||||
)
|
)
|
||||||
from cv_ocr_engines import (
|
from cv_ocr_engines import (
|
||||||
_fix_character_confusion,
|
_fix_character_confusion,
|
||||||
|
_group_words_into_lines,
|
||||||
)
|
)
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -227,32 +228,6 @@ def run_multi_pass_ocr(ocr_img: np.ndarray,
|
|||||||
# Stage 7: Line Alignment → Vocabulary Entries
|
# Stage 7: Line Alignment → Vocabulary Entries
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|
||||||
def _group_words_into_lines(words: List[Dict], y_tolerance_px: int = 20) -> List[List[Dict]]:
|
|
||||||
"""Group words by Y position into lines, sorted by X within each line."""
|
|
||||||
if not words:
|
|
||||||
return []
|
|
||||||
|
|
||||||
sorted_words = sorted(words, key=lambda w: (w['top'], w['left']))
|
|
||||||
lines: List[List[Dict]] = []
|
|
||||||
current_line: List[Dict] = [sorted_words[0]]
|
|
||||||
current_y = sorted_words[0]['top']
|
|
||||||
|
|
||||||
for word in sorted_words[1:]:
|
|
||||||
if abs(word['top'] - current_y) <= y_tolerance_px:
|
|
||||||
current_line.append(word)
|
|
||||||
else:
|
|
||||||
current_line.sort(key=lambda w: w['left'])
|
|
||||||
lines.append(current_line)
|
|
||||||
current_line = [word]
|
|
||||||
current_y = word['top']
|
|
||||||
|
|
||||||
if current_line:
|
|
||||||
current_line.sort(key=lambda w: w['left'])
|
|
||||||
lines.append(current_line)
|
|
||||||
|
|
||||||
return lines
|
|
||||||
|
|
||||||
|
|
||||||
def match_lines_to_vocab(ocr_results: Dict[str, List[Dict]],
|
def match_lines_to_vocab(ocr_results: Dict[str, List[Dict]],
|
||||||
regions: List[PageRegion],
|
regions: List[PageRegion],
|
||||||
y_tolerance_px: int = 25) -> List[VocabRow]:
|
y_tolerance_px: int = 25) -> List[VocabRow]:
|
||||||
|
|||||||
Reference in New Issue
Block a user