Restructure: Move 47 cv_* files into ocr/ package
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 39s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m34s
CI / test-python-agent-core (push) Successful in 20s
CI / test-nodejs-website (push) Successful in 26s

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-25 21:03:54 +02:00
parent 45287b3541
commit cb1be59e46
95 changed files with 317 additions and 103 deletions
+1
View File
@@ -27,6 +27,7 @@
# Algorithmic monolith — detect_column_geometry() allein 411 LOC, nicht weiter teilbar # Algorithmic monolith — detect_column_geometry() allein 411 LOC, nicht weiter teilbar
**/cv_layout_columns.py | owner=klausur | reason=detect_column_geometry ist eine einzelne 411-LOC Funktion (Whitespace-Gap-Analyse) | review=2026-10-01 **/cv_layout_columns.py | owner=klausur | reason=detect_column_geometry ist eine einzelne 411-LOC Funktion (Whitespace-Gap-Analyse) | review=2026-10-01
**/ocr/layout/columns.py | owner=klausur | reason=Same file moved to ocr/ package | review=2026-10-01
# Two indivisible route handlers (~230 LOC each) that cannot be split further # Two indivisible route handlers (~230 LOC each) that cannot be split further
**/vocab_worksheet_compare_api.py | owner=klausur | reason=compare_ocr_methods (234 LOC) + analyze_grid (255 LOC), each a single cohesive handler | review=2026-10-01 **/vocab_worksheet_compare_api.py | owner=klausur | reason=compare_ocr_methods (234 LOC) + analyze_grid (255 LOC), each a single cohesive handler | review=2026-10-01
+4
View File
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/detect\/box_detect.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.detect.box_detect")
+4
View File
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/detect\/box_layout.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.detect.box_layout")
+4
View File
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/cell_grid\/cell_grid.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.cell_grid.cell_grid")
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/cell_grid\/build.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.cell_grid.build")
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/cell_grid\/helpers.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.cell_grid.helpers")
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/cell_grid\/legacy.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.cell_grid.legacy")
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/cell_grid\/merge.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.cell_grid.merge")
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/cell_grid\/streaming.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.cell_grid.streaming")
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/cell_grid\/vocab.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.cell_grid.vocab")
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/detect\/color_detect.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.detect.color_detect")
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/detect\/doclayout_detect.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.detect.doclayout_detect")
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/detect\/graphic_detect.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.detect.graphic_detect")
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/gutter\/repair.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.gutter.repair")
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/gutter\/core.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.gutter.core")
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/gutter\/grid.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.gutter.grid")
+4
View File
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/ipa_german.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.ipa_german")
+4
View File
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/layout\/layout.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.layout.layout")
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/layout\/analyze.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.layout.analyze")
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/layout\/classify.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.layout.classify")
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/layout\/classify_position.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.layout.classify_position")
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/layout\/column_refine.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.layout.column_refine")
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/layout\/columns.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.layout.columns")
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/layout\/detection.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.layout.detection")
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/layout\/row_regularize.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.layout.row_regularize")
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/layout\/rows.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.layout.rows")
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/layout\/scoring.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.layout.scoring")
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/engines\/cell_filter.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.engines.cell_filter")
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/engines\/cell_phonetics.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.engines.cell_phonetics")
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/engines\/engines.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.engines.engines")
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/engines\/ipa_lookup.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.engines.ipa_lookup")
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/engines\/ipa_repair.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.engines.ipa_repair")
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/engines\/vocab_postprocess.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.engines.vocab_postprocess")
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/engines\/word_assembly.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.engines.word_assembly")
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/preprocessing\/preprocessing.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.preprocessing.preprocessing")
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/preprocessing\/deskew.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.preprocessing.deskew")
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/preprocessing\/dewarp.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.preprocessing.dewarp")
+4
View File
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/review\/review.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.review.review")
+4
View File
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/review\/llm.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.review.llm")
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/review\/pipeline.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.review.pipeline")
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/review\/spell.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.review.spell")
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/detect\/syllable\/core.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.detect.syllable.core")
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/detect\/syllable\/detect.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.detect.syllable.detect")
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/detect\/syllable\/merge.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.detect.syllable.merge")
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/pipeline.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.pipeline")
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/types.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.types")
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/words_first.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.words_first")
+9
View File
@@ -0,0 +1,9 @@
"""
OCR package — restructured from cv_* flat modules.
Backward-compatible re-exports: consumers can still use
``from cv_layout import ...`` etc. via the shim files in backend/.
"""
from .types import * # noqa: F401,F403
from .pipeline import * # noqa: F401,F403
@@ -0,0 +1,2 @@
"""Cell-grid construction sub-package."""
from .cell_grid import * # noqa: F401,F403
@@ -10,8 +10,8 @@ from typing import Any, Dict, List, Optional, Tuple
import numpy as np import numpy as np
from cv_vocab_types import PageRegion, RowGeometry from ..types import PageRegion, RowGeometry
from cv_ocr_engines import ( from ..engines.engines import (
RAPIDOCR_AVAILABLE, RAPIDOCR_AVAILABLE,
_assign_row_words_to_columns, _assign_row_words_to_columns,
_clean_cell_text, _clean_cell_text,
@@ -22,7 +22,7 @@ from cv_ocr_engines import (
ocr_region_rapid, ocr_region_rapid,
ocr_region_trocr, ocr_region_trocr,
) )
from cv_cell_grid_helpers import ( from .helpers import (
_MIN_WORD_CONF, _MIN_WORD_CONF,
_ensure_minimum_crop_size, _ensure_minimum_crop_size,
_heal_row_gaps, _heal_row_gaps,
@@ -16,7 +16,7 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
""" """
# --- Helpers --- # --- Helpers ---
from cv_cell_grid_helpers import ( # noqa: F401 from .helpers import ( # noqa: F401
_MIN_WORD_CONF, _MIN_WORD_CONF,
_compute_cell_padding, _compute_cell_padding,
_ensure_minimum_crop_size, _ensure_minimum_crop_size,
@@ -26,26 +26,26 @@ from cv_cell_grid_helpers import ( # noqa: F401
) )
# --- v2 build (current default) --- # --- v2 build (current default) ---
from cv_cell_grid_build import ( # noqa: F401 from .build import ( # noqa: F401
_NARROW_COL_THRESHOLD_PCT, _NARROW_COL_THRESHOLD_PCT,
_ocr_cell_crop, _ocr_cell_crop,
build_cell_grid_v2, build_cell_grid_v2,
) )
# --- Legacy build (DEPRECATED) --- # --- Legacy build (DEPRECATED) ---
from cv_cell_grid_legacy import ( # noqa: F401 from .legacy import ( # noqa: F401
_ocr_single_cell, _ocr_single_cell,
build_cell_grid, build_cell_grid,
) )
# --- Streaming variants --- # --- Streaming variants ---
from cv_cell_grid_streaming import ( # noqa: F401 from .streaming import ( # noqa: F401
build_cell_grid_streaming, build_cell_grid_streaming,
build_cell_grid_v2_streaming, build_cell_grid_v2_streaming,
) )
# --- Row merging --- # --- Row merging ---
from cv_cell_grid_merge import ( # noqa: F401 from .merge import ( # noqa: F401
_PHONETIC_ONLY_RE, _PHONETIC_ONLY_RE,
_is_phonetic_only_text, _is_phonetic_only_text,
_merge_continuation_rows, _merge_continuation_rows,
@@ -54,7 +54,7 @@ from cv_cell_grid_merge import ( # noqa: F401
) )
# --- Vocab extraction --- # --- Vocab extraction ---
from cv_cell_grid_vocab import ( # noqa: F401 from .vocab import ( # noqa: F401
_cells_to_vocab_entries, _cells_to_vocab_entries,
build_word_grid, build_word_grid,
) )
@@ -13,7 +13,7 @@ from typing import List
import numpy as np import numpy as np
from cv_vocab_types import RowGeometry from ..types import RowGeometry
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -12,8 +12,8 @@ from typing import Any, Dict, List, Optional, Tuple
import numpy as np import numpy as np
from cv_vocab_types import PageRegion, RowGeometry from ..types import PageRegion, RowGeometry
from cv_ocr_engines import ( from ..engines.engines import (
RAPIDOCR_AVAILABLE, RAPIDOCR_AVAILABLE,
_assign_row_words_to_columns, _assign_row_words_to_columns,
_clean_cell_text, _clean_cell_text,
@@ -22,7 +22,7 @@ from cv_ocr_engines import (
ocr_region_rapid, ocr_region_rapid,
ocr_region_trocr, ocr_region_trocr,
) )
from cv_cell_grid_helpers import ( from .helpers import (
_MIN_WORD_CONF, _MIN_WORD_CONF,
_compute_cell_padding, _compute_cell_padding,
_ensure_minimum_crop_size, _ensure_minimum_crop_size,
@@ -11,7 +11,7 @@ import logging
import re import re
from typing import Any, Dict, List from typing import Any, Dict, List
from cv_ocr_engines import _RE_ALPHA from ..engines.engines import _RE_ALPHA
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -13,17 +13,17 @@ from typing import Any, Dict, Generator, List, Optional, Tuple
import numpy as np import numpy as np
from cv_vocab_types import PageRegion, RowGeometry from ..types import PageRegion, RowGeometry
from cv_ocr_engines import ( from ..engines.engines import (
RAPIDOCR_AVAILABLE, RAPIDOCR_AVAILABLE,
_assign_row_words_to_columns, _assign_row_words_to_columns,
) )
from cv_cell_grid_helpers import ( from .helpers import (
_heal_row_gaps, _heal_row_gaps,
_is_artifact_row, _is_artifact_row,
) )
from cv_cell_grid_build import _ocr_cell_crop from .build import _ocr_cell_crop
from cv_cell_grid_legacy import _ocr_single_cell from .legacy import _ocr_single_cell
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -10,13 +10,13 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
import logging import logging
from typing import Any, Dict, List from typing import Any, Dict, List
from cv_ocr_engines import ( from ..engines.engines import (
_attach_example_sentences, _attach_example_sentences,
_fix_phonetic_brackets, _fix_phonetic_brackets,
_split_comma_entries, _split_comma_entries,
) )
from cv_cell_grid_legacy import build_cell_grid from .legacy import build_cell_grid
from cv_cell_grid_merge import ( from .merge import (
_merge_continuation_rows, _merge_continuation_rows,
_merge_phonetic_continuation_rows, _merge_phonetic_continuation_rows,
_merge_wrapped_rows, _merge_wrapped_rows,
@@ -0,0 +1,2 @@
"""Detection sub-package (boxes, graphics, colors, syllables, doclayout)."""
from .box_detect import * # noqa: F401,F403
@@ -21,7 +21,7 @@ from typing import List, Optional, Tuple
import cv2 import cv2
import numpy as np import numpy as np
from cv_vocab_types import DetectedBox, PageZone from ..types import DetectedBox, PageZone
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -127,7 +127,7 @@ def detect_graphic_elements(
backend = os.environ.get("GRAPHIC_DETECT_BACKEND", "auto") backend = os.environ.get("GRAPHIC_DETECT_BACKEND", "auto")
if backend in ("doclayout", "auto"): if backend in ("doclayout", "auto"):
try: try:
from cv_doclayout_detect import detect_layout_regions, is_doclayout_available from .doclayout_detect import detect_layout_regions, is_doclayout_available
if is_doclayout_available(): if is_doclayout_available():
regions = detect_layout_regions(img_bgr) regions = detect_layout_regions(img_bgr)
if regions: if regions:
@@ -0,0 +1,2 @@
"""Syllable detection sub-package."""
from .detect import * # noqa: F401,F403
@@ -10,7 +10,7 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
""" """
# Core: init, validation, autocorrect # Core: init, validation, autocorrect
from cv_syllable_core import ( # noqa: F401 from .core import ( # noqa: F401
_IPA_RE, _IPA_RE,
_STOP_WORDS, _STOP_WORDS,
_get_hyphenators, _get_hyphenators,
@@ -23,7 +23,7 @@ from cv_syllable_core import ( # noqa: F401
) )
# Merge: gap merging, syllabify, insert # Merge: gap merging, syllabify, insert
from cv_syllable_merge import ( # noqa: F401 from .merge import ( # noqa: F401
_try_merge_pipe_gaps, _try_merge_pipe_gaps,
merge_word_gaps_in_zones, merge_word_gaps_in_zones,
_try_merge_word_gaps, _try_merge_word_gaps,
@@ -13,7 +13,7 @@ from typing import Any, Dict, List, Optional
import numpy as np import numpy as np
from cv_syllable_core import ( from .core import (
_get_hyphenators, _get_hyphenators,
_hyphenate_word, _hyphenate_word,
_IPA_RE, _IPA_RE,
@@ -0,0 +1,2 @@
"""OCR engines sub-package."""
from .engines import * # noqa: F401,F403
@@ -16,7 +16,7 @@ from typing import Any, Dict, List, Optional
import numpy as np import numpy as np
from cv_vocab_types import PageRegion, RowGeometry from ..types import PageRegion, RowGeometry
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -11,14 +11,14 @@ import logging
import re import re
from typing import Any, Dict, List from typing import Any, Dict, List
from cv_vocab_types import IPA_AVAILABLE from ..types import IPA_AVAILABLE
from cv_ocr_ipa_lookup import ( from .ipa_lookup import (
_insert_missing_ipa, _insert_missing_ipa,
_replace_phonetics_in_text, _replace_phonetics_in_text,
_text_has_garbled_ipa, _text_has_garbled_ipa,
) )
from cv_ocr_ipa_repair import ( from .ipa_repair import (
_has_non_dict_trailing, _has_non_dict_trailing,
_insert_headword_ipa, _insert_headword_ipa,
_strip_post_bracket_garbled, _strip_post_bracket_garbled,
@@ -24,7 +24,7 @@ from typing import Any, Dict, List, Optional, Tuple
import numpy as np import numpy as np
from cv_vocab_types import ( from ..types import (
IPA_AVAILABLE, IPA_AVAILABLE,
PageRegion, PageRegion,
RowGeometry, RowGeometry,
@@ -47,7 +47,7 @@ except ImportError:
# ── Re-exports from sub-modules (backward compatibility) ────────────────── # ── Re-exports from sub-modules (backward compatibility) ──────────────────
from cv_ocr_word_assembly import ( # noqa: F401 from .word_assembly import ( # noqa: F401
_group_words_into_lines, _group_words_into_lines,
_words_to_reading_order_lines, _words_to_reading_order_lines,
_rejoin_hyphenated, _rejoin_hyphenated,
@@ -55,7 +55,7 @@ from cv_ocr_word_assembly import ( # noqa: F401
_words_to_spaced_text, _words_to_spaced_text,
) )
from cv_ocr_vocab_postprocess import ( # noqa: F401 from .vocab_postprocess import ( # noqa: F401
_CHAR_CONFUSION_RULES, _CHAR_CONFUSION_RULES,
_DE_INDICATORS_FOR_EN_I, _DE_INDICATORS_FOR_EN_I,
_fix_character_confusion, _fix_character_confusion,
@@ -66,7 +66,7 @@ from cv_ocr_vocab_postprocess import ( # noqa: F401
_attach_example_sentences, _attach_example_sentences,
) )
from cv_ocr_ipa_lookup import ( # noqa: F401 from .ipa_lookup import ( # noqa: F401
_PHONETIC_BRACKET_RE, _PHONETIC_BRACKET_RE,
_IPA_CHARS, _IPA_CHARS,
_MIN_WORD_CONF, _MIN_WORD_CONF,
@@ -80,20 +80,20 @@ from cv_ocr_ipa_lookup import ( # noqa: F401
_insert_missing_ipa, _insert_missing_ipa,
) )
from cv_ocr_ipa_repair import ( # noqa: F401 from .ipa_repair import ( # noqa: F401
_has_non_dict_trailing, _has_non_dict_trailing,
_strip_post_bracket_garbled, _strip_post_bracket_garbled,
fix_ipa_continuation_cell, fix_ipa_continuation_cell,
_insert_headword_ipa, _insert_headword_ipa,
) )
from cv_ocr_cell_phonetics import ( # noqa: F401 from .cell_phonetics import ( # noqa: F401
fix_cell_phonetics, fix_cell_phonetics,
_has_ipa_gap, _has_ipa_gap,
_sync_word_boxes_after_ipa_insert, _sync_word_boxes_after_ipa_insert,
) )
from cv_ocr_cell_filter import ( # noqa: F401 from .cell_filter import ( # noqa: F401
_RE_REAL_WORD, _RE_REAL_WORD,
_RE_ALPHA, _RE_ALPHA,
_COMMON_SHORT_WORDS, _COMMON_SHORT_WORDS,
@@ -23,7 +23,7 @@ import logging
import re import re
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
from cv_vocab_types import ( from ..types import (
IPA_AVAILABLE, IPA_AVAILABLE,
_britfone_dict, _britfone_dict,
_ipa_convert_american, _ipa_convert_american,
@@ -16,8 +16,8 @@ import logging
import re import re
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
from cv_vocab_types import IPA_AVAILABLE from ..types import IPA_AVAILABLE
from cv_ocr_ipa_lookup import ( from .ipa_lookup import (
_lookup_ipa, _lookup_ipa,
_GRAMMAR_BRACKET_WORDS, _GRAMMAR_BRACKET_WORDS,
) )
@@ -0,0 +1,2 @@
"""Gutter repair sub-package."""
from .repair import * # noqa: F401,F403
+1 -1
View File
@@ -11,7 +11,7 @@ import logging
import time import time
from typing import Any, Dict, List, Tuple from typing import Any, Dict, List, Tuple
from cv_gutter_repair_core import ( from .core import (
_init_spellcheckers, _init_spellcheckers,
_is_ipa_text, _is_ipa_text,
_is_known, _is_known,
+2 -2
View File
@@ -10,7 +10,7 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
""" """
# Core: spellchecker, data types, repair helpers # Core: spellchecker, data types, repair helpers
from cv_gutter_repair_core import ( # noqa: F401 from .core import ( # noqa: F401
_init_spellcheckers, _init_spellcheckers,
_is_known, _is_known,
_spell_candidates, _spell_candidates,
@@ -29,7 +29,7 @@ from cv_gutter_repair_core import ( # noqa: F401
) )
# Grid: analysis and application # Grid: analysis and application
from cv_gutter_repair_grid import ( # noqa: F401 from .grid import ( # noqa: F401
analyse_grid_for_gutter_repair, analyse_grid_for_gutter_repair,
apply_gutter_suggestions, apply_gutter_suggestions,
) )
+2 -2
View File
@@ -26,7 +26,7 @@ def _lookup_ipa_de(word: str) -> Optional[str]:
Returns IPA string or None if not found. Returns IPA string or None if not found.
""" """
from cv_vocab_types import _de_ipa_dict, _epitran_de, DE_IPA_AVAILABLE from .types import _de_ipa_dict, _epitran_de, DE_IPA_AVAILABLE
if not DE_IPA_AVAILABLE and _epitran_de is None: if not DE_IPA_AVAILABLE and _epitran_de is None:
return None return None
@@ -109,7 +109,7 @@ def insert_german_ipa(
Returns: Returns:
Number of cells modified. Number of cells modified.
""" """
from cv_vocab_types import DE_IPA_AVAILABLE, _epitran_de from .types import DE_IPA_AVAILABLE, _epitran_de
if not DE_IPA_AVAILABLE and _epitran_de is None: if not DE_IPA_AVAILABLE and _epitran_de is None:
logger.warning("German IPA not available — skipping") logger.warning("German IPA not available — skipping")
@@ -0,0 +1,2 @@
"""Layout analysis sub-package."""
from .layout import * # noqa: F401,F403
@@ -13,8 +13,8 @@ from typing import List
import numpy as np import numpy as np
from cv_vocab_types import PageRegion from ..types import PageRegion
from cv_layout_detection import _find_content_bounds from .detection import _find_content_bounds
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -246,7 +246,7 @@ def analyze_layout(layout_img: np.ndarray, ocr_img: np.ndarray) -> List[PageRegi
# Add header/footer info (gap-based detection with fallback) # Add header/footer info (gap-based detection with fallback)
# Lazy import to avoid circular dependency with cv_layout.py # Lazy import to avoid circular dependency with cv_layout.py
from cv_layout_detection import _add_header_footer from .detection import _add_header_footer
_add_header_footer(regions, top_y, bottom_y, w, h, inv=inv) _add_header_footer(regions, top_y, bottom_y, w, h, inv=inv)
top_region = next((r.type for r in regions if r.type in ('header', 'margin_top')), 'none') top_region = next((r.type for r in regions if r.type in ('header', 'margin_top')), 'none')
@@ -11,16 +11,16 @@ from typing import Dict, List, Optional
import numpy as np import numpy as np
from cv_vocab_types import ColumnGeometry, PageRegion from ..types import ColumnGeometry, PageRegion
from cv_layout_scoring import ( from .scoring import (
_score_language, _score_language,
_score_role, _score_role,
_score_dictionary_signals, _score_dictionary_signals,
_classify_dictionary_columns, _classify_dictionary_columns,
) )
from cv_layout_classify_position import ( from .classify_position import (
_classify_by_position_enhanced, _classify_by_position_enhanced,
_classify_by_position_fallback, _classify_by_position_fallback,
) )
@@ -211,7 +211,7 @@ def classify_column_types(geometries: List[ColumnGeometry],
# _add_header_footer lives in cv_layout (avoids circular import at module # _add_header_footer lives in cv_layout (avoids circular import at module
# level). Lazy-import here so the module can be tested independently when # level). Lazy-import here so the module can be tested independently when
# cv_layout hasn't been modified yet. # cv_layout hasn't been modified yet.
from cv_layout_detection import _add_header_footer # noqa: E402 from .detection import _add_header_footer # noqa: E402
content_h = bottom_y - top_y content_h = bottom_y - top_y
@@ -11,7 +11,7 @@ Extracted from cv_layout_classify.py during file-size split.
import logging import logging
from typing import Dict, List, Optional from typing import Dict, List, Optional
from cv_vocab_types import ColumnGeometry, PageRegion from ..types import ColumnGeometry, PageRegion
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -16,7 +16,7 @@ from typing import Dict, List, Optional, Tuple
import numpy as np import numpy as np
from cv_vocab_types import ColumnGeometry from ..types import ColumnGeometry
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -19,8 +19,8 @@ from typing import Dict, List, Optional, Tuple
import numpy as np import numpy as np
from cv_vocab_types import ColumnGeometry from ..types import ColumnGeometry
from cv_layout_detection import _find_content_bounds from .detection import _find_content_bounds
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -13,7 +13,7 @@ from typing import List, Optional, Tuple
import numpy as np import numpy as np
from cv_vocab_types import ( from ..types import (
DocumentTypeResult, DocumentTypeResult,
PageRegion, PageRegion,
) )
+11 -11
View File
@@ -21,14 +21,14 @@ from typing import Any, Dict, List, Optional, Tuple
import numpy as np import numpy as np
from cv_vocab_types import ColumnGeometry, DetectedBox, PageRegion from ..types import ColumnGeometry, DetectedBox, PageRegion
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# ── Re-exports (backward compatibility) ─────────────────────────────────── # ── Re-exports (backward compatibility) ───────────────────────────────────
from cv_layout_detection import ( # noqa: F401 from .detection import ( # noqa: F401
detect_document_type, detect_document_type,
create_ocr_image, create_ocr_image,
create_layout_image, create_layout_image,
@@ -39,46 +39,46 @@ from cv_layout_detection import ( # noqa: F401
_add_header_footer, _add_header_footer,
) )
from cv_layout_analyze import ( # noqa: F401 from .analyze import ( # noqa: F401
analyze_layout, analyze_layout,
) )
from cv_layout_columns import ( # noqa: F401 from .columns import ( # noqa: F401
detect_column_geometry, detect_column_geometry,
_detect_columns_by_clustering, _detect_columns_by_clustering,
_build_geometries_from_starts, _build_geometries_from_starts,
) )
from cv_layout_column_refine import ( # noqa: F401 from .column_refine import ( # noqa: F401
_detect_sub_columns, _detect_sub_columns,
_split_broad_columns, _split_broad_columns,
expand_narrow_columns, expand_narrow_columns,
) )
from cv_layout_rows import ( # noqa: F401 from .rows import ( # noqa: F401
detect_row_geometry, detect_row_geometry,
_build_rows_from_word_grouping, _build_rows_from_word_grouping,
) )
from cv_layout_row_regularize import ( # noqa: F401 from .row_regularize import ( # noqa: F401
_regularize_row_grid, _regularize_row_grid,
) )
from cv_layout_scoring import ( # noqa: F401 from .scoring import ( # noqa: F401
_score_language, _score_language,
_score_role, _score_role,
_score_dictionary_signals, _score_dictionary_signals,
_classify_dictionary_columns, _classify_dictionary_columns,
) )
from cv_layout_classify import ( # noqa: F401 from .classify import ( # noqa: F401
_build_margin_regions, _build_margin_regions,
positional_column_regions, positional_column_regions,
classify_column_types, classify_column_types,
_classify_by_content, _classify_by_content,
) )
from cv_layout_classify_position import ( # noqa: F401 from .classify_position import ( # noqa: F401
_classify_by_position_enhanced, _classify_by_position_enhanced,
_classify_by_position_fallback, _classify_by_position_fallback,
) )
@@ -143,7 +143,7 @@ def detect_column_geometry_zoned(
per content zone on the corresponding sub-image. per content zone on the corresponding sub-image.
4. If no boxes: delegates entirely to detect_column_geometry(). 4. If no boxes: delegates entirely to detect_column_geometry().
""" """
from cv_box_detect import detect_boxes, split_page_into_zones from ..detect.box_detect import detect_boxes, split_page_into_zones
geo_result = detect_column_geometry(ocr_img, dewarped_bgr) geo_result = detect_column_geometry(ocr_img, dewarped_bgr)
if geo_result is None: if geo_result is None:
@@ -13,7 +13,7 @@ from typing import Dict, List
import numpy as np import numpy as np
from cv_vocab_types import RowGeometry from ..types import RowGeometry
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
+3 -3
View File
@@ -20,9 +20,9 @@ try:
except ImportError: except ImportError:
cv2 = None # type: ignore[assignment] cv2 = None # type: ignore[assignment]
from cv_vocab_types import RowGeometry from ..types import RowGeometry
from cv_ocr_word_assembly import _group_words_into_lines from ..engines.word_assembly import _group_words_into_lines
from cv_layout_row_regularize import _regularize_row_grid from .row_regularize import _regularize_row_grid
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -11,7 +11,7 @@ import logging
from collections import Counter from collections import Counter
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
from cv_vocab_types import ( from ..types import (
ColumnGeometry, ColumnGeometry,
ENGLISH_FUNCTION_WORDS, ENGLISH_FUNCTION_WORDS,
GERMAN_FUNCTION_WORDS, GERMAN_FUNCTION_WORDS,
+12 -12
View File
@@ -14,24 +14,24 @@ Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
""" """
from cv_vocab_types import * # noqa: F401,F403 from .types import * # noqa: F401,F403
from cv_preprocessing import * # noqa: F401,F403 from .preprocessing.preprocessing import * # noqa: F401,F403
from cv_layout import * # noqa: F401,F403 from .layout.layout import * # noqa: F401,F403
from cv_ocr_engines import * # noqa: F401,F403 from .engines.engines import * # noqa: F401,F403
from cv_cell_grid import * # noqa: F401,F403 from .cell_grid.cell_grid import * # noqa: F401,F403
from cv_box_detect import * # noqa: F401,F403 from .detect.box_detect import * # noqa: F401,F403
from cv_review import * # noqa: F401,F403 from .review.review import * # noqa: F401,F403
# Private names used by consumers — not covered by wildcard re-exports. # Private names used by consumers — not covered by wildcard re-exports.
from cv_preprocessing import _apply_shear # noqa: F401 from .preprocessing.preprocessing import _apply_shear # noqa: F401
from cv_layout import ( # noqa: F401 from .layout.layout import ( # noqa: F401
_detect_header_footer_gaps, _detect_header_footer_gaps,
_detect_sub_columns, _detect_sub_columns,
_split_broad_columns, _split_broad_columns,
) )
from cv_ocr_engines import ( # noqa: F401 from .engines.engines import ( # noqa: F401
_fix_character_confusion, _fix_character_confusion,
_fix_phonetic_brackets, _fix_phonetic_brackets,
) )
from cv_cell_grid import _cells_to_vocab_entries # noqa: F401 from .cell_grid.cell_grid import _cells_to_vocab_entries # noqa: F401
from cv_words_first import build_grid_from_words # noqa: F401 from .words_first import build_grid_from_words # noqa: F401
@@ -0,0 +1,2 @@
"""Preprocessing sub-package (deskew, dewarp, image I/O)."""
from .preprocessing import * # noqa: F401,F403
@@ -11,7 +11,7 @@ from typing import Any, Dict, Tuple
import numpy as np import numpy as np
from cv_vocab_types import ( from ..types import (
CV2_AVAILABLE, CV2_AVAILABLE,
TESSERACT_AVAILABLE, TESSERACT_AVAILABLE,
) )
@@ -16,7 +16,7 @@ from typing import Any, Dict, List, Tuple
import numpy as np import numpy as np
from cv_vocab_types import ( from ..types import (
CV2_AVAILABLE, CV2_AVAILABLE,
TESSERACT_AVAILABLE, TESSERACT_AVAILABLE,
) )
@@ -17,7 +17,7 @@ from typing import Tuple
import numpy as np import numpy as np
from cv_vocab_types import ( from ..types import (
CV2_AVAILABLE, CV2_AVAILABLE,
TESSERACT_AVAILABLE, TESSERACT_AVAILABLE,
) )
@@ -38,7 +38,7 @@ except ImportError:
Image = None # type: ignore[assignment,misc] Image = None # type: ignore[assignment,misc]
# Re-export all deskew functions # Re-export all deskew functions
from cv_preprocessing_deskew import ( # noqa: F401 from .deskew import ( # noqa: F401
deskew_image, deskew_image,
deskew_image_by_word_alignment, deskew_image_by_word_alignment,
deskew_image_iterative, deskew_image_iterative,
@@ -48,7 +48,7 @@ from cv_preprocessing_deskew import ( # noqa: F401
) )
# Re-export all dewarp functions # Re-export all dewarp functions
from cv_preprocessing_dewarp import ( # noqa: F401 from .dewarp import ( # noqa: F401
_apply_shear, _apply_shear,
_detect_shear_angle, _detect_shear_angle,
_detect_shear_by_hough, _detect_shear_by_hough,
@@ -0,0 +1,2 @@
"""Review sub-package (spell, LLM, pipeline orchestration)."""
from .review import * # noqa: F401,F403
+3 -3
View File
@@ -183,7 +183,7 @@ async def llm_review_entries(
model: str = None, model: str = None,
) -> Dict: ) -> Dict:
"""OCR error correction. Uses spell-checker (REVIEW_ENGINE=spell) or LLM (REVIEW_ENGINE=llm).""" """OCR error correction. Uses spell-checker (REVIEW_ENGINE=spell) or LLM (REVIEW_ENGINE=llm)."""
from cv_review_spell import spell_review_entries_sync, _SPELL_AVAILABLE from .spell import spell_review_entries_sync, _SPELL_AVAILABLE
if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE: if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
return spell_review_entries_sync(entries) return spell_review_entries_sync(entries)
@@ -260,8 +260,8 @@ async def llm_review_entries_streaming(
Phase 0 (always): Run _fix_character_confusion and emit any changes. Phase 0 (always): Run _fix_character_confusion and emit any changes.
""" """
from cv_ocr_engines import _fix_character_confusion from ..engines.engines import _fix_character_confusion
from cv_review_spell import spell_review_entries_streaming, _SPELL_AVAILABLE from .spell import spell_review_entries_streaming, _SPELL_AVAILABLE
_CONF_FIELDS = ('english', 'german', 'example') _CONF_FIELDS = ('english', 'german', 'example')
originals = [{f: e.get(f, '') for f in _CONF_FIELDS} for e in entries] originals = [{f: e.get(f, '') for f in _CONF_FIELDS} for e in entries]
@@ -13,24 +13,24 @@ from typing import Any, Dict, List, Optional
import numpy as np import numpy as np
from cv_vocab_types import ( from ..types import (
CV_PIPELINE_AVAILABLE, CV_PIPELINE_AVAILABLE,
PageRegion, PageRegion,
PipelineResult, PipelineResult,
VocabRow, VocabRow,
) )
from cv_preprocessing import ( from ..preprocessing.preprocessing import (
deskew_image, deskew_image,
dewarp_image, dewarp_image,
render_image_high_res, render_image_high_res,
render_pdf_high_res, render_pdf_high_res,
) )
from cv_layout import ( from ..layout.layout import (
analyze_layout, analyze_layout,
create_layout_image, create_layout_image,
create_ocr_image, create_ocr_image,
) )
from cv_ocr_engines import ( from ..engines.engines import (
_group_words_into_lines, _group_words_into_lines,
) )
+3 -3
View File
@@ -12,7 +12,7 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
""" """
# Re-export everything for backward compatibility # Re-export everything for backward compatibility
from cv_review_pipeline import ( # noqa: F401 from .pipeline import ( # noqa: F401
ocr_region, ocr_region,
run_multi_pass_ocr, run_multi_pass_ocr,
match_lines_to_vocab, match_lines_to_vocab,
@@ -20,7 +20,7 @@ from cv_review_pipeline import ( # noqa: F401
run_cv_pipeline, run_cv_pipeline,
) )
from cv_review_spell import ( # noqa: F401 from .spell import ( # noqa: F401
_SPELL_AVAILABLE, _SPELL_AVAILABLE,
_spell_dict_knows, _spell_dict_knows,
_spell_fix_field, _spell_fix_field,
@@ -31,7 +31,7 @@ from cv_review_spell import ( # noqa: F401
spell_review_entries_streaming, spell_review_entries_streaming,
) )
from cv_review_llm import ( # noqa: F401 from .llm import ( # noqa: F401
OLLAMA_REVIEW_MODEL, OLLAMA_REVIEW_MODEL,
REVIEW_ENGINE, REVIEW_ENGINE,
_REVIEW_BATCH_SIZE, _REVIEW_BATCH_SIZE,
+1 -1
View File
@@ -210,7 +210,7 @@ def spell_review_entries_sync(entries: List[Dict]) -> Dict:
Uses SmartSpellChecker for language-aware corrections with context-based Uses SmartSpellChecker for language-aware corrections with context-based
disambiguation (a/I), multi-digit substitution, and cross-language guard. disambiguation (a/I), multi-digit substitution, and cross-language guard.
""" """
from cv_review_llm import _entry_needs_review from .llm import _entry_needs_review
t0 = time.time() t0 = time.time()
changes: List[Dict] = [] changes: List[Dict] = []
+1 -1
View File
@@ -19,7 +19,7 @@ import re
import statistics import statistics
from typing import Any, Dict, List, Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
from cv_ocr_engines import ( from .engines.engines import (
_group_words_into_lines, _group_words_into_lines,
_words_to_reading_order_text, _words_to_reading_order_text,
) )
@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/image_enhance.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.image_enhance")