Restructure: Move 47 cv_* files into ocr/ package
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 39s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m34s
CI / test-python-agent-core (push) Successful in 20s
CI / test-nodejs-website (push) Successful in 26s

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-25 21:03:54 +02:00
parent 45287b3541
commit cb1be59e46
95 changed files with 317 additions and 103 deletions

View File

@@ -27,6 +27,7 @@
# Algorithmic monolith — detect_column_geometry() allein 411 LOC, nicht weiter teilbar
**/cv_layout_columns.py | owner=klausur | reason=detect_column_geometry ist eine einzelne 411-LOC Funktion (Whitespace-Gap-Analyse) | review=2026-10-01
**/ocr/layout/columns.py | owner=klausur | reason=Same file moved to ocr/ package | review=2026-10-01
# Two indivisible route handlers (~230 LOC each) that cannot be split further
**/vocab_worksheet_compare_api.py | owner=klausur | reason=compare_ocr_methods (234 LOC) + analyze_grid (255 LOC), each a single cohesive handler | review=2026-10-01

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/detect\/box_detect.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.detect.box_detect")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/detect\/box_layout.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.detect.box_layout")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/cell_grid\/cell_grid.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.cell_grid.cell_grid")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/cell_grid\/build.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.cell_grid.build")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/cell_grid\/helpers.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.cell_grid.helpers")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/cell_grid\/legacy.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.cell_grid.legacy")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/cell_grid\/merge.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.cell_grid.merge")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/cell_grid\/streaming.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.cell_grid.streaming")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/cell_grid\/vocab.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.cell_grid.vocab")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/detect\/color_detect.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.detect.color_detect")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/detect\/doclayout_detect.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.detect.doclayout_detect")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/detect\/graphic_detect.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.detect.graphic_detect")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/gutter\/repair.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.gutter.repair")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/gutter\/core.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.gutter.core")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/gutter\/grid.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.gutter.grid")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/ipa_german.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.ipa_german")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/layout\/layout.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.layout.layout")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/layout\/analyze.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.layout.analyze")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/layout\/classify.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.layout.classify")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/layout\/classify_position.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.layout.classify_position")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/layout\/column_refine.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.layout.column_refine")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/layout\/columns.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.layout.columns")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/layout\/detection.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.layout.detection")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/layout\/row_regularize.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.layout.row_regularize")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/layout\/rows.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.layout.rows")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/layout\/scoring.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.layout.scoring")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/engines\/cell_filter.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.engines.cell_filter")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/engines\/cell_phonetics.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.engines.cell_phonetics")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/engines\/engines.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.engines.engines")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/engines\/ipa_lookup.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.engines.ipa_lookup")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/engines\/ipa_repair.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.engines.ipa_repair")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/engines\/vocab_postprocess.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.engines.vocab_postprocess")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/engines\/word_assembly.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.engines.word_assembly")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/preprocessing\/preprocessing.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.preprocessing.preprocessing")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/preprocessing\/deskew.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.preprocessing.deskew")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/preprocessing\/dewarp.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.preprocessing.dewarp")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/review\/review.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.review.review")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/review\/llm.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.review.llm")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/review\/pipeline.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.review.pipeline")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/review\/spell.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.review.spell")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/detect\/syllable\/core.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.detect.syllable.core")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/detect\/syllable\/detect.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.detect.syllable.detect")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/detect\/syllable\/merge.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.detect.syllable.merge")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/pipeline.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.pipeline")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/types.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.types")

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/words_first.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.words_first")

View File

@@ -0,0 +1,9 @@
"""
OCR package — restructured from cv_* flat modules.
Backward-compatible re-exports: consumers can still use
``from cv_layout import ...`` etc. via the shim files in backend/.
"""
from .types import * # noqa: F401,F403
from .pipeline import * # noqa: F401,F403

View File

@@ -0,0 +1,2 @@
"""Cell-grid construction sub-package."""
from .cell_grid import * # noqa: F401,F403

View File

@@ -10,8 +10,8 @@ from typing import Any, Dict, List, Optional, Tuple
import numpy as np
from cv_vocab_types import PageRegion, RowGeometry
from cv_ocr_engines import (
from ..types import PageRegion, RowGeometry
from ..engines.engines import (
RAPIDOCR_AVAILABLE,
_assign_row_words_to_columns,
_clean_cell_text,
@@ -22,7 +22,7 @@ from cv_ocr_engines import (
ocr_region_rapid,
ocr_region_trocr,
)
from cv_cell_grid_helpers import (
from .helpers import (
_MIN_WORD_CONF,
_ensure_minimum_crop_size,
_heal_row_gaps,

View File

@@ -16,7 +16,7 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
# --- Helpers ---
from cv_cell_grid_helpers import ( # noqa: F401
from .helpers import ( # noqa: F401
_MIN_WORD_CONF,
_compute_cell_padding,
_ensure_minimum_crop_size,
@@ -26,26 +26,26 @@ from cv_cell_grid_helpers import ( # noqa: F401
)
# --- v2 build (current default) ---
from cv_cell_grid_build import ( # noqa: F401
from .build import ( # noqa: F401
_NARROW_COL_THRESHOLD_PCT,
_ocr_cell_crop,
build_cell_grid_v2,
)
# --- Legacy build (DEPRECATED) ---
from cv_cell_grid_legacy import ( # noqa: F401
from .legacy import ( # noqa: F401
_ocr_single_cell,
build_cell_grid,
)
# --- Streaming variants ---
from cv_cell_grid_streaming import ( # noqa: F401
from .streaming import ( # noqa: F401
build_cell_grid_streaming,
build_cell_grid_v2_streaming,
)
# --- Row merging ---
from cv_cell_grid_merge import ( # noqa: F401
from .merge import ( # noqa: F401
_PHONETIC_ONLY_RE,
_is_phonetic_only_text,
_merge_continuation_rows,
@@ -54,7 +54,7 @@ from cv_cell_grid_merge import ( # noqa: F401
)
# --- Vocab extraction ---
from cv_cell_grid_vocab import ( # noqa: F401
from .vocab import ( # noqa: F401
_cells_to_vocab_entries,
build_word_grid,
)

View File

@@ -13,7 +13,7 @@ from typing import List
import numpy as np
from cv_vocab_types import RowGeometry
from ..types import RowGeometry
logger = logging.getLogger(__name__)

View File

@@ -12,8 +12,8 @@ from typing import Any, Dict, List, Optional, Tuple
import numpy as np
from cv_vocab_types import PageRegion, RowGeometry
from cv_ocr_engines import (
from ..types import PageRegion, RowGeometry
from ..engines.engines import (
RAPIDOCR_AVAILABLE,
_assign_row_words_to_columns,
_clean_cell_text,
@@ -22,7 +22,7 @@ from cv_ocr_engines import (
ocr_region_rapid,
ocr_region_trocr,
)
from cv_cell_grid_helpers import (
from .helpers import (
_MIN_WORD_CONF,
_compute_cell_padding,
_ensure_minimum_crop_size,

View File

@@ -11,7 +11,7 @@ import logging
import re
from typing import Any, Dict, List
from cv_ocr_engines import _RE_ALPHA
from ..engines.engines import _RE_ALPHA
logger = logging.getLogger(__name__)

View File

@@ -13,17 +13,17 @@ from typing import Any, Dict, Generator, List, Optional, Tuple
import numpy as np
from cv_vocab_types import PageRegion, RowGeometry
from cv_ocr_engines import (
from ..types import PageRegion, RowGeometry
from ..engines.engines import (
RAPIDOCR_AVAILABLE,
_assign_row_words_to_columns,
)
from cv_cell_grid_helpers import (
from .helpers import (
_heal_row_gaps,
_is_artifact_row,
)
from cv_cell_grid_build import _ocr_cell_crop
from cv_cell_grid_legacy import _ocr_single_cell
from .build import _ocr_cell_crop
from .legacy import _ocr_single_cell
logger = logging.getLogger(__name__)

View File

@@ -10,13 +10,13 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
import logging
from typing import Any, Dict, List
from cv_ocr_engines import (
from ..engines.engines import (
_attach_example_sentences,
_fix_phonetic_brackets,
_split_comma_entries,
)
from cv_cell_grid_legacy import build_cell_grid
from cv_cell_grid_merge import (
from .legacy import build_cell_grid
from .merge import (
_merge_continuation_rows,
_merge_phonetic_continuation_rows,
_merge_wrapped_rows,

View File

@@ -0,0 +1,2 @@
"""Detection sub-package (boxes, graphics, colors, syllables, doclayout)."""
from .box_detect import * # noqa: F401,F403

View File

@@ -21,7 +21,7 @@ from typing import List, Optional, Tuple
import cv2
import numpy as np
from cv_vocab_types import DetectedBox, PageZone
from ..types import DetectedBox, PageZone
logger = logging.getLogger(__name__)

View File

@@ -127,7 +127,7 @@ def detect_graphic_elements(
backend = os.environ.get("GRAPHIC_DETECT_BACKEND", "auto")
if backend in ("doclayout", "auto"):
try:
from cv_doclayout_detect import detect_layout_regions, is_doclayout_available
from .doclayout_detect import detect_layout_regions, is_doclayout_available
if is_doclayout_available():
regions = detect_layout_regions(img_bgr)
if regions:

View File

@@ -0,0 +1,2 @@
"""Syllable detection sub-package."""
from .detect import * # noqa: F401,F403

View File

@@ -10,7 +10,7 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
# Core: init, validation, autocorrect
from cv_syllable_core import ( # noqa: F401
from .core import ( # noqa: F401
_IPA_RE,
_STOP_WORDS,
_get_hyphenators,
@@ -23,7 +23,7 @@ from cv_syllable_core import ( # noqa: F401
)
# Merge: gap merging, syllabify, insert
from cv_syllable_merge import ( # noqa: F401
from .merge import ( # noqa: F401
_try_merge_pipe_gaps,
merge_word_gaps_in_zones,
_try_merge_word_gaps,

View File

@@ -13,7 +13,7 @@ from typing import Any, Dict, List, Optional
import numpy as np
from cv_syllable_core import (
from .core import (
_get_hyphenators,
_hyphenate_word,
_IPA_RE,

View File

@@ -0,0 +1,2 @@
"""OCR engines sub-package."""
from .engines import * # noqa: F401,F403

View File

@@ -16,7 +16,7 @@ from typing import Any, Dict, List, Optional
import numpy as np
from cv_vocab_types import PageRegion, RowGeometry
from ..types import PageRegion, RowGeometry
logger = logging.getLogger(__name__)

View File

@@ -11,14 +11,14 @@ import logging
import re
from typing import Any, Dict, List
from cv_vocab_types import IPA_AVAILABLE
from ..types import IPA_AVAILABLE
from cv_ocr_ipa_lookup import (
from .ipa_lookup import (
_insert_missing_ipa,
_replace_phonetics_in_text,
_text_has_garbled_ipa,
)
from cv_ocr_ipa_repair import (
from .ipa_repair import (
_has_non_dict_trailing,
_insert_headword_ipa,
_strip_post_bracket_garbled,

View File

@@ -24,7 +24,7 @@ from typing import Any, Dict, List, Optional, Tuple
import numpy as np
from cv_vocab_types import (
from ..types import (
IPA_AVAILABLE,
PageRegion,
RowGeometry,
@@ -47,7 +47,7 @@ except ImportError:
# ── Re-exports from sub-modules (backward compatibility) ──────────────────
from cv_ocr_word_assembly import ( # noqa: F401
from .word_assembly import ( # noqa: F401
_group_words_into_lines,
_words_to_reading_order_lines,
_rejoin_hyphenated,
@@ -55,7 +55,7 @@ from cv_ocr_word_assembly import ( # noqa: F401
_words_to_spaced_text,
)
from cv_ocr_vocab_postprocess import ( # noqa: F401
from .vocab_postprocess import ( # noqa: F401
_CHAR_CONFUSION_RULES,
_DE_INDICATORS_FOR_EN_I,
_fix_character_confusion,
@@ -66,7 +66,7 @@ from cv_ocr_vocab_postprocess import ( # noqa: F401
_attach_example_sentences,
)
from cv_ocr_ipa_lookup import ( # noqa: F401
from .ipa_lookup import ( # noqa: F401
_PHONETIC_BRACKET_RE,
_IPA_CHARS,
_MIN_WORD_CONF,
@@ -80,20 +80,20 @@ from cv_ocr_ipa_lookup import ( # noqa: F401
_insert_missing_ipa,
)
from cv_ocr_ipa_repair import ( # noqa: F401
from .ipa_repair import ( # noqa: F401
_has_non_dict_trailing,
_strip_post_bracket_garbled,
fix_ipa_continuation_cell,
_insert_headword_ipa,
)
from cv_ocr_cell_phonetics import ( # noqa: F401
from .cell_phonetics import ( # noqa: F401
fix_cell_phonetics,
_has_ipa_gap,
_sync_word_boxes_after_ipa_insert,
)
from cv_ocr_cell_filter import ( # noqa: F401
from .cell_filter import ( # noqa: F401
_RE_REAL_WORD,
_RE_ALPHA,
_COMMON_SHORT_WORDS,

View File

@@ -23,7 +23,7 @@ import logging
import re
from typing import Any, Dict, List, Optional
from cv_vocab_types import (
from ..types import (
IPA_AVAILABLE,
_britfone_dict,
_ipa_convert_american,

View File

@@ -16,8 +16,8 @@ import logging
import re
from typing import Any, Dict, List, Optional
from cv_vocab_types import IPA_AVAILABLE
from cv_ocr_ipa_lookup import (
from ..types import IPA_AVAILABLE
from .ipa_lookup import (
_lookup_ipa,
_GRAMMAR_BRACKET_WORDS,
)

View File

@@ -0,0 +1,2 @@
"""Gutter repair sub-package."""
from .repair import * # noqa: F401,F403

View File

@@ -11,7 +11,7 @@ import logging
import time
from typing import Any, Dict, List, Tuple
from cv_gutter_repair_core import (
from .core import (
_init_spellcheckers,
_is_ipa_text,
_is_known,

View File

@@ -10,7 +10,7 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
# Core: spellchecker, data types, repair helpers
from cv_gutter_repair_core import ( # noqa: F401
from .core import ( # noqa: F401
_init_spellcheckers,
_is_known,
_spell_candidates,
@@ -29,7 +29,7 @@ from cv_gutter_repair_core import ( # noqa: F401
)
# Grid: analysis and application
from cv_gutter_repair_grid import ( # noqa: F401
from .grid import ( # noqa: F401
analyse_grid_for_gutter_repair,
apply_gutter_suggestions,
)

View File

@@ -26,7 +26,7 @@ def _lookup_ipa_de(word: str) -> Optional[str]:
Returns IPA string or None if not found.
"""
from cv_vocab_types import _de_ipa_dict, _epitran_de, DE_IPA_AVAILABLE
from .types import _de_ipa_dict, _epitran_de, DE_IPA_AVAILABLE
if not DE_IPA_AVAILABLE and _epitran_de is None:
return None
@@ -109,7 +109,7 @@ def insert_german_ipa(
Returns:
Number of cells modified.
"""
from cv_vocab_types import DE_IPA_AVAILABLE, _epitran_de
from .types import DE_IPA_AVAILABLE, _epitran_de
if not DE_IPA_AVAILABLE and _epitran_de is None:
logger.warning("German IPA not available — skipping")

View File

@@ -0,0 +1,2 @@
"""Layout analysis sub-package."""
from .layout import * # noqa: F401,F403

View File

@@ -13,8 +13,8 @@ from typing import List
import numpy as np
from cv_vocab_types import PageRegion
from cv_layout_detection import _find_content_bounds
from ..types import PageRegion
from .detection import _find_content_bounds
logger = logging.getLogger(__name__)
@@ -246,7 +246,7 @@ def analyze_layout(layout_img: np.ndarray, ocr_img: np.ndarray) -> List[PageRegi
# Add header/footer info (gap-based detection with fallback)
# Lazy import to avoid circular dependency with cv_layout.py
from cv_layout_detection import _add_header_footer
from .detection import _add_header_footer
_add_header_footer(regions, top_y, bottom_y, w, h, inv=inv)
top_region = next((r.type for r in regions if r.type in ('header', 'margin_top')), 'none')

View File

@@ -11,16 +11,16 @@ from typing import Dict, List, Optional
import numpy as np
from cv_vocab_types import ColumnGeometry, PageRegion
from ..types import ColumnGeometry, PageRegion
from cv_layout_scoring import (
from .scoring import (
_score_language,
_score_role,
_score_dictionary_signals,
_classify_dictionary_columns,
)
from cv_layout_classify_position import (
from .classify_position import (
_classify_by_position_enhanced,
_classify_by_position_fallback,
)
@@ -211,7 +211,7 @@ def classify_column_types(geometries: List[ColumnGeometry],
# _add_header_footer lives in cv_layout (avoids circular import at module
# level). Lazy-import here so the module can be tested independently when
# cv_layout hasn't been modified yet.
from cv_layout_detection import _add_header_footer # noqa: E402
from .detection import _add_header_footer # noqa: E402
content_h = bottom_y - top_y

View File

@@ -11,7 +11,7 @@ Extracted from cv_layout_classify.py during file-size split.
import logging
from typing import Dict, List, Optional
from cv_vocab_types import ColumnGeometry, PageRegion
from ..types import ColumnGeometry, PageRegion
logger = logging.getLogger(__name__)

View File

@@ -16,7 +16,7 @@ from typing import Dict, List, Optional, Tuple
import numpy as np
from cv_vocab_types import ColumnGeometry
from ..types import ColumnGeometry
logger = logging.getLogger(__name__)

View File

@@ -19,8 +19,8 @@ from typing import Dict, List, Optional, Tuple
import numpy as np
from cv_vocab_types import ColumnGeometry
from cv_layout_detection import _find_content_bounds
from ..types import ColumnGeometry
from .detection import _find_content_bounds
logger = logging.getLogger(__name__)

View File

@@ -13,7 +13,7 @@ from typing import List, Optional, Tuple
import numpy as np
from cv_vocab_types import (
from ..types import (
DocumentTypeResult,
PageRegion,
)

View File

@@ -21,14 +21,14 @@ from typing import Any, Dict, List, Optional, Tuple
import numpy as np
from cv_vocab_types import ColumnGeometry, DetectedBox, PageRegion
from ..types import ColumnGeometry, DetectedBox, PageRegion
logger = logging.getLogger(__name__)
# ── Re-exports (backward compatibility) ───────────────────────────────────
from cv_layout_detection import ( # noqa: F401
from .detection import ( # noqa: F401
detect_document_type,
create_ocr_image,
create_layout_image,
@@ -39,46 +39,46 @@ from cv_layout_detection import ( # noqa: F401
_add_header_footer,
)
from cv_layout_analyze import ( # noqa: F401
from .analyze import ( # noqa: F401
analyze_layout,
)
from cv_layout_columns import ( # noqa: F401
from .columns import ( # noqa: F401
detect_column_geometry,
_detect_columns_by_clustering,
_build_geometries_from_starts,
)
from cv_layout_column_refine import ( # noqa: F401
from .column_refine import ( # noqa: F401
_detect_sub_columns,
_split_broad_columns,
expand_narrow_columns,
)
from cv_layout_rows import ( # noqa: F401
from .rows import ( # noqa: F401
detect_row_geometry,
_build_rows_from_word_grouping,
)
from cv_layout_row_regularize import ( # noqa: F401
from .row_regularize import ( # noqa: F401
_regularize_row_grid,
)
from cv_layout_scoring import ( # noqa: F401
from .scoring import ( # noqa: F401
_score_language,
_score_role,
_score_dictionary_signals,
_classify_dictionary_columns,
)
from cv_layout_classify import ( # noqa: F401
from .classify import ( # noqa: F401
_build_margin_regions,
positional_column_regions,
classify_column_types,
_classify_by_content,
)
from cv_layout_classify_position import ( # noqa: F401
from .classify_position import ( # noqa: F401
_classify_by_position_enhanced,
_classify_by_position_fallback,
)
@@ -143,7 +143,7 @@ def detect_column_geometry_zoned(
per content zone on the corresponding sub-image.
4. If no boxes: delegates entirely to detect_column_geometry().
"""
from cv_box_detect import detect_boxes, split_page_into_zones
from ..detect.box_detect import detect_boxes, split_page_into_zones
geo_result = detect_column_geometry(ocr_img, dewarped_bgr)
if geo_result is None:

View File

@@ -13,7 +13,7 @@ from typing import Dict, List
import numpy as np
from cv_vocab_types import RowGeometry
from ..types import RowGeometry
logger = logging.getLogger(__name__)

View File

@@ -20,9 +20,9 @@ try:
except ImportError:
cv2 = None # type: ignore[assignment]
from cv_vocab_types import RowGeometry
from cv_ocr_word_assembly import _group_words_into_lines
from cv_layout_row_regularize import _regularize_row_grid
from ..types import RowGeometry
from ..engines.word_assembly import _group_words_into_lines
from .row_regularize import _regularize_row_grid
logger = logging.getLogger(__name__)

View File

@@ -11,7 +11,7 @@ import logging
from collections import Counter
from typing import Any, Dict, List, Optional
from cv_vocab_types import (
from ..types import (
ColumnGeometry,
ENGLISH_FUNCTION_WORDS,
GERMAN_FUNCTION_WORDS,

View File

@@ -14,24 +14,24 @@ Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
from cv_vocab_types import * # noqa: F401,F403
from cv_preprocessing import * # noqa: F401,F403
from cv_layout import * # noqa: F401,F403
from cv_ocr_engines import * # noqa: F401,F403
from cv_cell_grid import * # noqa: F401,F403
from cv_box_detect import * # noqa: F401,F403
from cv_review import * # noqa: F401,F403
from .types import * # noqa: F401,F403
from .preprocessing.preprocessing import * # noqa: F401,F403
from .layout.layout import * # noqa: F401,F403
from .engines.engines import * # noqa: F401,F403
from .cell_grid.cell_grid import * # noqa: F401,F403
from .detect.box_detect import * # noqa: F401,F403
from .review.review import * # noqa: F401,F403
# Private names used by consumers — not covered by wildcard re-exports.
from cv_preprocessing import _apply_shear # noqa: F401
from cv_layout import ( # noqa: F401
from .preprocessing.preprocessing import _apply_shear # noqa: F401
from .layout.layout import ( # noqa: F401
_detect_header_footer_gaps,
_detect_sub_columns,
_split_broad_columns,
)
from cv_ocr_engines import ( # noqa: F401
from .engines.engines import ( # noqa: F401
_fix_character_confusion,
_fix_phonetic_brackets,
)
from cv_cell_grid import _cells_to_vocab_entries # noqa: F401
from cv_words_first import build_grid_from_words # noqa: F401
from .cell_grid.cell_grid import _cells_to_vocab_entries # noqa: F401
from .words_first import build_grid_from_words # noqa: F401

View File

@@ -0,0 +1,2 @@
"""Preprocessing sub-package (deskew, dewarp, image I/O)."""
from .preprocessing import * # noqa: F401,F403

View File

@@ -11,7 +11,7 @@ from typing import Any, Dict, Tuple
import numpy as np
from cv_vocab_types import (
from ..types import (
CV2_AVAILABLE,
TESSERACT_AVAILABLE,
)

View File

@@ -16,7 +16,7 @@ from typing import Any, Dict, List, Tuple
import numpy as np
from cv_vocab_types import (
from ..types import (
CV2_AVAILABLE,
TESSERACT_AVAILABLE,
)

View File

@@ -17,7 +17,7 @@ from typing import Tuple
import numpy as np
from cv_vocab_types import (
from ..types import (
CV2_AVAILABLE,
TESSERACT_AVAILABLE,
)
@@ -38,7 +38,7 @@ except ImportError:
Image = None # type: ignore[assignment,misc]
# Re-export all deskew functions
from cv_preprocessing_deskew import ( # noqa: F401
from .deskew import ( # noqa: F401
deskew_image,
deskew_image_by_word_alignment,
deskew_image_iterative,
@@ -48,7 +48,7 @@ from cv_preprocessing_deskew import ( # noqa: F401
)
# Re-export all dewarp functions
from cv_preprocessing_dewarp import ( # noqa: F401
from .dewarp import ( # noqa: F401
_apply_shear,
_detect_shear_angle,
_detect_shear_by_hough,

View File

@@ -0,0 +1,2 @@
"""Review sub-package (spell, LLM, pipeline orchestration)."""
from .review import * # noqa: F401,F403

View File

@@ -183,7 +183,7 @@ async def llm_review_entries(
model: str = None,
) -> Dict:
"""OCR error correction. Uses spell-checker (REVIEW_ENGINE=spell) or LLM (REVIEW_ENGINE=llm)."""
from cv_review_spell import spell_review_entries_sync, _SPELL_AVAILABLE
from .spell import spell_review_entries_sync, _SPELL_AVAILABLE
if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
return spell_review_entries_sync(entries)
@@ -260,8 +260,8 @@ async def llm_review_entries_streaming(
Phase 0 (always): Run _fix_character_confusion and emit any changes.
"""
from cv_ocr_engines import _fix_character_confusion
from cv_review_spell import spell_review_entries_streaming, _SPELL_AVAILABLE
from ..engines.engines import _fix_character_confusion
from .spell import spell_review_entries_streaming, _SPELL_AVAILABLE
_CONF_FIELDS = ('english', 'german', 'example')
originals = [{f: e.get(f, '') for f in _CONF_FIELDS} for e in entries]

View File

@@ -13,24 +13,24 @@ from typing import Any, Dict, List, Optional
import numpy as np
from cv_vocab_types import (
from ..types import (
CV_PIPELINE_AVAILABLE,
PageRegion,
PipelineResult,
VocabRow,
)
from cv_preprocessing import (
from ..preprocessing.preprocessing import (
deskew_image,
dewarp_image,
render_image_high_res,
render_pdf_high_res,
)
from cv_layout import (
from ..layout.layout import (
analyze_layout,
create_layout_image,
create_ocr_image,
)
from cv_ocr_engines import (
from ..engines.engines import (
_group_words_into_lines,
)

View File

@@ -12,7 +12,7 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
# Re-export everything for backward compatibility
from cv_review_pipeline import ( # noqa: F401
from .pipeline import ( # noqa: F401
ocr_region,
run_multi_pass_ocr,
match_lines_to_vocab,
@@ -20,7 +20,7 @@ from cv_review_pipeline import ( # noqa: F401
run_cv_pipeline,
)
from cv_review_spell import ( # noqa: F401
from .spell import ( # noqa: F401
_SPELL_AVAILABLE,
_spell_dict_knows,
_spell_fix_field,
@@ -31,7 +31,7 @@ from cv_review_spell import ( # noqa: F401
spell_review_entries_streaming,
)
from cv_review_llm import ( # noqa: F401
from .llm import ( # noqa: F401
OLLAMA_REVIEW_MODEL,
REVIEW_ENGINE,
_REVIEW_BATCH_SIZE,

View File

@@ -210,7 +210,7 @@ def spell_review_entries_sync(entries: List[Dict]) -> Dict:
Uses SmartSpellChecker for language-aware corrections with context-based
disambiguation (a/I), multi-digit substitution, and cross-language guard.
"""
from cv_review_llm import _entry_needs_review
from .llm import _entry_needs_review
t0 = time.time()
changes: List[Dict] = []

View File

@@ -19,7 +19,7 @@ import re
import statistics
from typing import Any, Dict, List, Optional, Tuple
from cv_ocr_engines import (
from .engines.engines import (
_group_words_into_lines,
_words_to_reading_order_text,
)

View File

@@ -0,0 +1,4 @@
# Backward-compat shim -- module moved to ocr\/image_enhance.py
import importlib as _importlib
import sys as _sys
_sys.modules[__name__] = _importlib.import_module("ocr.image_enhance")