refactor: cv_vocab_pipeline.py in 6 Module aufteilen (8163 → 6 + Fassade)

Monolithische 8163-Zeilen-Datei aufgeteilt in fokussierte Module: - cv_vocab_types.py (156 Z.): Dataklassen, Konstanten, IPA, Feature-Flags - cv_preprocessing.py (1166 Z.): Bild-I/O, Orientierung, Deskew, Dewarp - cv_layout.py (3036 Z.): Dokumenttyp, Spalten, Zeilen, Klassifikation - cv_ocr_engines.py (1282 Z.): OCR-Engines, Vocab-Postprocessing, Text-Cleaning - cv_cell_grid.py (1510 Z.): Cell-Grid v2+Legacy, Vocab-Konvertierung - cv_review.py (1184 Z.): LLM/Spell Review, Pipeline-Orchestrierung cv_vocab_pipeline.py ist jetzt eine Re-Export-Fassade (35 Z.) — alle bestehenden Imports bleiben unveraendert. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-08 23:46:47 +01:00
parent 931ab92c92
commit 9a5a35bff1
7 changed files with 8359 additions and 8153 deletions
@@ -0,0 +1,156 @@
 """
 Shared types, constants, and availability guards for the CV vocabulary pipeline.
 Lizenz: Apache 2.0 (kommerziell nutzbar)
 DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
 """
 import json
 import logging
 import os
 import re  # noqa: F401 — re-exported for downstream modules
 from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional
 import numpy as np  # noqa: F401
 logger = logging.getLogger(__name__)
 # --- Availability Guards ---
 try:
    import cv2  # noqa: F401
    CV2_AVAILABLE = True
 except ImportError:
    cv2 = None  # type: ignore[assignment]
    CV2_AVAILABLE = False
    logger.warning("OpenCV not available — CV pipeline disabled")
 try:
    import pytesseract  # noqa: F401
    from PIL import Image  # noqa: F401
    TESSERACT_AVAILABLE = True
 except ImportError:
    pytesseract = None  # type: ignore[assignment]
    Image = None  # type: ignore[assignment,misc]
    TESSERACT_AVAILABLE = False
    logger.warning("pytesseract/Pillow not available — CV pipeline disabled")
 CV_PIPELINE_AVAILABLE = CV2_AVAILABLE and TESSERACT_AVAILABLE
 # --- IPA Dictionary ---
 IPA_AVAILABLE = False
 _ipa_convert_american = None
 _britfone_dict: Dict[str, str] = {}
 try:
    import eng_to_ipa as _eng_to_ipa
    _ipa_convert_american = _eng_to_ipa.convert
    IPA_AVAILABLE = True
    logger.info("eng_to_ipa available — American IPA lookup enabled")
 except ImportError:
    logger.info("eng_to_ipa not installed — American IPA disabled")
 # Load Britfone dictionary (MIT license, ~15k British English IPA entries)
 _britfone_path = os.path.join(os.path.dirname(__file__), 'data', 'britfone_ipa.json')
 if os.path.exists(_britfone_path):
    try:
        with open(_britfone_path, 'r', encoding='utf-8') as f:
            _britfone_dict = json.load(f)
        IPA_AVAILABLE = True
        logger.info(f"Britfone loaded — {len(_britfone_dict)} British IPA entries")
    except Exception as e:
        logger.warning(f"Failed to load Britfone: {e}")
 else:
    logger.info("Britfone not found — British IPA disabled")
 # --- Language Detection Constants ---
 GERMAN_FUNCTION_WORDS = {'der', 'die', 'das', 'und', 'ist', 'ein', 'eine', 'nicht',
    'von', 'zu', 'mit', 'auf', 'fuer', 'den', 'dem', 'sich', 'auch', 'wird',
    'nach', 'bei', 'aus', 'wie', 'oder', 'wenn', 'noch', 'aber', 'hat', 'nur',
    'ueber', 'kann', 'als', 'ich', 'er', 'sie', 'es', 'wir', 'ihr', 'haben',
    'sein', 'werden', 'war', 'sind', 'muss', 'soll', 'dieser', 'diese', 'diesem'}
 ENGLISH_FUNCTION_WORDS = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'to', 'of',
    'and', 'in', 'that', 'it', 'for', 'on', 'with', 'as', 'at', 'by', 'from',
    'or', 'but', 'not', 'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
    'would', 'can', 'could', 'should', 'may', 'might', 'this', 'they', 'you', 'he',
    'she', 'we', 'my', 'your', 'his', 'her', 'its', 'our', 'their', 'which'}
 # --- Data Classes ---
@dataclass
 class PageRegion:
    """A detected region on the page."""
    type: str           # 'column_en', 'column_de', 'column_example', 'page_ref', 'column_marker', 'column_text', 'header', 'footer', 'margin_top', 'margin_bottom'
    x: int
    y: int
    width: int
    height: int
    classification_confidence: float = 1.0   # 0.0-1.0
    classification_method: str = ""          # 'content', 'position_enhanced', 'position_fallback'
@dataclass
 class ColumnGeometry:
    """Geometrisch erkannte Spalte vor Typ-Klassifikation."""
    index: int              # 0-basiert, links->rechts
    x: int
    y: int
    width: int
    height: int
    word_count: int
    words: List[Dict]       # Wort-Dicts aus Tesseract (text, conf, left, top, ...)
    width_ratio: float      # width / content_width (0.0-1.0)
    is_sub_column: bool = False  # True if created by _detect_sub_columns() split
@dataclass
 class RowGeometry:
    """Geometrisch erkannte Zeile mit Kopf-/Fusszeilen-Klassifikation."""
    index: int              # 0-basiert, oben→unten
    x: int                  # absolute left (= content left_x)
    y: int                  # absolute y start
    width: int              # content width
    height: int             # Zeilenhoehe in px
    word_count: int
    words: List[Dict]
    row_type: str = 'content'  # 'content' | 'header' | 'footer'
    gap_before: int = 0     # Gap in px ueber dieser Zeile
@dataclass
 class VocabRow:
    """A single vocabulary entry assembled from multi-column OCR."""
    english: str = ""
    german: str = ""
    example: str = ""
    source_page: str = ""
    confidence: float = 0.0
    y_position: int = 0
@dataclass
 class PipelineResult:
    """Complete result of the CV pipeline."""
    vocabulary: List[Dict[str, Any]] = field(default_factory=list)
    word_count: int = 0
    columns_detected: int = 0
    duration_seconds: float = 0.0
    stages: Dict[str, float] = field(default_factory=dict)
    error: Optional[str] = None
    image_width: int = 0
    image_height: int = 0
@dataclass
 class DocumentTypeResult:
    """Result of automatic document type detection."""
    doc_type: str           # 'vocab_table' | 'full_text' | 'generic_table'
    confidence: float       # 0.0-1.0
    pipeline: str           # 'cell_first' | 'full_page'
    skip_steps: List[str] = field(default_factory=list)  # e.g. ['columns', 'rows']
    features: Dict[str, Any] = field(default_factory=dict)  # debug info