refactor: cv_vocab_pipeline.py in 6 Module aufteilen (8163 → 6 + Fassade)

Monolithische 8163-Zeilen-Datei aufgeteilt in fokussierte Module: - cv_vocab_types.py (156 Z.): Dataklassen, Konstanten, IPA, Feature-Flags - cv_preprocessing.py (1166 Z.): Bild-I/O, Orientierung, Deskew, Dewarp - cv_layout.py (3036 Z.): Dokumenttyp, Spalten, Zeilen, Klassifikation - cv_ocr_engines.py (1282 Z.): OCR-Engines, Vocab-Postprocessing, Text-Cleaning - cv_cell_grid.py (1510 Z.): Cell-Grid v2+Legacy, Vocab-Konvertierung - cv_review.py (1184 Z.): LLM/Spell Review, Pipeline-Orchestrierung cv_vocab_pipeline.py ist jetzt eine Re-Export-Fassade (35 Z.) — alle bestehenden Imports bleiben unveraendert. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-08 23:46:47 +01:00
parent 931ab92c92
commit 9a5a35bff1
7 changed files with 8359 additions and 8153 deletions
@@ -0,0 +1,156 @@
+"""
+Shared types, constants, and availability guards for the CV vocabulary pipeline.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import json
+import logging
+import os
+import re  # noqa: F401 — re-exported for downstream modules
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional
+
+import numpy as np  # noqa: F401
+
+logger = logging.getLogger(__name__)
+
+# --- Availability Guards ---
+
+try:
+    import cv2  # noqa: F401
+    CV2_AVAILABLE = True
+except ImportError:
+    cv2 = None  # type: ignore[assignment]
+    CV2_AVAILABLE = False
+    logger.warning("OpenCV not available — CV pipeline disabled")
+
+try:
+    import pytesseract  # noqa: F401
+    from PIL import Image  # noqa: F401
+    TESSERACT_AVAILABLE = True
+except ImportError:
+    pytesseract = None  # type: ignore[assignment]
+    Image = None  # type: ignore[assignment,misc]
+    TESSERACT_AVAILABLE = False
+    logger.warning("pytesseract/Pillow not available — CV pipeline disabled")
+
+CV_PIPELINE_AVAILABLE = CV2_AVAILABLE and TESSERACT_AVAILABLE
+
+# --- IPA Dictionary ---
+
+IPA_AVAILABLE = False
+_ipa_convert_american = None
+_britfone_dict: Dict[str, str] = {}
+
+try:
+    import eng_to_ipa as _eng_to_ipa
+    _ipa_convert_american = _eng_to_ipa.convert
+    IPA_AVAILABLE = True
+    logger.info("eng_to_ipa available — American IPA lookup enabled")
+except ImportError:
+    logger.info("eng_to_ipa not installed — American IPA disabled")
+
+# Load Britfone dictionary (MIT license, ~15k British English IPA entries)
+_britfone_path = os.path.join(os.path.dirname(__file__), 'data', 'britfone_ipa.json')
+if os.path.exists(_britfone_path):
+    try:
+        with open(_britfone_path, 'r', encoding='utf-8') as f:
+            _britfone_dict = json.load(f)
+        IPA_AVAILABLE = True
+        logger.info(f"Britfone loaded — {len(_britfone_dict)} British IPA entries")
+    except Exception as e:
+        logger.warning(f"Failed to load Britfone: {e}")
+else:
+    logger.info("Britfone not found — British IPA disabled")
+
+# --- Language Detection Constants ---
+
+GERMAN_FUNCTION_WORDS = {'der', 'die', 'das', 'und', 'ist', 'ein', 'eine', 'nicht',
+    'von', 'zu', 'mit', 'auf', 'fuer', 'den', 'dem', 'sich', 'auch', 'wird',
+    'nach', 'bei', 'aus', 'wie', 'oder', 'wenn', 'noch', 'aber', 'hat', 'nur',
+    'ueber', 'kann', 'als', 'ich', 'er', 'sie', 'es', 'wir', 'ihr', 'haben',
+    'sein', 'werden', 'war', 'sind', 'muss', 'soll', 'dieser', 'diese', 'diesem'}
+
+ENGLISH_FUNCTION_WORDS = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'to', 'of',
+    'and', 'in', 'that', 'it', 'for', 'on', 'with', 'as', 'at', 'by', 'from',
+    'or', 'but', 'not', 'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
+    'would', 'can', 'could', 'should', 'may', 'might', 'this', 'they', 'you', 'he',
+    'she', 'we', 'my', 'your', 'his', 'her', 'its', 'our', 'their', 'which'}
+
+
+# --- Data Classes ---
+
+@dataclass
+class PageRegion:
+    """A detected region on the page."""
+    type: str           # 'column_en', 'column_de', 'column_example', 'page_ref', 'column_marker', 'column_text', 'header', 'footer', 'margin_top', 'margin_bottom'
+    x: int
+    y: int
+    width: int
+    height: int
+    classification_confidence: float = 1.0   # 0.0-1.0
+    classification_method: str = ""          # 'content', 'position_enhanced', 'position_fallback'
+
+
+@dataclass
+class ColumnGeometry:
+    """Geometrisch erkannte Spalte vor Typ-Klassifikation."""
+    index: int              # 0-basiert, links->rechts
+    x: int
+    y: int
+    width: int
+    height: int
+    word_count: int
+    words: List[Dict]       # Wort-Dicts aus Tesseract (text, conf, left, top, ...)
+    width_ratio: float      # width / content_width (0.0-1.0)
+    is_sub_column: bool = False  # True if created by _detect_sub_columns() split
+
+
+@dataclass
+class RowGeometry:
+    """Geometrisch erkannte Zeile mit Kopf-/Fusszeilen-Klassifikation."""
+    index: int              # 0-basiert, oben→unten
+    x: int                  # absolute left (= content left_x)
+    y: int                  # absolute y start
+    width: int              # content width
+    height: int             # Zeilenhoehe in px
+    word_count: int
+    words: List[Dict]
+    row_type: str = 'content'  # 'content' | 'header' | 'footer'
+    gap_before: int = 0     # Gap in px ueber dieser Zeile
+
+
+@dataclass
+class VocabRow:
+    """A single vocabulary entry assembled from multi-column OCR."""
+    english: str = ""
+    german: str = ""
+    example: str = ""
+    source_page: str = ""
+    confidence: float = 0.0
+    y_position: int = 0
+
+
+@dataclass
+class PipelineResult:
+    """Complete result of the CV pipeline."""
+    vocabulary: List[Dict[str, Any]] = field(default_factory=list)
+    word_count: int = 0
+    columns_detected: int = 0
+    duration_seconds: float = 0.0
+    stages: Dict[str, float] = field(default_factory=dict)
+    error: Optional[str] = None
+    image_width: int = 0
+    image_height: int = 0
+
+
+@dataclass
+class DocumentTypeResult:
+    """Result of automatic document type detection."""
+    doc_type: str           # 'vocab_table' | 'full_text' | 'generic_table'
+    confidence: float       # 0.0-1.0
+    pipeline: str           # 'cell_first' | 'full_page'
+    skip_steps: List[str] = field(default_factory=list)  # e.g. ['columns', 'rows']
+    features: Dict[str, Any] = field(default_factory=dict)  # debug info