refactor: cv_vocab_pipeline.py in 6 Module aufteilen (8163 → 6 + Fassade)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 30s
CI / test-python-klausur (push) Failing after 1m59s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 18s

Monolithische 8163-Zeilen-Datei aufgeteilt in fokussierte Module:
- cv_vocab_types.py (156 Z.): Dataklassen, Konstanten, IPA, Feature-Flags
- cv_preprocessing.py (1166 Z.): Bild-I/O, Orientierung, Deskew, Dewarp
- cv_layout.py (3036 Z.): Dokumenttyp, Spalten, Zeilen, Klassifikation
- cv_ocr_engines.py (1282 Z.): OCR-Engines, Vocab-Postprocessing, Text-Cleaning
- cv_cell_grid.py (1510 Z.): Cell-Grid v2+Legacy, Vocab-Konvertierung
- cv_review.py (1184 Z.): LLM/Spell Review, Pipeline-Orchestrierung

cv_vocab_pipeline.py ist jetzt eine Re-Export-Fassade (35 Z.) —
alle bestehenden Imports bleiben unveraendert.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-08 23:46:47 +01:00
parent 931ab92c92
commit 9a5a35bff1
7 changed files with 8359 additions and 8153 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,156 @@
"""
Shared types, constants, and availability guards for the CV vocabulary pipeline.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import json
import logging
import os
import re # noqa: F401 — re-exported for downstream modules
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional
import numpy as np # noqa: F401
logger = logging.getLogger(__name__)
# --- Availability Guards ---
try:
import cv2 # noqa: F401
CV2_AVAILABLE = True
except ImportError:
cv2 = None # type: ignore[assignment]
CV2_AVAILABLE = False
logger.warning("OpenCV not available — CV pipeline disabled")
try:
import pytesseract # noqa: F401
from PIL import Image # noqa: F401
TESSERACT_AVAILABLE = True
except ImportError:
pytesseract = None # type: ignore[assignment]
Image = None # type: ignore[assignment,misc]
TESSERACT_AVAILABLE = False
logger.warning("pytesseract/Pillow not available — CV pipeline disabled")
CV_PIPELINE_AVAILABLE = CV2_AVAILABLE and TESSERACT_AVAILABLE
# --- IPA Dictionary ---
IPA_AVAILABLE = False
_ipa_convert_american = None
_britfone_dict: Dict[str, str] = {}
try:
import eng_to_ipa as _eng_to_ipa
_ipa_convert_american = _eng_to_ipa.convert
IPA_AVAILABLE = True
logger.info("eng_to_ipa available — American IPA lookup enabled")
except ImportError:
logger.info("eng_to_ipa not installed — American IPA disabled")
# Load Britfone dictionary (MIT license, ~15k British English IPA entries)
_britfone_path = os.path.join(os.path.dirname(__file__), 'data', 'britfone_ipa.json')
if os.path.exists(_britfone_path):
try:
with open(_britfone_path, 'r', encoding='utf-8') as f:
_britfone_dict = json.load(f)
IPA_AVAILABLE = True
logger.info(f"Britfone loaded — {len(_britfone_dict)} British IPA entries")
except Exception as e:
logger.warning(f"Failed to load Britfone: {e}")
else:
logger.info("Britfone not found — British IPA disabled")
# --- Language Detection Constants ---
GERMAN_FUNCTION_WORDS = {'der', 'die', 'das', 'und', 'ist', 'ein', 'eine', 'nicht',
'von', 'zu', 'mit', 'auf', 'fuer', 'den', 'dem', 'sich', 'auch', 'wird',
'nach', 'bei', 'aus', 'wie', 'oder', 'wenn', 'noch', 'aber', 'hat', 'nur',
'ueber', 'kann', 'als', 'ich', 'er', 'sie', 'es', 'wir', 'ihr', 'haben',
'sein', 'werden', 'war', 'sind', 'muss', 'soll', 'dieser', 'diese', 'diesem'}
ENGLISH_FUNCTION_WORDS = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'to', 'of',
'and', 'in', 'that', 'it', 'for', 'on', 'with', 'as', 'at', 'by', 'from',
'or', 'but', 'not', 'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
'would', 'can', 'could', 'should', 'may', 'might', 'this', 'they', 'you', 'he',
'she', 'we', 'my', 'your', 'his', 'her', 'its', 'our', 'their', 'which'}
# --- Data Classes ---
@dataclass
class PageRegion:
"""A detected region on the page."""
type: str # 'column_en', 'column_de', 'column_example', 'page_ref', 'column_marker', 'column_text', 'header', 'footer', 'margin_top', 'margin_bottom'
x: int
y: int
width: int
height: int
classification_confidence: float = 1.0 # 0.0-1.0
classification_method: str = "" # 'content', 'position_enhanced', 'position_fallback'
@dataclass
class ColumnGeometry:
"""Geometrisch erkannte Spalte vor Typ-Klassifikation."""
index: int # 0-basiert, links->rechts
x: int
y: int
width: int
height: int
word_count: int
words: List[Dict] # Wort-Dicts aus Tesseract (text, conf, left, top, ...)
width_ratio: float # width / content_width (0.0-1.0)
is_sub_column: bool = False # True if created by _detect_sub_columns() split
@dataclass
class RowGeometry:
"""Geometrisch erkannte Zeile mit Kopf-/Fusszeilen-Klassifikation."""
index: int # 0-basiert, oben→unten
x: int # absolute left (= content left_x)
y: int # absolute y start
width: int # content width
height: int # Zeilenhoehe in px
word_count: int
words: List[Dict]
row_type: str = 'content' # 'content' | 'header' | 'footer'
gap_before: int = 0 # Gap in px ueber dieser Zeile
@dataclass
class VocabRow:
"""A single vocabulary entry assembled from multi-column OCR."""
english: str = ""
german: str = ""
example: str = ""
source_page: str = ""
confidence: float = 0.0
y_position: int = 0
@dataclass
class PipelineResult:
"""Complete result of the CV pipeline."""
vocabulary: List[Dict[str, Any]] = field(default_factory=list)
word_count: int = 0
columns_detected: int = 0
duration_seconds: float = 0.0
stages: Dict[str, float] = field(default_factory=dict)
error: Optional[str] = None
image_width: int = 0
image_height: int = 0
@dataclass
class DocumentTypeResult:
"""Result of automatic document type detection."""
doc_type: str # 'vocab_table' | 'full_text' | 'generic_table'
confidence: float # 0.0-1.0
pipeline: str # 'cell_first' | 'full_page'
skip_steps: List[str] = field(default_factory=list) # e.g. ['columns', 'rows']
features: Dict[str, Any] = field(default_factory=dict) # debug info