refactor: cv_vocab_pipeline.py in 6 Module aufteilen (8163 → 6 + Fassade)
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 30s
CI / test-python-klausur (push) Failing after 1m59s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 18s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 30s
CI / test-python-klausur (push) Failing after 1m59s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 18s
Monolithische 8163-Zeilen-Datei aufgeteilt in fokussierte Module: - cv_vocab_types.py (156 Z.): Dataklassen, Konstanten, IPA, Feature-Flags - cv_preprocessing.py (1166 Z.): Bild-I/O, Orientierung, Deskew, Dewarp - cv_layout.py (3036 Z.): Dokumenttyp, Spalten, Zeilen, Klassifikation - cv_ocr_engines.py (1282 Z.): OCR-Engines, Vocab-Postprocessing, Text-Cleaning - cv_cell_grid.py (1510 Z.): Cell-Grid v2+Legacy, Vocab-Konvertierung - cv_review.py (1184 Z.): LLM/Spell Review, Pipeline-Orchestrierung cv_vocab_pipeline.py ist jetzt eine Re-Export-Fassade (35 Z.) — alle bestehenden Imports bleiben unveraendert. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,156 @@
|
||||
"""
|
||||
Shared types, constants, and availability guards for the CV vocabulary pipeline.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re # noqa: F401 — re-exported for downstream modules
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import numpy as np # noqa: F401
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# --- Availability Guards ---
|
||||
|
||||
try:
|
||||
import cv2 # noqa: F401
|
||||
CV2_AVAILABLE = True
|
||||
except ImportError:
|
||||
cv2 = None # type: ignore[assignment]
|
||||
CV2_AVAILABLE = False
|
||||
logger.warning("OpenCV not available — CV pipeline disabled")
|
||||
|
||||
try:
|
||||
import pytesseract # noqa: F401
|
||||
from PIL import Image # noqa: F401
|
||||
TESSERACT_AVAILABLE = True
|
||||
except ImportError:
|
||||
pytesseract = None # type: ignore[assignment]
|
||||
Image = None # type: ignore[assignment,misc]
|
||||
TESSERACT_AVAILABLE = False
|
||||
logger.warning("pytesseract/Pillow not available — CV pipeline disabled")
|
||||
|
||||
CV_PIPELINE_AVAILABLE = CV2_AVAILABLE and TESSERACT_AVAILABLE
|
||||
|
||||
# --- IPA Dictionary ---
|
||||
|
||||
IPA_AVAILABLE = False
|
||||
_ipa_convert_american = None
|
||||
_britfone_dict: Dict[str, str] = {}
|
||||
|
||||
try:
|
||||
import eng_to_ipa as _eng_to_ipa
|
||||
_ipa_convert_american = _eng_to_ipa.convert
|
||||
IPA_AVAILABLE = True
|
||||
logger.info("eng_to_ipa available — American IPA lookup enabled")
|
||||
except ImportError:
|
||||
logger.info("eng_to_ipa not installed — American IPA disabled")
|
||||
|
||||
# Load Britfone dictionary (MIT license, ~15k British English IPA entries)
|
||||
_britfone_path = os.path.join(os.path.dirname(__file__), 'data', 'britfone_ipa.json')
|
||||
if os.path.exists(_britfone_path):
|
||||
try:
|
||||
with open(_britfone_path, 'r', encoding='utf-8') as f:
|
||||
_britfone_dict = json.load(f)
|
||||
IPA_AVAILABLE = True
|
||||
logger.info(f"Britfone loaded — {len(_britfone_dict)} British IPA entries")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to load Britfone: {e}")
|
||||
else:
|
||||
logger.info("Britfone not found — British IPA disabled")
|
||||
|
||||
# --- Language Detection Constants ---
|
||||
|
||||
GERMAN_FUNCTION_WORDS = {'der', 'die', 'das', 'und', 'ist', 'ein', 'eine', 'nicht',
|
||||
'von', 'zu', 'mit', 'auf', 'fuer', 'den', 'dem', 'sich', 'auch', 'wird',
|
||||
'nach', 'bei', 'aus', 'wie', 'oder', 'wenn', 'noch', 'aber', 'hat', 'nur',
|
||||
'ueber', 'kann', 'als', 'ich', 'er', 'sie', 'es', 'wir', 'ihr', 'haben',
|
||||
'sein', 'werden', 'war', 'sind', 'muss', 'soll', 'dieser', 'diese', 'diesem'}
|
||||
|
||||
ENGLISH_FUNCTION_WORDS = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'to', 'of',
|
||||
'and', 'in', 'that', 'it', 'for', 'on', 'with', 'as', 'at', 'by', 'from',
|
||||
'or', 'but', 'not', 'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
|
||||
'would', 'can', 'could', 'should', 'may', 'might', 'this', 'they', 'you', 'he',
|
||||
'she', 'we', 'my', 'your', 'his', 'her', 'its', 'our', 'their', 'which'}
|
||||
|
||||
|
||||
# --- Data Classes ---
|
||||
|
||||
@dataclass
|
||||
class PageRegion:
|
||||
"""A detected region on the page."""
|
||||
type: str # 'column_en', 'column_de', 'column_example', 'page_ref', 'column_marker', 'column_text', 'header', 'footer', 'margin_top', 'margin_bottom'
|
||||
x: int
|
||||
y: int
|
||||
width: int
|
||||
height: int
|
||||
classification_confidence: float = 1.0 # 0.0-1.0
|
||||
classification_method: str = "" # 'content', 'position_enhanced', 'position_fallback'
|
||||
|
||||
|
||||
@dataclass
|
||||
class ColumnGeometry:
|
||||
"""Geometrisch erkannte Spalte vor Typ-Klassifikation."""
|
||||
index: int # 0-basiert, links->rechts
|
||||
x: int
|
||||
y: int
|
||||
width: int
|
||||
height: int
|
||||
word_count: int
|
||||
words: List[Dict] # Wort-Dicts aus Tesseract (text, conf, left, top, ...)
|
||||
width_ratio: float # width / content_width (0.0-1.0)
|
||||
is_sub_column: bool = False # True if created by _detect_sub_columns() split
|
||||
|
||||
|
||||
@dataclass
|
||||
class RowGeometry:
|
||||
"""Geometrisch erkannte Zeile mit Kopf-/Fusszeilen-Klassifikation."""
|
||||
index: int # 0-basiert, oben→unten
|
||||
x: int # absolute left (= content left_x)
|
||||
y: int # absolute y start
|
||||
width: int # content width
|
||||
height: int # Zeilenhoehe in px
|
||||
word_count: int
|
||||
words: List[Dict]
|
||||
row_type: str = 'content' # 'content' | 'header' | 'footer'
|
||||
gap_before: int = 0 # Gap in px ueber dieser Zeile
|
||||
|
||||
|
||||
@dataclass
|
||||
class VocabRow:
|
||||
"""A single vocabulary entry assembled from multi-column OCR."""
|
||||
english: str = ""
|
||||
german: str = ""
|
||||
example: str = ""
|
||||
source_page: str = ""
|
||||
confidence: float = 0.0
|
||||
y_position: int = 0
|
||||
|
||||
|
||||
@dataclass
|
||||
class PipelineResult:
|
||||
"""Complete result of the CV pipeline."""
|
||||
vocabulary: List[Dict[str, Any]] = field(default_factory=list)
|
||||
word_count: int = 0
|
||||
columns_detected: int = 0
|
||||
duration_seconds: float = 0.0
|
||||
stages: Dict[str, float] = field(default_factory=dict)
|
||||
error: Optional[str] = None
|
||||
image_width: int = 0
|
||||
image_height: int = 0
|
||||
|
||||
|
||||
@dataclass
|
||||
class DocumentTypeResult:
|
||||
"""Result of automatic document type detection."""
|
||||
doc_type: str # 'vocab_table' | 'full_text' | 'generic_table'
|
||||
confidence: float # 0.0-1.0
|
||||
pipeline: str # 'cell_first' | 'full_page'
|
||||
skip_steps: List[str] = field(default_factory=list) # e.g. ['columns', 'rows']
|
||||
features: Dict[str, Any] = field(default_factory=dict) # debug info
|
||||
Reference in New Issue
Block a user