refactor: cv_vocab_pipeline.py in 6 Module aufteilen (8163 → 6 + Fassade)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 30s
CI / test-python-klausur (push) Failing after 1m59s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 18s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 30s
CI / test-python-klausur (push) Failing after 1m59s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 18s
Monolithische 8163-Zeilen-Datei aufgeteilt in fokussierte Module: - cv_vocab_types.py (156 Z.): Dataklassen, Konstanten, IPA, Feature-Flags - cv_preprocessing.py (1166 Z.): Bild-I/O, Orientierung, Deskew, Dewarp - cv_layout.py (3036 Z.): Dokumenttyp, Spalten, Zeilen, Klassifikation - cv_ocr_engines.py (1282 Z.): OCR-Engines, Vocab-Postprocessing, Text-Cleaning - cv_cell_grid.py (1510 Z.): Cell-Grid v2+Legacy, Vocab-Konvertierung - cv_review.py (1184 Z.): LLM/Spell Review, Pipeline-Orchestrierung cv_vocab_pipeline.py ist jetzt eine Re-Export-Fassade (35 Z.) — alle bestehenden Imports bleiben unveraendert. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
1510
klausur-service/backend/cv_cell_grid.py
Normal file
1510
klausur-service/backend/cv_cell_grid.py
Normal file
File diff suppressed because it is too large
Load Diff
3036
klausur-service/backend/cv_layout.py
Normal file
3036
klausur-service/backend/cv_layout.py
Normal file
File diff suppressed because it is too large
Load Diff
1282
klausur-service/backend/cv_ocr_engines.py
Normal file
1282
klausur-service/backend/cv_ocr_engines.py
Normal file
File diff suppressed because it is too large
Load Diff
1166
klausur-service/backend/cv_preprocessing.py
Normal file
1166
klausur-service/backend/cv_preprocessing.py
Normal file
File diff suppressed because it is too large
Load Diff
1184
klausur-service/backend/cv_review.py
Normal file
1184
klausur-service/backend/cv_review.py
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
156
klausur-service/backend/cv_vocab_types.py
Normal file
156
klausur-service/backend/cv_vocab_types.py
Normal file
@@ -0,0 +1,156 @@
|
|||||||
|
"""
|
||||||
|
Shared types, constants, and availability guards for the CV vocabulary pipeline.
|
||||||
|
|
||||||
|
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||||
|
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import re # noqa: F401 — re-exported for downstream modules
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
import numpy as np # noqa: F401
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# --- Availability Guards ---
|
||||||
|
|
||||||
|
try:
|
||||||
|
import cv2 # noqa: F401
|
||||||
|
CV2_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
cv2 = None # type: ignore[assignment]
|
||||||
|
CV2_AVAILABLE = False
|
||||||
|
logger.warning("OpenCV not available — CV pipeline disabled")
|
||||||
|
|
||||||
|
try:
|
||||||
|
import pytesseract # noqa: F401
|
||||||
|
from PIL import Image # noqa: F401
|
||||||
|
TESSERACT_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
pytesseract = None # type: ignore[assignment]
|
||||||
|
Image = None # type: ignore[assignment,misc]
|
||||||
|
TESSERACT_AVAILABLE = False
|
||||||
|
logger.warning("pytesseract/Pillow not available — CV pipeline disabled")
|
||||||
|
|
||||||
|
CV_PIPELINE_AVAILABLE = CV2_AVAILABLE and TESSERACT_AVAILABLE
|
||||||
|
|
||||||
|
# --- IPA Dictionary ---
|
||||||
|
|
||||||
|
IPA_AVAILABLE = False
|
||||||
|
_ipa_convert_american = None
|
||||||
|
_britfone_dict: Dict[str, str] = {}
|
||||||
|
|
||||||
|
try:
|
||||||
|
import eng_to_ipa as _eng_to_ipa
|
||||||
|
_ipa_convert_american = _eng_to_ipa.convert
|
||||||
|
IPA_AVAILABLE = True
|
||||||
|
logger.info("eng_to_ipa available — American IPA lookup enabled")
|
||||||
|
except ImportError:
|
||||||
|
logger.info("eng_to_ipa not installed — American IPA disabled")
|
||||||
|
|
||||||
|
# Load Britfone dictionary (MIT license, ~15k British English IPA entries)
|
||||||
|
_britfone_path = os.path.join(os.path.dirname(__file__), 'data', 'britfone_ipa.json')
|
||||||
|
if os.path.exists(_britfone_path):
|
||||||
|
try:
|
||||||
|
with open(_britfone_path, 'r', encoding='utf-8') as f:
|
||||||
|
_britfone_dict = json.load(f)
|
||||||
|
IPA_AVAILABLE = True
|
||||||
|
logger.info(f"Britfone loaded — {len(_britfone_dict)} British IPA entries")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to load Britfone: {e}")
|
||||||
|
else:
|
||||||
|
logger.info("Britfone not found — British IPA disabled")
|
||||||
|
|
||||||
|
# --- Language Detection Constants ---
|
||||||
|
|
||||||
|
GERMAN_FUNCTION_WORDS = {'der', 'die', 'das', 'und', 'ist', 'ein', 'eine', 'nicht',
|
||||||
|
'von', 'zu', 'mit', 'auf', 'fuer', 'den', 'dem', 'sich', 'auch', 'wird',
|
||||||
|
'nach', 'bei', 'aus', 'wie', 'oder', 'wenn', 'noch', 'aber', 'hat', 'nur',
|
||||||
|
'ueber', 'kann', 'als', 'ich', 'er', 'sie', 'es', 'wir', 'ihr', 'haben',
|
||||||
|
'sein', 'werden', 'war', 'sind', 'muss', 'soll', 'dieser', 'diese', 'diesem'}
|
||||||
|
|
||||||
|
ENGLISH_FUNCTION_WORDS = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'to', 'of',
|
||||||
|
'and', 'in', 'that', 'it', 'for', 'on', 'with', 'as', 'at', 'by', 'from',
|
||||||
|
'or', 'but', 'not', 'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
|
||||||
|
'would', 'can', 'could', 'should', 'may', 'might', 'this', 'they', 'you', 'he',
|
||||||
|
'she', 'we', 'my', 'your', 'his', 'her', 'its', 'our', 'their', 'which'}
|
||||||
|
|
||||||
|
|
||||||
|
# --- Data Classes ---
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PageRegion:
|
||||||
|
"""A detected region on the page."""
|
||||||
|
type: str # 'column_en', 'column_de', 'column_example', 'page_ref', 'column_marker', 'column_text', 'header', 'footer', 'margin_top', 'margin_bottom'
|
||||||
|
x: int
|
||||||
|
y: int
|
||||||
|
width: int
|
||||||
|
height: int
|
||||||
|
classification_confidence: float = 1.0 # 0.0-1.0
|
||||||
|
classification_method: str = "" # 'content', 'position_enhanced', 'position_fallback'
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ColumnGeometry:
|
||||||
|
"""Geometrisch erkannte Spalte vor Typ-Klassifikation."""
|
||||||
|
index: int # 0-basiert, links->rechts
|
||||||
|
x: int
|
||||||
|
y: int
|
||||||
|
width: int
|
||||||
|
height: int
|
||||||
|
word_count: int
|
||||||
|
words: List[Dict] # Wort-Dicts aus Tesseract (text, conf, left, top, ...)
|
||||||
|
width_ratio: float # width / content_width (0.0-1.0)
|
||||||
|
is_sub_column: bool = False # True if created by _detect_sub_columns() split
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RowGeometry:
|
||||||
|
"""Geometrisch erkannte Zeile mit Kopf-/Fusszeilen-Klassifikation."""
|
||||||
|
index: int # 0-basiert, oben→unten
|
||||||
|
x: int # absolute left (= content left_x)
|
||||||
|
y: int # absolute y start
|
||||||
|
width: int # content width
|
||||||
|
height: int # Zeilenhoehe in px
|
||||||
|
word_count: int
|
||||||
|
words: List[Dict]
|
||||||
|
row_type: str = 'content' # 'content' | 'header' | 'footer'
|
||||||
|
gap_before: int = 0 # Gap in px ueber dieser Zeile
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class VocabRow:
|
||||||
|
"""A single vocabulary entry assembled from multi-column OCR."""
|
||||||
|
english: str = ""
|
||||||
|
german: str = ""
|
||||||
|
example: str = ""
|
||||||
|
source_page: str = ""
|
||||||
|
confidence: float = 0.0
|
||||||
|
y_position: int = 0
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PipelineResult:
|
||||||
|
"""Complete result of the CV pipeline."""
|
||||||
|
vocabulary: List[Dict[str, Any]] = field(default_factory=list)
|
||||||
|
word_count: int = 0
|
||||||
|
columns_detected: int = 0
|
||||||
|
duration_seconds: float = 0.0
|
||||||
|
stages: Dict[str, float] = field(default_factory=dict)
|
||||||
|
error: Optional[str] = None
|
||||||
|
image_width: int = 0
|
||||||
|
image_height: int = 0
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class DocumentTypeResult:
|
||||||
|
"""Result of automatic document type detection."""
|
||||||
|
doc_type: str # 'vocab_table' | 'full_text' | 'generic_table'
|
||||||
|
confidence: float # 0.0-1.0
|
||||||
|
pipeline: str # 'cell_first' | 'full_page'
|
||||||
|
skip_steps: List[str] = field(default_factory=list) # e.g. ['columns', 'rows']
|
||||||
|
features: Dict[str, Any] = field(default_factory=dict) # debug info
|
||||||
Reference in New Issue
Block a user