""" Gutter Repair Core — spellchecker setup, data types, and single-word repair logic. Extracted from cv_gutter_repair.py for modularity. Lizenz: Apache 2.0 (kommerziell nutzbar) DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import itertools import logging import re import uuid from dataclasses import dataclass, field, asdict from typing import Any, Dict, List, Optional, Tuple logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Spellchecker setup (lazy, cached) # --------------------------------------------------------------------------- _spell_de = None _spell_en = None _SPELL_AVAILABLE = False def _init_spellcheckers(): """Lazy-load DE + EN spellcheckers (cached across calls).""" global _spell_de, _spell_en, _SPELL_AVAILABLE if _spell_de is not None: return try: from spellchecker import SpellChecker _spell_de = SpellChecker(language='de', distance=1) _spell_en = SpellChecker(language='en', distance=1) _SPELL_AVAILABLE = True logger.info("Gutter repair: spellcheckers loaded (DE + EN)") except ImportError: logger.warning("pyspellchecker not installed — gutter repair unavailable") def _is_known(word: str) -> bool: """Check if a word is known in DE or EN dictionary.""" _init_spellcheckers() if not _SPELL_AVAILABLE: return False w = word.lower() return bool(_spell_de.known([w])) or bool(_spell_en.known([w])) def _spell_candidates(word: str, lang: str = "both") -> List[str]: """Get all plausible spellchecker candidates for a word (deduplicated).""" _init_spellcheckers() if not _SPELL_AVAILABLE: return [] w = word.lower() seen: set = set() results: List[str] = [] for checker in ([_spell_de, _spell_en] if lang == "both" else [_spell_de] if lang == "de" else [_spell_en]): if checker is None: continue cands = checker.candidates(w) if cands: for c in cands: if c and c != w and c not in seen: seen.add(c) results.append(c) return results # --------------------------------------------------------------------------- # Gutter position detection # --------------------------------------------------------------------------- # Minimum word length for spell-fix (very short words are often legitimate) _MIN_WORD_LEN_SPELL = 3 # Minimum word length for hyphen-join candidates (fragments at the gutter # can be as short as 1-2 chars, e.g. "ve" from "ver-künden") _MIN_WORD_LEN_HYPHEN = 2 # How close to the right column edge a word must be to count as "gutter-adjacent". # Expressed as fraction of column width (e.g. 0.75 = rightmost 25%). _GUTTER_EDGE_THRESHOLD = 0.70 # Small common words / abbreviations that should NOT be repaired _STOPWORDS = frozenset([ # German "ab", "an", "am", "da", "er", "es", "im", "in", "ja", "ob", "so", "um", "zu", "wo", "du", "eh", "ei", "je", "na", "nu", "oh", # English "a", "am", "an", "as", "at", "be", "by", "do", "go", "he", "if", "in", "is", "it", "me", "my", "no", "of", "on", "or", "so", "to", "up", "us", "we", ]) # IPA / phonetic patterns — skip these cells _IPA_RE = re.compile(r'[\[\]/ˈˌːʃʒθðŋɑɒæɔəɛɪʊʌ]') def _is_ipa_text(text: str) -> bool: """True if text looks like IPA transcription.""" return bool(_IPA_RE.search(text)) def _word_is_at_gutter_edge(word_bbox: Dict, col_x: float, col_width: float) -> bool: """Check if a word's right edge is near the right boundary of its column.""" if col_width <= 0: return False word_right = word_bbox.get("left", 0) + word_bbox.get("width", 0) col_right = col_x + col_width # Word's right edge within the rightmost portion of the column relative_pos = (word_right - col_x) / col_width return relative_pos >= _GUTTER_EDGE_THRESHOLD # --------------------------------------------------------------------------- # Suggestion types # --------------------------------------------------------------------------- @dataclass class GutterSuggestion: """A single correction suggestion.""" id: str = field(default_factory=lambda: str(uuid.uuid4())[:8]) type: str = "" # "hyphen_join" | "spell_fix" zone_index: int = 0 row_index: int = 0 col_index: int = 0 col_type: str = "" cell_id: str = "" original_text: str = "" suggested_text: str = "" # For hyphen_join: next_row_index: int = -1 next_row_cell_id: str = "" next_row_text: str = "" missing_chars: str = "" display_parts: List[str] = field(default_factory=list) # Alternatives (other plausible corrections the user can pick from) alternatives: List[str] = field(default_factory=list) # Meta: confidence: float = 0.0 reason: str = "" # "gutter_truncation" | "gutter_blur" | "hyphen_continuation" def to_dict(self) -> Dict[str, Any]: return asdict(self) # --------------------------------------------------------------------------- # Core repair logic # --------------------------------------------------------------------------- _TRAILING_PUNCT_RE = re.compile(r'[.,;:!?\)\]]+$') def _try_hyphen_join( word_text: str, next_word_text: str, max_missing: int = 3, ) -> Optional[Tuple[str, str, float]]: """Try joining two fragments with 0..max_missing interpolated chars. Strips trailing punctuation from the continuation word before testing (e.g. "künden," → "künden") so dictionary lookup succeeds. Returns (joined_word, missing_chars, confidence) or None. """ base = word_text.rstrip("-").rstrip() # Strip trailing punctuation from continuation (commas, periods, etc.) raw_continuation = next_word_text.lstrip() continuation = _TRAILING_PUNCT_RE.sub('', raw_continuation) if not base or not continuation: return None # 1. Direct join (no missing chars) direct = base + continuation if _is_known(direct): return (direct, "", 0.95) # 2. Try with 1..max_missing missing characters # Use common letters, weighted by frequency in German/English _COMMON_CHARS = "enristaldhgcmobwfkzpvjyxqu" for n_missing in range(1, max_missing + 1): for chars in itertools.product(_COMMON_CHARS[:15], repeat=n_missing): candidate = base + "".join(chars) + continuation if _is_known(candidate): missing = "".join(chars) # Confidence decreases with more missing chars conf = 0.90 - (n_missing - 1) * 0.10 return (candidate, missing, conf) return None def _try_spell_fix( word_text: str, col_type: str = "", ) -> Optional[Tuple[str, float, List[str]]]: """Try to fix a single garbled gutter word via spellchecker. Returns (best_correction, confidence, alternatives_list) or None. The alternatives list contains other plausible corrections the user can choose from (e.g. "stammelt" vs "stammeln"). """ if len(word_text) < _MIN_WORD_LEN_SPELL: return None # Strip trailing/leading parentheses and check if the bare word is valid. # Words like "probieren)" or "(Englisch" are valid words with punctuation, # not OCR errors. Don't suggest corrections for them. stripped = word_text.strip("()") if stripped and _is_known(stripped): return None # Determine language priority from column type if "en" in col_type: lang = "en" elif "de" in col_type: lang = "de" else: lang = "both" candidates = _spell_candidates(word_text, lang=lang) if not candidates and lang != "both": candidates = _spell_candidates(word_text, lang="both") if not candidates: return None # Preserve original casing is_upper = word_text[0].isupper() def _preserve_case(w: str) -> str: if is_upper and w: return w[0].upper() + w[1:] return w # Sort candidates by edit distance (closest first) scored = [] for c in candidates: dist = _edit_distance(word_text.lower(), c.lower()) scored.append((dist, c)) scored.sort(key=lambda x: x[0]) best_dist, best = scored[0] best = _preserve_case(best) conf = max(0.5, 1.0 - best_dist * 0.15) # Build alternatives (all other candidates, also case-preserved) alts = [_preserve_case(c) for _, c in scored[1:] if c.lower() != best.lower()] # Limit to top 5 alternatives alts = alts[:5] return (best, conf, alts) def _edit_distance(a: str, b: str) -> int: """Simple Levenshtein distance.""" if len(a) < len(b): return _edit_distance(b, a) if len(b) == 0: return len(a) prev = list(range(len(b) + 1)) for i, ca in enumerate(a): curr = [i + 1] for j, cb in enumerate(b): cost = 0 if ca == cb else 1 curr.append(min(curr[j] + 1, prev[j + 1] + 1, prev[j] + cost)) prev = curr return prev[len(b)]