Fix: Sidebar scrollable + add Eltern-Portal nav link
overflow-hidden → overflow-y-auto so all nav items are reachable. Added /parent (Eltern-Portal) link with people icon. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
275
klausur-service/backend/ocr/gutter/core.py
Normal file
275
klausur-service/backend/ocr/gutter/core.py
Normal file
@@ -0,0 +1,275 @@
|
||||
"""
|
||||
Gutter Repair Core — spellchecker setup, data types, and single-word repair logic.
|
||||
|
||||
Extracted from cv_gutter_repair.py for modularity.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import itertools
|
||||
import logging
|
||||
import re
|
||||
import uuid
|
||||
from dataclasses import dataclass, field, asdict
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Spellchecker setup (lazy, cached)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_spell_de = None
|
||||
_spell_en = None
|
||||
_SPELL_AVAILABLE = False
|
||||
|
||||
def _init_spellcheckers():
|
||||
"""Lazy-load DE + EN spellcheckers (cached across calls)."""
|
||||
global _spell_de, _spell_en, _SPELL_AVAILABLE
|
||||
if _spell_de is not None:
|
||||
return
|
||||
try:
|
||||
from spellchecker import SpellChecker
|
||||
_spell_de = SpellChecker(language='de', distance=1)
|
||||
_spell_en = SpellChecker(language='en', distance=1)
|
||||
_SPELL_AVAILABLE = True
|
||||
logger.info("Gutter repair: spellcheckers loaded (DE + EN)")
|
||||
except ImportError:
|
||||
logger.warning("pyspellchecker not installed — gutter repair unavailable")
|
||||
|
||||
|
||||
def _is_known(word: str) -> bool:
|
||||
"""Check if a word is known in DE or EN dictionary."""
|
||||
_init_spellcheckers()
|
||||
if not _SPELL_AVAILABLE:
|
||||
return False
|
||||
w = word.lower()
|
||||
return bool(_spell_de.known([w])) or bool(_spell_en.known([w]))
|
||||
|
||||
|
||||
def _spell_candidates(word: str, lang: str = "both") -> List[str]:
|
||||
"""Get all plausible spellchecker candidates for a word (deduplicated)."""
|
||||
_init_spellcheckers()
|
||||
if not _SPELL_AVAILABLE:
|
||||
return []
|
||||
w = word.lower()
|
||||
seen: set = set()
|
||||
results: List[str] = []
|
||||
|
||||
for checker in ([_spell_de, _spell_en] if lang == "both"
|
||||
else [_spell_de] if lang == "de"
|
||||
else [_spell_en]):
|
||||
if checker is None:
|
||||
continue
|
||||
cands = checker.candidates(w)
|
||||
if cands:
|
||||
for c in cands:
|
||||
if c and c != w and c not in seen:
|
||||
seen.add(c)
|
||||
results.append(c)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Gutter position detection
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Minimum word length for spell-fix (very short words are often legitimate)
|
||||
_MIN_WORD_LEN_SPELL = 3
|
||||
|
||||
# Minimum word length for hyphen-join candidates (fragments at the gutter
|
||||
# can be as short as 1-2 chars, e.g. "ve" from "ver-künden")
|
||||
_MIN_WORD_LEN_HYPHEN = 2
|
||||
|
||||
# How close to the right column edge a word must be to count as "gutter-adjacent".
|
||||
# Expressed as fraction of column width (e.g. 0.75 = rightmost 25%).
|
||||
_GUTTER_EDGE_THRESHOLD = 0.70
|
||||
|
||||
# Small common words / abbreviations that should NOT be repaired
|
||||
_STOPWORDS = frozenset([
|
||||
# German
|
||||
"ab", "an", "am", "da", "er", "es", "im", "in", "ja", "ob", "so", "um",
|
||||
"zu", "wo", "du", "eh", "ei", "je", "na", "nu", "oh",
|
||||
# English
|
||||
"a", "am", "an", "as", "at", "be", "by", "do", "go", "he", "if", "in",
|
||||
"is", "it", "me", "my", "no", "of", "on", "or", "so", "to", "up", "us",
|
||||
"we",
|
||||
])
|
||||
|
||||
# IPA / phonetic patterns — skip these cells
|
||||
_IPA_RE = re.compile(r'[\[\]/ˈˌːʃʒθðŋɑɒæɔəɛɪʊʌ]')
|
||||
|
||||
|
||||
def _is_ipa_text(text: str) -> bool:
|
||||
"""True if text looks like IPA transcription."""
|
||||
return bool(_IPA_RE.search(text))
|
||||
|
||||
|
||||
def _word_is_at_gutter_edge(word_bbox: Dict, col_x: float, col_width: float) -> bool:
|
||||
"""Check if a word's right edge is near the right boundary of its column."""
|
||||
if col_width <= 0:
|
||||
return False
|
||||
word_right = word_bbox.get("left", 0) + word_bbox.get("width", 0)
|
||||
col_right = col_x + col_width
|
||||
# Word's right edge within the rightmost portion of the column
|
||||
relative_pos = (word_right - col_x) / col_width
|
||||
return relative_pos >= _GUTTER_EDGE_THRESHOLD
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Suggestion types
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@dataclass
|
||||
class GutterSuggestion:
|
||||
"""A single correction suggestion."""
|
||||
id: str = field(default_factory=lambda: str(uuid.uuid4())[:8])
|
||||
type: str = "" # "hyphen_join" | "spell_fix"
|
||||
zone_index: int = 0
|
||||
row_index: int = 0
|
||||
col_index: int = 0
|
||||
col_type: str = ""
|
||||
cell_id: str = ""
|
||||
original_text: str = ""
|
||||
suggested_text: str = ""
|
||||
# For hyphen_join:
|
||||
next_row_index: int = -1
|
||||
next_row_cell_id: str = ""
|
||||
next_row_text: str = ""
|
||||
missing_chars: str = ""
|
||||
display_parts: List[str] = field(default_factory=list)
|
||||
# Alternatives (other plausible corrections the user can pick from)
|
||||
alternatives: List[str] = field(default_factory=list)
|
||||
# Meta:
|
||||
confidence: float = 0.0
|
||||
reason: str = "" # "gutter_truncation" | "gutter_blur" | "hyphen_continuation"
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return asdict(self)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Core repair logic
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_TRAILING_PUNCT_RE = re.compile(r'[.,;:!?\)\]]+$')
|
||||
|
||||
|
||||
def _try_hyphen_join(
|
||||
word_text: str,
|
||||
next_word_text: str,
|
||||
max_missing: int = 3,
|
||||
) -> Optional[Tuple[str, str, float]]:
|
||||
"""Try joining two fragments with 0..max_missing interpolated chars.
|
||||
|
||||
Strips trailing punctuation from the continuation word before testing
|
||||
(e.g. "künden," → "künden") so dictionary lookup succeeds.
|
||||
|
||||
Returns (joined_word, missing_chars, confidence) or None.
|
||||
"""
|
||||
base = word_text.rstrip("-").rstrip()
|
||||
# Strip trailing punctuation from continuation (commas, periods, etc.)
|
||||
raw_continuation = next_word_text.lstrip()
|
||||
continuation = _TRAILING_PUNCT_RE.sub('', raw_continuation)
|
||||
|
||||
if not base or not continuation:
|
||||
return None
|
||||
|
||||
# 1. Direct join (no missing chars)
|
||||
direct = base + continuation
|
||||
if _is_known(direct):
|
||||
return (direct, "", 0.95)
|
||||
|
||||
# 2. Try with 1..max_missing missing characters
|
||||
# Use common letters, weighted by frequency in German/English
|
||||
_COMMON_CHARS = "enristaldhgcmobwfkzpvjyxqu"
|
||||
|
||||
for n_missing in range(1, max_missing + 1):
|
||||
for chars in itertools.product(_COMMON_CHARS[:15], repeat=n_missing):
|
||||
candidate = base + "".join(chars) + continuation
|
||||
if _is_known(candidate):
|
||||
missing = "".join(chars)
|
||||
# Confidence decreases with more missing chars
|
||||
conf = 0.90 - (n_missing - 1) * 0.10
|
||||
return (candidate, missing, conf)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _try_spell_fix(
|
||||
word_text: str, col_type: str = "",
|
||||
) -> Optional[Tuple[str, float, List[str]]]:
|
||||
"""Try to fix a single garbled gutter word via spellchecker.
|
||||
|
||||
Returns (best_correction, confidence, alternatives_list) or None.
|
||||
The alternatives list contains other plausible corrections the user
|
||||
can choose from (e.g. "stammelt" vs "stammeln").
|
||||
"""
|
||||
if len(word_text) < _MIN_WORD_LEN_SPELL:
|
||||
return None
|
||||
|
||||
# Strip trailing/leading parentheses and check if the bare word is valid.
|
||||
# Words like "probieren)" or "(Englisch" are valid words with punctuation,
|
||||
# not OCR errors. Don't suggest corrections for them.
|
||||
stripped = word_text.strip("()")
|
||||
if stripped and _is_known(stripped):
|
||||
return None
|
||||
|
||||
# Determine language priority from column type
|
||||
if "en" in col_type:
|
||||
lang = "en"
|
||||
elif "de" in col_type:
|
||||
lang = "de"
|
||||
else:
|
||||
lang = "both"
|
||||
|
||||
candidates = _spell_candidates(word_text, lang=lang)
|
||||
if not candidates and lang != "both":
|
||||
candidates = _spell_candidates(word_text, lang="both")
|
||||
|
||||
if not candidates:
|
||||
return None
|
||||
|
||||
# Preserve original casing
|
||||
is_upper = word_text[0].isupper()
|
||||
|
||||
def _preserve_case(w: str) -> str:
|
||||
if is_upper and w:
|
||||
return w[0].upper() + w[1:]
|
||||
return w
|
||||
|
||||
# Sort candidates by edit distance (closest first)
|
||||
scored = []
|
||||
for c in candidates:
|
||||
dist = _edit_distance(word_text.lower(), c.lower())
|
||||
scored.append((dist, c))
|
||||
scored.sort(key=lambda x: x[0])
|
||||
|
||||
best_dist, best = scored[0]
|
||||
best = _preserve_case(best)
|
||||
conf = max(0.5, 1.0 - best_dist * 0.15)
|
||||
|
||||
# Build alternatives (all other candidates, also case-preserved)
|
||||
alts = [_preserve_case(c) for _, c in scored[1:] if c.lower() != best.lower()]
|
||||
# Limit to top 5 alternatives
|
||||
alts = alts[:5]
|
||||
|
||||
return (best, conf, alts)
|
||||
|
||||
|
||||
def _edit_distance(a: str, b: str) -> int:
|
||||
"""Simple Levenshtein distance."""
|
||||
if len(a) < len(b):
|
||||
return _edit_distance(b, a)
|
||||
if len(b) == 0:
|
||||
return len(a)
|
||||
prev = list(range(len(b) + 1))
|
||||
for i, ca in enumerate(a):
|
||||
curr = [i + 1]
|
||||
for j, cb in enumerate(b):
|
||||
cost = 0 if ca == cb else 1
|
||||
curr.append(min(curr[j] + 1, prev[j + 1] + 1, prev[j] + cost))
|
||||
prev = curr
|
||||
return prev[len(b)]
|
||||
Reference in New Issue
Block a user