Restructure: Move final 12 root files into packages (klausur-service)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 2m23s
CI / test-python-agent-core (push) Successful in 19s
CI / test-nodejs-website (push) Successful in 19s

ocr/spell/  (3): smart_spell, core, text
upload/     (3): api, chunked, mobile
crawler/    (3): github, github_core, github_parsers
+ unified_grid → grid/, tesseract_extractor → ocr/engines/, htr_api → ocr/pipeline/

12 shims added. Only main.py, config.py, storage + RAG files remain at root.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-25 23:19:11 +02:00
parent cba877c65a
commit d093a4d388
27 changed files with 3116 additions and 3049 deletions

View File

@@ -0,0 +1,346 @@
"""
Tesseract-based OCR extraction with word-level bounding boxes.
Uses Tesseract for spatial information (WHERE text is) while
the Vision LLM handles semantic understanding (WHAT the text means).
Tesseract runs natively on ARM64 via Debian's apt package.
Lizenz: Apache 2.0 (kommerziell nutzbar)
"""
import io
import logging
from typing import List, Dict, Any, Optional
from difflib import SequenceMatcher
logger = logging.getLogger(__name__)
try:
import pytesseract
from PIL import Image
TESSERACT_AVAILABLE = True
except ImportError:
TESSERACT_AVAILABLE = False
logger.warning("pytesseract or Pillow not installed - Tesseract OCR unavailable")
async def extract_bounding_boxes(image_bytes: bytes, lang: str = "eng+deu") -> dict:
"""Run Tesseract OCR and return word-level bounding boxes.
Args:
image_bytes: PNG/JPEG image as bytes.
lang: Tesseract language string (e.g. "eng+deu").
Returns:
Dict with 'words' list and 'image_width'/'image_height'.
"""
if not TESSERACT_AVAILABLE:
return {"words": [], "image_width": 0, "image_height": 0, "error": "Tesseract not available"}
image = Image.open(io.BytesIO(image_bytes))
data = pytesseract.image_to_data(image, lang=lang, output_type=pytesseract.Output.DICT)
words = []
for i in range(len(data['text'])):
text = data['text'][i].strip()
conf = int(data['conf'][i])
if not text or conf < 20:
continue
words.append({
"text": text,
"left": data['left'][i],
"top": data['top'][i],
"width": data['width'][i],
"height": data['height'][i],
"conf": conf,
"block_num": data['block_num'][i],
"par_num": data['par_num'][i],
"line_num": data['line_num'][i],
"word_num": data['word_num'][i],
})
return {
"words": words,
"image_width": image.width,
"image_height": image.height,
}
def group_words_into_lines(words: List[dict], y_tolerance_px: int = 15) -> List[List[dict]]:
"""Group words by their Y position into lines.
Args:
words: List of word dicts from extract_bounding_boxes.
y_tolerance_px: Max pixel distance to consider words on the same line.
Returns:
List of lines, each line is a list of words sorted by X position.
"""
if not words:
return []
# Sort by Y then X
sorted_words = sorted(words, key=lambda w: (w['top'], w['left']))
lines: List[List[dict]] = []
current_line: List[dict] = [sorted_words[0]]
current_y = sorted_words[0]['top']
for word in sorted_words[1:]:
if abs(word['top'] - current_y) <= y_tolerance_px:
current_line.append(word)
else:
current_line.sort(key=lambda w: w['left'])
lines.append(current_line)
current_line = [word]
current_y = word['top']
if current_line:
current_line.sort(key=lambda w: w['left'])
lines.append(current_line)
return lines
def detect_columns(lines: List[List[dict]], image_width: int) -> Dict[str, Any]:
"""Detect column boundaries from word positions.
Typical vocab table: Left=English, Middle=German, Right=Example sentences.
Returns:
Dict with column boundaries and type assignments.
"""
if not lines or image_width == 0:
return {"columns": [], "column_types": []}
# Collect all word X positions
all_x_positions = []
for line in lines:
for word in line:
all_x_positions.append(word['left'])
if not all_x_positions:
return {"columns": [], "column_types": []}
# Find X-position clusters (column starts)
all_x_positions.sort()
# Simple gap-based column detection
min_gap = image_width * 0.08 # 8% of page width = column gap
clusters = []
current_cluster = [all_x_positions[0]]
for x in all_x_positions[1:]:
if x - current_cluster[-1] > min_gap:
clusters.append(current_cluster)
current_cluster = [x]
else:
current_cluster.append(x)
if current_cluster:
clusters.append(current_cluster)
# Each cluster represents a column start
columns = []
for cluster in clusters:
col_start = min(cluster)
columns.append({
"x_start": col_start,
"x_start_pct": col_start / image_width * 100,
"word_count": len(cluster),
})
# Assign column types based on position (left→right: EN, DE, Example)
type_map = ["english", "german", "example"]
column_types = []
for i, col in enumerate(columns):
if i < len(type_map):
column_types.append(type_map[i])
else:
column_types.append("unknown")
return {
"columns": columns,
"column_types": column_types,
}
def words_to_vocab_entries(lines: List[List[dict]], columns: List[dict],
column_types: List[str], image_width: int,
image_height: int) -> List[dict]:
"""Convert grouped words into vocabulary entries using column positions.
Args:
lines: Grouped word lines from group_words_into_lines.
columns: Column boundaries from detect_columns.
column_types: Column type assignments.
image_width: Image width in pixels.
image_height: Image height in pixels.
Returns:
List of vocabulary entry dicts with english/german/example fields.
"""
if not columns or not lines:
return []
# Build column boundaries for word assignment
col_boundaries = []
for i, col in enumerate(columns):
start = col['x_start']
if i + 1 < len(columns):
end = columns[i + 1]['x_start']
else:
end = image_width
col_boundaries.append((start, end, column_types[i] if i < len(column_types) else "unknown"))
entries = []
for line in lines:
entry = {"english": "", "german": "", "example": ""}
line_words_by_col: Dict[str, List[str]] = {"english": [], "german": [], "example": []}
line_bbox: Dict[str, Optional[dict]] = {}
for word in line:
word_center_x = word['left'] + word['width'] / 2
assigned_type = "unknown"
for start, end, col_type in col_boundaries:
if start <= word_center_x < end:
assigned_type = col_type
break
if assigned_type in line_words_by_col:
line_words_by_col[assigned_type].append(word['text'])
# Track bounding box for the column
if assigned_type not in line_bbox or line_bbox[assigned_type] is None:
line_bbox[assigned_type] = {
"left": word['left'],
"top": word['top'],
"right": word['left'] + word['width'],
"bottom": word['top'] + word['height'],
}
else:
bb = line_bbox[assigned_type]
bb['left'] = min(bb['left'], word['left'])
bb['top'] = min(bb['top'], word['top'])
bb['right'] = max(bb['right'], word['left'] + word['width'])
bb['bottom'] = max(bb['bottom'], word['top'] + word['height'])
for col_type in ["english", "german", "example"]:
if line_words_by_col[col_type]:
entry[col_type] = " ".join(line_words_by_col[col_type])
if line_bbox.get(col_type):
bb = line_bbox[col_type]
entry[f"{col_type}_bbox"] = {
"x_pct": bb['left'] / image_width * 100,
"y_pct": bb['top'] / image_height * 100,
"w_pct": (bb['right'] - bb['left']) / image_width * 100,
"h_pct": (bb['bottom'] - bb['top']) / image_height * 100,
}
# Only add if at least one column has content
if entry["english"] or entry["german"]:
entries.append(entry)
return entries
def match_positions_to_vocab(tess_words: List[dict], llm_vocab: List[dict],
image_w: int, image_h: int,
threshold: float = 0.6) -> List[dict]:
"""Match Tesseract bounding boxes to LLM vocabulary entries.
For each LLM vocab entry, find the best-matching Tesseract word
and attach its bounding box coordinates.
Args:
tess_words: Word list from Tesseract with pixel coordinates.
llm_vocab: Vocabulary list from Vision LLM.
image_w: Image width in pixels.
image_h: Image height in pixels.
threshold: Minimum similarity ratio for a match.
Returns:
llm_vocab list with bbox_x_pct/bbox_y_pct/bbox_w_pct/bbox_h_pct added.
"""
if not tess_words or not llm_vocab or image_w == 0 or image_h == 0:
return llm_vocab
for entry in llm_vocab:
english = entry.get("english", "").lower().strip()
german = entry.get("german", "").lower().strip()
if not english and not german:
continue
# Try to match English word first, then German
for field in ["english", "german"]:
search_text = entry.get(field, "").lower().strip()
if not search_text:
continue
best_word = None
best_ratio = 0.0
for word in tess_words:
ratio = SequenceMatcher(None, search_text, word['text'].lower()).ratio()
if ratio > best_ratio:
best_ratio = ratio
best_word = word
if best_word and best_ratio >= threshold:
entry[f"bbox_x_pct"] = best_word['left'] / image_w * 100
entry[f"bbox_y_pct"] = best_word['top'] / image_h * 100
entry[f"bbox_w_pct"] = best_word['width'] / image_w * 100
entry[f"bbox_h_pct"] = best_word['height'] / image_h * 100
entry["bbox_match_field"] = field
entry["bbox_match_ratio"] = round(best_ratio, 3)
break # Found a match, no need to try the other field
return llm_vocab
async def run_tesseract_pipeline(image_bytes: bytes, lang: str = "eng+deu") -> dict:
"""Full Tesseract pipeline: extract words, group lines, detect columns, build vocab.
Args:
image_bytes: PNG/JPEG image as bytes.
lang: Tesseract language string.
Returns:
Dict with 'vocabulary', 'words', 'lines', 'columns', 'image_width', 'image_height'.
"""
# Step 1: Extract bounding boxes
bbox_data = await extract_bounding_boxes(image_bytes, lang=lang)
if bbox_data.get("error"):
return bbox_data
words = bbox_data["words"]
image_w = bbox_data["image_width"]
image_h = bbox_data["image_height"]
# Step 2: Group into lines
lines = group_words_into_lines(words)
# Step 3: Detect columns
col_info = detect_columns(lines, image_w)
# Step 4: Build vocabulary entries
vocab = words_to_vocab_entries(
lines,
col_info["columns"],
col_info["column_types"],
image_w,
image_h,
)
return {
"vocabulary": vocab,
"words": words,
"lines_count": len(lines),
"columns": col_info["columns"],
"column_types": col_info["column_types"],
"image_width": image_w,
"image_height": image_h,
"word_count": len(words),
}

View File

@@ -0,0 +1,276 @@
"""
Handwriting HTR API - Hochwertige Handschriftenerkennung (HTR) fuer Klausurkorrekturen.
Endpoints:
- POST /api/v1/htr/recognize - Bild hochladen → handgeschriebener Text
- POST /api/v1/htr/recognize-session - OCR-Pipeline Session als Quelle nutzen
Modell-Strategie:
1. qwen2.5vl:32b via Ollama (primaer, hoechste Qualitaet als VLM)
2. microsoft/trocr-large-handwritten (Fallback, offline, kein Ollama)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal auf dem Mac Mini.
"""
import io
import os
import logging
import time
import base64
from typing import Optional
import cv2
import numpy as np
from fastapi import APIRouter, HTTPException, Query, UploadFile, File
from pydantic import BaseModel
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/v1/htr", tags=["HTR"])
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
OLLAMA_HTR_MODEL = os.getenv("OLLAMA_HTR_MODEL", "qwen2.5vl:32b")
HTR_FALLBACK_MODEL = os.getenv("HTR_FALLBACK_MODEL", "trocr-large")
# ---------------------------------------------------------------------------
# Pydantic Models
# ---------------------------------------------------------------------------
class HTRSessionRequest(BaseModel):
session_id: str
model: str = "auto" # "auto" | "qwen2.5vl" | "trocr-large"
use_clean: bool = True # Prefer clean_png (after handwriting removal)
# ---------------------------------------------------------------------------
# Preprocessing
# ---------------------------------------------------------------------------
def _preprocess_for_htr(img_bgr: np.ndarray) -> np.ndarray:
"""
CLAHE contrast enhancement + upscale to improve HTR accuracy.
Returns grayscale enhanced image.
"""
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
enhanced = clahe.apply(gray)
# Upscale if image is too small
h, w = enhanced.shape
if min(h, w) < 800:
scale = 800 / min(h, w)
enhanced = cv2.resize(
enhanced, None, fx=scale, fy=scale,
interpolation=cv2.INTER_CUBIC
)
return enhanced
def _bgr_to_png_bytes(img_bgr: np.ndarray) -> bytes:
"""Convert BGR ndarray to PNG bytes."""
success, buf = cv2.imencode(".png", img_bgr)
if not success:
raise RuntimeError("Failed to encode image to PNG")
return buf.tobytes()
def _preprocess_image_bytes(image_bytes: bytes) -> bytes:
"""Load image, apply HTR preprocessing, return PNG bytes."""
arr = np.frombuffer(image_bytes, dtype=np.uint8)
img_bgr = cv2.imdecode(arr, cv2.IMREAD_COLOR)
if img_bgr is None:
raise ValueError("Could not decode image")
enhanced = _preprocess_for_htr(img_bgr)
# Convert grayscale back to BGR for encoding
enhanced_bgr = cv2.cvtColor(enhanced, cv2.COLOR_GRAY2BGR)
return _bgr_to_png_bytes(enhanced_bgr)
# ---------------------------------------------------------------------------
# Backend: Ollama qwen2.5vl
# ---------------------------------------------------------------------------
async def _recognize_with_qwen_vl(image_bytes: bytes, language: str) -> Optional[str]:
"""
Send image to Ollama qwen2.5vl:32b for HTR.
Returns extracted text or None on error.
"""
import httpx
lang_hint = {
"de": "Deutsch",
"en": "Englisch",
"de+en": "Deutsch und Englisch",
}.get(language, "Deutsch")
prompt = (
f"Du bist ein OCR-Experte fuer handgeschriebenen Text auf {lang_hint}. "
"Lies den Text im Bild exakt ab — korrigiere KEINE Rechtschreibfehler. "
"Antworte NUR mit dem erkannten Text, ohne Erklaerungen."
)
img_b64 = base64.b64encode(image_bytes).decode("utf-8")
payload = {
"model": OLLAMA_HTR_MODEL,
"prompt": prompt,
"images": [img_b64],
"stream": False,
}
try:
async with httpx.AsyncClient(timeout=120.0) as client:
resp = await client.post(f"{OLLAMA_BASE_URL}/api/generate", json=payload)
resp.raise_for_status()
data = resp.json()
return data.get("response", "").strip()
except Exception as e:
logger.warning(f"Ollama qwen2.5vl HTR failed: {e}")
return None
# ---------------------------------------------------------------------------
# Backend: TrOCR-large fallback
# ---------------------------------------------------------------------------
async def _recognize_with_trocr_large(image_bytes: bytes) -> Optional[str]:
"""
Use microsoft/trocr-large-handwritten via trocr_service.py.
Returns extracted text or None on error.
"""
try:
from services.trocr_service import run_trocr_ocr, _check_trocr_available
if not _check_trocr_available():
logger.warning("TrOCR not available for HTR fallback")
return None
text, confidence = await run_trocr_ocr(image_bytes, handwritten=True, size="large")
return text.strip() if text else None
except Exception as e:
logger.warning(f"TrOCR-large HTR failed: {e}")
return None
# ---------------------------------------------------------------------------
# Core recognition logic
# ---------------------------------------------------------------------------
async def _do_recognize(
image_bytes: bytes,
model: str = "auto",
preprocess: bool = True,
language: str = "de",
) -> dict:
"""
Core HTR logic: preprocess → try Ollama → fallback to TrOCR-large.
Returns dict with text, model_used, processing_time_ms.
"""
t0 = time.monotonic()
if preprocess:
try:
image_bytes = _preprocess_image_bytes(image_bytes)
except Exception as e:
logger.warning(f"HTR preprocessing failed, using raw image: {e}")
text: Optional[str] = None
model_used: str = "none"
use_qwen = model in ("auto", "qwen2.5vl")
use_trocr = model in ("auto", "trocr-large") or (use_qwen and text is None)
if use_qwen:
text = await _recognize_with_qwen_vl(image_bytes, language)
if text is not None:
model_used = f"qwen2.5vl ({OLLAMA_HTR_MODEL})"
if text is None and (use_trocr or model == "trocr-large"):
text = await _recognize_with_trocr_large(image_bytes)
if text is not None:
model_used = "trocr-large-handwritten"
if text is None:
text = ""
model_used = "none (all backends failed)"
elapsed_ms = int((time.monotonic() - t0) * 1000)
return {
"text": text,
"model_used": model_used,
"processing_time_ms": elapsed_ms,
"language": language,
"preprocessed": preprocess,
}
# ---------------------------------------------------------------------------
# Endpoints
# ---------------------------------------------------------------------------
@router.post("/recognize")
async def recognize_handwriting(
file: UploadFile = File(...),
model: str = Query("auto", description="auto | qwen2.5vl | trocr-large"),
preprocess: bool = Query(True, description="Apply CLAHE + upscale before recognition"),
language: str = Query("de", description="de | en | de+en"),
):
"""
Upload an image and get back the handwritten text as plain text.
Tries qwen2.5vl:32b via Ollama first, falls back to TrOCR-large-handwritten.
"""
if model not in ("auto", "qwen2.5vl", "trocr-large"):
raise HTTPException(status_code=400, detail="model must be one of: auto, qwen2.5vl, trocr-large")
if language not in ("de", "en", "de+en"):
raise HTTPException(status_code=400, detail="language must be one of: de, en, de+en")
image_bytes = await file.read()
if not image_bytes:
raise HTTPException(status_code=400, detail="Empty file")
return await _do_recognize(image_bytes, model=model, preprocess=preprocess, language=language)
@router.post("/recognize-session")
async def recognize_from_session(req: HTRSessionRequest):
"""
Use an OCR-Pipeline session as image source for HTR.
Set use_clean=true to prefer the clean image (after handwriting removal step).
This is useful when you want to do HTR on isolated handwriting regions.
"""
from ocr_pipeline_session_store import get_session_db, get_session_image
session = await get_session_db(req.session_id)
if not session:
raise HTTPException(status_code=404, detail=f"Session {req.session_id} not found")
# Choose source image
image_bytes: Optional[bytes] = None
source_used: str = ""
if req.use_clean:
image_bytes = await get_session_image(req.session_id, "clean")
if image_bytes:
source_used = "clean"
if not image_bytes:
image_bytes = await get_session_image(req.session_id, "deskewed")
if image_bytes:
source_used = "deskewed"
if not image_bytes:
image_bytes = await get_session_image(req.session_id, "original")
source_used = "original"
if not image_bytes:
raise HTTPException(status_code=404, detail="No image available in session")
result = await _do_recognize(image_bytes, model=req.model)
result["session_id"] = req.session_id
result["source_image"] = source_used
return result

View File

@@ -0,0 +1,7 @@
"""
OCR spell-checking sub-package — language-aware OCR correction.
Moved from backend/ flat modules (smart_spell*.py).
Backward-compatible shim files remain at the old locations.
"""
from .smart_spell import * # noqa: F401,F403

View File

@@ -0,0 +1,298 @@
"""
SmartSpellChecker Core — init, data types, language detection, word correction.
Extracted from smart_spell.py for modularity.
Lizenz: Apache 2.0 (kommerziell nutzbar)
"""
import logging
import re
from dataclasses import dataclass, field
from typing import Dict, List, Literal, Optional, Set, Tuple
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Init
# ---------------------------------------------------------------------------
try:
from spellchecker import SpellChecker as _SpellChecker
_en_spell = _SpellChecker(language='en', distance=1)
_de_spell = _SpellChecker(language='de', distance=1)
_AVAILABLE = True
except ImportError:
_AVAILABLE = False
logger.warning("pyspellchecker not installed — SmartSpellChecker disabled")
Lang = Literal["en", "de", "both", "unknown"]
# ---------------------------------------------------------------------------
# Bigram context for a/I disambiguation
# ---------------------------------------------------------------------------
# Words that commonly follow "I" (subject pronoun -> verb/modal)
_I_FOLLOWERS: frozenset = frozenset({
"am", "was", "have", "had", "do", "did", "will", "would", "can",
"could", "should", "shall", "may", "might", "must",
"think", "know", "see", "want", "need", "like", "love", "hate",
"go", "went", "come", "came", "say", "said", "get", "got",
"make", "made", "take", "took", "give", "gave", "tell", "told",
"feel", "felt", "find", "found", "believe", "hope", "wish",
"remember", "forget", "understand", "mean", "meant",
"don't", "didn't", "can't", "won't", "couldn't", "wouldn't",
"shouldn't", "haven't", "hadn't", "isn't", "wasn't",
"really", "just", "also", "always", "never", "often", "sometimes",
})
# Words that commonly follow "a" (article -> noun/adjective)
_A_FOLLOWERS: frozenset = frozenset({
"lot", "few", "little", "bit", "good", "bad", "great", "new", "old",
"long", "short", "big", "small", "large", "huge", "tiny",
"nice", "beautiful", "wonderful", "terrible", "horrible",
"man", "woman", "boy", "girl", "child", "dog", "cat", "bird",
"book", "car", "house", "room", "school", "teacher", "student",
"day", "week", "month", "year", "time", "place", "way",
"friend", "family", "person", "problem", "question", "story",
"very", "really", "quite", "rather", "pretty", "single",
})
# Digit->letter substitutions (OCR confusion)
_DIGIT_SUBS: Dict[str, List[str]] = {
'0': ['o', 'O'],
'1': ['l', 'I'],
'5': ['s', 'S'],
'6': ['g', 'G'],
'8': ['b', 'B'],
'|': ['I', 'l'],
'/': ['l'], # italic 'l' misread as slash (e.g. "p/" -> "pl")
}
_SUSPICIOUS_CHARS = frozenset(_DIGIT_SUBS.keys())
# Umlaut confusion: OCR drops dots (u->u, a->a, o->o)
_UMLAUT_MAP = {
'a': '\u00e4', 'o': '\u00f6', 'u': '\u00fc', 'i': '\u00fc',
'A': '\u00c4', 'O': '\u00d6', 'U': '\u00dc', 'I': '\u00dc',
}
# Tokenizer -- includes | and / so OCR artifacts like "p/" are treated as words
_TOKEN_RE = re.compile(r"([A-Za-z\u00c4\u00d6\u00dc\u00e4\u00f6\u00fc\u00df'|/]+)([^A-Za-z\u00c4\u00d6\u00dc\u00e4\u00f6\u00fc\u00df'|/]*)")
# ---------------------------------------------------------------------------
# Data types
# ---------------------------------------------------------------------------
@dataclass
class CorrectionResult:
original: str
corrected: str
lang_detected: Lang
changed: bool
changes: List[str] = field(default_factory=list)
# ---------------------------------------------------------------------------
# Core class — language detection and word-level correction
# ---------------------------------------------------------------------------
class _SmartSpellCoreBase:
"""Base class with language detection and single-word correction.
Not intended for direct use — SmartSpellChecker inherits from this.
"""
def __init__(self):
if not _AVAILABLE:
raise RuntimeError("pyspellchecker not installed")
self.en = _en_spell
self.de = _de_spell
# --- Language detection ---
def detect_word_lang(self, word: str) -> Lang:
"""Detect language of a single word using dual-dict heuristic."""
w = word.lower().strip(".,;:!?\"'()")
if not w:
return "unknown"
in_en = bool(self.en.known([w]))
in_de = bool(self.de.known([w]))
if in_en and in_de:
return "both"
if in_en:
return "en"
if in_de:
return "de"
return "unknown"
def detect_text_lang(self, text: str) -> Lang:
"""Detect dominant language of a text string (sentence/phrase)."""
words = re.findall(r"[A-Za-z\u00c4\u00d6\u00dc\u00e4\u00f6\u00fc\u00df]+", text)
if not words:
return "unknown"
en_count = 0
de_count = 0
for w in words:
lang = self.detect_word_lang(w)
if lang == "en":
en_count += 1
elif lang == "de":
de_count += 1
# "both" doesn't count for either
if en_count > de_count:
return "en"
if de_count > en_count:
return "de"
if en_count == de_count and en_count > 0:
return "both"
return "unknown"
# --- Single-word correction ---
def _known(self, word: str) -> bool:
"""True if word is known in EN or DE dictionary, or is a known abbreviation."""
w = word.lower()
if bool(self.en.known([w])) or bool(self.de.known([w])):
return True
# Also accept known abbreviations (sth, sb, adj, etc.)
try:
from cv_ocr_engines import _KNOWN_ABBREVIATIONS
if w in _KNOWN_ABBREVIATIONS:
return True
except ImportError:
pass
return False
def _word_freq(self, word: str) -> float:
"""Get word frequency (max of EN and DE)."""
w = word.lower()
return max(self.en.word_usage_frequency(w), self.de.word_usage_frequency(w))
def _known_in(self, word: str, lang: str) -> bool:
"""True if word is known in a specific language dictionary."""
w = word.lower()
spell = self.en if lang == "en" else self.de
return bool(spell.known([w]))
def correct_word(self, word: str, lang: str = "en",
prev_word: str = "", next_word: str = "") -> Optional[str]:
"""Correct a single word for the given language.
Returns None if no correction needed, or the corrected string.
"""
if not word or not word.strip():
return None
# Skip numbers, abbreviations with dots, very short tokens
if word.isdigit() or '.' in word:
return None
# Skip IPA/phonetic content in brackets
if '[' in word or ']' in word:
return None
has_suspicious = any(ch in _SUSPICIOUS_CHARS for ch in word)
# 1. Already known -> no fix
if self._known(word):
# But check a/I disambiguation for single-char words
if word.lower() in ('l', '|') and next_word:
return self._disambiguate_a_I(word, next_word)
return None
# 2. Digit/pipe substitution
if has_suspicious:
if word == '|':
return 'I'
# Try single-char substitutions
for i, ch in enumerate(word):
if ch not in _DIGIT_SUBS:
continue
for replacement in _DIGIT_SUBS[ch]:
candidate = word[:i] + replacement + word[i + 1:]
if self._known(candidate):
return candidate
# Try multi-char substitution (e.g., "sch00l" -> "school")
multi = self._try_multi_digit_sub(word)
if multi:
return multi
# 3. Umlaut correction (German)
if lang == "de" and len(word) >= 3 and word.isalpha():
umlaut_fix = self._try_umlaut_fix(word)
if umlaut_fix:
return umlaut_fix
# 4. General spell correction
if not has_suspicious and len(word) >= 3 and word.isalpha():
# Safety: don't correct if the word is valid in the OTHER language
other_lang = "de" if lang == "en" else "en"
if self._known_in(word, other_lang):
return None
if other_lang == "de" and self._try_umlaut_fix(word):
return None # has a valid DE umlaut variant -> don't touch
spell = self.en if lang == "en" else self.de
correction = spell.correction(word.lower())
if correction and correction != word.lower():
if word[0].isupper():
correction = correction[0].upper() + correction[1:]
if self._known(correction):
return correction
return None
# --- Multi-digit substitution ---
def _try_multi_digit_sub(self, word: str) -> Optional[str]:
"""Try replacing multiple digits simultaneously using BFS."""
positions = [(i, ch) for i, ch in enumerate(word) if ch in _DIGIT_SUBS]
if not positions or len(positions) > 4:
return None
# BFS over substitution combinations
queue = [list(word)]
for pos, ch in positions:
next_queue = []
for current in queue:
# Keep original
next_queue.append(current[:])
# Try each substitution
for repl in _DIGIT_SUBS[ch]:
variant = current[:]
variant[pos] = repl
next_queue.append(variant)
queue = next_queue
# Check which combinations produce known words
for combo in queue:
candidate = "".join(combo)
if candidate != word and self._known(candidate):
return candidate
return None
# --- Umlaut fix ---
def _try_umlaut_fix(self, word: str) -> Optional[str]:
"""Try single-char umlaut substitutions for German words."""
for i, ch in enumerate(word):
if ch in _UMLAUT_MAP:
candidate = word[:i] + _UMLAUT_MAP[ch] + word[i + 1:]
if self._known(candidate):
return candidate
return None
# --- a/I disambiguation ---
def _disambiguate_a_I(self, token: str, next_word: str) -> Optional[str]:
"""Disambiguate 'a' vs 'I' (and OCR variants like 'l', '|')."""
nw = next_word.lower().strip(".,;:!?")
if nw in _I_FOLLOWERS:
return "I"
if nw in _A_FOLLOWERS:
return "a"
return None # uncertain, don't change

View File

@@ -0,0 +1,25 @@
"""
SmartSpellChecker — barrel re-export.
All implementation split into:
smart_spell_core — init, data types, language detection, word correction
smart_spell_text — full text correction, boundary repair, context split
Lizenz: Apache 2.0 (kommerziell nutzbar)
"""
# Core: data types, lang detection (re-exported for tests)
from .core import ( # noqa: F401
_AVAILABLE,
_DIGIT_SUBS,
_SUSPICIOUS_CHARS,
_UMLAUT_MAP,
_TOKEN_RE,
_I_FOLLOWERS,
_A_FOLLOWERS,
CorrectionResult,
Lang,
)
# Text: SmartSpellChecker class (the main public API)
from .text import SmartSpellChecker # noqa: F401

View File

@@ -0,0 +1,289 @@
"""
SmartSpellChecker Text — full text correction, boundary repair, context split.
Extracted from smart_spell.py for modularity.
Lizenz: Apache 2.0 (kommerziell nutzbar)
"""
import re
from typing import Dict, List, Optional, Tuple
from .core import (
_SmartSpellCoreBase,
_TOKEN_RE,
CorrectionResult,
Lang,
)
class SmartSpellChecker(_SmartSpellCoreBase):
"""Language-aware OCR spell checker using pyspellchecker (no LLM).
Inherits single-word correction from _SmartSpellCoreBase.
Adds text-level passes: boundary repair, context split, full correction.
"""
# --- Boundary repair (shifted word boundaries) ---
def _try_boundary_repair(self, word1: str, word2: str) -> Optional[Tuple[str, str]]:
"""Fix shifted word boundaries between adjacent tokens.
OCR sometimes shifts the boundary: "at sth." -> "ats th."
Try moving 1-2 chars from end of word1 to start of word2 and vice versa.
Returns (fixed_word1, fixed_word2) or None.
"""
# Import known abbreviations for vocabulary context
try:
from cv_ocr_engines import _KNOWN_ABBREVIATIONS
except ImportError:
_KNOWN_ABBREVIATIONS = set()
# Strip trailing punctuation for checking, preserve for result
w2_stripped = word2.rstrip(".,;:!?")
w2_punct = word2[len(w2_stripped):]
# Try shifting 1-2 chars from word1 -> word2
for shift in (1, 2):
if len(word1) <= shift:
continue
new_w1 = word1[:-shift]
new_w2_base = word1[-shift:] + w2_stripped
w1_ok = self._known(new_w1) or new_w1.lower() in _KNOWN_ABBREVIATIONS
w2_ok = self._known(new_w2_base) or new_w2_base.lower() in _KNOWN_ABBREVIATIONS
if w1_ok and w2_ok:
return (new_w1, new_w2_base + w2_punct)
# Try shifting 1-2 chars from word2 -> word1
for shift in (1, 2):
if len(w2_stripped) <= shift:
continue
new_w1 = word1 + w2_stripped[:shift]
new_w2_base = w2_stripped[shift:]
w1_ok = self._known(new_w1) or new_w1.lower() in _KNOWN_ABBREVIATIONS
w2_ok = self._known(new_w2_base) or new_w2_base.lower() in _KNOWN_ABBREVIATIONS
if w1_ok and w2_ok:
return (new_w1, new_w2_base + w2_punct)
return None
# --- Context-based word split for ambiguous merges ---
# Patterns where a valid word is actually "a" + adjective/noun
_ARTICLE_SPLIT_CANDIDATES = {
# word -> (article, remainder) -- only when followed by a compatible word
"anew": ("a", "new"),
"areal": ("a", "real"),
"alive": None, # genuinely one word, never split
"alone": None,
"aware": None,
"alike": None,
"apart": None,
"aside": None,
"above": None,
"about": None,
"among": None,
"along": None,
}
def _try_context_split(self, word: str, next_word: str,
prev_word: str) -> Optional[str]:
"""Split words like 'anew' -> 'a new' when context indicates a merge.
Only splits when:
- The word is in the split candidates list
- The following word makes sense as a noun (for "a + adj + noun" pattern)
- OR the word is unknown and can be split into article + known word
"""
w_lower = word.lower()
# Check explicit candidates
if w_lower in self._ARTICLE_SPLIT_CANDIDATES:
split = self._ARTICLE_SPLIT_CANDIDATES[w_lower]
if split is None:
return None # explicitly marked as "don't split"
article, remainder = split
# Only split if followed by a word (noun pattern)
if next_word and next_word[0].islower():
return f"{article} {remainder}"
# Also split if remainder + next_word makes a common phrase
if next_word and self._known(next_word):
return f"{article} {remainder}"
# Generic: if word starts with 'a' and rest is a known adjective/word
if (len(word) >= 4 and word[0].lower() == 'a'
and not self._known(word) # only for UNKNOWN words
and self._known(word[1:])):
return f"a {word[1:]}"
return None
# --- Full text correction ---
def correct_text(self, text: str, lang: str = "en") -> CorrectionResult:
"""Correct a full text string (field value).
Three passes:
1. Boundary repair -- fix shifted word boundaries between adjacent tokens
2. Context split -- split ambiguous merges (anew -> a new)
3. Per-word correction -- spell check individual words
"""
if not text or not text.strip():
return CorrectionResult(text, text, "unknown", False)
detected = self.detect_text_lang(text) if lang == "auto" else lang
effective_lang = detected if detected in ("en", "de") else "en"
changes: List[str] = []
tokens = list(_TOKEN_RE.finditer(text))
# Extract token list: [(word, separator), ...]
token_list: List[List[str]] = [] # [[word, sep], ...]
for m in tokens:
token_list.append([m.group(1), m.group(2)])
# --- Pass 1: Boundary repair between adjacent unknown words ---
# Import abbreviations for the heuristic below
try:
from cv_ocr_engines import _KNOWN_ABBREVIATIONS as _ABBREVS
except ImportError:
_ABBREVS = set()
for i in range(len(token_list) - 1):
w1 = token_list[i][0]
w2_raw = token_list[i + 1][0]
# Skip boundary repair for IPA/bracket content
# Brackets may be in the token OR in the adjacent separators
sep_before_w1 = token_list[i - 1][1] if i > 0 else ""
sep_after_w1 = token_list[i][1]
sep_after_w2 = token_list[i + 1][1]
has_bracket = (
'[' in w1 or ']' in w1 or '[' in w2_raw or ']' in w2_raw
or ']' in sep_after_w1 # w1 text was inside [brackets]
or '[' in sep_after_w1 # w2 starts a bracket
or ']' in sep_after_w2 # w2 text was inside [brackets]
or '[' in sep_before_w1 # w1 starts a bracket
)
if has_bracket:
continue
# Include trailing punct from separator in w2 for abbreviation matching
w2_with_punct = w2_raw + token_list[i + 1][1].rstrip(" ")
# Try boundary repair -- always, even if both words are valid.
# Use word-frequency scoring to decide if repair is better.
repair = self._try_boundary_repair(w1, w2_with_punct)
if not repair and w2_with_punct != w2_raw:
repair = self._try_boundary_repair(w1, w2_raw)
if repair:
new_w1, new_w2_full = repair
new_w2_base = new_w2_full.rstrip(".,;:!?")
# Frequency-based scoring: product of word frequencies
# Higher product = more common word pair = better
old_freq = self._word_freq(w1) * self._word_freq(w2_raw)
new_freq = self._word_freq(new_w1) * self._word_freq(new_w2_base)
# Abbreviation bonus: if repair produces a known abbreviation
has_abbrev = new_w1.lower() in _ABBREVS or new_w2_base.lower() in _ABBREVS
if has_abbrev:
# Accept abbreviation repair ONLY if at least one of the
# original words is rare/unknown (prevents "Can I" -> "Ca nI"
# where both original words are common and correct).
RARE_THRESHOLD = 1e-6
orig_both_common = (
self._word_freq(w1) > RARE_THRESHOLD
and self._word_freq(w2_raw) > RARE_THRESHOLD
)
if not orig_both_common:
new_freq = max(new_freq, old_freq * 10)
else:
has_abbrev = False # both originals common -> don't trust
# Accept if repair produces a more frequent word pair
# (threshold: at least 5x more frequent to avoid false positives)
if new_freq > old_freq * 5:
new_w2_punct = new_w2_full[len(new_w2_base):]
changes.append(f"{w1} {w2_raw}\u2192{new_w1} {new_w2_base}")
token_list[i][0] = new_w1
token_list[i + 1][0] = new_w2_base
if new_w2_punct:
token_list[i + 1][1] = new_w2_punct + token_list[i + 1][1].lstrip(".,;:!?")
# --- Pass 2: Context split (anew -> a new) ---
expanded: List[List[str]] = []
for i, (word, sep) in enumerate(token_list):
next_word = token_list[i + 1][0] if i + 1 < len(token_list) else ""
prev_word = token_list[i - 1][0] if i > 0 else ""
split = self._try_context_split(word, next_word, prev_word)
if split and split != word:
changes.append(f"{word}\u2192{split}")
expanded.append([split, sep])
else:
expanded.append([word, sep])
token_list = expanded
# --- Pass 3: Per-word correction ---
parts: List[str] = []
# Preserve any leading text before the first token match
first_start = tokens[0].start() if tokens else 0
if first_start > 0:
parts.append(text[:first_start])
for i, (word, sep) in enumerate(token_list):
# Skip words inside IPA brackets (brackets land in separators)
prev_sep = token_list[i - 1][1] if i > 0 else ""
if '[' in prev_sep or ']' in sep:
parts.append(word)
parts.append(sep)
continue
next_word = token_list[i + 1][0] if i + 1 < len(token_list) else ""
prev_word = token_list[i - 1][0] if i > 0 else ""
correction = self.correct_word(
word, lang=effective_lang,
prev_word=prev_word, next_word=next_word,
)
if correction and correction != word:
changes.append(f"{word}\u2192{correction}")
parts.append(correction)
else:
parts.append(word)
parts.append(sep)
# Append any trailing text
last_end = tokens[-1].end() if tokens else 0
if last_end < len(text):
parts.append(text[last_end:])
corrected = "".join(parts)
return CorrectionResult(
original=text,
corrected=corrected,
lang_detected=detected,
changed=corrected != text,
changes=changes,
)
# --- Vocabulary entry correction ---
def correct_vocab_entry(self, english: str, german: str,
example: str = "") -> Dict[str, CorrectionResult]:
"""Correct a full vocabulary entry (EN + DE + example).
Uses column position to determine language -- the most reliable signal.
"""
results = {}
results["english"] = self.correct_text(english, lang="en")
results["german"] = self.correct_text(german, lang="de")
if example:
# For examples, auto-detect language
results["example"] = self.correct_text(example, lang="auto")
return results