Restructure: Move final 12 root files into packages (klausur-service)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 2m23s
CI / test-python-agent-core (push) Successful in 19s
CI / test-nodejs-website (push) Successful in 19s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 2m23s
CI / test-python-agent-core (push) Successful in 19s
CI / test-nodejs-website (push) Successful in 19s
ocr/spell/ (3): smart_spell, core, text upload/ (3): api, chunked, mobile crawler/ (3): github, github_core, github_parsers + unified_grid → grid/, tesseract_extractor → ocr/engines/, htr_api → ocr/pipeline/ 12 shims added. Only main.py, config.py, storage + RAG files remain at root. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
346
klausur-service/backend/ocr/engines/tesseract_extractor.py
Normal file
346
klausur-service/backend/ocr/engines/tesseract_extractor.py
Normal file
@@ -0,0 +1,346 @@
|
||||
"""
|
||||
Tesseract-based OCR extraction with word-level bounding boxes.
|
||||
|
||||
Uses Tesseract for spatial information (WHERE text is) while
|
||||
the Vision LLM handles semantic understanding (WHAT the text means).
|
||||
|
||||
Tesseract runs natively on ARM64 via Debian's apt package.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
"""
|
||||
|
||||
import io
|
||||
import logging
|
||||
from typing import List, Dict, Any, Optional
|
||||
from difflib import SequenceMatcher
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
TESSERACT_AVAILABLE = True
|
||||
except ImportError:
|
||||
TESSERACT_AVAILABLE = False
|
||||
logger.warning("pytesseract or Pillow not installed - Tesseract OCR unavailable")
|
||||
|
||||
|
||||
async def extract_bounding_boxes(image_bytes: bytes, lang: str = "eng+deu") -> dict:
|
||||
"""Run Tesseract OCR and return word-level bounding boxes.
|
||||
|
||||
Args:
|
||||
image_bytes: PNG/JPEG image as bytes.
|
||||
lang: Tesseract language string (e.g. "eng+deu").
|
||||
|
||||
Returns:
|
||||
Dict with 'words' list and 'image_width'/'image_height'.
|
||||
"""
|
||||
if not TESSERACT_AVAILABLE:
|
||||
return {"words": [], "image_width": 0, "image_height": 0, "error": "Tesseract not available"}
|
||||
|
||||
image = Image.open(io.BytesIO(image_bytes))
|
||||
data = pytesseract.image_to_data(image, lang=lang, output_type=pytesseract.Output.DICT)
|
||||
|
||||
words = []
|
||||
for i in range(len(data['text'])):
|
||||
text = data['text'][i].strip()
|
||||
conf = int(data['conf'][i])
|
||||
if not text or conf < 20:
|
||||
continue
|
||||
words.append({
|
||||
"text": text,
|
||||
"left": data['left'][i],
|
||||
"top": data['top'][i],
|
||||
"width": data['width'][i],
|
||||
"height": data['height'][i],
|
||||
"conf": conf,
|
||||
"block_num": data['block_num'][i],
|
||||
"par_num": data['par_num'][i],
|
||||
"line_num": data['line_num'][i],
|
||||
"word_num": data['word_num'][i],
|
||||
})
|
||||
|
||||
return {
|
||||
"words": words,
|
||||
"image_width": image.width,
|
||||
"image_height": image.height,
|
||||
}
|
||||
|
||||
|
||||
def group_words_into_lines(words: List[dict], y_tolerance_px: int = 15) -> List[List[dict]]:
|
||||
"""Group words by their Y position into lines.
|
||||
|
||||
Args:
|
||||
words: List of word dicts from extract_bounding_boxes.
|
||||
y_tolerance_px: Max pixel distance to consider words on the same line.
|
||||
|
||||
Returns:
|
||||
List of lines, each line is a list of words sorted by X position.
|
||||
"""
|
||||
if not words:
|
||||
return []
|
||||
|
||||
# Sort by Y then X
|
||||
sorted_words = sorted(words, key=lambda w: (w['top'], w['left']))
|
||||
|
||||
lines: List[List[dict]] = []
|
||||
current_line: List[dict] = [sorted_words[0]]
|
||||
current_y = sorted_words[0]['top']
|
||||
|
||||
for word in sorted_words[1:]:
|
||||
if abs(word['top'] - current_y) <= y_tolerance_px:
|
||||
current_line.append(word)
|
||||
else:
|
||||
current_line.sort(key=lambda w: w['left'])
|
||||
lines.append(current_line)
|
||||
current_line = [word]
|
||||
current_y = word['top']
|
||||
|
||||
if current_line:
|
||||
current_line.sort(key=lambda w: w['left'])
|
||||
lines.append(current_line)
|
||||
|
||||
return lines
|
||||
|
||||
|
||||
def detect_columns(lines: List[List[dict]], image_width: int) -> Dict[str, Any]:
|
||||
"""Detect column boundaries from word positions.
|
||||
|
||||
Typical vocab table: Left=English, Middle=German, Right=Example sentences.
|
||||
|
||||
Returns:
|
||||
Dict with column boundaries and type assignments.
|
||||
"""
|
||||
if not lines or image_width == 0:
|
||||
return {"columns": [], "column_types": []}
|
||||
|
||||
# Collect all word X positions
|
||||
all_x_positions = []
|
||||
for line in lines:
|
||||
for word in line:
|
||||
all_x_positions.append(word['left'])
|
||||
|
||||
if not all_x_positions:
|
||||
return {"columns": [], "column_types": []}
|
||||
|
||||
# Find X-position clusters (column starts)
|
||||
all_x_positions.sort()
|
||||
|
||||
# Simple gap-based column detection
|
||||
min_gap = image_width * 0.08 # 8% of page width = column gap
|
||||
clusters = []
|
||||
current_cluster = [all_x_positions[0]]
|
||||
|
||||
for x in all_x_positions[1:]:
|
||||
if x - current_cluster[-1] > min_gap:
|
||||
clusters.append(current_cluster)
|
||||
current_cluster = [x]
|
||||
else:
|
||||
current_cluster.append(x)
|
||||
|
||||
if current_cluster:
|
||||
clusters.append(current_cluster)
|
||||
|
||||
# Each cluster represents a column start
|
||||
columns = []
|
||||
for cluster in clusters:
|
||||
col_start = min(cluster)
|
||||
columns.append({
|
||||
"x_start": col_start,
|
||||
"x_start_pct": col_start / image_width * 100,
|
||||
"word_count": len(cluster),
|
||||
})
|
||||
|
||||
# Assign column types based on position (left→right: EN, DE, Example)
|
||||
type_map = ["english", "german", "example"]
|
||||
column_types = []
|
||||
for i, col in enumerate(columns):
|
||||
if i < len(type_map):
|
||||
column_types.append(type_map[i])
|
||||
else:
|
||||
column_types.append("unknown")
|
||||
|
||||
return {
|
||||
"columns": columns,
|
||||
"column_types": column_types,
|
||||
}
|
||||
|
||||
|
||||
def words_to_vocab_entries(lines: List[List[dict]], columns: List[dict],
|
||||
column_types: List[str], image_width: int,
|
||||
image_height: int) -> List[dict]:
|
||||
"""Convert grouped words into vocabulary entries using column positions.
|
||||
|
||||
Args:
|
||||
lines: Grouped word lines from group_words_into_lines.
|
||||
columns: Column boundaries from detect_columns.
|
||||
column_types: Column type assignments.
|
||||
image_width: Image width in pixels.
|
||||
image_height: Image height in pixels.
|
||||
|
||||
Returns:
|
||||
List of vocabulary entry dicts with english/german/example fields.
|
||||
"""
|
||||
if not columns or not lines:
|
||||
return []
|
||||
|
||||
# Build column boundaries for word assignment
|
||||
col_boundaries = []
|
||||
for i, col in enumerate(columns):
|
||||
start = col['x_start']
|
||||
if i + 1 < len(columns):
|
||||
end = columns[i + 1]['x_start']
|
||||
else:
|
||||
end = image_width
|
||||
col_boundaries.append((start, end, column_types[i] if i < len(column_types) else "unknown"))
|
||||
|
||||
entries = []
|
||||
for line in lines:
|
||||
entry = {"english": "", "german": "", "example": ""}
|
||||
line_words_by_col: Dict[str, List[str]] = {"english": [], "german": [], "example": []}
|
||||
line_bbox: Dict[str, Optional[dict]] = {}
|
||||
|
||||
for word in line:
|
||||
word_center_x = word['left'] + word['width'] / 2
|
||||
assigned_type = "unknown"
|
||||
for start, end, col_type in col_boundaries:
|
||||
if start <= word_center_x < end:
|
||||
assigned_type = col_type
|
||||
break
|
||||
|
||||
if assigned_type in line_words_by_col:
|
||||
line_words_by_col[assigned_type].append(word['text'])
|
||||
# Track bounding box for the column
|
||||
if assigned_type not in line_bbox or line_bbox[assigned_type] is None:
|
||||
line_bbox[assigned_type] = {
|
||||
"left": word['left'],
|
||||
"top": word['top'],
|
||||
"right": word['left'] + word['width'],
|
||||
"bottom": word['top'] + word['height'],
|
||||
}
|
||||
else:
|
||||
bb = line_bbox[assigned_type]
|
||||
bb['left'] = min(bb['left'], word['left'])
|
||||
bb['top'] = min(bb['top'], word['top'])
|
||||
bb['right'] = max(bb['right'], word['left'] + word['width'])
|
||||
bb['bottom'] = max(bb['bottom'], word['top'] + word['height'])
|
||||
|
||||
for col_type in ["english", "german", "example"]:
|
||||
if line_words_by_col[col_type]:
|
||||
entry[col_type] = " ".join(line_words_by_col[col_type])
|
||||
if line_bbox.get(col_type):
|
||||
bb = line_bbox[col_type]
|
||||
entry[f"{col_type}_bbox"] = {
|
||||
"x_pct": bb['left'] / image_width * 100,
|
||||
"y_pct": bb['top'] / image_height * 100,
|
||||
"w_pct": (bb['right'] - bb['left']) / image_width * 100,
|
||||
"h_pct": (bb['bottom'] - bb['top']) / image_height * 100,
|
||||
}
|
||||
|
||||
# Only add if at least one column has content
|
||||
if entry["english"] or entry["german"]:
|
||||
entries.append(entry)
|
||||
|
||||
return entries
|
||||
|
||||
|
||||
def match_positions_to_vocab(tess_words: List[dict], llm_vocab: List[dict],
|
||||
image_w: int, image_h: int,
|
||||
threshold: float = 0.6) -> List[dict]:
|
||||
"""Match Tesseract bounding boxes to LLM vocabulary entries.
|
||||
|
||||
For each LLM vocab entry, find the best-matching Tesseract word
|
||||
and attach its bounding box coordinates.
|
||||
|
||||
Args:
|
||||
tess_words: Word list from Tesseract with pixel coordinates.
|
||||
llm_vocab: Vocabulary list from Vision LLM.
|
||||
image_w: Image width in pixels.
|
||||
image_h: Image height in pixels.
|
||||
threshold: Minimum similarity ratio for a match.
|
||||
|
||||
Returns:
|
||||
llm_vocab list with bbox_x_pct/bbox_y_pct/bbox_w_pct/bbox_h_pct added.
|
||||
"""
|
||||
if not tess_words or not llm_vocab or image_w == 0 or image_h == 0:
|
||||
return llm_vocab
|
||||
|
||||
for entry in llm_vocab:
|
||||
english = entry.get("english", "").lower().strip()
|
||||
german = entry.get("german", "").lower().strip()
|
||||
|
||||
if not english and not german:
|
||||
continue
|
||||
|
||||
# Try to match English word first, then German
|
||||
for field in ["english", "german"]:
|
||||
search_text = entry.get(field, "").lower().strip()
|
||||
if not search_text:
|
||||
continue
|
||||
|
||||
best_word = None
|
||||
best_ratio = 0.0
|
||||
|
||||
for word in tess_words:
|
||||
ratio = SequenceMatcher(None, search_text, word['text'].lower()).ratio()
|
||||
if ratio > best_ratio:
|
||||
best_ratio = ratio
|
||||
best_word = word
|
||||
|
||||
if best_word and best_ratio >= threshold:
|
||||
entry[f"bbox_x_pct"] = best_word['left'] / image_w * 100
|
||||
entry[f"bbox_y_pct"] = best_word['top'] / image_h * 100
|
||||
entry[f"bbox_w_pct"] = best_word['width'] / image_w * 100
|
||||
entry[f"bbox_h_pct"] = best_word['height'] / image_h * 100
|
||||
entry["bbox_match_field"] = field
|
||||
entry["bbox_match_ratio"] = round(best_ratio, 3)
|
||||
break # Found a match, no need to try the other field
|
||||
|
||||
return llm_vocab
|
||||
|
||||
|
||||
async def run_tesseract_pipeline(image_bytes: bytes, lang: str = "eng+deu") -> dict:
|
||||
"""Full Tesseract pipeline: extract words, group lines, detect columns, build vocab.
|
||||
|
||||
Args:
|
||||
image_bytes: PNG/JPEG image as bytes.
|
||||
lang: Tesseract language string.
|
||||
|
||||
Returns:
|
||||
Dict with 'vocabulary', 'words', 'lines', 'columns', 'image_width', 'image_height'.
|
||||
"""
|
||||
# Step 1: Extract bounding boxes
|
||||
bbox_data = await extract_bounding_boxes(image_bytes, lang=lang)
|
||||
|
||||
if bbox_data.get("error"):
|
||||
return bbox_data
|
||||
|
||||
words = bbox_data["words"]
|
||||
image_w = bbox_data["image_width"]
|
||||
image_h = bbox_data["image_height"]
|
||||
|
||||
# Step 2: Group into lines
|
||||
lines = group_words_into_lines(words)
|
||||
|
||||
# Step 3: Detect columns
|
||||
col_info = detect_columns(lines, image_w)
|
||||
|
||||
# Step 4: Build vocabulary entries
|
||||
vocab = words_to_vocab_entries(
|
||||
lines,
|
||||
col_info["columns"],
|
||||
col_info["column_types"],
|
||||
image_w,
|
||||
image_h,
|
||||
)
|
||||
|
||||
return {
|
||||
"vocabulary": vocab,
|
||||
"words": words,
|
||||
"lines_count": len(lines),
|
||||
"columns": col_info["columns"],
|
||||
"column_types": col_info["column_types"],
|
||||
"image_width": image_w,
|
||||
"image_height": image_h,
|
||||
"word_count": len(words),
|
||||
}
|
||||
276
klausur-service/backend/ocr/pipeline/htr_api.py
Normal file
276
klausur-service/backend/ocr/pipeline/htr_api.py
Normal file
@@ -0,0 +1,276 @@
|
||||
"""
|
||||
Handwriting HTR API - Hochwertige Handschriftenerkennung (HTR) fuer Klausurkorrekturen.
|
||||
|
||||
Endpoints:
|
||||
- POST /api/v1/htr/recognize - Bild hochladen → handgeschriebener Text
|
||||
- POST /api/v1/htr/recognize-session - OCR-Pipeline Session als Quelle nutzen
|
||||
|
||||
Modell-Strategie:
|
||||
1. qwen2.5vl:32b via Ollama (primaer, hoechste Qualitaet als VLM)
|
||||
2. microsoft/trocr-large-handwritten (Fallback, offline, kein Ollama)
|
||||
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal auf dem Mac Mini.
|
||||
"""
|
||||
|
||||
import io
|
||||
import os
|
||||
import logging
|
||||
import time
|
||||
import base64
|
||||
from typing import Optional
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
from fastapi import APIRouter, HTTPException, Query, UploadFile, File
|
||||
from pydantic import BaseModel
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter(prefix="/api/v1/htr", tags=["HTR"])
|
||||
|
||||
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
|
||||
OLLAMA_HTR_MODEL = os.getenv("OLLAMA_HTR_MODEL", "qwen2.5vl:32b")
|
||||
HTR_FALLBACK_MODEL = os.getenv("HTR_FALLBACK_MODEL", "trocr-large")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Pydantic Models
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class HTRSessionRequest(BaseModel):
|
||||
session_id: str
|
||||
model: str = "auto" # "auto" | "qwen2.5vl" | "trocr-large"
|
||||
use_clean: bool = True # Prefer clean_png (after handwriting removal)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Preprocessing
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _preprocess_for_htr(img_bgr: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
CLAHE contrast enhancement + upscale to improve HTR accuracy.
|
||||
Returns grayscale enhanced image.
|
||||
"""
|
||||
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
|
||||
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
|
||||
enhanced = clahe.apply(gray)
|
||||
|
||||
# Upscale if image is too small
|
||||
h, w = enhanced.shape
|
||||
if min(h, w) < 800:
|
||||
scale = 800 / min(h, w)
|
||||
enhanced = cv2.resize(
|
||||
enhanced, None, fx=scale, fy=scale,
|
||||
interpolation=cv2.INTER_CUBIC
|
||||
)
|
||||
|
||||
return enhanced
|
||||
|
||||
|
||||
def _bgr_to_png_bytes(img_bgr: np.ndarray) -> bytes:
|
||||
"""Convert BGR ndarray to PNG bytes."""
|
||||
success, buf = cv2.imencode(".png", img_bgr)
|
||||
if not success:
|
||||
raise RuntimeError("Failed to encode image to PNG")
|
||||
return buf.tobytes()
|
||||
|
||||
|
||||
def _preprocess_image_bytes(image_bytes: bytes) -> bytes:
|
||||
"""Load image, apply HTR preprocessing, return PNG bytes."""
|
||||
arr = np.frombuffer(image_bytes, dtype=np.uint8)
|
||||
img_bgr = cv2.imdecode(arr, cv2.IMREAD_COLOR)
|
||||
if img_bgr is None:
|
||||
raise ValueError("Could not decode image")
|
||||
|
||||
enhanced = _preprocess_for_htr(img_bgr)
|
||||
# Convert grayscale back to BGR for encoding
|
||||
enhanced_bgr = cv2.cvtColor(enhanced, cv2.COLOR_GRAY2BGR)
|
||||
return _bgr_to_png_bytes(enhanced_bgr)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Backend: Ollama qwen2.5vl
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
async def _recognize_with_qwen_vl(image_bytes: bytes, language: str) -> Optional[str]:
|
||||
"""
|
||||
Send image to Ollama qwen2.5vl:32b for HTR.
|
||||
Returns extracted text or None on error.
|
||||
"""
|
||||
import httpx
|
||||
|
||||
lang_hint = {
|
||||
"de": "Deutsch",
|
||||
"en": "Englisch",
|
||||
"de+en": "Deutsch und Englisch",
|
||||
}.get(language, "Deutsch")
|
||||
|
||||
prompt = (
|
||||
f"Du bist ein OCR-Experte fuer handgeschriebenen Text auf {lang_hint}. "
|
||||
"Lies den Text im Bild exakt ab — korrigiere KEINE Rechtschreibfehler. "
|
||||
"Antworte NUR mit dem erkannten Text, ohne Erklaerungen."
|
||||
)
|
||||
|
||||
img_b64 = base64.b64encode(image_bytes).decode("utf-8")
|
||||
|
||||
payload = {
|
||||
"model": OLLAMA_HTR_MODEL,
|
||||
"prompt": prompt,
|
||||
"images": [img_b64],
|
||||
"stream": False,
|
||||
}
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||
resp = await client.post(f"{OLLAMA_BASE_URL}/api/generate", json=payload)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
return data.get("response", "").strip()
|
||||
except Exception as e:
|
||||
logger.warning(f"Ollama qwen2.5vl HTR failed: {e}")
|
||||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Backend: TrOCR-large fallback
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
async def _recognize_with_trocr_large(image_bytes: bytes) -> Optional[str]:
|
||||
"""
|
||||
Use microsoft/trocr-large-handwritten via trocr_service.py.
|
||||
Returns extracted text or None on error.
|
||||
"""
|
||||
try:
|
||||
from services.trocr_service import run_trocr_ocr, _check_trocr_available
|
||||
if not _check_trocr_available():
|
||||
logger.warning("TrOCR not available for HTR fallback")
|
||||
return None
|
||||
|
||||
text, confidence = await run_trocr_ocr(image_bytes, handwritten=True, size="large")
|
||||
return text.strip() if text else None
|
||||
except Exception as e:
|
||||
logger.warning(f"TrOCR-large HTR failed: {e}")
|
||||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Core recognition logic
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
async def _do_recognize(
|
||||
image_bytes: bytes,
|
||||
model: str = "auto",
|
||||
preprocess: bool = True,
|
||||
language: str = "de",
|
||||
) -> dict:
|
||||
"""
|
||||
Core HTR logic: preprocess → try Ollama → fallback to TrOCR-large.
|
||||
Returns dict with text, model_used, processing_time_ms.
|
||||
"""
|
||||
t0 = time.monotonic()
|
||||
|
||||
if preprocess:
|
||||
try:
|
||||
image_bytes = _preprocess_image_bytes(image_bytes)
|
||||
except Exception as e:
|
||||
logger.warning(f"HTR preprocessing failed, using raw image: {e}")
|
||||
|
||||
text: Optional[str] = None
|
||||
model_used: str = "none"
|
||||
|
||||
use_qwen = model in ("auto", "qwen2.5vl")
|
||||
use_trocr = model in ("auto", "trocr-large") or (use_qwen and text is None)
|
||||
|
||||
if use_qwen:
|
||||
text = await _recognize_with_qwen_vl(image_bytes, language)
|
||||
if text is not None:
|
||||
model_used = f"qwen2.5vl ({OLLAMA_HTR_MODEL})"
|
||||
|
||||
if text is None and (use_trocr or model == "trocr-large"):
|
||||
text = await _recognize_with_trocr_large(image_bytes)
|
||||
if text is not None:
|
||||
model_used = "trocr-large-handwritten"
|
||||
|
||||
if text is None:
|
||||
text = ""
|
||||
model_used = "none (all backends failed)"
|
||||
|
||||
elapsed_ms = int((time.monotonic() - t0) * 1000)
|
||||
|
||||
return {
|
||||
"text": text,
|
||||
"model_used": model_used,
|
||||
"processing_time_ms": elapsed_ms,
|
||||
"language": language,
|
||||
"preprocessed": preprocess,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Endpoints
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@router.post("/recognize")
|
||||
async def recognize_handwriting(
|
||||
file: UploadFile = File(...),
|
||||
model: str = Query("auto", description="auto | qwen2.5vl | trocr-large"),
|
||||
preprocess: bool = Query(True, description="Apply CLAHE + upscale before recognition"),
|
||||
language: str = Query("de", description="de | en | de+en"),
|
||||
):
|
||||
"""
|
||||
Upload an image and get back the handwritten text as plain text.
|
||||
|
||||
Tries qwen2.5vl:32b via Ollama first, falls back to TrOCR-large-handwritten.
|
||||
"""
|
||||
if model not in ("auto", "qwen2.5vl", "trocr-large"):
|
||||
raise HTTPException(status_code=400, detail="model must be one of: auto, qwen2.5vl, trocr-large")
|
||||
if language not in ("de", "en", "de+en"):
|
||||
raise HTTPException(status_code=400, detail="language must be one of: de, en, de+en")
|
||||
|
||||
image_bytes = await file.read()
|
||||
if not image_bytes:
|
||||
raise HTTPException(status_code=400, detail="Empty file")
|
||||
|
||||
return await _do_recognize(image_bytes, model=model, preprocess=preprocess, language=language)
|
||||
|
||||
|
||||
@router.post("/recognize-session")
|
||||
async def recognize_from_session(req: HTRSessionRequest):
|
||||
"""
|
||||
Use an OCR-Pipeline session as image source for HTR.
|
||||
|
||||
Set use_clean=true to prefer the clean image (after handwriting removal step).
|
||||
This is useful when you want to do HTR on isolated handwriting regions.
|
||||
"""
|
||||
from ocr_pipeline_session_store import get_session_db, get_session_image
|
||||
|
||||
session = await get_session_db(req.session_id)
|
||||
if not session:
|
||||
raise HTTPException(status_code=404, detail=f"Session {req.session_id} not found")
|
||||
|
||||
# Choose source image
|
||||
image_bytes: Optional[bytes] = None
|
||||
source_used: str = ""
|
||||
|
||||
if req.use_clean:
|
||||
image_bytes = await get_session_image(req.session_id, "clean")
|
||||
if image_bytes:
|
||||
source_used = "clean"
|
||||
|
||||
if not image_bytes:
|
||||
image_bytes = await get_session_image(req.session_id, "deskewed")
|
||||
if image_bytes:
|
||||
source_used = "deskewed"
|
||||
|
||||
if not image_bytes:
|
||||
image_bytes = await get_session_image(req.session_id, "original")
|
||||
source_used = "original"
|
||||
|
||||
if not image_bytes:
|
||||
raise HTTPException(status_code=404, detail="No image available in session")
|
||||
|
||||
result = await _do_recognize(image_bytes, model=req.model)
|
||||
result["session_id"] = req.session_id
|
||||
result["source_image"] = source_used
|
||||
return result
|
||||
7
klausur-service/backend/ocr/spell/__init__.py
Normal file
7
klausur-service/backend/ocr/spell/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
||||
"""
|
||||
OCR spell-checking sub-package — language-aware OCR correction.
|
||||
|
||||
Moved from backend/ flat modules (smart_spell*.py).
|
||||
Backward-compatible shim files remain at the old locations.
|
||||
"""
|
||||
from .smart_spell import * # noqa: F401,F403
|
||||
298
klausur-service/backend/ocr/spell/core.py
Normal file
298
klausur-service/backend/ocr/spell/core.py
Normal file
@@ -0,0 +1,298 @@
|
||||
"""
|
||||
SmartSpellChecker Core — init, data types, language detection, word correction.
|
||||
|
||||
Extracted from smart_spell.py for modularity.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Dict, List, Literal, Optional, Set, Tuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Init
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
try:
|
||||
from spellchecker import SpellChecker as _SpellChecker
|
||||
_en_spell = _SpellChecker(language='en', distance=1)
|
||||
_de_spell = _SpellChecker(language='de', distance=1)
|
||||
_AVAILABLE = True
|
||||
except ImportError:
|
||||
_AVAILABLE = False
|
||||
logger.warning("pyspellchecker not installed — SmartSpellChecker disabled")
|
||||
|
||||
Lang = Literal["en", "de", "both", "unknown"]
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Bigram context for a/I disambiguation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Words that commonly follow "I" (subject pronoun -> verb/modal)
|
||||
_I_FOLLOWERS: frozenset = frozenset({
|
||||
"am", "was", "have", "had", "do", "did", "will", "would", "can",
|
||||
"could", "should", "shall", "may", "might", "must",
|
||||
"think", "know", "see", "want", "need", "like", "love", "hate",
|
||||
"go", "went", "come", "came", "say", "said", "get", "got",
|
||||
"make", "made", "take", "took", "give", "gave", "tell", "told",
|
||||
"feel", "felt", "find", "found", "believe", "hope", "wish",
|
||||
"remember", "forget", "understand", "mean", "meant",
|
||||
"don't", "didn't", "can't", "won't", "couldn't", "wouldn't",
|
||||
"shouldn't", "haven't", "hadn't", "isn't", "wasn't",
|
||||
"really", "just", "also", "always", "never", "often", "sometimes",
|
||||
})
|
||||
|
||||
# Words that commonly follow "a" (article -> noun/adjective)
|
||||
_A_FOLLOWERS: frozenset = frozenset({
|
||||
"lot", "few", "little", "bit", "good", "bad", "great", "new", "old",
|
||||
"long", "short", "big", "small", "large", "huge", "tiny",
|
||||
"nice", "beautiful", "wonderful", "terrible", "horrible",
|
||||
"man", "woman", "boy", "girl", "child", "dog", "cat", "bird",
|
||||
"book", "car", "house", "room", "school", "teacher", "student",
|
||||
"day", "week", "month", "year", "time", "place", "way",
|
||||
"friend", "family", "person", "problem", "question", "story",
|
||||
"very", "really", "quite", "rather", "pretty", "single",
|
||||
})
|
||||
|
||||
# Digit->letter substitutions (OCR confusion)
|
||||
_DIGIT_SUBS: Dict[str, List[str]] = {
|
||||
'0': ['o', 'O'],
|
||||
'1': ['l', 'I'],
|
||||
'5': ['s', 'S'],
|
||||
'6': ['g', 'G'],
|
||||
'8': ['b', 'B'],
|
||||
'|': ['I', 'l'],
|
||||
'/': ['l'], # italic 'l' misread as slash (e.g. "p/" -> "pl")
|
||||
}
|
||||
_SUSPICIOUS_CHARS = frozenset(_DIGIT_SUBS.keys())
|
||||
|
||||
# Umlaut confusion: OCR drops dots (u->u, a->a, o->o)
|
||||
_UMLAUT_MAP = {
|
||||
'a': '\u00e4', 'o': '\u00f6', 'u': '\u00fc', 'i': '\u00fc',
|
||||
'A': '\u00c4', 'O': '\u00d6', 'U': '\u00dc', 'I': '\u00dc',
|
||||
}
|
||||
|
||||
# Tokenizer -- includes | and / so OCR artifacts like "p/" are treated as words
|
||||
_TOKEN_RE = re.compile(r"([A-Za-z\u00c4\u00d6\u00dc\u00e4\u00f6\u00fc\u00df'|/]+)([^A-Za-z\u00c4\u00d6\u00dc\u00e4\u00f6\u00fc\u00df'|/]*)")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Data types
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@dataclass
|
||||
class CorrectionResult:
|
||||
original: str
|
||||
corrected: str
|
||||
lang_detected: Lang
|
||||
changed: bool
|
||||
changes: List[str] = field(default_factory=list)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Core class — language detection and word-level correction
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class _SmartSpellCoreBase:
|
||||
"""Base class with language detection and single-word correction.
|
||||
|
||||
Not intended for direct use — SmartSpellChecker inherits from this.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
if not _AVAILABLE:
|
||||
raise RuntimeError("pyspellchecker not installed")
|
||||
self.en = _en_spell
|
||||
self.de = _de_spell
|
||||
|
||||
# --- Language detection ---
|
||||
|
||||
def detect_word_lang(self, word: str) -> Lang:
|
||||
"""Detect language of a single word using dual-dict heuristic."""
|
||||
w = word.lower().strip(".,;:!?\"'()")
|
||||
if not w:
|
||||
return "unknown"
|
||||
in_en = bool(self.en.known([w]))
|
||||
in_de = bool(self.de.known([w]))
|
||||
if in_en and in_de:
|
||||
return "both"
|
||||
if in_en:
|
||||
return "en"
|
||||
if in_de:
|
||||
return "de"
|
||||
return "unknown"
|
||||
|
||||
def detect_text_lang(self, text: str) -> Lang:
|
||||
"""Detect dominant language of a text string (sentence/phrase)."""
|
||||
words = re.findall(r"[A-Za-z\u00c4\u00d6\u00dc\u00e4\u00f6\u00fc\u00df]+", text)
|
||||
if not words:
|
||||
return "unknown"
|
||||
|
||||
en_count = 0
|
||||
de_count = 0
|
||||
for w in words:
|
||||
lang = self.detect_word_lang(w)
|
||||
if lang == "en":
|
||||
en_count += 1
|
||||
elif lang == "de":
|
||||
de_count += 1
|
||||
# "both" doesn't count for either
|
||||
|
||||
if en_count > de_count:
|
||||
return "en"
|
||||
if de_count > en_count:
|
||||
return "de"
|
||||
if en_count == de_count and en_count > 0:
|
||||
return "both"
|
||||
return "unknown"
|
||||
|
||||
# --- Single-word correction ---
|
||||
|
||||
def _known(self, word: str) -> bool:
|
||||
"""True if word is known in EN or DE dictionary, or is a known abbreviation."""
|
||||
w = word.lower()
|
||||
if bool(self.en.known([w])) or bool(self.de.known([w])):
|
||||
return True
|
||||
# Also accept known abbreviations (sth, sb, adj, etc.)
|
||||
try:
|
||||
from cv_ocr_engines import _KNOWN_ABBREVIATIONS
|
||||
if w in _KNOWN_ABBREVIATIONS:
|
||||
return True
|
||||
except ImportError:
|
||||
pass
|
||||
return False
|
||||
|
||||
def _word_freq(self, word: str) -> float:
|
||||
"""Get word frequency (max of EN and DE)."""
|
||||
w = word.lower()
|
||||
return max(self.en.word_usage_frequency(w), self.de.word_usage_frequency(w))
|
||||
|
||||
def _known_in(self, word: str, lang: str) -> bool:
|
||||
"""True if word is known in a specific language dictionary."""
|
||||
w = word.lower()
|
||||
spell = self.en if lang == "en" else self.de
|
||||
return bool(spell.known([w]))
|
||||
|
||||
def correct_word(self, word: str, lang: str = "en",
|
||||
prev_word: str = "", next_word: str = "") -> Optional[str]:
|
||||
"""Correct a single word for the given language.
|
||||
|
||||
Returns None if no correction needed, or the corrected string.
|
||||
"""
|
||||
if not word or not word.strip():
|
||||
return None
|
||||
|
||||
# Skip numbers, abbreviations with dots, very short tokens
|
||||
if word.isdigit() or '.' in word:
|
||||
return None
|
||||
|
||||
# Skip IPA/phonetic content in brackets
|
||||
if '[' in word or ']' in word:
|
||||
return None
|
||||
|
||||
has_suspicious = any(ch in _SUSPICIOUS_CHARS for ch in word)
|
||||
|
||||
# 1. Already known -> no fix
|
||||
if self._known(word):
|
||||
# But check a/I disambiguation for single-char words
|
||||
if word.lower() in ('l', '|') and next_word:
|
||||
return self._disambiguate_a_I(word, next_word)
|
||||
return None
|
||||
|
||||
# 2. Digit/pipe substitution
|
||||
if has_suspicious:
|
||||
if word == '|':
|
||||
return 'I'
|
||||
# Try single-char substitutions
|
||||
for i, ch in enumerate(word):
|
||||
if ch not in _DIGIT_SUBS:
|
||||
continue
|
||||
for replacement in _DIGIT_SUBS[ch]:
|
||||
candidate = word[:i] + replacement + word[i + 1:]
|
||||
if self._known(candidate):
|
||||
return candidate
|
||||
# Try multi-char substitution (e.g., "sch00l" -> "school")
|
||||
multi = self._try_multi_digit_sub(word)
|
||||
if multi:
|
||||
return multi
|
||||
|
||||
# 3. Umlaut correction (German)
|
||||
if lang == "de" and len(word) >= 3 and word.isalpha():
|
||||
umlaut_fix = self._try_umlaut_fix(word)
|
||||
if umlaut_fix:
|
||||
return umlaut_fix
|
||||
|
||||
# 4. General spell correction
|
||||
if not has_suspicious and len(word) >= 3 and word.isalpha():
|
||||
# Safety: don't correct if the word is valid in the OTHER language
|
||||
other_lang = "de" if lang == "en" else "en"
|
||||
if self._known_in(word, other_lang):
|
||||
return None
|
||||
if other_lang == "de" and self._try_umlaut_fix(word):
|
||||
return None # has a valid DE umlaut variant -> don't touch
|
||||
|
||||
spell = self.en if lang == "en" else self.de
|
||||
correction = spell.correction(word.lower())
|
||||
if correction and correction != word.lower():
|
||||
if word[0].isupper():
|
||||
correction = correction[0].upper() + correction[1:]
|
||||
if self._known(correction):
|
||||
return correction
|
||||
|
||||
return None
|
||||
|
||||
# --- Multi-digit substitution ---
|
||||
|
||||
def _try_multi_digit_sub(self, word: str) -> Optional[str]:
|
||||
"""Try replacing multiple digits simultaneously using BFS."""
|
||||
positions = [(i, ch) for i, ch in enumerate(word) if ch in _DIGIT_SUBS]
|
||||
if not positions or len(positions) > 4:
|
||||
return None
|
||||
|
||||
# BFS over substitution combinations
|
||||
queue = [list(word)]
|
||||
for pos, ch in positions:
|
||||
next_queue = []
|
||||
for current in queue:
|
||||
# Keep original
|
||||
next_queue.append(current[:])
|
||||
# Try each substitution
|
||||
for repl in _DIGIT_SUBS[ch]:
|
||||
variant = current[:]
|
||||
variant[pos] = repl
|
||||
next_queue.append(variant)
|
||||
queue = next_queue
|
||||
|
||||
# Check which combinations produce known words
|
||||
for combo in queue:
|
||||
candidate = "".join(combo)
|
||||
if candidate != word and self._known(candidate):
|
||||
return candidate
|
||||
|
||||
return None
|
||||
|
||||
# --- Umlaut fix ---
|
||||
|
||||
def _try_umlaut_fix(self, word: str) -> Optional[str]:
|
||||
"""Try single-char umlaut substitutions for German words."""
|
||||
for i, ch in enumerate(word):
|
||||
if ch in _UMLAUT_MAP:
|
||||
candidate = word[:i] + _UMLAUT_MAP[ch] + word[i + 1:]
|
||||
if self._known(candidate):
|
||||
return candidate
|
||||
return None
|
||||
|
||||
# --- a/I disambiguation ---
|
||||
|
||||
def _disambiguate_a_I(self, token: str, next_word: str) -> Optional[str]:
|
||||
"""Disambiguate 'a' vs 'I' (and OCR variants like 'l', '|')."""
|
||||
nw = next_word.lower().strip(".,;:!?")
|
||||
if nw in _I_FOLLOWERS:
|
||||
return "I"
|
||||
if nw in _A_FOLLOWERS:
|
||||
return "a"
|
||||
return None # uncertain, don't change
|
||||
25
klausur-service/backend/ocr/spell/smart_spell.py
Normal file
25
klausur-service/backend/ocr/spell/smart_spell.py
Normal file
@@ -0,0 +1,25 @@
|
||||
"""
|
||||
SmartSpellChecker — barrel re-export.
|
||||
|
||||
All implementation split into:
|
||||
smart_spell_core — init, data types, language detection, word correction
|
||||
smart_spell_text — full text correction, boundary repair, context split
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
"""
|
||||
|
||||
# Core: data types, lang detection (re-exported for tests)
|
||||
from .core import ( # noqa: F401
|
||||
_AVAILABLE,
|
||||
_DIGIT_SUBS,
|
||||
_SUSPICIOUS_CHARS,
|
||||
_UMLAUT_MAP,
|
||||
_TOKEN_RE,
|
||||
_I_FOLLOWERS,
|
||||
_A_FOLLOWERS,
|
||||
CorrectionResult,
|
||||
Lang,
|
||||
)
|
||||
|
||||
# Text: SmartSpellChecker class (the main public API)
|
||||
from .text import SmartSpellChecker # noqa: F401
|
||||
289
klausur-service/backend/ocr/spell/text.py
Normal file
289
klausur-service/backend/ocr/spell/text.py
Normal file
@@ -0,0 +1,289 @@
|
||||
"""
|
||||
SmartSpellChecker Text — full text correction, boundary repair, context split.
|
||||
|
||||
Extracted from smart_spell.py for modularity.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
from .core import (
|
||||
_SmartSpellCoreBase,
|
||||
_TOKEN_RE,
|
||||
CorrectionResult,
|
||||
Lang,
|
||||
)
|
||||
|
||||
|
||||
class SmartSpellChecker(_SmartSpellCoreBase):
|
||||
"""Language-aware OCR spell checker using pyspellchecker (no LLM).
|
||||
|
||||
Inherits single-word correction from _SmartSpellCoreBase.
|
||||
Adds text-level passes: boundary repair, context split, full correction.
|
||||
"""
|
||||
|
||||
# --- Boundary repair (shifted word boundaries) ---
|
||||
|
||||
def _try_boundary_repair(self, word1: str, word2: str) -> Optional[Tuple[str, str]]:
|
||||
"""Fix shifted word boundaries between adjacent tokens.
|
||||
|
||||
OCR sometimes shifts the boundary: "at sth." -> "ats th."
|
||||
Try moving 1-2 chars from end of word1 to start of word2 and vice versa.
|
||||
Returns (fixed_word1, fixed_word2) or None.
|
||||
"""
|
||||
# Import known abbreviations for vocabulary context
|
||||
try:
|
||||
from cv_ocr_engines import _KNOWN_ABBREVIATIONS
|
||||
except ImportError:
|
||||
_KNOWN_ABBREVIATIONS = set()
|
||||
|
||||
# Strip trailing punctuation for checking, preserve for result
|
||||
w2_stripped = word2.rstrip(".,;:!?")
|
||||
w2_punct = word2[len(w2_stripped):]
|
||||
|
||||
# Try shifting 1-2 chars from word1 -> word2
|
||||
for shift in (1, 2):
|
||||
if len(word1) <= shift:
|
||||
continue
|
||||
new_w1 = word1[:-shift]
|
||||
new_w2_base = word1[-shift:] + w2_stripped
|
||||
|
||||
w1_ok = self._known(new_w1) or new_w1.lower() in _KNOWN_ABBREVIATIONS
|
||||
w2_ok = self._known(new_w2_base) or new_w2_base.lower() in _KNOWN_ABBREVIATIONS
|
||||
|
||||
if w1_ok and w2_ok:
|
||||
return (new_w1, new_w2_base + w2_punct)
|
||||
|
||||
# Try shifting 1-2 chars from word2 -> word1
|
||||
for shift in (1, 2):
|
||||
if len(w2_stripped) <= shift:
|
||||
continue
|
||||
new_w1 = word1 + w2_stripped[:shift]
|
||||
new_w2_base = w2_stripped[shift:]
|
||||
|
||||
w1_ok = self._known(new_w1) or new_w1.lower() in _KNOWN_ABBREVIATIONS
|
||||
w2_ok = self._known(new_w2_base) or new_w2_base.lower() in _KNOWN_ABBREVIATIONS
|
||||
|
||||
if w1_ok and w2_ok:
|
||||
return (new_w1, new_w2_base + w2_punct)
|
||||
|
||||
return None
|
||||
|
||||
# --- Context-based word split for ambiguous merges ---
|
||||
|
||||
# Patterns where a valid word is actually "a" + adjective/noun
|
||||
_ARTICLE_SPLIT_CANDIDATES = {
|
||||
# word -> (article, remainder) -- only when followed by a compatible word
|
||||
"anew": ("a", "new"),
|
||||
"areal": ("a", "real"),
|
||||
"alive": None, # genuinely one word, never split
|
||||
"alone": None,
|
||||
"aware": None,
|
||||
"alike": None,
|
||||
"apart": None,
|
||||
"aside": None,
|
||||
"above": None,
|
||||
"about": None,
|
||||
"among": None,
|
||||
"along": None,
|
||||
}
|
||||
|
||||
def _try_context_split(self, word: str, next_word: str,
|
||||
prev_word: str) -> Optional[str]:
|
||||
"""Split words like 'anew' -> 'a new' when context indicates a merge.
|
||||
|
||||
Only splits when:
|
||||
- The word is in the split candidates list
|
||||
- The following word makes sense as a noun (for "a + adj + noun" pattern)
|
||||
- OR the word is unknown and can be split into article + known word
|
||||
"""
|
||||
w_lower = word.lower()
|
||||
|
||||
# Check explicit candidates
|
||||
if w_lower in self._ARTICLE_SPLIT_CANDIDATES:
|
||||
split = self._ARTICLE_SPLIT_CANDIDATES[w_lower]
|
||||
if split is None:
|
||||
return None # explicitly marked as "don't split"
|
||||
article, remainder = split
|
||||
# Only split if followed by a word (noun pattern)
|
||||
if next_word and next_word[0].islower():
|
||||
return f"{article} {remainder}"
|
||||
# Also split if remainder + next_word makes a common phrase
|
||||
if next_word and self._known(next_word):
|
||||
return f"{article} {remainder}"
|
||||
|
||||
# Generic: if word starts with 'a' and rest is a known adjective/word
|
||||
if (len(word) >= 4 and word[0].lower() == 'a'
|
||||
and not self._known(word) # only for UNKNOWN words
|
||||
and self._known(word[1:])):
|
||||
return f"a {word[1:]}"
|
||||
|
||||
return None
|
||||
|
||||
# --- Full text correction ---
|
||||
|
||||
def correct_text(self, text: str, lang: str = "en") -> CorrectionResult:
|
||||
"""Correct a full text string (field value).
|
||||
|
||||
Three passes:
|
||||
1. Boundary repair -- fix shifted word boundaries between adjacent tokens
|
||||
2. Context split -- split ambiguous merges (anew -> a new)
|
||||
3. Per-word correction -- spell check individual words
|
||||
"""
|
||||
if not text or not text.strip():
|
||||
return CorrectionResult(text, text, "unknown", False)
|
||||
|
||||
detected = self.detect_text_lang(text) if lang == "auto" else lang
|
||||
effective_lang = detected if detected in ("en", "de") else "en"
|
||||
|
||||
changes: List[str] = []
|
||||
tokens = list(_TOKEN_RE.finditer(text))
|
||||
|
||||
# Extract token list: [(word, separator), ...]
|
||||
token_list: List[List[str]] = [] # [[word, sep], ...]
|
||||
for m in tokens:
|
||||
token_list.append([m.group(1), m.group(2)])
|
||||
|
||||
# --- Pass 1: Boundary repair between adjacent unknown words ---
|
||||
# Import abbreviations for the heuristic below
|
||||
try:
|
||||
from cv_ocr_engines import _KNOWN_ABBREVIATIONS as _ABBREVS
|
||||
except ImportError:
|
||||
_ABBREVS = set()
|
||||
|
||||
for i in range(len(token_list) - 1):
|
||||
w1 = token_list[i][0]
|
||||
w2_raw = token_list[i + 1][0]
|
||||
|
||||
# Skip boundary repair for IPA/bracket content
|
||||
# Brackets may be in the token OR in the adjacent separators
|
||||
sep_before_w1 = token_list[i - 1][1] if i > 0 else ""
|
||||
sep_after_w1 = token_list[i][1]
|
||||
sep_after_w2 = token_list[i + 1][1]
|
||||
has_bracket = (
|
||||
'[' in w1 or ']' in w1 or '[' in w2_raw or ']' in w2_raw
|
||||
or ']' in sep_after_w1 # w1 text was inside [brackets]
|
||||
or '[' in sep_after_w1 # w2 starts a bracket
|
||||
or ']' in sep_after_w2 # w2 text was inside [brackets]
|
||||
or '[' in sep_before_w1 # w1 starts a bracket
|
||||
)
|
||||
if has_bracket:
|
||||
continue
|
||||
|
||||
# Include trailing punct from separator in w2 for abbreviation matching
|
||||
w2_with_punct = w2_raw + token_list[i + 1][1].rstrip(" ")
|
||||
|
||||
# Try boundary repair -- always, even if both words are valid.
|
||||
# Use word-frequency scoring to decide if repair is better.
|
||||
repair = self._try_boundary_repair(w1, w2_with_punct)
|
||||
if not repair and w2_with_punct != w2_raw:
|
||||
repair = self._try_boundary_repair(w1, w2_raw)
|
||||
if repair:
|
||||
new_w1, new_w2_full = repair
|
||||
new_w2_base = new_w2_full.rstrip(".,;:!?")
|
||||
|
||||
# Frequency-based scoring: product of word frequencies
|
||||
# Higher product = more common word pair = better
|
||||
old_freq = self._word_freq(w1) * self._word_freq(w2_raw)
|
||||
new_freq = self._word_freq(new_w1) * self._word_freq(new_w2_base)
|
||||
|
||||
# Abbreviation bonus: if repair produces a known abbreviation
|
||||
has_abbrev = new_w1.lower() in _ABBREVS or new_w2_base.lower() in _ABBREVS
|
||||
if has_abbrev:
|
||||
# Accept abbreviation repair ONLY if at least one of the
|
||||
# original words is rare/unknown (prevents "Can I" -> "Ca nI"
|
||||
# where both original words are common and correct).
|
||||
RARE_THRESHOLD = 1e-6
|
||||
orig_both_common = (
|
||||
self._word_freq(w1) > RARE_THRESHOLD
|
||||
and self._word_freq(w2_raw) > RARE_THRESHOLD
|
||||
)
|
||||
if not orig_both_common:
|
||||
new_freq = max(new_freq, old_freq * 10)
|
||||
else:
|
||||
has_abbrev = False # both originals common -> don't trust
|
||||
|
||||
# Accept if repair produces a more frequent word pair
|
||||
# (threshold: at least 5x more frequent to avoid false positives)
|
||||
if new_freq > old_freq * 5:
|
||||
new_w2_punct = new_w2_full[len(new_w2_base):]
|
||||
changes.append(f"{w1} {w2_raw}\u2192{new_w1} {new_w2_base}")
|
||||
token_list[i][0] = new_w1
|
||||
token_list[i + 1][0] = new_w2_base
|
||||
if new_w2_punct:
|
||||
token_list[i + 1][1] = new_w2_punct + token_list[i + 1][1].lstrip(".,;:!?")
|
||||
|
||||
# --- Pass 2: Context split (anew -> a new) ---
|
||||
expanded: List[List[str]] = []
|
||||
for i, (word, sep) in enumerate(token_list):
|
||||
next_word = token_list[i + 1][0] if i + 1 < len(token_list) else ""
|
||||
prev_word = token_list[i - 1][0] if i > 0 else ""
|
||||
split = self._try_context_split(word, next_word, prev_word)
|
||||
if split and split != word:
|
||||
changes.append(f"{word}\u2192{split}")
|
||||
expanded.append([split, sep])
|
||||
else:
|
||||
expanded.append([word, sep])
|
||||
token_list = expanded
|
||||
|
||||
# --- Pass 3: Per-word correction ---
|
||||
parts: List[str] = []
|
||||
|
||||
# Preserve any leading text before the first token match
|
||||
first_start = tokens[0].start() if tokens else 0
|
||||
if first_start > 0:
|
||||
parts.append(text[:first_start])
|
||||
|
||||
for i, (word, sep) in enumerate(token_list):
|
||||
# Skip words inside IPA brackets (brackets land in separators)
|
||||
prev_sep = token_list[i - 1][1] if i > 0 else ""
|
||||
if '[' in prev_sep or ']' in sep:
|
||||
parts.append(word)
|
||||
parts.append(sep)
|
||||
continue
|
||||
|
||||
next_word = token_list[i + 1][0] if i + 1 < len(token_list) else ""
|
||||
prev_word = token_list[i - 1][0] if i > 0 else ""
|
||||
|
||||
correction = self.correct_word(
|
||||
word, lang=effective_lang,
|
||||
prev_word=prev_word, next_word=next_word,
|
||||
)
|
||||
if correction and correction != word:
|
||||
changes.append(f"{word}\u2192{correction}")
|
||||
parts.append(correction)
|
||||
else:
|
||||
parts.append(word)
|
||||
parts.append(sep)
|
||||
|
||||
# Append any trailing text
|
||||
last_end = tokens[-1].end() if tokens else 0
|
||||
if last_end < len(text):
|
||||
parts.append(text[last_end:])
|
||||
|
||||
corrected = "".join(parts)
|
||||
return CorrectionResult(
|
||||
original=text,
|
||||
corrected=corrected,
|
||||
lang_detected=detected,
|
||||
changed=corrected != text,
|
||||
changes=changes,
|
||||
)
|
||||
|
||||
# --- Vocabulary entry correction ---
|
||||
|
||||
def correct_vocab_entry(self, english: str, german: str,
|
||||
example: str = "") -> Dict[str, CorrectionResult]:
|
||||
"""Correct a full vocabulary entry (EN + DE + example).
|
||||
|
||||
Uses column position to determine language -- the most reliable signal.
|
||||
"""
|
||||
results = {}
|
||||
results["english"] = self.correct_text(english, lang="en")
|
||||
results["german"] = self.correct_text(german, lang="de")
|
||||
if example:
|
||||
# For examples, auto-detect language
|
||||
results["example"] = self.correct_text(example, lang="auto")
|
||||
return results
|
||||
Reference in New Issue
Block a user