Files
breakpilot-lehrer/klausur-service/backend/cv_ocr_engines.py
Benjamin Admin cf9dde9876
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 30s
CI / test-python-klausur (push) Failing after 2m4s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 21s
fix: _group_words_into_lines nach cv_ocr_engines.py verschieben
Funktion war nur in cv_review.py definiert, wurde aber auch in
cv_ocr_engines.py und cv_layout.py benutzt — NameError zur Laufzeit.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-09 15:24:56 +01:00

1309 lines
49 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
OCR engines (RapidOCR, TrOCR, LightOn), vocab postprocessing, and text cleaning.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import io
import logging
import re
from typing import Any, Dict, List, Optional, Tuple
import numpy as np
from cv_vocab_types import (
IPA_AVAILABLE,
PageRegion,
RowGeometry,
_britfone_dict,
_ipa_convert_american,
)
logger = logging.getLogger(__name__)
try:
import cv2
except ImportError:
cv2 = None # type: ignore[assignment]
try:
from PIL import Image
except ImportError:
Image = None # type: ignore[assignment,misc]
# =============================================================================
# Pipeline Step 5: Word Grid from Columns × Rows
# =============================================================================
def _group_words_into_lines(words: List[Dict], y_tolerance_px: int = 20) -> List[List[Dict]]:
"""Group words by Y position into lines, sorted by X within each line."""
if not words:
return []
sorted_words = sorted(words, key=lambda w: (w['top'], w['left']))
lines: List[List[Dict]] = []
current_line: List[Dict] = [sorted_words[0]]
current_y = sorted_words[0]['top']
for word in sorted_words[1:]:
if abs(word['top'] - current_y) <= y_tolerance_px:
current_line.append(word)
else:
current_line.sort(key=lambda w: w['left'])
lines.append(current_line)
current_line = [word]
current_y = word['top']
if current_line:
current_line.sort(key=lambda w: w['left'])
lines.append(current_line)
return lines
def _words_to_reading_order_lines(words: List[Dict], y_tolerance_px: int = 15) -> List[str]:
"""Group OCR words into visual lines in reading order.
Returns a list of line strings (one per visual line in the cell).
"""
if not words:
return []
lines = _group_words_into_lines(words, y_tolerance_px=y_tolerance_px)
return [' '.join(w['text'] for w in line) for line in lines]
def _rejoin_hyphenated(lines: List[str]) -> List[str]:
"""Rejoin words split by line-break hyphenation.
E.g. ['Fuß-', 'boden'] → ['Fußboden']
['some text-', 'thing here'] → ['something here']
"""
if len(lines) <= 1:
return lines
result = []
i = 0
while i < len(lines):
line = lines[i]
# If line ends with '-' and there's a next line, rejoin
if i + 1 < len(lines) and line.rstrip().endswith('-'):
stripped = line.rstrip()
# Get the word fragment before hyphen (last word)
prefix = stripped[:-1] # remove trailing hyphen
next_line = lines[i + 1]
# Join: last word of this line + first word of next line
prefix_words = prefix.rsplit(' ', 1)
next_words = next_line.split(' ', 1)
if len(prefix_words) > 1:
joined = prefix_words[0] + ' ' + prefix_words[1] + next_words[0]
else:
joined = prefix_words[0] + next_words[0]
remainder = next_words[1] if len(next_words) > 1 else ''
if remainder:
result.append(joined + ' ' + remainder)
else:
result.append(joined)
i += 2
else:
result.append(line)
i += 1
return result
def _words_to_reading_order_text(words: List[Dict], y_tolerance_px: int = 15) -> str:
"""Join OCR words into text in correct reading order, preserving line breaks.
Groups words into visual lines by Y-tolerance, sorts each line by X,
rejoins hyphenated words, then joins lines with newlines.
"""
lines = _words_to_reading_order_lines(words, y_tolerance_px)
lines = _rejoin_hyphenated(lines)
return '\n'.join(lines)
# --- RapidOCR integration (PaddleOCR models on ONNX Runtime) ---
_rapid_engine = None
RAPIDOCR_AVAILABLE = False
try:
from rapidocr import RapidOCR as _RapidOCRClass
from rapidocr import LangRec as _LangRec, OCRVersion as _OCRVersion, ModelType as _ModelType
RAPIDOCR_AVAILABLE = True
logger.info("RapidOCR available — can be used as alternative to Tesseract")
except ImportError:
logger.info("RapidOCR not installed — using Tesseract only")
def _get_rapid_engine():
"""Lazy-init RapidOCR engine with PP-OCRv5 Latin model for German support."""
global _rapid_engine
if _rapid_engine is None:
_rapid_engine = _RapidOCRClass(params={
# PP-OCRv5 Latin model — supports German umlauts (ä, ö, ü, ß)
"Rec.lang_type": _LangRec.LATIN,
"Rec.model_type": _ModelType.SERVER,
"Rec.ocr_version": _OCRVersion.PPOCRV5,
# Tighter detection boxes to reduce word merging
"Det.unclip_ratio": 1.3,
# Lower threshold to detect small chars (periods, ellipsis, phonetics)
"Det.box_thresh": 0.4,
# Silence verbose logging
"Global.log_level": "critical",
})
logger.info("RapidOCR engine initialized (PP-OCRv5 Latin, unclip_ratio=1.3)")
return _rapid_engine
def ocr_region_rapid(
img_bgr: np.ndarray,
region: PageRegion,
) -> List[Dict[str, Any]]:
"""Run RapidOCR on a specific region, returning word dicts compatible with Tesseract format.
Args:
img_bgr: Full-page BGR image (NOT binarized — RapidOCR works on color/gray).
region: Region to crop and OCR.
Returns:
List of word dicts with text, left, top, width, height, conf, region_type.
"""
engine = _get_rapid_engine()
# Crop region from BGR image
crop = img_bgr[region.y:region.y + region.height,
region.x:region.x + region.width]
if crop.size == 0:
return []
result = engine(crop)
if result is None or result.boxes is None or result.txts is None:
return []
words = []
boxes = result.boxes # shape (N, 4, 2) — 4 corner points per text line
txts = result.txts # tuple of strings
scores = result.scores # tuple of floats
for i, (box, txt, score) in enumerate(zip(boxes, txts, scores)):
if not txt or not txt.strip():
continue
# box is [[x1,y1],[x2,y2],[x3,y3],[x4,y4]] (clockwise from top-left)
xs = [p[0] for p in box]
ys = [p[1] for p in box]
left = int(min(xs))
top = int(min(ys))
w = int(max(xs) - left)
h = int(max(ys) - top)
words.append({
'text': txt.strip(),
'left': left + region.x, # Absolute coords
'top': top + region.y,
'width': w,
'height': h,
'conf': int(score * 100), # 0-100 like Tesseract
'region_type': region.type,
})
return words
def ocr_region_trocr(img_bgr: np.ndarray, region: PageRegion, handwritten: bool = False) -> List[Dict[str, Any]]:
"""Run TrOCR on a region. Returns line-level word dicts (same format as ocr_region_rapid).
Uses trocr_service.get_trocr_model() + _split_into_lines() for line segmentation.
Bboxes are approximated from equal line-height distribution within the region.
Falls back to Tesseract if TrOCR is not available.
"""
from services.trocr_service import get_trocr_model, _split_into_lines, _check_trocr_available
if not _check_trocr_available():
logger.warning("TrOCR not available, falling back to Tesseract")
if region.height > 0 and region.width > 0:
ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) if img_bgr is not None else None
if ocr_img_crop is not None:
return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
return []
crop = img_bgr[region.y:region.y + region.height, region.x:region.x + region.width]
if crop.size == 0:
return []
try:
import torch
from PIL import Image as _PILImage
processor, model = get_trocr_model(handwritten=handwritten)
if processor is None or model is None:
logger.warning("TrOCR model not loaded, falling back to Tesseract")
ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
pil_crop = _PILImage.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
lines = _split_into_lines(pil_crop)
if not lines:
lines = [pil_crop]
device = next(model.parameters()).device
all_text = []
confidences = []
for line_img in lines:
pixel_values = processor(images=line_img, return_tensors="pt").pixel_values.to(device)
with torch.no_grad():
generated_ids = model.generate(pixel_values, max_length=128)
text_line = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
if text_line:
all_text.append(text_line)
confidences.append(0.85 if len(text_line) > 3 else 0.5)
if not all_text:
return []
avg_conf = int(sum(confidences) / len(confidences) * 100)
line_h = region.height // max(len(all_text), 1)
words = []
for i, line in enumerate(all_text):
words.append({
"text": line,
"left": region.x,
"top": region.y + i * line_h,
"width": region.width,
"height": line_h,
"conf": avg_conf,
"region_type": region.type,
})
return words
except Exception as e:
logger.error(f"ocr_region_trocr failed: {e}")
return []
def ocr_region_lighton(img_bgr: np.ndarray, region: PageRegion) -> List[Dict[str, Any]]:
"""Run LightOnOCR-2-1B on a region. Returns line-level word dicts (same format as ocr_region_rapid).
Falls back to RapidOCR or Tesseract if LightOnOCR is not available.
"""
from services.lighton_ocr_service import get_lighton_model, _check_lighton_available
if not _check_lighton_available():
logger.warning("LightOnOCR not available, falling back to RapidOCR/Tesseract")
if RAPIDOCR_AVAILABLE and img_bgr is not None:
return ocr_region_rapid(img_bgr, region)
ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) if img_bgr is not None else None
return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6) if ocr_img_crop is not None else []
crop = img_bgr[region.y:region.y + region.height, region.x:region.x + region.width]
if crop.size == 0:
return []
try:
import io
import torch
from PIL import Image as _PILImage
processor, model = get_lighton_model()
if processor is None or model is None:
logger.warning("LightOnOCR model not loaded, falling back to RapidOCR/Tesseract")
if RAPIDOCR_AVAILABLE and img_bgr is not None:
return ocr_region_rapid(img_bgr, region)
ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
pil_crop = _PILImage.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
conversation = [{"role": "user", "content": [{"type": "image"}]}]
inputs = processor.apply_chat_template(
conversation, images=[pil_crop],
add_generation_prompt=True, return_tensors="pt"
).to(model.device)
with torch.no_grad():
output_ids = model.generate(**inputs, max_new_tokens=1024)
text = processor.decode(output_ids[0], skip_special_tokens=True).strip()
if not text:
return []
lines = [l.strip() for l in text.split("\n") if l.strip()]
line_h = region.height // max(len(lines), 1)
words = []
for i, line in enumerate(lines):
words.append({
"text": line,
"left": region.x,
"top": region.y + i * line_h,
"width": region.width,
"height": line_h,
"conf": 85,
"region_type": region.type,
})
return words
except Exception as e:
logger.error(f"ocr_region_lighton failed: {e}")
return []
# =============================================================================
# Post-Processing: Deterministic Quality Fixes
# =============================================================================
# --- A. Character Confusion Fix (I/1/l) ---
# Common OCR confusion pairs in vocabulary context
_CHAR_CONFUSION_RULES = [
# "1" at word start followed by lowercase → likely "I" or "l"
# Exception: NOT before "." or "," (numbered list prefix: "1. Kreuz", "1, 2, 3")
(re.compile(r'\b1([a-z])'), r'I\1'), # 1ch → Ich, 1want → Iwant
# Standalone "1" → "I" (English pronoun), but NOT "1." or "1," (list number)
(re.compile(r'(?<!\d)\b1\b(?![\d.,])'), 'I'), # "1 want" → "I want"
# "|" → "I", but NOT "|." or "|," (those are "1." list prefixes → spell-checker handles them)
(re.compile(r'(?<!\|)\|(?!\||[.,])'), 'I'), # |ch → Ich, | want → I want
]
# Cross-language indicators: if DE has these, EN "1" is almost certainly "I"
_DE_INDICATORS_FOR_EN_I = {'ich', 'mich', 'mir', 'mein', 'meine', 'meiner', 'meinem'}
def _fix_character_confusion(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Fix common OCR character confusions using context.
Deterministic rules:
- "1" at word start → "I" or "l" based on context
- Cross-reference EN↔DE: if DE contains "ich/mich/mir", EN "1""I"
- "y " artifact at word boundaries → remove (e.g. "y you""you")
"""
for entry in entries:
en = entry.get('english', '') or ''
de = entry.get('german', '') or ''
ex = entry.get('example', '') or ''
# Apply general rules to all fields
for pattern, replacement in _CHAR_CONFUSION_RULES:
en = pattern.sub(replacement, en)
de = pattern.sub(replacement, de)
ex = pattern.sub(replacement, ex)
# Cross-reference: if DE has "ich"/"mich" indicators, fix EN "1" → "I"
de_lower_words = set(de.lower().replace(',', ' ').split())
if de_lower_words & _DE_INDICATORS_FOR_EN_I:
# Any remaining "1" in EN that looks like "I"
en = re.sub(r'\b1\b(?![\d.,])', 'I', en)
# Fix "y " artifact before repeated word: "y you" → "you"
en = re.sub(r'\by\s+([a-z])', r'\1', en)
ex = re.sub(r'\by\s+([a-z])', r'\1', ex)
entry['english'] = en.strip()
entry['german'] = de.strip()
entry['example'] = ex.strip()
return entries
# --- B. Comma-Separated Word Form Splitting ---
def _is_singular_plural_pair(parts: List[str]) -> bool:
"""Detect if comma-separated parts are singular/plural forms of the same word.
E.g. "mouse, mice" or "Maus, Mäuse" → True (should NOT be split).
"break, broke, broken" → False (different verb forms, OK to split).
Heuristic: exactly 2 parts that share a common prefix of >= 50% length,
OR one part is a known plural suffix of the other (e.g. +s, +es, +en).
"""
if len(parts) != 2:
return False
a, b = parts[0].lower().strip(), parts[1].lower().strip()
if not a or not b:
return False
# Common prefix heuristic: if words share >= 50% of the shorter word,
# they are likely forms of the same word (Maus/Mäuse, child/children).
min_len = min(len(a), len(b))
common = 0
for ca, cb in zip(a, b):
if ca == cb:
common += 1
else:
break
if common >= max(2, min_len * 0.5):
return True
# Umlaut relation: one form adds umlaut (a→ä, o→ö, u→ü)
umlaut_map = str.maketrans('aou', 'äöü')
if a.translate(umlaut_map) == b or b.translate(umlaut_map) == a:
return True
return False
def _split_comma_entries(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Split entries with comma-separated word forms into individual entries.
E.g. EN: "break, broke, broken" / DE: "brechen, brach, gebrochen"
→ 3 entries: break/brechen, broke/brach, broken/gebrochen
Does NOT split singular/plural pairs like "mouse, mice" / "Maus, Mäuse"
because those are forms of the same vocabulary entry.
Only splits when both EN and DE have the same number of comma-parts,
parts are short (word forms, not sentences), and at least 3 parts
(to avoid splitting pairs that likely belong together).
"""
result: List[Dict[str, Any]] = []
for entry in entries:
en = (entry.get('english', '') or '').strip()
de = (entry.get('german', '') or '').strip()
# Split by comma (but not inside brackets or parentheses)
en_parts = _split_by_comma(en)
de_parts = _split_by_comma(de)
# Only split if we have multiple parts and counts match
should_split = False
if len(en_parts) > 1 and len(de_parts) > 1 and len(en_parts) == len(de_parts):
# All parts must be short (word forms, not sentences)
if all(len(p.split()) <= 3 for p in en_parts) and all(len(p.split()) <= 3 for p in de_parts):
# Do NOT split singular/plural pairs (2 parts that are
# forms of the same word)
if _is_singular_plural_pair(en_parts) or _is_singular_plural_pair(de_parts):
should_split = False
else:
should_split = True
if not should_split:
result.append(entry)
continue
# Split into individual entries
for k in range(len(en_parts)):
sub = dict(entry) # shallow copy
sub['english'] = en_parts[k].strip()
sub['german'] = de_parts[k].strip() if k < len(de_parts) else ''
sub['example'] = '' # examples get attached later
sub['split_from_comma'] = True
result.append(sub)
# Re-number
for i, e in enumerate(result):
e['row_index'] = i
return result
def _split_by_comma(text: str) -> List[str]:
"""Split text by commas, but not inside brackets [...] or parens (...)."""
if ',' not in text:
return [text]
parts = []
depth_bracket = 0
depth_paren = 0
current = []
for ch in text:
if ch == '[':
depth_bracket += 1
elif ch == ']':
depth_bracket = max(0, depth_bracket - 1)
elif ch == '(':
depth_paren += 1
elif ch == ')':
depth_paren = max(0, depth_paren - 1)
elif ch == ',' and depth_bracket == 0 and depth_paren == 0:
parts.append(''.join(current).strip())
current = []
continue
current.append(ch)
if current:
parts.append(''.join(current).strip())
# Filter empty parts
return [p for p in parts if p]
# --- C. Example Sentence Attachment ---
def _find_best_vocab_match(example_text: str, vocab_entries: List[Dict[str, Any]]) -> int:
"""Find the vocab entry whose English word(s) best match the example sentence.
Returns index into vocab_entries, or -1 if no match found.
Uses word stem overlap: "a broken arm" matches "broken" or "break".
"""
if not vocab_entries or not example_text:
return -1
example_lower = example_text.lower()
example_words = set(re.findall(r'[a-zäöüß]+', example_lower))
best_idx = -1
best_score = 0
for i, entry in enumerate(vocab_entries):
en = (entry.get('english', '') or '').lower()
if not en:
continue
# Extract vocab words (split on space, comma, newline)
vocab_words = set(re.findall(r'[a-zäöüß]+', en))
# Score: how many vocab words appear in the example?
# Also check if example words share a common stem (first 4 chars)
direct_matches = vocab_words & example_words
score = len(direct_matches) * 10
# Stem matching: "broken" matches "break" via shared prefix "bro"/"bre"
if score == 0:
for vw in vocab_words:
if len(vw) < 3:
continue
stem = vw[:4] if len(vw) >= 4 else vw[:3]
for ew in example_words:
if len(ew) >= len(stem) and ew[:len(stem)] == stem:
score += 5
break
if score > best_score:
best_score = score
best_idx = i
return best_idx if best_score > 0 else -1
def _attach_example_sentences(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Attach rows with EN text but no DE translation as examples to matching vocab entries.
Vocabulary worksheets often have:
Row 1: break, broke, broken / brechen, brach, gebrochen
Row 2: a broken arm (no DE → example for "broken")
Row 3: a broken plate (no DE → example for "broken")
Row 4: egg / Ei (has DE → new vocab entry)
Rules (deterministic, generic):
- A row is an "example row" if it has EN text but NO DE text (or very short DE ≤2 chars)
- Find the best matching vocab entry by checking which entry's English words
appear in the example sentence (semantic matching via word overlap)
- Fall back to the nearest preceding entry if no word match found
- Multiple examples get joined with " | "
"""
if not entries:
return entries
# Separate into vocab entries (have DE) and example candidates (no DE)
vocab_entries: List[Dict[str, Any]] = []
examples_for: Dict[int, List[str]] = {} # vocab_index → list of example texts
for entry in entries:
en = (entry.get('english', '') or '').strip()
de = (entry.get('german', '') or '').strip()
ex = (entry.get('example', '') or '').strip()
# Treat single-char DE as OCR noise, not real translation.
# "Ei" (2 chars) is a valid German word, so threshold is 1.
has_de = len(de) > 1
has_en = bool(en)
# Heuristic: a row without DE is an "example sentence" only if
# the EN text looks like a sentence (>= 4 words, or contains
# typical sentence punctuation). Short EN text (1-3 words) is
# more likely a vocab entry whose DE was missed by OCR.
_looks_like_sentence = (
len(en.split()) >= 4
or en.rstrip().endswith(('.', '!', '?'))
)
is_example_candidate = (
has_en and not has_de and _looks_like_sentence and vocab_entries
)
if is_example_candidate:
# This is an example sentence — find best matching vocab entry
example_text = en
match_idx = _find_best_vocab_match(en, vocab_entries)
if match_idx < 0:
# No word match → fall back to last entry
match_idx = len(vocab_entries) - 1
if match_idx not in examples_for:
examples_for[match_idx] = []
examples_for[match_idx].append(example_text)
else:
vocab_entries.append(entry)
# Attach examples to their matched vocab entries
for idx, example_list in examples_for.items():
if 0 <= idx < len(vocab_entries):
entry = vocab_entries[idx]
existing_ex = (entry.get('example', '') or '').strip()
new_examples = ' | '.join(example_list)
entry['example'] = f"{existing_ex} | {new_examples}" if existing_ex else new_examples
# Re-number
for i, e in enumerate(vocab_entries):
e['row_index'] = i
return vocab_entries
# --- D. Phonetic Bracket IPA Replacement ---
# Pattern: word followed by any bracket type containing phonetic content.
# Tesseract often garbles IPA brackets: [ˈdɑːns] → {'tfatno] or (cy) etc.
# Match any opener ([, {, () with any closer (], }, )) — even mixed pairs.
# This intentionally matches mixed brackets (e.g. {content]) because
# Tesseract frequently misrecognizes bracket characters.
_PHONETIC_BRACKET_RE = re.compile(
r'(\b[a-zA-ZäöüÄÖÜß]+)\s*[\[\{\(]([^\]\}\)]*?)[\]\}\)]'
)
# Unicode IPA characters — used to distinguish correct IPA (from dictionary
# lookup) from garbled OCR content when stripping orphan brackets.
_IPA_CHARS = frozenset('ˈˌːɑɒæɛəɜɪɔʊʌðŋθʃʒɹɡɾʔɐ')
# Minimum word confidence for full-page Tesseract results (0-100).
# Words below this threshold are OCR noise (scanner shadows, borders).
_MIN_WORD_CONF = 30
def _lookup_ipa(word: str, pronunciation: str = 'british') -> Optional[str]:
"""Look up IPA for a word using the selected pronunciation dictionary.
Args:
word: English word to look up.
pronunciation: 'british' (Britfone, MIT) or 'american' (eng_to_ipa, MIT).
Returns:
IPA string or None if not found.
"""
word_lower = word.lower().strip()
if not word_lower:
return None
if pronunciation == 'british' and _britfone_dict:
ipa = _britfone_dict.get(word_lower)
if ipa:
return ipa
# Fallback to American if not in Britfone
if _ipa_convert_american:
result = _ipa_convert_american(word_lower)
if result and '*' not in result:
return result
return None
if pronunciation == 'american' and _ipa_convert_american:
result = _ipa_convert_american(word_lower)
if result and '*' not in result:
return result
# Fallback to Britfone if not in CMU
if _britfone_dict:
ipa = _britfone_dict.get(word_lower)
if ipa:
return ipa
return None
# Try any available source
if _britfone_dict:
ipa = _britfone_dict.get(word_lower)
if ipa:
return ipa
if _ipa_convert_american:
result = _ipa_convert_american(word_lower)
if result and '*' not in result:
return result
return None
def _fix_phonetic_brackets(
entries: List[Dict[str, Any]],
pronunciation: str = 'british',
) -> List[Dict[str, Any]]:
"""Replace OCR'd phonetic transcriptions with dictionary IPA.
Detects patterns like "dance [du:ns]" and replaces with correct IPA:
- British: "dance [dˈɑːns]" (Britfone, MIT)
- American: "dance [dæns]" (eng_to_ipa/CMU, MIT)
Only replaces if the word before brackets is found in the dictionary.
"""
if not IPA_AVAILABLE:
return entries
# IPA phonetics only appear in the ENGLISH field of vocab tables.
# German and example fields contain meaningful parenthetical content:
# german: "Eis (gefrorenes Wasser)", "(Salat-)Gurke", "sauer (auf)"
# example: "(sich beschweren)", "(brauchen)", "(jammern)"
# These must NEVER be processed as phonetic transcriptions.
replaced_count = 0
for entry in entries:
text = entry.get('english', '') or ''
if not any(ch in text for ch in '[{('):
continue
new_text = _replace_phonetics_in_text(text, pronunciation)
if new_text != text:
logger.debug(f"_fix_phonetic_brackets: '{text}''{new_text}'")
replaced_count += 1
entry['english'] = new_text
if replaced_count:
logger.info(f"_fix_phonetic_brackets: {replaced_count} IPA replacements in {len(entries)} entries")
return entries
# Grammar particles that appear in brackets after English words:
# cross (with), complain (about/of), agree (on/with), look (sth) up
# These must NOT be replaced with IPA. Only used for the English field
# (German/example fields are never processed for IPA replacement).
_GRAMMAR_BRACKET_WORDS = frozenset({
# English prepositions/particles commonly in vocab tables
'with', 'about', 'of', 'for', 'to', 'from', 'in', 'on', 'at', 'by',
'up', 'out', 'off', 'into', 'over', 'down', 'away', 'back', 'through',
# English grammar abbreviations used in vocab tables
'sth', 'sb', 'adj', 'adv',
})
def _is_grammar_bracket_content(content: str) -> bool:
"""Return True if bracket content is grammar info in the ENGLISH field.
Grammar info: cross (with), complain (about/of), agree (on/with)
NOT grammar: [breik], [maus], {'tfatno], (cy), ['kju:kambo], [test]
Since we only process the English field, we only need to recognize
English grammar particles. Everything else is (garbled) IPA.
"""
if not content:
return False
# Split on / for patterns like (about/of), (on/with)
tokens = [t.strip().lower() for t in content.split('/') if t.strip()]
if not tokens:
return False
# ALL tokens must be known grammar words
return all(token in _GRAMMAR_BRACKET_WORDS for token in tokens)
def _replace_phonetics_in_text(text: str, pronunciation: str = 'british') -> str:
"""Replace [phonetic] / {phonetic} / (phonetic) after words with dictionary IPA.
Tesseract garbles IPA brackets, e.g. China [ˈtʃaɪnə] → China {'tfatno].
We match any bracket type and replace with dictionary IPA if found.
Legitimate parenthetical content like (zer)brechen or (veranstaltung) is preserved.
"""
if not IPA_AVAILABLE:
return text
def replacer(match):
word = match.group(1)
bracket_content = match.group(2).strip()
full_match = match.group(0)
# Skip if bracket content looks like regular text (multiple words)
if len(bracket_content.split()) > 3:
return full_match
# Look up IPA for the word before brackets
ipa = _lookup_ipa(word, pronunciation)
if ipa:
# Word has IPA → bracket content is phonetic (garbled or correct).
# Exception: grammar particles like cross (with) — keep those.
if _is_grammar_bracket_content(bracket_content):
return full_match
logger.debug(f"phonetic: '{full_match}''{word} [{ipa}]'")
return f"{word} [{ipa}]"
# No IPA for this word — keep as-is
return full_match
text = _PHONETIC_BRACKET_RE.sub(replacer, text)
# Second pass: strip remaining orphan brackets that are garbled IPA.
# These have no word before them (the main regex requires \b word \s* bracket).
# Examples: "[mais]", "{'mani setva]", trailing "(kros]"
# Keep: grammar parens "(sich beschweren)", correct IPA "[dˈɑːns]"
def _strip_orphan_bracket(m):
content = m.group(1).strip()
# Keep grammar info: (sich beschweren), (about/of)
if _is_grammar_bracket_content(content):
return m.group(0)
# Keep correct IPA (contains Unicode IPA characters)
if any(ch in _IPA_CHARS for ch in content):
return m.group(0)
logger.debug(f"phonetic: stripping orphan bracket '{m.group(0)}'")
return ''
text = re.sub(r'[\[\{\(]([^\]\}\)]*)[\]\}\)]', _strip_orphan_bracket, text)
text = text.strip()
return text
def _assign_row_words_to_columns(
row: RowGeometry,
columns: List[PageRegion],
) -> Dict[int, List[Dict]]:
"""Assign each word in a row to exactly one column.
Uses a two-pass strategy:
1. Containment: if a word's center falls within a column's horizontal
bounds (with padding), assign it to that column.
2. Nearest center: for words not contained by any column, fall back to
nearest column center distance.
This prevents long sentences in wide columns (e.g. example) from having
their rightmost words stolen by an adjacent column.
Args:
row: Row with words (relative coordinates).
columns: Sorted list of columns (absolute coordinates).
Returns:
Dict mapping col_index → list of words assigned to that column.
"""
result: Dict[int, List[Dict]] = {i: [] for i in range(len(columns))}
if not row.words or not columns:
return result
left_x = row.x # content ROI left (absolute)
# Build non-overlapping column assignment ranges using midpoints.
# For adjacent columns, the boundary is the midpoint between them.
# This prevents words near column borders from being assigned to
# the wrong column (e.g. "We" at the start of an example sentence
# being stolen by the preceding DE column).
n = len(columns)
col_ranges_rel = [] # (assign_left, assign_right) per column
for ci, col in enumerate(columns):
col_left_rel = col.x - left_x
col_right_rel = col_left_rel + col.width
# Left boundary: midpoint to previous column, or 0
if ci == 0:
assign_left = 0
else:
prev_right = columns[ci - 1].x - left_x + columns[ci - 1].width
assign_left = (prev_right + col_left_rel) / 2
# Right boundary: midpoint to next column, or infinity (row width)
if ci == n - 1:
assign_right = row.width + 100 # generous for last column
else:
next_left = columns[ci + 1].x - left_x
assign_right = (col_right_rel + next_left) / 2
col_ranges_rel.append((assign_left, assign_right))
for w in row.words:
w_left = w['left']
w_right = w_left + w['width']
w_center_x = w_left + w['width'] / 2
# Primary: overlap-based matching — assign to column with most overlap.
# This is more robust than center-based for narrow columns (page_ref)
# where the last character's center may fall into the next column.
best_col = -1
best_overlap = 0
for ci, col in enumerate(columns):
col_left_rel = col.x - left_x
col_right_rel = col_left_rel + col.width
overlap = max(0, min(w_right, col_right_rel) - max(w_left, col_left_rel))
if overlap > best_overlap:
best_overlap = overlap
best_col = ci
if best_col >= 0 and best_overlap > 0:
result[best_col].append(w)
else:
# Fallback: center-based range matching
assigned = False
for ci, (al, ar) in enumerate(col_ranges_rel):
if al <= w_center_x < ar:
result[ci].append(w)
assigned = True
break
if not assigned:
# Last resort: nearest column center
best_col = 0
col_left_0 = columns[0].x - left_x
best_dist = abs(w_center_x - (col_left_0 + columns[0].width / 2))
for ci in range(1, n):
col_left = columns[ci].x - left_x
dist = abs(w_center_x - (col_left + columns[ci].width / 2))
if dist < best_dist:
best_dist = dist
best_col = ci
result[best_col].append(w)
return result
# Regex: at least 2 consecutive letters (Latin + umlauts + accents)
_RE_REAL_WORD = re.compile(r'[a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]{2,}')
_RE_ALPHA = re.compile(r'[a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]')
# Common short EN/DE words (2-3 chars). Tokens at the end of a cell
# that do NOT appear here are treated as trailing OCR noise.
_COMMON_SHORT_WORDS: set = {
# EN 1-2 letter
'a', 'i', 'am', 'an', 'as', 'at', 'be', 'by', 'do', 'go', 'he',
'if', 'in', 'is', 'it', 'me', 'my', 'no', 'of', 'oh', 'ok', 'on',
'or', 'so', 'to', 'up', 'us', 'we',
# EN 3 letter
'ace', 'act', 'add', 'age', 'ago', 'aid', 'aim', 'air', 'all',
'and', 'ant', 'any', 'ape', 'arc', 'are', 'ark', 'arm', 'art',
'ask', 'ate', 'axe', 'bad', 'bag', 'ban', 'bar', 'bat', 'bay',
'bed', 'bee', 'bet', 'big', 'bin', 'bit', 'bow', 'box', 'boy',
'bud', 'bug', 'bun', 'bus', 'but', 'buy', 'cab', 'can', 'cap',
'car', 'cat', 'cop', 'cow', 'cry', 'cub', 'cup', 'cut', 'dad',
'dam', 'day', 'den', 'dew', 'did', 'die', 'dig', 'dim', 'dip',
'dog', 'dot', 'dry', 'due', 'dug', 'dye', 'ear', 'eat', 'eel',
'egg', 'elm', 'end', 'era', 'eve', 'ewe', 'eye', 'fan', 'far',
'fat', 'fax', 'fed', 'fee', 'few', 'fig', 'fin', 'fir', 'fit',
'fix', 'fly', 'foe', 'fog', 'for', 'fox', 'fry', 'fun', 'fur',
'gag', 'gap', 'gas', 'get', 'god', 'got', 'gum', 'gun', 'gut',
'guy', 'gym', 'had', 'ham', 'has', 'hat', 'hay', 'hen', 'her',
'hid', 'him', 'hip', 'his', 'hit', 'hog', 'hop', 'hot', 'how',
'hue', 'hug', 'hum', 'hut', 'ice', 'icy', 'ill', 'imp', 'ink',
'inn', 'ion', 'its', 'ivy', 'jam', 'jar', 'jaw', 'jay', 'jet',
'jig', 'job', 'jog', 'joy', 'jug', 'key', 'kid', 'kin', 'kit',
'lab', 'lad', 'lag', 'lap', 'law', 'lay', 'led', 'leg', 'let',
'lid', 'lie', 'lip', 'lit', 'log', 'lot', 'low', 'mad', 'man',
'map', 'mat', 'maw', 'may', 'men', 'met', 'mid', 'mix', 'mob',
'mog', 'mom', 'mop', 'mow', 'mrs', 'mud', 'mug', 'mum', 'nag',
'nap', 'net', 'new', 'nod', 'nor', 'not', 'now', 'nun', 'nut',
'oak', 'oar', 'oat', 'odd', 'off', 'oft', 'oil', 'old', 'one',
'opt', 'orb', 'ore', 'our', 'out', 'owe', 'owl', 'own', 'pad',
'pal', 'pan', 'pat', 'paw', 'pay', 'pea', 'peg', 'pen', 'per',
'pet', 'pie', 'pig', 'pin', 'pit', 'ply', 'pod', 'pop', 'pot',
'pro', 'pry', 'pub', 'pug', 'pun', 'pup', 'put', 'rag', 'ram',
'ran', 'rap', 'rat', 'raw', 'ray', 'red', 'ref', 'rib', 'rid',
'rig', 'rim', 'rip', 'rob', 'rod', 'roe', 'rot', 'row', 'rub',
'rug', 'rum', 'run', 'rut', 'rye', 'sac', 'sad', 'sag', 'sap',
'sat', 'saw', 'say', 'sea', 'set', 'sew', 'she', 'shy', 'sin',
'sip', 'sir', 'sis', 'sit', 'six', 'ski', 'sky', 'sly', 'sob',
'sod', 'son', 'sop', 'sot', 'sow', 'soy', 'spa', 'spy', 'sty',
'sub', 'sue', 'sum', 'sun', 'sup', 'tab', 'tad', 'tag', 'tan',
'tap', 'tar', 'tax', 'tea', 'ten', 'the', 'tie', 'tin', 'tip',
'toe', 'ton', 'too', 'top', 'tow', 'toy', 'try', 'tub', 'tug',
'two', 'urn', 'use', 'van', 'vat', 'vet', 'via', 'vie', 'vim',
'vow', 'wag', 'war', 'was', 'wax', 'way', 'web', 'wed', 'wet',
'who', 'why', 'wig', 'win', 'wit', 'woe', 'wok', 'won', 'woo',
'wow', 'yam', 'yap', 'yaw', 'yea', 'yes', 'yet', 'yew', 'you',
'zap', 'zip', 'zoo',
# DE 2-3 letter
'ab', 'da', 'du', 'ei', 'er', 'es', 'ja', 'ob', 'um', 'zu',
'als', 'alt', 'auf', 'aus', 'bei', 'bin', 'bis', 'das', 'dem',
'den', 'der', 'des', 'die', 'dir', 'ehe', 'ein', 'eng', 'gar',
'gib', 'gut', 'hat', 'her', 'ich', 'ihm', 'ihr', 'ins', 'ist',
'mal', 'man', 'mir', 'mit', 'nah', 'neu', 'nie', 'nur', 'nun',
'ort', 'rad', 'rat', 'rot', 'ruf', 'ruh', 'sei', 'sie', 'tag',
'tal', 'tat', 'tee', 'tor', 'tun', 'tut', 'uns', 'vom', 'von',
'vor', 'war', 'was', 'weg', 'wem', 'wen', 'wer', 'wie', 'wir',
'wut', 'zum', 'zur',
}
# Known abbreviations found in EN/DE textbooks and dictionaries.
# Stored WITHOUT trailing period (the noise filter strips periods).
# These rescue tokens like "sth." / "sb." / "usw." from being deleted.
_KNOWN_ABBREVIATIONS: set = {
# EN dictionary meta-words
'sth', 'sb', 'smth', 'smb', 'sbd',
# EN general
'etc', 'eg', 'ie', 'esp', 'approx', 'dept', 'govt', 'corp',
'inc', 'ltd', 'vs', 'cf', 'ibid', 'nb', 'ps', 'asap',
# EN references / textbook
'p', 'pp', 'ch', 'chap', 'fig', 'figs', 'no', 'nos', 'nr',
'vol', 'vols', 'ed', 'eds', 'rev', 'repr', 'trans', 'ff',
'fn', 'sec', 'par', 'para', 'app', 'abbr', 'ex', 'exs',
'ans', 'wb', 'tb', 'vocab',
# EN parts of speech / grammar
'adj', 'adv', 'prep', 'conj', 'pron', 'det', 'art', 'interj',
'aux', 'mod', 'inf', 'pt', 'pres', 'pret', 'ger',
'sg', 'pl', 'sing', 'irreg', 'reg', 'intr', 'intrans',
'refl', 'pass', 'imper', 'subj', 'ind', 'perf', 'fut',
'attr', 'pred', 'comp', 'superl', 'pos', 'neg',
'lit', 'colloq', 'sl', 'dial', 'arch', 'obs', 'fml', 'infml',
'syn', 'ant', 'opp', 'var', 'orig',
# EN titles
'mr', 'mrs', 'ms', 'dr', 'prof', 'st', 'jr', 'sr',
# EN pronunciation
'br', 'am', 'brit', 'amer',
# EN units
'hr', 'hrs', 'min', 'km', 'cm', 'mm', 'kg', 'mg', 'ml',
# DE general
'usw', 'bzw', 'evtl', 'ggf', 'ggfs', 'sog', 'eigtl', 'allg',
'bes', 'insb', 'insbes', 'bspw', 'ca',
'od', 'ua', 'sa', 'vgl', 'zb', 'dh', 'zt', 'idr',
'inkl', 'exkl', 'zzgl', 'abzgl',
# DE references
'abs', 'abschn', 'abt', 'anm', 'ausg', 'aufl', 'bd', 'bde',
'bearb', 'ebd', 'hrsg', 'hg', 'jg', 'jh', 'jhd', 'kap',
's', 'sp', 'zit', 'zs', 'vlg',
# DE grammar
'nom', 'akk', 'dat', 'gen', 'konj', 'subst', 'obj',
'praet', 'imp', 'part', 'mask', 'fem', 'neutr',
'trennb', 'untrennb', 'ugs', 'geh', 'pej',
# DE regional
'nordd', 'österr', 'schweiz',
# Linguistic
'lex', 'morph', 'phon', 'phonet', 'sem', 'synt', 'etym',
'deriv', 'pref', 'suf', 'suff', 'dim', 'coll',
'count', 'uncount', 'indef', 'def', 'poss', 'demon',
}
def _is_noise_tail_token(token: str) -> bool:
"""Check if a token at the END of cell text is trailing OCR noise.
Trailing fragments are very common OCR artifacts from image edges,
borders, and neighbouring cells. This is more aggressive than a
general word filter: any short token that isn't in the dictionary
of common EN/DE words is considered noise.
Examples of noise: "Es)", "3", "ee", "B"
Examples to keep: "sister.", "cupcakes.", "...", "mice", "[eg]"
"""
t = token.strip()
if not t:
return True
# Keep ellipsis
if t in ('...', ''):
return False
# Keep phonetic brackets: [eg], [maus], ["a:mand], serva], etc.
if t.startswith('[') or t.startswith('["') or t.startswith("['"):
return False
if t.endswith(']'):
return False
# Pure non-alpha → noise ("3", ")", "|")
alpha_chars = _RE_ALPHA.findall(t)
if not alpha_chars:
return True
# Extract only alpha characters for dictionary lookup
cleaned = ''.join(alpha_chars)
# Known abbreviations (e.g. "sth.", "usw.", "adj.") — always keep
if cleaned.lower() in _KNOWN_ABBREVIATIONS:
return False
# Strip normal trailing punctuation before checking for internal noise.
stripped_punct = re.sub(r'[.,;:!?]+$', '', t) # "cupcakes." → "cupcakes"
t_check = stripped_punct if stripped_punct else t
# Check for legitimate punctuation patterns vs. real noise.
# Legitimate: "(auf)", "under-", "e.g.", "(on)", "selbst)", "(wir",
# "(Salat-)Gurke", "Tanz(veranstaltung)", "(zer)brechen"
# Noise: "3d", "B|", "x7"
# Strategy: strip common dictionary punctuation (parens, hyphens, slashes),
# THEN check if residual contains only alpha characters.
t_inner = t_check
# Remove all parentheses, hyphens, slashes, and dots — these are normal
# in dictionary entries: "(Salat-)Gurke", "Tanz(veranstaltung)",
# "(zer)brechen", "wir/uns", "e.g."
t_inner = re.sub(r'[()\-/.,;:!?]', '', t_inner)
# Now check: does the inner form still have non-alpha noise?
inner_alpha = ''.join(_RE_ALPHA.findall(t_inner))
has_internal_noise = (len(t_inner) > len(inner_alpha)) if t_inner else False
# Long alpha words (4+ chars) without internal noise are likely real
if len(cleaned) >= 4 and not has_internal_noise:
return False
# Short words: check dictionary (uses only alpha chars)
if cleaned.lower() in _COMMON_SHORT_WORDS and not has_internal_noise:
return False
# Default: short or suspicious → noise
return True
def _is_garbage_text(text: str) -> bool:
"""Check if entire cell text is OCR garbage from image areas.
Garbage text = no recognizable dictionary word. Catches
"(ci]oeu", "uanoaain." etc.
"""
words = _RE_REAL_WORD.findall(text)
if not words:
# Check if any token is a known abbreviation (e.g. "e.g.")
alpha_only = ''.join(_RE_ALPHA.findall(text)).lower()
if alpha_only in _KNOWN_ABBREVIATIONS:
return False
return True
for w in words:
wl = w.lower()
# Known short word or abbreviation → not garbage
if wl in _COMMON_SHORT_WORDS or wl in _KNOWN_ABBREVIATIONS:
return False
# Long word (>= 4 chars): check vowel/consonant ratio.
# Real EN/DE words have 20-60% vowels. Garbage like "uanoaain"
# or "cioeu" has unusual ratios (too many or too few vowels).
if len(wl) >= 4:
vowels = sum(1 for c in wl if c in 'aeiouäöü')
ratio = vowels / len(wl)
if 0.15 <= ratio <= 0.65:
return False # plausible vowel ratio → real word
return True
def _clean_cell_text(text: str) -> str:
"""Remove OCR noise from cell text. Generic filters:
1. If the entire text has no real alphabetic word (>= 2 letters), clear.
2. If the entire text is garbage (no dictionary word), clear.
3. Strip trailing noise tokens from the end of the text.
"""
stripped = text.strip()
if not stripped:
return ''
# --- Filter 1: No real word at all ---
if not _RE_REAL_WORD.search(stripped):
# Exception: dotted abbreviations like "e.g.", "z.B.", "i.e."
alpha_only = ''.join(_RE_ALPHA.findall(stripped)).lower()
if alpha_only not in _KNOWN_ABBREVIATIONS:
return ''
# --- Filter 2: Entire text is garbage ---
if _is_garbage_text(stripped):
return ''
# --- Filter 3: Strip trailing noise tokens ---
tokens = stripped.split()
while tokens and _is_noise_tail_token(tokens[-1]):
tokens.pop()
if not tokens:
return ''
return ' '.join(tokens)
def _clean_cell_text_lite(text: str) -> str:
"""Simplified noise filter for cell-first OCR (isolated cell crops).
Since each cell is OCR'd in isolation (no neighbour content visible),
trailing-noise stripping is unnecessary. Only 2 filters remain:
1. No real alphabetic word (>= 2 letters) and not a known abbreviation → empty.
2. Entire text is garbage (no dictionary word) → empty.
"""
stripped = text.strip()
if not stripped:
return ''
# --- Filter 1: No real word at all ---
if not _RE_REAL_WORD.search(stripped):
alpha_only = ''.join(_RE_ALPHA.findall(stripped)).lower()
if alpha_only not in _KNOWN_ABBREVIATIONS:
return ''
# --- Filter 2: Entire text is garbage ---
if _is_garbage_text(stripped):
return ''
return stripped
# ---------------------------------------------------------------------------
# Bold detection via stroke-width analysis (relative / page-level)
# ---------------------------------------------------------------------------
def _measure_stroke_width(gray_crop: np.ndarray) -> float:
"""Measure mean stroke width in a binarised cell crop.
Returns a DPI-normalised value (mean stroke width as % of crop height),
or 0.0 if measurement is not possible.
"""
if gray_crop is None or gray_crop.size == 0:
return 0.0
h, w = gray_crop.shape[:2]
if h < 10 or w < 10:
return 0.0
# Binarise: text = white (255), background = black (0)
_, bw = cv2.threshold(gray_crop, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)
if cv2.countNonZero(bw) < 20:
return 0.0
# Distance transform: value at each white pixel = distance to nearest black
dist = cv2.distanceTransform(bw, cv2.DIST_L2, 3)
# Skeleton via morphological thinning
kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3))
thin = bw.copy()
for _ in range(max(1, min(h, w) // 6)):
eroded = cv2.erode(thin, kernel)
if cv2.countNonZero(eroded) < 5:
break
thin = eroded
skeleton_pts = thin > 0
if not np.any(skeleton_pts):
return 0.0
mean_stroke = float(np.mean(dist[skeleton_pts]))
return mean_stroke / max(h, 1) * 100 # normalised: % of cell height
def _classify_bold_cells(cells: List[Dict[str, Any]], ocr_img: Optional[np.ndarray],
img_w: int, img_h: int) -> None:
"""Two-pass bold detection: measure all cells, then compare against median.
Cells with stroke width > 1.4× the page median are marked as bold.
This adapts automatically to font, DPI and scan quality.
Modifies cells in-place (sets 'is_bold' key).
"""
if ocr_img is None:
return
# Pass 1: measure stroke width for every cell with text
metrics: List[float] = []
cell_strokes: List[float] = []
for cell in cells:
sw = 0.0
if cell.get('text', '').strip():
bp = cell['bbox_px']
y1 = max(0, bp['y'])
y2 = min(img_h, bp['y'] + bp['h'])
x1 = max(0, bp['x'])
x2 = min(img_w, bp['x'] + bp['w'])
if y2 > y1 and x2 > x1:
sw = _measure_stroke_width(ocr_img[y1:y2, x1:x2])
cell_strokes.append(sw)
if sw > 0:
metrics.append(sw)
if len(metrics) < 3:
# Too few cells to compare — leave all as non-bold
return
median_sw = float(np.median(metrics))
if median_sw <= 0:
return
# Pass 2: cells significantly above median → bold
for cell, sw in zip(cells, cell_strokes):
cell['is_bold'] = sw > 0 and (sw / median_sw) > 1.4
# ---------------------------------------------------------------------------