[split-required] Split remaining Python monoliths (Phase 1 continued)
klausur-service (7 monoliths): - grid_editor_helpers.py (1,737 → 5 files: columns, filters, headers, zones) - cv_cell_grid.py (1,675 → 7 files: build, legacy, streaming, merge, vocab) - worksheet_editor_api.py (1,305 → 4 files: models, AI, reconstruct, routes) - legal_corpus_ingestion.py (1,280 → 3 files: registry, chunking, ingestion) - cv_review.py (1,248 → 4 files: pipeline, spell, LLM, barrel) - cv_preprocessing.py (1,166 → 3 files: deskew, dewarp, barrel) - rbac.py, admin_api.py, routes/eh.py remain (next batch) backend-lehrer (1 monolith): - classroom_engine/repository.py (1,705 → 7 files by domain) All re-export barrels preserve backward compatibility. Zero import errors verified. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
200
klausur-service/backend/cv_cell_grid_vocab.py
Normal file
200
klausur-service/backend/cv_cell_grid_vocab.py
Normal file
@@ -0,0 +1,200 @@
|
||||
"""
|
||||
Vocabulary extraction: cells -> vocab entries, and build_word_grid wrapper.
|
||||
|
||||
Extracted from cv_cell_grid.py.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from cv_ocr_engines import (
|
||||
_attach_example_sentences,
|
||||
_fix_phonetic_brackets,
|
||||
_split_comma_entries,
|
||||
)
|
||||
from cv_cell_grid_legacy import build_cell_grid
|
||||
from cv_cell_grid_merge import (
|
||||
_merge_continuation_rows,
|
||||
_merge_phonetic_continuation_rows,
|
||||
_merge_wrapped_rows,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _cells_to_vocab_entries(
|
||||
cells: List[Dict[str, Any]],
|
||||
columns_meta: List[Dict[str, Any]],
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Map generic cells to vocab entries with english/german/example fields.
|
||||
|
||||
Groups cells by row_index, maps col_type -> field name, and produces
|
||||
one entry per row (only rows with at least one non-empty field).
|
||||
"""
|
||||
col_type_to_field = {
|
||||
'column_en': 'english',
|
||||
'column_de': 'german',
|
||||
'column_example': 'example',
|
||||
'page_ref': 'source_page',
|
||||
'column_marker': 'marker',
|
||||
'column_text': 'text', # generic single-column (box sub-sessions)
|
||||
}
|
||||
bbox_key_map = {
|
||||
'column_en': 'bbox_en',
|
||||
'column_de': 'bbox_de',
|
||||
'column_example': 'bbox_ex',
|
||||
'page_ref': 'bbox_ref',
|
||||
'column_marker': 'bbox_marker',
|
||||
'column_text': 'bbox_text',
|
||||
}
|
||||
|
||||
# Group cells by row_index
|
||||
rows: Dict[int, List[Dict]] = {}
|
||||
for cell in cells:
|
||||
ri = cell['row_index']
|
||||
rows.setdefault(ri, []).append(cell)
|
||||
|
||||
entries: List[Dict[str, Any]] = []
|
||||
for row_idx in sorted(rows.keys()):
|
||||
row_cells = rows[row_idx]
|
||||
entry: Dict[str, Any] = {
|
||||
'row_index': row_idx,
|
||||
'english': '',
|
||||
'german': '',
|
||||
'example': '',
|
||||
'text': '', # generic single-column (box sub-sessions)
|
||||
'source_page': '',
|
||||
'marker': '',
|
||||
'confidence': 0.0,
|
||||
'bbox': None,
|
||||
'bbox_en': None,
|
||||
'bbox_de': None,
|
||||
'bbox_ex': None,
|
||||
'bbox_ref': None,
|
||||
'bbox_marker': None,
|
||||
'bbox_text': None,
|
||||
'ocr_engine': row_cells[0].get('ocr_engine', '') if row_cells else '',
|
||||
}
|
||||
|
||||
confidences = []
|
||||
for cell in row_cells:
|
||||
col_type = cell['col_type']
|
||||
field = col_type_to_field.get(col_type)
|
||||
if field:
|
||||
entry[field] = cell['text']
|
||||
bbox_field = bbox_key_map.get(col_type)
|
||||
if bbox_field:
|
||||
entry[bbox_field] = cell['bbox_pct']
|
||||
if cell['confidence'] > 0:
|
||||
confidences.append(cell['confidence'])
|
||||
|
||||
# Compute row-level bbox as union of all cell bboxes
|
||||
all_bboxes = [c['bbox_pct'] for c in row_cells if c.get('bbox_pct')]
|
||||
if all_bboxes:
|
||||
min_x = min(b['x'] for b in all_bboxes)
|
||||
min_y = min(b['y'] for b in all_bboxes)
|
||||
max_x2 = max(b['x'] + b['w'] for b in all_bboxes)
|
||||
max_y2 = max(b['y'] + b['h'] for b in all_bboxes)
|
||||
entry['bbox'] = {
|
||||
'x': round(min_x, 2),
|
||||
'y': round(min_y, 2),
|
||||
'w': round(max_x2 - min_x, 2),
|
||||
'h': round(max_y2 - min_y, 2),
|
||||
}
|
||||
|
||||
entry['confidence'] = round(
|
||||
sum(confidences) / len(confidences), 1
|
||||
) if confidences else 0.0
|
||||
|
||||
# Only include if at least one mapped field has text
|
||||
has_content = any(
|
||||
entry.get(f)
|
||||
for f in col_type_to_field.values()
|
||||
)
|
||||
if has_content:
|
||||
entries.append(entry)
|
||||
|
||||
return entries
|
||||
|
||||
|
||||
def build_word_grid(
|
||||
ocr_img,
|
||||
column_regions,
|
||||
row_geometries,
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
lang: str = "eng+deu",
|
||||
ocr_engine: str = "auto",
|
||||
img_bgr=None,
|
||||
pronunciation: str = "british",
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Vocab-specific: Cell-Grid + Vocab-Mapping + Post-Processing.
|
||||
|
||||
Wrapper around build_cell_grid() that adds vocabulary-specific logic:
|
||||
- Maps cells to english/german/example entries
|
||||
- Applies character confusion fixes, IPA lookup, comma splitting, etc.
|
||||
- Falls back to returning raw cells if no vocab columns detected.
|
||||
|
||||
Args:
|
||||
ocr_img: Binarized full-page image (for Tesseract).
|
||||
column_regions: Classified columns from Step 3.
|
||||
row_geometries: Rows from Step 4.
|
||||
img_w, img_h: Image dimensions.
|
||||
lang: Default Tesseract language.
|
||||
ocr_engine: 'tesseract', 'rapid', or 'auto'.
|
||||
img_bgr: BGR color image (required for RapidOCR).
|
||||
pronunciation: 'british' or 'american' for IPA lookup.
|
||||
|
||||
Returns:
|
||||
List of entry dicts with english/german/example text and bbox info (percent).
|
||||
"""
|
||||
cells, columns_meta = build_cell_grid(
|
||||
ocr_img, column_regions, row_geometries, img_w, img_h,
|
||||
lang=lang, ocr_engine=ocr_engine, img_bgr=img_bgr,
|
||||
)
|
||||
|
||||
if not cells:
|
||||
return []
|
||||
|
||||
# Check if vocab layout is present
|
||||
col_types = {c['type'] for c in columns_meta}
|
||||
if not (col_types & {'column_en', 'column_de'}):
|
||||
logger.info("build_word_grid: no vocab columns -- returning raw cells")
|
||||
return cells
|
||||
|
||||
# Vocab mapping: cells -> entries
|
||||
entries = _cells_to_vocab_entries(cells, columns_meta)
|
||||
|
||||
# --- Post-processing pipeline (deterministic, no LLM) ---
|
||||
n_raw = len(entries)
|
||||
|
||||
# 0. Merge cell-wrap continuation rows (empty primary column = text wrap)
|
||||
entries = _merge_wrapped_rows(entries)
|
||||
|
||||
# 0a. Merge phonetic-only continuation rows into previous entry
|
||||
entries = _merge_phonetic_continuation_rows(entries)
|
||||
|
||||
# 0b. Merge multi-line continuation rows (lowercase EN, empty DE)
|
||||
entries = _merge_continuation_rows(entries)
|
||||
|
||||
# 1. Character confusion (| -> I, 1 -> I, 8 -> B) is now run in
|
||||
# llm_review_entries_streaming so changes are visible to the user in Step 6.
|
||||
|
||||
# 2. Replace OCR'd phonetics with dictionary IPA
|
||||
entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
|
||||
|
||||
# 3. Split comma-separated word forms (break, broke, broken -> 3 entries)
|
||||
entries = _split_comma_entries(entries)
|
||||
|
||||
# 4. Attach example sentences (rows without DE -> examples for preceding entry)
|
||||
entries = _attach_example_sentences(entries)
|
||||
|
||||
engine_name = cells[0].get('ocr_engine', 'unknown') if cells else 'unknown'
|
||||
logger.info(f"build_word_grid: {len(entries)} entries from "
|
||||
f"{n_raw} raw -> {len(entries)} after post-processing "
|
||||
f"(engine={engine_name})")
|
||||
|
||||
return entries
|
||||
Reference in New Issue
Block a user