Files
breakpilot-lehrer/klausur-service/backend/cv_gutter_repair_grid.py
Benjamin Admin bd4b956e3c [split-required] Split final 43 files (500-668 LOC) to complete refactoring
klausur-service (11 files):
- cv_gutter_repair, ocr_pipeline_regression, upload_api
- ocr_pipeline_sessions, smart_spell, nru_worksheet_generator
- ocr_pipeline_overlays, mail/aggregator, zeugnis_api
- cv_syllable_detect, self_rag

backend-lehrer (17 files):
- classroom_engine/suggestions, generators/quiz_generator
- worksheets_api, llm_gateway/comparison, state_engine_api
- classroom/models (→ 4 submodules), services/file_processor
- alerts_agent/api/wizard+digests+routes, content_generators/pdf
- classroom/routes/sessions, llm_gateway/inference
- classroom_engine/analytics, auth/keycloak_auth
- alerts_agent/processing/rule_engine, ai_processor/print_versions

agent-core (5 files):
- brain/memory_store, brain/knowledge_graph, brain/context_manager
- orchestrator/supervisor, sessions/session_manager

admin-lehrer (5 components):
- GridOverlay, StepGridReview, DevOpsPipelineSidebar
- DataFlowDiagram, sbom/wizard/page

website (2 files):
- DependencyMap, lehrer/abitur-archiv

Other: nibis_ingestion, grid_detection_service, export-doclayout-onnx

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 09:41:42 +02:00

357 lines
13 KiB
Python

"""
Gutter Repair Grid — grid analysis and suggestion application.
Extracted from cv_gutter_repair.py for modularity.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
import time
from typing import Any, Dict, List, Tuple
from cv_gutter_repair_core import (
_init_spellcheckers,
_is_ipa_text,
_is_known,
_MIN_WORD_LEN_HYPHEN,
_SPELL_AVAILABLE,
_STOPWORDS,
_TRAILING_PUNCT_RE,
_try_hyphen_join,
_try_spell_fix,
_word_is_at_gutter_edge,
GutterSuggestion,
)
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Grid analysis
# ---------------------------------------------------------------------------
def analyse_grid_for_gutter_repair(
grid_data: Dict[str, Any],
image_width: int = 0,
) -> Dict[str, Any]:
"""Analyse a structured grid and return gutter repair suggestions.
Args:
grid_data: The grid_editor_result from the session (zones→cells structure).
image_width: Image width in pixels (for determining gutter side).
Returns:
Dict with "suggestions" list and "stats".
"""
t0 = time.time()
_init_spellcheckers()
if not _SPELL_AVAILABLE:
return {
"suggestions": [],
"stats": {"error": "pyspellchecker not installed"},
"duration_seconds": 0,
}
zones = grid_data.get("zones", [])
suggestions: List[GutterSuggestion] = []
words_checked = 0
gutter_candidates = 0
for zi, zone in enumerate(zones):
columns = zone.get("columns", [])
cells = zone.get("cells", [])
if not columns or not cells:
continue
# Build column lookup: col_index → {x, width, type}
col_info: Dict[int, Dict] = {}
for col in columns:
ci = col.get("index", col.get("col_index", -1))
col_info[ci] = {
"x": col.get("x_min_px", col.get("x", 0)),
"width": col.get("x_max_px", col.get("width", 0)) - col.get("x_min_px", col.get("x", 0)),
"type": col.get("type", col.get("col_type", "")),
}
# Build row→col→cell lookup
cell_map: Dict[Tuple[int, int], Dict] = {}
max_row = 0
for cell in cells:
ri = cell.get("row_index", 0)
ci = cell.get("col_index", 0)
cell_map[(ri, ci)] = cell
if ri > max_row:
max_row = ri
# Determine which columns are at the gutter edge.
# For a left page: rightmost content columns.
# For now, check ALL columns — a word is a candidate if it's at the
# right edge of its column AND not a known word.
for (ri, ci), cell in cell_map.items():
text = (cell.get("text") or "").strip()
if not text:
continue
if _is_ipa_text(text):
continue
words_checked += 1
col = col_info.get(ci, {})
col_type = col.get("type", "")
# Get word boxes to check position
word_boxes = cell.get("word_boxes", [])
# Check the LAST word in the cell (rightmost, closest to gutter)
cell_words = text.split()
if not cell_words:
continue
last_word = cell_words[-1]
# Skip stopwords
if last_word.lower().rstrip(".,;:!?-") in _STOPWORDS:
continue
last_word_clean = last_word.rstrip(".,;:!?)(")
if len(last_word_clean) < _MIN_WORD_LEN_HYPHEN:
continue
# Check if the last word is at the gutter edge
is_at_edge = False
if word_boxes:
last_wb = word_boxes[-1]
is_at_edge = _word_is_at_gutter_edge(
last_wb, col.get("x", 0), col.get("width", 1)
)
else:
# No word boxes — use cell bbox
bbox = cell.get("bbox_px", {})
is_at_edge = _word_is_at_gutter_edge(
{"left": bbox.get("x", 0), "width": bbox.get("w", 0)},
col.get("x", 0), col.get("width", 1)
)
if not is_at_edge:
continue
# Word is at gutter edge — check if it's a known word
if _is_known(last_word_clean):
continue
# Check if the word ends with "-" (explicit hyphen break)
ends_with_hyphen = last_word.endswith("-")
# If the word already ends with "-" and the stem (without
# the hyphen) is a known word, this is a VALID line-break
# hyphenation — not a gutter error. Gutter problems cause
# the hyphen to be LOST ("ve" instead of "ver-"), so a
# visible hyphen + known stem = intentional word-wrap.
# Example: "wunder-" → "wunder" is known → skip.
if ends_with_hyphen:
stem = last_word_clean.rstrip("-")
if stem and _is_known(stem):
continue
gutter_candidates += 1
# --- Strategy 1: Hyphen join with next row ---
next_cell = cell_map.get((ri + 1, ci))
if next_cell:
next_text = (next_cell.get("text") or "").strip()
next_words = next_text.split()
if next_words:
first_next = next_words[0]
first_next_clean = _TRAILING_PUNCT_RE.sub('', first_next)
first_alpha = next((c for c in first_next if c.isalpha()), "")
# Also skip if the joined word is known (covers compound
# words where the stem alone might not be in the dictionary)
if ends_with_hyphen and first_next_clean:
direct = last_word_clean.rstrip("-") + first_next_clean
if _is_known(direct):
continue
# Continuation likely if:
# - explicit hyphen, OR
# - next row starts lowercase (= not a new entry)
if ends_with_hyphen or (first_alpha and first_alpha.islower()):
result = _try_hyphen_join(last_word_clean, first_next)
if result:
joined, missing, conf = result
# Build display parts: show hyphenation for original layout
if ends_with_hyphen:
display_p1 = last_word_clean.rstrip("-")
if missing:
display_p1 += missing
display_p1 += "-"
else:
display_p1 = last_word_clean
if missing:
display_p1 += missing + "-"
else:
display_p1 += "-"
suggestion = GutterSuggestion(
type="hyphen_join",
zone_index=zi,
row_index=ri,
col_index=ci,
col_type=col_type,
cell_id=cell.get("cell_id", f"R{ri:02d}_C{ci}"),
original_text=last_word,
suggested_text=joined,
next_row_index=ri + 1,
next_row_cell_id=next_cell.get("cell_id", f"R{ri+1:02d}_C{ci}"),
next_row_text=next_text,
missing_chars=missing,
display_parts=[display_p1, first_next],
confidence=conf,
reason="gutter_truncation" if missing else "hyphen_continuation",
)
suggestions.append(suggestion)
continue # skip spell_fix if hyphen_join found
# --- Strategy 2: Single-word spell fix (only for longer words) ---
fix_result = _try_spell_fix(last_word_clean, col_type)
if fix_result:
corrected, conf, alts = fix_result
suggestion = GutterSuggestion(
type="spell_fix",
zone_index=zi,
row_index=ri,
col_index=ci,
col_type=col_type,
cell_id=cell.get("cell_id", f"R{ri:02d}_C{ci}"),
original_text=last_word,
suggested_text=corrected,
alternatives=alts,
confidence=conf,
reason="gutter_blur",
)
suggestions.append(suggestion)
duration = round(time.time() - t0, 3)
logger.info(
"Gutter repair: checked %d words, %d gutter candidates, %d suggestions (%.2fs)",
words_checked, gutter_candidates, len(suggestions), duration,
)
return {
"suggestions": [s.to_dict() for s in suggestions],
"stats": {
"words_checked": words_checked,
"gutter_candidates": gutter_candidates,
"suggestions_found": len(suggestions),
},
"duration_seconds": duration,
}
def apply_gutter_suggestions(
grid_data: Dict[str, Any],
accepted_ids: List[str],
suggestions: List[Dict[str, Any]],
) -> Dict[str, Any]:
"""Apply accepted gutter repair suggestions to the grid data.
Modifies cells in-place and returns summary of changes.
Args:
grid_data: The grid_editor_result (zones→cells).
accepted_ids: List of suggestion IDs the user accepted.
suggestions: The full suggestions list (from analyse_grid_for_gutter_repair).
Returns:
Dict with "applied_count" and "changes" list.
"""
accepted_set = set(accepted_ids)
accepted_suggestions = [s for s in suggestions if s.get("id") in accepted_set]
zones = grid_data.get("zones", [])
changes: List[Dict[str, Any]] = []
for s in accepted_suggestions:
zi = s.get("zone_index", 0)
ri = s.get("row_index", 0)
ci = s.get("col_index", 0)
stype = s.get("type", "")
if zi >= len(zones):
continue
zone_cells = zones[zi].get("cells", [])
# Find the target cell
target_cell = None
for cell in zone_cells:
if cell.get("row_index") == ri and cell.get("col_index") == ci:
target_cell = cell
break
if not target_cell:
continue
old_text = target_cell.get("text", "")
if stype == "spell_fix":
# Replace the last word in the cell text
original_word = s.get("original_text", "")
corrected = s.get("suggested_text", "")
if original_word and corrected:
# Replace from the right (last occurrence)
idx = old_text.rfind(original_word)
if idx >= 0:
new_text = old_text[:idx] + corrected + old_text[idx + len(original_word):]
target_cell["text"] = new_text
changes.append({
"type": "spell_fix",
"zone_index": zi,
"row_index": ri,
"col_index": ci,
"cell_id": target_cell.get("cell_id", ""),
"old_text": old_text,
"new_text": new_text,
})
elif stype == "hyphen_join":
# Current cell: replace last word with the hyphenated first part
original_word = s.get("original_text", "")
joined = s.get("suggested_text", "")
display_parts = s.get("display_parts", [])
next_ri = s.get("next_row_index", -1)
if not original_word or not joined or not display_parts:
continue
# The first display part is what goes in the current row
first_part = display_parts[0] if display_parts else ""
# Replace the last word in current cell with the restored form.
# The next row is NOT modified — "künden" stays in its row
# because the original book layout has it there. We only fix
# the truncated word in the current row (e.g. "ve" → "ver-").
idx = old_text.rfind(original_word)
if idx >= 0:
new_text = old_text[:idx] + first_part + old_text[idx + len(original_word):]
target_cell["text"] = new_text
changes.append({
"type": "hyphen_join",
"zone_index": zi,
"row_index": ri,
"col_index": ci,
"cell_id": target_cell.get("cell_id", ""),
"old_text": old_text,
"new_text": new_text,
"joined_word": joined,
})
logger.info("Gutter repair applied: %d/%d suggestions", len(changes), len(accepted_suggestions))
return {
"applied_count": len(accepted_suggestions),
"changes": changes,
}