klausur-service (11 files): - cv_gutter_repair, ocr_pipeline_regression, upload_api - ocr_pipeline_sessions, smart_spell, nru_worksheet_generator - ocr_pipeline_overlays, mail/aggregator, zeugnis_api - cv_syllable_detect, self_rag backend-lehrer (17 files): - classroom_engine/suggestions, generators/quiz_generator - worksheets_api, llm_gateway/comparison, state_engine_api - classroom/models (→ 4 submodules), services/file_processor - alerts_agent/api/wizard+digests+routes, content_generators/pdf - classroom/routes/sessions, llm_gateway/inference - classroom_engine/analytics, auth/keycloak_auth - alerts_agent/processing/rule_engine, ai_processor/print_versions agent-core (5 files): - brain/memory_store, brain/knowledge_graph, brain/context_manager - orchestrator/supervisor, sessions/session_manager admin-lehrer (5 components): - GridOverlay, StepGridReview, DevOpsPipelineSidebar - DataFlowDiagram, sbom/wizard/page website (2 files): - DependencyMap, lehrer/abitur-archiv Other: nibis_ingestion, grid_detection_service, export-doclayout-onnx Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
357 lines
13 KiB
Python
357 lines
13 KiB
Python
"""
|
|
Gutter Repair Grid — grid analysis and suggestion application.
|
|
|
|
Extracted from cv_gutter_repair.py for modularity.
|
|
|
|
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
|
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
|
"""
|
|
|
|
import logging
|
|
import time
|
|
from typing import Any, Dict, List, Tuple
|
|
|
|
from cv_gutter_repair_core import (
|
|
_init_spellcheckers,
|
|
_is_ipa_text,
|
|
_is_known,
|
|
_MIN_WORD_LEN_HYPHEN,
|
|
_SPELL_AVAILABLE,
|
|
_STOPWORDS,
|
|
_TRAILING_PUNCT_RE,
|
|
_try_hyphen_join,
|
|
_try_spell_fix,
|
|
_word_is_at_gutter_edge,
|
|
GutterSuggestion,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Grid analysis
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def analyse_grid_for_gutter_repair(
|
|
grid_data: Dict[str, Any],
|
|
image_width: int = 0,
|
|
) -> Dict[str, Any]:
|
|
"""Analyse a structured grid and return gutter repair suggestions.
|
|
|
|
Args:
|
|
grid_data: The grid_editor_result from the session (zones→cells structure).
|
|
image_width: Image width in pixels (for determining gutter side).
|
|
|
|
Returns:
|
|
Dict with "suggestions" list and "stats".
|
|
"""
|
|
t0 = time.time()
|
|
_init_spellcheckers()
|
|
|
|
if not _SPELL_AVAILABLE:
|
|
return {
|
|
"suggestions": [],
|
|
"stats": {"error": "pyspellchecker not installed"},
|
|
"duration_seconds": 0,
|
|
}
|
|
|
|
zones = grid_data.get("zones", [])
|
|
suggestions: List[GutterSuggestion] = []
|
|
words_checked = 0
|
|
gutter_candidates = 0
|
|
|
|
for zi, zone in enumerate(zones):
|
|
columns = zone.get("columns", [])
|
|
cells = zone.get("cells", [])
|
|
if not columns or not cells:
|
|
continue
|
|
|
|
# Build column lookup: col_index → {x, width, type}
|
|
col_info: Dict[int, Dict] = {}
|
|
for col in columns:
|
|
ci = col.get("index", col.get("col_index", -1))
|
|
col_info[ci] = {
|
|
"x": col.get("x_min_px", col.get("x", 0)),
|
|
"width": col.get("x_max_px", col.get("width", 0)) - col.get("x_min_px", col.get("x", 0)),
|
|
"type": col.get("type", col.get("col_type", "")),
|
|
}
|
|
|
|
# Build row→col→cell lookup
|
|
cell_map: Dict[Tuple[int, int], Dict] = {}
|
|
max_row = 0
|
|
for cell in cells:
|
|
ri = cell.get("row_index", 0)
|
|
ci = cell.get("col_index", 0)
|
|
cell_map[(ri, ci)] = cell
|
|
if ri > max_row:
|
|
max_row = ri
|
|
|
|
# Determine which columns are at the gutter edge.
|
|
# For a left page: rightmost content columns.
|
|
# For now, check ALL columns — a word is a candidate if it's at the
|
|
# right edge of its column AND not a known word.
|
|
for (ri, ci), cell in cell_map.items():
|
|
text = (cell.get("text") or "").strip()
|
|
if not text:
|
|
continue
|
|
if _is_ipa_text(text):
|
|
continue
|
|
|
|
words_checked += 1
|
|
col = col_info.get(ci, {})
|
|
col_type = col.get("type", "")
|
|
|
|
# Get word boxes to check position
|
|
word_boxes = cell.get("word_boxes", [])
|
|
|
|
# Check the LAST word in the cell (rightmost, closest to gutter)
|
|
cell_words = text.split()
|
|
if not cell_words:
|
|
continue
|
|
|
|
last_word = cell_words[-1]
|
|
|
|
# Skip stopwords
|
|
if last_word.lower().rstrip(".,;:!?-") in _STOPWORDS:
|
|
continue
|
|
|
|
last_word_clean = last_word.rstrip(".,;:!?)(")
|
|
if len(last_word_clean) < _MIN_WORD_LEN_HYPHEN:
|
|
continue
|
|
|
|
# Check if the last word is at the gutter edge
|
|
is_at_edge = False
|
|
if word_boxes:
|
|
last_wb = word_boxes[-1]
|
|
is_at_edge = _word_is_at_gutter_edge(
|
|
last_wb, col.get("x", 0), col.get("width", 1)
|
|
)
|
|
else:
|
|
# No word boxes — use cell bbox
|
|
bbox = cell.get("bbox_px", {})
|
|
is_at_edge = _word_is_at_gutter_edge(
|
|
{"left": bbox.get("x", 0), "width": bbox.get("w", 0)},
|
|
col.get("x", 0), col.get("width", 1)
|
|
)
|
|
|
|
if not is_at_edge:
|
|
continue
|
|
|
|
# Word is at gutter edge — check if it's a known word
|
|
if _is_known(last_word_clean):
|
|
continue
|
|
|
|
# Check if the word ends with "-" (explicit hyphen break)
|
|
ends_with_hyphen = last_word.endswith("-")
|
|
|
|
# If the word already ends with "-" and the stem (without
|
|
# the hyphen) is a known word, this is a VALID line-break
|
|
# hyphenation — not a gutter error. Gutter problems cause
|
|
# the hyphen to be LOST ("ve" instead of "ver-"), so a
|
|
# visible hyphen + known stem = intentional word-wrap.
|
|
# Example: "wunder-" → "wunder" is known → skip.
|
|
if ends_with_hyphen:
|
|
stem = last_word_clean.rstrip("-")
|
|
if stem and _is_known(stem):
|
|
continue
|
|
|
|
gutter_candidates += 1
|
|
|
|
# --- Strategy 1: Hyphen join with next row ---
|
|
next_cell = cell_map.get((ri + 1, ci))
|
|
if next_cell:
|
|
next_text = (next_cell.get("text") or "").strip()
|
|
next_words = next_text.split()
|
|
if next_words:
|
|
first_next = next_words[0]
|
|
first_next_clean = _TRAILING_PUNCT_RE.sub('', first_next)
|
|
first_alpha = next((c for c in first_next if c.isalpha()), "")
|
|
|
|
# Also skip if the joined word is known (covers compound
|
|
# words where the stem alone might not be in the dictionary)
|
|
if ends_with_hyphen and first_next_clean:
|
|
direct = last_word_clean.rstrip("-") + first_next_clean
|
|
if _is_known(direct):
|
|
continue
|
|
|
|
# Continuation likely if:
|
|
# - explicit hyphen, OR
|
|
# - next row starts lowercase (= not a new entry)
|
|
if ends_with_hyphen or (first_alpha and first_alpha.islower()):
|
|
result = _try_hyphen_join(last_word_clean, first_next)
|
|
if result:
|
|
joined, missing, conf = result
|
|
# Build display parts: show hyphenation for original layout
|
|
if ends_with_hyphen:
|
|
display_p1 = last_word_clean.rstrip("-")
|
|
if missing:
|
|
display_p1 += missing
|
|
display_p1 += "-"
|
|
else:
|
|
display_p1 = last_word_clean
|
|
if missing:
|
|
display_p1 += missing + "-"
|
|
else:
|
|
display_p1 += "-"
|
|
|
|
suggestion = GutterSuggestion(
|
|
type="hyphen_join",
|
|
zone_index=zi,
|
|
row_index=ri,
|
|
col_index=ci,
|
|
col_type=col_type,
|
|
cell_id=cell.get("cell_id", f"R{ri:02d}_C{ci}"),
|
|
original_text=last_word,
|
|
suggested_text=joined,
|
|
next_row_index=ri + 1,
|
|
next_row_cell_id=next_cell.get("cell_id", f"R{ri+1:02d}_C{ci}"),
|
|
next_row_text=next_text,
|
|
missing_chars=missing,
|
|
display_parts=[display_p1, first_next],
|
|
confidence=conf,
|
|
reason="gutter_truncation" if missing else "hyphen_continuation",
|
|
)
|
|
suggestions.append(suggestion)
|
|
continue # skip spell_fix if hyphen_join found
|
|
|
|
# --- Strategy 2: Single-word spell fix (only for longer words) ---
|
|
fix_result = _try_spell_fix(last_word_clean, col_type)
|
|
if fix_result:
|
|
corrected, conf, alts = fix_result
|
|
suggestion = GutterSuggestion(
|
|
type="spell_fix",
|
|
zone_index=zi,
|
|
row_index=ri,
|
|
col_index=ci,
|
|
col_type=col_type,
|
|
cell_id=cell.get("cell_id", f"R{ri:02d}_C{ci}"),
|
|
original_text=last_word,
|
|
suggested_text=corrected,
|
|
alternatives=alts,
|
|
confidence=conf,
|
|
reason="gutter_blur",
|
|
)
|
|
suggestions.append(suggestion)
|
|
|
|
duration = round(time.time() - t0, 3)
|
|
|
|
logger.info(
|
|
"Gutter repair: checked %d words, %d gutter candidates, %d suggestions (%.2fs)",
|
|
words_checked, gutter_candidates, len(suggestions), duration,
|
|
)
|
|
|
|
return {
|
|
"suggestions": [s.to_dict() for s in suggestions],
|
|
"stats": {
|
|
"words_checked": words_checked,
|
|
"gutter_candidates": gutter_candidates,
|
|
"suggestions_found": len(suggestions),
|
|
},
|
|
"duration_seconds": duration,
|
|
}
|
|
|
|
|
|
def apply_gutter_suggestions(
|
|
grid_data: Dict[str, Any],
|
|
accepted_ids: List[str],
|
|
suggestions: List[Dict[str, Any]],
|
|
) -> Dict[str, Any]:
|
|
"""Apply accepted gutter repair suggestions to the grid data.
|
|
|
|
Modifies cells in-place and returns summary of changes.
|
|
|
|
Args:
|
|
grid_data: The grid_editor_result (zones→cells).
|
|
accepted_ids: List of suggestion IDs the user accepted.
|
|
suggestions: The full suggestions list (from analyse_grid_for_gutter_repair).
|
|
|
|
Returns:
|
|
Dict with "applied_count" and "changes" list.
|
|
"""
|
|
accepted_set = set(accepted_ids)
|
|
accepted_suggestions = [s for s in suggestions if s.get("id") in accepted_set]
|
|
|
|
zones = grid_data.get("zones", [])
|
|
changes: List[Dict[str, Any]] = []
|
|
|
|
for s in accepted_suggestions:
|
|
zi = s.get("zone_index", 0)
|
|
ri = s.get("row_index", 0)
|
|
ci = s.get("col_index", 0)
|
|
stype = s.get("type", "")
|
|
|
|
if zi >= len(zones):
|
|
continue
|
|
zone_cells = zones[zi].get("cells", [])
|
|
|
|
# Find the target cell
|
|
target_cell = None
|
|
for cell in zone_cells:
|
|
if cell.get("row_index") == ri and cell.get("col_index") == ci:
|
|
target_cell = cell
|
|
break
|
|
|
|
if not target_cell:
|
|
continue
|
|
|
|
old_text = target_cell.get("text", "")
|
|
|
|
if stype == "spell_fix":
|
|
# Replace the last word in the cell text
|
|
original_word = s.get("original_text", "")
|
|
corrected = s.get("suggested_text", "")
|
|
if original_word and corrected:
|
|
# Replace from the right (last occurrence)
|
|
idx = old_text.rfind(original_word)
|
|
if idx >= 0:
|
|
new_text = old_text[:idx] + corrected + old_text[idx + len(original_word):]
|
|
target_cell["text"] = new_text
|
|
changes.append({
|
|
"type": "spell_fix",
|
|
"zone_index": zi,
|
|
"row_index": ri,
|
|
"col_index": ci,
|
|
"cell_id": target_cell.get("cell_id", ""),
|
|
"old_text": old_text,
|
|
"new_text": new_text,
|
|
})
|
|
|
|
elif stype == "hyphen_join":
|
|
# Current cell: replace last word with the hyphenated first part
|
|
original_word = s.get("original_text", "")
|
|
joined = s.get("suggested_text", "")
|
|
display_parts = s.get("display_parts", [])
|
|
next_ri = s.get("next_row_index", -1)
|
|
|
|
if not original_word or not joined or not display_parts:
|
|
continue
|
|
|
|
# The first display part is what goes in the current row
|
|
first_part = display_parts[0] if display_parts else ""
|
|
|
|
# Replace the last word in current cell with the restored form.
|
|
# The next row is NOT modified — "künden" stays in its row
|
|
# because the original book layout has it there. We only fix
|
|
# the truncated word in the current row (e.g. "ve" → "ver-").
|
|
idx = old_text.rfind(original_word)
|
|
if idx >= 0:
|
|
new_text = old_text[:idx] + first_part + old_text[idx + len(original_word):]
|
|
target_cell["text"] = new_text
|
|
changes.append({
|
|
"type": "hyphen_join",
|
|
"zone_index": zi,
|
|
"row_index": ri,
|
|
"col_index": ci,
|
|
"cell_id": target_cell.get("cell_id", ""),
|
|
"old_text": old_text,
|
|
"new_text": new_text,
|
|
"joined_word": joined,
|
|
})
|
|
|
|
logger.info("Gutter repair applied: %d/%d suggestions", len(changes), len(accepted_suggestions))
|
|
|
|
return {
|
|
"applied_count": len(accepted_suggestions),
|
|
"changes": changes,
|
|
}
|