diff --git a/.claude/rules/loc-exceptions.txt b/.claude/rules/loc-exceptions.txt index 78f0efc..486a0e1 100644 --- a/.claude/rules/loc-exceptions.txt +++ b/.claude/rules/loc-exceptions.txt @@ -31,6 +31,7 @@ # Two indivisible route handlers (~230 LOC each) that cannot be split further **/vocab_worksheet_compare_api.py | owner=klausur | reason=compare_ocr_methods (234 LOC) + analyze_grid (255 LOC), each a single cohesive handler | review=2026-10-01 +**/vocab/worksheet/compare_api.py | owner=klausur | reason=Same file moved to vocab/ package | review=2026-10-01 # TypeScript Data Catalogs (admin-lehrer/lib/sdk/) # Pure exported const arrays/objects with type definitions, no business logic. diff --git a/klausur-service/backend/grid/__init__.py b/klausur-service/backend/grid/__init__.py new file mode 100644 index 0000000..51be601 --- /dev/null +++ b/klausur-service/backend/grid/__init__.py @@ -0,0 +1,10 @@ +""" +Grid package — restructured from grid_* flat modules. + +Backward-compatible re-exports: consumers can still use +``from grid_build_core import ...`` etc. via the shim files in backend/. + +Sub-packages: + - grid.build — grid construction pipeline (_build_grid_core and phases) + - grid.editor — FastAPI endpoints, helper functions, column/zone logic +""" diff --git a/klausur-service/backend/grid/build/__init__.py b/klausur-service/backend/grid/build/__init__.py new file mode 100644 index 0000000..29b0fb7 --- /dev/null +++ b/klausur-service/backend/grid/build/__init__.py @@ -0,0 +1,11 @@ +""" +Grid Build sub-package — grid construction pipeline. + +Modules: + - core — _build_grid_core() main entry point + - zones — image loading, graphic/box detection, zone-aware grid building + - cleanup — junk row removal, artifact cleanup, pipe dividers + - text_ops — color annotation, heading detection, IPA correction + - cell_ops — bullet removal, garbled cells, word-box reordering + - finalize — dictionary detection, spell checking, result assembly +""" diff --git a/klausur-service/backend/grid/build/cell_ops.py b/klausur-service/backend/grid/build/cell_ops.py new file mode 100644 index 0000000..57bc721 --- /dev/null +++ b/klausur-service/backend/grid/build/cell_ops.py @@ -0,0 +1,305 @@ +""" +Grid Build Cell Ops — Cell-level operations: bullet/artifact removal, +garbled cell cleanup, word-box reordering, and max_columns enforcement. + +Extracted from grid_build_core.py for maintainability. +""" + +import logging +import re +from typing import Any, Dict, List, Tuple + +from cv_ocr_engines import ( + _words_to_reading_order_text, _group_words_into_lines, _lookup_ipa, +) + +logger = logging.getLogger(__name__) + + +def _remove_bullets_and_artifacts(zones_data: List[Dict[str, Any]]) -> None: + """Remove blue bullet/artifact word_boxes (Step 5i). + + Handles tiny coloured symbols, overlapping word_boxes, duplicate text, + and syllable-split word merging. + """ + _ALPHA_WORD_RE = re.compile(r'^[A-Za-z\u00c0-\u024f\-]+[.,;:!?]*$') + _REMOVE_SYMBOLS = {'>', '<', '~', '\\', '^', '`', '#', '|', '¬', '¦'} + + bullet_removed = 0 + for z in zones_data: + for cell in z.get("cells", []): + wbs = cell.get("word_boxes") or [] + if len(wbs) < 2: + continue + to_remove: set = set() + + # Rule (a): tiny coloured symbols + for i, wb in enumerate(wbs): + cn = wb.get("color_name", "black") + if (cn != "black" + and wb.get("width", 0) * wb.get("height", 0) < 200 + and wb.get("conf", 100) < 85): + to_remove.add(i) + + # Rule (a2): isolated non-alphanumeric symbols + for i, wb in enumerate(wbs): + t = (wb.get("text") or "").strip() + if t and not re.search(r'[a-zA-Z0-9äöüÄÖÜß]', t) and len(t) <= 2: + if t in _REMOVE_SYMBOLS: + to_remove.add(i) + + # Rule (b) + (c): overlap and duplicate detection + to_merge: List[Tuple[int, int]] = [] + indexed = sorted(enumerate(wbs), key=lambda iw: iw[1].get("left", 0)) + for p in range(len(indexed) - 1): + i1, w1 = indexed[p] + i2, w2 = indexed[p + 1] + x1s, x1e = w1.get("left", 0), w1.get("left", 0) + w1.get("width", 0) + x2s, x2e = w2.get("left", 0), w2.get("left", 0) + w2.get("width", 0) + overlap = max(0, min(x1e, x2e) - max(x1s, x2s)) + min_w = min(w1.get("width", 1), w2.get("width", 1)) + gap = x2s - x1e + overlap_pct = overlap / min_w if min_w > 0 else 0 + + if overlap_pct > 0.20: + t1 = (w1.get("text") or "").strip() + t2 = (w2.get("text") or "").strip() + + # Syllable-split words + if (overlap_pct <= 0.75 + and _ALPHA_WORD_RE.match(t1) + and _ALPHA_WORD_RE.match(t2)): + to_merge.append((i1, i2)) + continue + + # High overlap with short prefix + if (overlap_pct > 0.75 + and _ALPHA_WORD_RE.match(t1) + and _ALPHA_WORD_RE.match(t2) + and t1.rstrip(".,;:!?").lower() != t2.rstrip(".,;:!?").lower() + and min(len(t1.rstrip(".,;:!?")), len(t2.rstrip(".,;:!?"))) <= 4): + to_merge.append((i1, i2)) + continue + + if overlap_pct <= 0.40: + continue + + c1 = w1.get("conf", 50) + c2 = w2.get("conf", 50) + + # Very high overlap: prefer IPA-dictionary word + if overlap_pct > 0.90 and t1.lower() != t2.lower(): + in_dict_1 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t1.lower()), "british")) if t1.isalpha() else False + in_dict_2 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t2.lower()), "british")) if t2.isalpha() else False + if in_dict_1 and not in_dict_2: + to_remove.add(i2) + continue + elif in_dict_2 and not in_dict_1: + to_remove.add(i1) + continue + + if c1 < c2: + to_remove.add(i1) + elif c2 < c1: + to_remove.add(i2) + else: + if w1.get("height", 0) > w2.get("height", 0): + to_remove.add(i1) + else: + to_remove.add(i2) + + elif (gap < 6 + and w1.get("color_name") == "blue" + and w2.get("color_name") == "blue" + and (w1.get("text") or "").strip() == (w2.get("text") or "").strip()): + c1 = w1.get("conf", 50) + c2 = w2.get("conf", 50) + to_remove.add(i1 if c1 <= c2 else i2) + + # Execute merges first (syllable-split words) + if to_merge: + merge_parent: Dict[int, int] = {} + for mi1, mi2 in to_merge: + actual_mi1 = mi1 + while actual_mi1 in merge_parent: + actual_mi1 = merge_parent[actual_mi1] + if actual_mi1 in to_remove or mi2 in to_remove: + continue + if mi2 in merge_parent: + continue + mw1, mw2 = wbs[actual_mi1], wbs[mi2] + mt1 = (mw1.get("text") or "").rstrip(".,;:!?") + mt2 = (mw2.get("text") or "").strip() + merged_text = mt1 + mt2 + mx = min(mw1["left"], mw2["left"]) + my = min(mw1["top"], mw2["top"]) + mr = max(mw1["left"] + mw1["width"], + mw2["left"] + mw2["width"]) + mb = max(mw1["top"] + mw1["height"], + mw2["top"] + mw2["height"]) + mw1["text"] = merged_text + mw1["left"] = mx + mw1["top"] = my + mw1["width"] = mr - mx + mw1["height"] = mb - my + mw1["conf"] = (mw1.get("conf", 50) + mw2.get("conf", 50)) / 2 + to_remove.add(mi2) + merge_parent[mi2] = actual_mi1 + bullet_removed -= 1 + + if to_remove: + bullet_removed += len(to_remove) + filtered = [wb for i, wb in enumerate(wbs) if i not in to_remove] + cell["word_boxes"] = filtered + if not cell.get("_ipa_corrected"): + cell["text"] = _words_to_reading_order_text(filtered) + + if bullet_removed: + for z in zones_data: + z["cells"] = [c for c in z.get("cells", []) + if (c.get("word_boxes") or c.get("text", "").strip())] + logger.info("Step 5i: removed %d bullet/artifact word_boxes", bullet_removed) + + +def _remove_garbled_cells(zones_data: List[Dict[str, Any]]) -> None: + """Remove cells whose text is entirely garbled / artifact noise (Step 5j-pre).""" + _COMMON_SHORT_WORDS = { + "ab", "am", "an", "da", "du", "er", "es", "im", "in", "ja", + "ob", "so", "um", "zu", "wo", "je", "oh", "or", + "die", "der", "das", "dem", "den", "des", "ein", "und", + "auf", "aus", "bei", "bis", "für", "mit", "nur", "von", + "a", "i", "an", "as", "at", "be", "by", "do", "go", "he", + "if", "in", "is", "it", "me", "my", "no", "of", "oh", "ok", + "on", "or", "so", "to", "up", "us", "we", + "the", "and", "but", "for", "not", + } + _PURE_JUNK_RE = re.compile(r'^[\W\d\s]+$') + artifact_cells_removed = 0 + + for z in zones_data: + before = len(z.get("cells", [])) + kept = [] + for cell in z.get("cells", []): + text = (cell.get("text") or "").strip() + core = text.rstrip(".,;:!?'\"") + is_artifact = False + if not core: + is_artifact = True + elif _PURE_JUNK_RE.match(core): + if core.strip() not in ('=', '(=', '=)', ';', ':', '-', '–', '—'): + is_artifact = True + elif len(core) <= 2 and core.lower() not in _COMMON_SHORT_WORDS and not core.isalpha(): + is_artifact = True + elif len(core) <= 3 and core.isupper() and core.lower() not in _COMMON_SHORT_WORDS: + is_artifact = True + elif (len(core) <= 5 and re.search(r'\d', core) and re.search(r'[A-Za-z]', core) + and not re.match(r'^[pPsS]\.?\d+$', core)): + is_artifact = True + if is_artifact: + kept.append(None) + else: + kept.append(cell) + z["cells"] = [c for c in kept if c is not None] + artifact_cells_removed += before - len(z["cells"]) + + if artifact_cells_removed: + for z in zones_data: + cell_ris = {c.get("row_index") for c in z.get("cells", [])} + z["rows"] = [r for r in z.get("rows", []) if r["index"] in cell_ris] + logger.info("Step 5j-pre: removed %d artifact cells", artifact_cells_removed) + + +def _normalize_word_order(zones_data: List[Dict[str, Any]]) -> None: + """Normalise word_box order to reading order (Step 5j).""" + wb_reordered = 0 + for z in zones_data: + for cell in z.get("cells", []): + wbs = cell.get("word_boxes") or [] + if len(wbs) < 2: + continue + lines = _group_words_into_lines(wbs, y_tolerance_px=15) + sorted_wbs = [w for line in lines for w in line] + if [id(w) for w in sorted_wbs] != [id(w) for w in wbs]: + cell["word_boxes"] = sorted_wbs + wb_reordered += 1 + if wb_reordered: + logger.info("Step 5j: re-ordered word_boxes in %d cells to reading order", wb_reordered) + + +def _enforce_max_columns( + zones_data: List[Dict[str, Any]], + max_columns: int, +) -> None: + """Enforce max_columns by merging narrowest columns (Step 5k).""" + for z in zones_data: + if z.get("zone_type") != "content": + continue + cols = z.get("columns", []) + cells = z.get("cells", []) + if len(cols) <= max_columns: + continue + + logger.info( + "max_columns=%d: zone %s has %d columns -> merging", + max_columns, z.get("zone_index"), len(cols), + ) + + cols_by_width = sorted(cols, key=lambda c: (c.get("x_max_px", 0) - c.get("x_min_px", 0))) + + while len(cols) > max_columns: + narrowest = cols_by_width.pop(0) + ni = narrowest["index"] + + sorted_by_x = sorted(cols, key=lambda c: c.get("x_min_px", c.get("x_min_pct", 0))) + pos = next(i for i, c in enumerate(sorted_by_x) if c["index"] == ni) + if pos + 1 < len(sorted_by_x): + merge_target = sorted_by_x[pos + 1] + elif pos > 0: + merge_target = sorted_by_x[pos - 1] + else: + break + + ti = merge_target["index"] + + merge_target["x_min_px"] = min( + merge_target.get("x_min_px", merge_target.get("x_min_pct", 0)), + narrowest.get("x_min_px", narrowest.get("x_min_pct", 0)), + ) + merge_target["x_max_px"] = max( + merge_target.get("x_max_px", merge_target.get("x_max_pct", 100)), + narrowest.get("x_max_px", narrowest.get("x_max_pct", 100)), + ) + if "x_min_pct" in merge_target and "x_min_pct" in narrowest: + merge_target["x_min_pct"] = min(merge_target["x_min_pct"], narrowest["x_min_pct"]) + merge_target["x_max_pct"] = max(merge_target["x_max_pct"], narrowest["x_max_pct"]) + + for cell in cells: + if cell.get("col_index") == ni: + cell["col_index"] = ti + existing = next( + (c for c in cells if c["col_index"] == ti + and c["row_index"] == cell["row_index"] + and c is not cell), + None, + ) + if existing: + existing["text"] = ( + (existing.get("text", "") + " " + cell.get("text", "")).strip() + ) + existing["word_boxes"] = existing.get("word_boxes", []) + cell.get("word_boxes", []) + cell["_merged"] = True + + z["cells"] = [c for c in cells if not c.get("_merged")] + cells = z["cells"] + cols.remove(narrowest) + cols_by_width = [c for c in cols_by_width if c["index"] != ni] + + # Re-index columns 0..N-1 + for new_idx, col in enumerate(sorted(cols, key=lambda c: c.get("x_min_px", c.get("x_min_pct", 0)))): + old_idx = col["index"] + col["index"] = new_idx + for cell in cells: + if cell.get("col_index") == old_idx: + cell["col_index"] = new_idx + + logger.info("max_columns: zone %s now has %d columns", z.get("zone_index"), len(cols)) diff --git a/klausur-service/backend/grid/build/cleanup.py b/klausur-service/backend/grid/build/cleanup.py new file mode 100644 index 0000000..39a60d8 --- /dev/null +++ b/klausur-service/backend/grid/build/cleanup.py @@ -0,0 +1,390 @@ +""" +Grid Build Cleanup — Phase 3: Junk row removal, artifact cleanup, pipe +divider removal, connector normalization, border strip detection, and +alphabet sidebar removal. + +Extracted from grid_build_core.py for maintainability. +""" + +import logging +import re +from typing import Any, Dict, List + +from cv_ocr_engines import _words_to_reading_order_text + +logger = logging.getLogger(__name__) + +_PIPE_RE = re.compile(r"^\|+$") + + +def _cleanup_zones( + zones_data: List[Dict[str, Any]], + border_prefiltered: bool, + session_id: str, +) -> bool: + """Clean up zone data: remove junk rows, artifacts, pipes, border strips. + + Args: + zones_data: List of zone dicts (modified in place). + border_prefiltered: Whether border words were already pre-filtered. + session_id: For logging. + + Returns: + Updated border_prefiltered flag. + """ + _remove_junk_rows(zones_data) + _remove_artifact_cells(zones_data) + _remove_oversized_word_boxes(zones_data) + _remove_pipe_dividers(zones_data) + _normalize_connector_columns(zones_data) + border_prefiltered = _remove_border_strips(zones_data, border_prefiltered) + _remove_alphabet_sidebars(zones_data) + return border_prefiltered + + +def _remove_junk_rows(zones_data: List[Dict[str, Any]]) -> None: + """Remove rows where ALL cells contain only short, low-confidence text. + + Also removes 'oversized stub' rows and 'scattered debris' rows. + """ + _JUNK_CONF_THRESHOLD = 50 + _JUNK_MAX_TEXT_LEN = 3 + + for z in zones_data: + cells = z.get("cells", []) + rows = z.get("rows", []) + if not cells or not rows: + continue + + # Compute median word height across the zone for oversized detection + all_wb_heights = [ + wb["height"] + for cell in cells + for wb in cell.get("word_boxes") or [] + if wb.get("height", 0) > 0 + ] + median_wb_h = sorted(all_wb_heights)[len(all_wb_heights) // 2] if all_wb_heights else 28 + + junk_row_indices = set() + for row in rows: + ri = row["index"] + row_cells = [c for c in cells if c.get("row_index") == ri] + if not row_cells: + continue + + row_wbs = [ + wb for cell in row_cells + for wb in cell.get("word_boxes") or [] + ] + + # Rule 1: ALL word_boxes are low-conf AND short text + all_junk = True + for wb in row_wbs: + text = (wb.get("text") or "").strip() + conf = wb.get("conf", 0) + if conf >= _JUNK_CONF_THRESHOLD or len(text) > _JUNK_MAX_TEXT_LEN: + all_junk = False + break + if all_junk and row_wbs: + junk_row_indices.add(ri) + continue + + # Rule 2: oversized stub -- <=3 words, short total text, + # and word height > 1.8x median + if len(row_wbs) <= 3: + total_text = "".join((wb.get("text") or "").strip() for wb in row_wbs) + max_h = max((wb.get("height", 0) for wb in row_wbs), default=0) + has_page_ref = any( + re.match(r'^[pPsS]\.?\s*\d+$', (wb.get("text") or "").strip()) + for wb in row_wbs + ) + if len(total_text) <= 5 and max_h > median_wb_h * 1.8 and not has_page_ref: + junk_row_indices.add(ri) + continue + + # Rule 3: scattered debris -- rows with only tiny fragments + longest = max(len((wb.get("text") or "").strip()) for wb in row_wbs) + if longest <= 2: + junk_row_indices.add(ri) + continue + + if junk_row_indices: + z["cells"] = [c for c in cells if c.get("row_index") not in junk_row_indices] + z["rows"] = [r for r in rows if r["index"] not in junk_row_indices] + logger.info( + "build-grid: removed %d junk rows from zone %d: %s", + len(junk_row_indices), z["zone_index"], + sorted(junk_row_indices), + ) + + +def _remove_artifact_cells(zones_data: List[Dict[str, Any]]) -> None: + """Remove individual cells with a single very-short, low-conf word.""" + _ARTIFACT_MAX_LEN = 2 + _ARTIFACT_CONF_THRESHOLD = 65 + + for z in zones_data: + cells = z.get("cells", []) + if not cells: + continue + artifact_ids = set() + for cell in cells: + wbs = cell.get("word_boxes") or [] + if len(wbs) != 1: + continue + wb = wbs[0] + text = (wb.get("text") or "").strip() + conf = wb.get("conf", 100) + if len(text) <= _ARTIFACT_MAX_LEN and conf < _ARTIFACT_CONF_THRESHOLD: + artifact_ids.add(cell.get("cell_id")) + if artifact_ids: + z["cells"] = [c for c in cells if c.get("cell_id") not in artifact_ids] + logger.info( + "build-grid: removed %d artifact cells from zone %d: %s", + len(artifact_ids), z.get("zone_index", 0), + [c.get("text") for c in cells if c.get("cell_id") in artifact_ids], + ) + + +def _remove_oversized_word_boxes(zones_data: List[Dict[str, Any]]) -> None: + """Remove word_boxes whose height is 3x+ the median (graphic artifacts).""" + for z in zones_data: + cells = z.get("cells", []) + if not cells: + continue + all_wh = [ + wb["height"] + for cell in cells + for wb in cell.get("word_boxes") or [] + if wb.get("height", 0) > 0 + ] + if not all_wh: + continue + med_h = sorted(all_wh)[len(all_wh) // 2] + oversized_threshold = med_h * 3 + removed_oversized = 0 + for cell in cells: + wbs = cell.get("word_boxes") or [] + filtered = [wb for wb in wbs if wb.get("height", 0) <= oversized_threshold] + if len(filtered) < len(wbs): + removed_oversized += len(wbs) - len(filtered) + cell["word_boxes"] = filtered + cell["text"] = _words_to_reading_order_text(filtered) + if removed_oversized: + z["cells"] = [c for c in cells if c.get("word_boxes")] + logger.info( + "build-grid: removed %d oversized word_boxes (>%dpx) from zone %d", + removed_oversized, oversized_threshold, z.get("zone_index", 0), + ) + + +def _remove_pipe_dividers(zones_data: List[Dict[str, Any]]) -> None: + """Remove pipe-character word_boxes (column divider artifacts).""" + for z in zones_data: + if z.get("vsplit_group") is not None: + continue # pipes already removed before split + removed_pipes = 0 + for cell in z.get("cells", []): + wbs = cell.get("word_boxes") or [] + filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())] + if len(filtered) < len(wbs): + removed_pipes += len(wbs) - len(filtered) + cell["word_boxes"] = filtered + cell["text"] = _words_to_reading_order_text(filtered) + if removed_pipes: + z["cells"] = [c for c in z.get("cells", []) if (c.get("word_boxes") or c.get("text", "").strip())] + logger.info( + "build-grid: removed %d pipe-divider word_boxes from zone %d", + removed_pipes, z.get("zone_index", 0), + ) + + # Strip pipe chars ONLY from cell edges (OCR artifacts). + # Preserve pipes embedded in words as syllable separators. + for z in zones_data: + for cell in z.get("cells", []): + text = cell.get("text", "") + if "|" in text: + cleaned = text.strip("|").strip() + if cleaned != text.strip(): + cell["text"] = cleaned + + +def _normalize_connector_columns(zones_data: List[Dict[str, Any]]) -> None: + """Normalize narrow connector columns where OCR appends noise chars. + + In synonym dictionaries a narrow column repeats the same word + (e.g. "oder") in every row. OCR sometimes appends noise chars. + """ + for z in zones_data: + cols = z.get("columns", []) + cells = z.get("cells", []) + if not cols or not cells: + continue + for col in cols: + ci = col.get("index") + col_cells = [c for c in cells if c.get("col_index") == ci] + if len(col_cells) < 3: + continue + text_counts: Dict[str, int] = {} + for c in col_cells: + t = (c.get("text") or "").strip() + if t: + text_counts[t] = text_counts.get(t, 0) + 1 + if not text_counts: + continue + dominant_text = max(text_counts, key=text_counts.get) # type: ignore[arg-type] + dominant_count = text_counts[dominant_text] + if len(dominant_text) > 10 or dominant_count < len(col_cells) * 0.6: + continue + fixed = 0 + for c in col_cells: + t = (c.get("text") or "").strip() + if t != dominant_text and t.startswith(dominant_text) and len(t) <= len(dominant_text) + 2: + c["text"] = dominant_text + wbs = c.get("word_boxes") or [] + if len(wbs) == 1: + wbs[0]["text"] = dominant_text + fixed += 1 + if fixed: + logger.info( + "build-grid: normalized %d outlier cells in connector column %d " + "(dominant='%s') zone %d", + fixed, ci, dominant_text, z.get("zone_index", 0), + ) + + +def _remove_border_strips( + zones_data: List[Dict[str, Any]], + border_prefiltered: bool, +) -> bool: + """Detect and remove page-border decoration strips. + + Returns updated border_prefiltered flag. + """ + border_strip_removed = 0 + if border_prefiltered: + logger.info("Step 4e: skipped (border pre-filter already applied)") + return border_prefiltered + + for z in zones_data: + cells = z.get("cells", []) + if not cells: + continue + all_wbs_with_cell: list = [] + for cell in cells: + for wb in cell.get("word_boxes") or []: + all_wbs_with_cell.append((wb.get("left", 0), wb, cell)) + if len(all_wbs_with_cell) < 10: + continue + all_wbs_with_cell.sort(key=lambda t: t[0]) + total = len(all_wbs_with_cell) + + # -- Left-edge scan -- + left_strip_count = 0 + left_gap = 0 + running_right = 0 + for gi in range(total - 1): + running_right = max( + running_right, + all_wbs_with_cell[gi][0] + all_wbs_with_cell[gi][1].get("width", 0), + ) + gap = all_wbs_with_cell[gi + 1][0] - running_right + if gap > 30: + left_strip_count = gi + 1 + left_gap = gap + break + + # -- Right-edge scan -- + right_strip_count = 0 + right_gap = 0 + running_left = all_wbs_with_cell[-1][0] + for gi in range(total - 1, 0, -1): + running_left = min(running_left, all_wbs_with_cell[gi][0]) + prev_right = ( + all_wbs_with_cell[gi - 1][0] + + all_wbs_with_cell[gi - 1][1].get("width", 0) + ) + gap = running_left - prev_right + if gap > 30: + right_strip_count = total - gi + right_gap = gap + break + + strip_wbs: set = set() + strip_side = "" + strip_gap = 0 + strip_count = 0 + if left_strip_count > 0 and left_strip_count / total < 0.20: + strip_side = "left" + strip_count = left_strip_count + strip_gap = left_gap + strip_wbs = {id(t[1]) for t in all_wbs_with_cell[:left_strip_count]} + elif right_strip_count > 0 and right_strip_count / total < 0.20: + strip_side = "right" + strip_count = right_strip_count + strip_gap = right_gap + strip_wbs = {id(t[1]) for t in all_wbs_with_cell[total - right_strip_count:]} + + if not strip_wbs: + continue + for cell in cells: + wbs = cell.get("word_boxes") or [] + filtered = [wb for wb in wbs if id(wb) not in strip_wbs] + if len(filtered) < len(wbs): + border_strip_removed += len(wbs) - len(filtered) + cell["word_boxes"] = filtered + cell["text"] = _words_to_reading_order_text(filtered) + z["cells"] = [c for c in cells + if (c.get("word_boxes") or c.get("text", "").strip())] + logger.info( + "Step 4e: removed %d border-strip word_boxes (%s) from zone %d " + "(gap=%dpx, strip=%d/%d wbs)", + border_strip_removed, strip_side, z.get("zone_index", 0), + strip_gap, strip_count, total, + ) + + return border_prefiltered + + +def _remove_alphabet_sidebars(zones_data: List[Dict[str, Any]]) -> None: + """Remove decorative edge columns (alphabet sidebar safety net). + + Dictionary pages have A-Z letter sidebars that OCR reads as single- + character word_boxes. + """ + for z in zones_data: + columns = z.get("columns", []) + cells = z.get("cells", []) + if len(columns) < 3 or not cells: + continue + col_cells: Dict[str, List[Dict]] = {} + for cell in cells: + ct = cell.get("col_type", "") + if ct.startswith("column_"): + col_cells.setdefault(ct, []).append(cell) + col_types_ordered = sorted(col_cells.keys()) + if len(col_types_ordered) < 3: + continue + for edge_ct in [col_types_ordered[0], col_types_ordered[-1]]: + edge_cells_list = col_cells.get(edge_ct, []) + if len(edge_cells_list) < 3: + continue + texts = [(c.get("text") or "").strip() for c in edge_cells_list] + avg_len = sum(len(t) for t in texts) / len(texts) + single_char = sum(1 for t in texts if len(t) <= 1) + single_ratio = single_char / len(texts) + if avg_len > 1.5: + continue + if single_ratio < 0.7: + continue + removed_count = len(edge_cells_list) + edge_ids = {id(c) for c in edge_cells_list} + z["cells"] = [c for c in cells if id(c) not in edge_ids] + z["columns"] = [col for col in columns if col.get("col_type") != edge_ct] + logger.info( + "Step 4f: removed decorative edge column '%s' from zone %d " + "(%d cells, avg_len=%.1f, single_char=%.0f%%)", + edge_ct, z.get("zone_index", 0), removed_count, + avg_len, single_ratio * 100, + ) + break # only remove one edge per zone diff --git a/klausur-service/backend/grid/build/core.py b/klausur-service/backend/grid/build/core.py new file mode 100644 index 0000000..0b22263 --- /dev/null +++ b/klausur-service/backend/grid/build/core.py @@ -0,0 +1,213 @@ +""" +Grid Build Core — the main _build_grid_core() function. + +Extracted from grid_editor_api.py for maintainability. +Takes merged OCR word positions and builds a structured, zone-aware grid. + +The function delegates to phase-specific modules: +- grid_build_zones.py — image loading, graphic/box detection, zone grids +- grid_build_cleanup.py — junk rows, artifacts, pipes, border strips +- grid_build_text_ops.py — color, headings, IPA, page refs +- grid_build_finalize.py — bullets, max_columns, dictionary, spelling, result +""" + +import logging +import time +from typing import Any, Dict, List, Optional + +from grid.editor.filters import ( + _flatten_word_boxes, + _get_content_bounds, + _filter_decorative_margin, + _filter_footer_words, + _filter_header_junk, +) + +from .zones import _build_zones +from .cleanup import _cleanup_zones +from .text_ops import _process_text +from .finalize import _finalize_grid + +logger = logging.getLogger(__name__) + + +async def _build_grid_core( + session_id: str, + session: dict, + *, + ipa_mode: str = "auto", + syllable_mode: str = "auto", + enhance: bool = True, + max_columns: Optional[int] = None, + min_conf: Optional[int] = None, +) -> dict: + """Core grid building logic — pure computation, no HTTP or DB side effects. + + Args: + session_id: Session identifier (for logging and image loading). + session: Full session dict from get_session_db(). + ipa_mode: "auto" (only when English headwords detected), "all" + (force IPA on all content columns), "en" (English column only), + "de" (German/definition columns only), or "none" (skip entirely). + syllable_mode: "auto" (only when original has pipe dividers), + "all" (force syllabification on all words), "en" (English only), + "de" (German only), or "none" (skip). + + Returns: + StructuredGrid result dict. + + Raises: + ValueError: If session data is incomplete. + """ + t0 = time.time() + + # ── Phase 1: Input Validation & Word Filtering ────────────────── + + # 1. Validate and load word results + word_result = session.get("word_result") + if not word_result or not word_result.get("cells"): + raise ValueError("No word results found. Run paddle-kombi or rapid-kombi first.") + + img_w = word_result.get("image_width", 0) + img_h = word_result.get("image_height", 0) + if not img_w or not img_h: + raise ValueError("Missing image dimensions in word_result") + + # 2. Flatten all word boxes from cells + all_words = _flatten_word_boxes(word_result["cells"]) + if not all_words: + raise ValueError("No word boxes found in cells") + + # 2a-pre. Apply min_conf filter if specified + if min_conf and min_conf > 0: + before = len(all_words) + all_words = [w for w in all_words if w.get('conf', 100) >= min_conf] + removed = before - len(all_words) + if removed: + logger.info("build-grid session %s: min_conf=%d removed %d/%d words", + session_id, min_conf, removed, before) + + logger.info("build-grid session %s: %d words from %d cells (enhance=%s, max_cols=%s, min_conf=%s)", + session_id, len(all_words), len(word_result["cells"]), + enhance, max_columns, min_conf) + + # 2b. Filter decorative margin columns (alphabet graphics) + margin_strip_info = _filter_decorative_margin(all_words, img_w, logger, session_id) + margin_strip_detected = margin_strip_info.get("found", False) + + # Read document_category from session + document_category = session.get("document_category") + + # 2c. Filter footer rows (page numbers at the very bottom) + page_number_info = _filter_footer_words(all_words, img_h, logger, session_id) + + # 2c2. Filter OCR junk from header illustrations + _filter_header_junk(all_words, img_h, logger, session_id) + + # 2d. Filter words inside user-defined exclude regions + structure_result = session.get("structure_result") + exclude_rects = [] + if structure_result: + for er in structure_result.get("exclude_regions", []): + exclude_rects.append({ + "x": er["x"], "y": er["y"], + "w": er["w"], "h": er["h"], + }) + if exclude_rects: + before = len(all_words) + filtered = [] + for w in all_words: + w_cx = w["left"] + w.get("width", 0) / 2 + w_cy = w["top"] + w.get("height", 0) / 2 + inside = any( + er["x"] <= w_cx <= er["x"] + er["w"] + and er["y"] <= w_cy <= er["y"] + er["h"] + for er in exclude_rects + ) + if not inside: + filtered.append(w) + removed = before - len(filtered) + if removed: + all_words = filtered + logger.info( + "build-grid session %s: removed %d words inside %d user exclude region(s)", + session_id, removed, len(exclude_rects), + ) + + # 2e. Hard-filter words inside graphic/image regions from structure step + graphic_rects: List[Dict[str, int]] = [] + if structure_result: + for g in structure_result.get("graphics", []): + graphic_rects.append({ + "x": g["x"], "y": g["y"], + "w": g["w"], "h": g["h"], + }) + if graphic_rects: + before = len(all_words) + all_words = [ + w for w in all_words + if not any( + gr["x"] <= w["left"] + w.get("width", 0) / 2 <= gr["x"] + gr["w"] + and gr["y"] <= w["top"] + w.get("height", 0) / 2 <= gr["y"] + gr["h"] + for gr in graphic_rects + ) + ] + removed = before - len(all_words) + if removed: + logger.info( + "build-grid session %s: hard-removed %d words inside %d structure graphic region(s)", + session_id, removed, len(graphic_rects), + ) + + content_x, content_y, content_w, content_h = _get_content_bounds(all_words) + + # ── Phase 2: Image Processing & Zone Detection ────────────────── + + zone_result = await _build_zones( + session_id, session, all_words, graphic_rects, + content_x, content_y, content_w, content_h, + img_w, img_h, + ) + zones_data = zone_result["zones_data"] + boxes_detected = zone_result["boxes_detected"] + recovered_count = zone_result["recovered_count"] + border_prefiltered = zone_result["border_prefiltered"] + img_bgr = zone_result["img_bgr"] + + # ── Phase 3: Junk Removal & Cell Cleanup ──────────────────────── + + border_prefiltered = _cleanup_zones(zones_data, border_prefiltered, session_id) + + # ── Phase 4+5a: Color, Headings, IPA, Page Refs ───────────────── + + text_result = _process_text( + zones_data, img_bgr, img_w, img_h, ipa_mode, page_number_info, + ) + + # ── Phase 5b+6: Finalize & Result Assembly ────────────────────── + + duration = time.time() - t0 + + result = _finalize_grid( + zones_data=zones_data, + all_words=all_words, + img_bgr=img_bgr, + img_w=img_w, + img_h=img_h, + session_id=session_id, + max_columns=max_columns, + ipa_mode=ipa_mode, + syllable_mode=syllable_mode, + en_col_type=text_result["en_col_type"], + ipa_target_cols=text_result["ipa_target_cols"], + all_content_cols=text_result["all_content_cols"], + skip_ipa=text_result["skip_ipa"], + document_category=document_category, + margin_strip_detected=margin_strip_detected, + page_number_info=text_result["page_number_info"], + boxes_detected=boxes_detected, + recovered_count=recovered_count, + duration=duration, + ) + + return result diff --git a/klausur-service/backend/grid/build/finalize.py b/klausur-service/backend/grid/build/finalize.py new file mode 100644 index 0000000..857df5f --- /dev/null +++ b/klausur-service/backend/grid/build/finalize.py @@ -0,0 +1,452 @@ +""" +Grid Build Finalize — Phase 5b+6: Orchestrates cell-level operations, +dictionary detection, syllable dividers, spell checking, empty column +removal, and result assembly. + +Extracted from grid_build_core.py for maintainability. +""" + +import logging +import re +from typing import Any, Dict, List, Optional + +from .cell_ops import ( + _remove_bullets_and_artifacts, + _remove_garbled_cells, + _normalize_word_order, + _enforce_max_columns, +) + +logger = logging.getLogger(__name__) + + +def _finalize_grid( + zones_data: List[Dict[str, Any]], + all_words: List[Dict[str, Any]], + img_bgr: Any, + img_w: int, + img_h: int, + session_id: str, + max_columns: Optional[int], + ipa_mode: str, + syllable_mode: str, + en_col_type: Optional[str], + ipa_target_cols: set, + all_content_cols: set, + skip_ipa: bool, + document_category: Optional[str], + margin_strip_detected: bool, + page_number_info: Optional[Dict], + boxes_detected: int, + recovered_count: int, + duration: float, +) -> dict: + """Run final processing steps and assemble result dict. + + Handles: bullet removal, artifact cells, word ordering, max_columns, + dictionary detection, syllable dividers, spell check, empty columns, + internal flag cleanup, and result assembly. + """ + total_cols = sum(len(z.get("columns", [])) for z in zones_data) + + # 5i. Remove blue bullet/artifact word_boxes + _remove_bullets_and_artifacts(zones_data) + + # 5j-pre. Remove cells whose text is entirely garbled / artifact noise + _remove_garbled_cells(zones_data) + + # 5j. Normalise word_box order to reading order + _normalize_word_order(zones_data) + + # 5k. Enforce max_columns by merging narrowest columns + if max_columns and max_columns > 0: + _enforce_max_columns(zones_data, max_columns) + + # --- Dictionary detection on assembled grid --- + dict_detection = _detect_dictionary( + zones_data, img_w, img_h, document_category, margin_strip_detected + ) + + # --- Word-gap merge --- + try: + from cv_syllable_detect import merge_word_gaps_in_zones + merge_word_gaps_in_zones(zones_data, session_id) + except Exception as e: + logger.warning("Word-gap merge failed: %s", e) + + # --- Pipe auto-correction --- + try: + from cv_syllable_detect import autocorrect_pipe_artifacts + autocorrect_pipe_artifacts(zones_data, session_id) + except Exception as e: + logger.warning("Pipe autocorrect failed: %s", e) + + # --- Syllable divider insertion --- + syllable_insertions = _insert_syllable_dividers( + zones_data, img_bgr, session_id, syllable_mode, dict_detection, + en_col_type, all_content_cols, total_cols, + ) + + # --- Split merged words --- + _split_merged_words(zones_data, session_id) + + # --- Ensure space before IPA/phonetic brackets --- + _fix_ipa_spacing(zones_data) + + # --- SmartSpellChecker --- + _run_spell_checker(zones_data, session_id, en_col_type, total_cols) + + # --- Debug log cell counts per column --- + for z in zones_data: + if z.get("zone_type") == "content": + from collections import Counter as _Counter + _cc = _Counter(c.get("col_index") for c in z.get("cells", [])) + _cols = z.get("columns", []) + logger.info( + "pre-empty-col-removal zone %d: %d cols, cells_per_col=%s", + z.get("zone_index", 0), len(_cols), dict(sorted(_cc.items())), + ) + + # --- Remove empty columns --- + _remove_empty_columns(zones_data) + + # Clean up internal flags before returning + for z in zones_data: + for cell in z.get("cells", []): + cell.pop("_ipa_corrected", None) + + # 6. Build result + return _assemble_result( + zones_data, all_words, img_w, img_h, session_id, + ipa_mode, syllable_mode, ipa_target_cols, skip_ipa, + dict_detection, page_number_info, boxes_detected, + recovered_count, duration, syllable_insertions, + ) + + +def _detect_dictionary( + zones_data: List[Dict[str, Any]], + img_w: int, + img_h: int, + document_category: Optional[str], + margin_strip_detected: bool, +) -> Dict[str, Any]: + """Run dictionary detection on the assembled grid.""" + from cv_layout import _score_dictionary_signals + dict_detection: Dict[str, Any] = {"is_dictionary": False, "confidence": 0.0} + try: + from cv_vocab_types import ColumnGeometry + for z in zones_data: + zone_cells = z.get("cells", []) + zone_cols = z.get("columns", []) + if len(zone_cols) < 2 or len(zone_cells) < 10: + continue + pseudo_geoms = [] + for col in zone_cols: + ci = col["index"] + col_cells = [c for c in zone_cells if c.get("col_index") == ci] + col_words = [] + for cell in col_cells: + for wb in cell.get("word_boxes") or []: + col_words.append({ + "text": wb.get("text", ""), + "conf": wb.get("conf", 0), + "top": wb.get("top", 0), + "left": wb.get("left", 0), + "height": wb.get("height", 0), + "width": wb.get("width", 0), + }) + if not cell.get("word_boxes") and cell.get("text"): + col_words.append({ + "text": cell["text"], + "conf": cell.get("confidence", 50), + "top": cell.get("bbox_px", {}).get("y", 0), + "left": cell.get("bbox_px", {}).get("x", 0), + "height": cell.get("bbox_px", {}).get("h", 20), + "width": cell.get("bbox_px", {}).get("w", 50), + }) + col_w = col.get("x_max_px", 0) - col.get("x_min_px", 0) + pseudo_geoms.append(ColumnGeometry( + index=ci, x=col.get("x_min_px", 0), y=0, + width=max(col_w, 1), height=img_h, + word_count=len(col_words), words=col_words, + width_ratio=col_w / max(img_w, 1), + )) + if len(pseudo_geoms) >= 2: + dd = _score_dictionary_signals( + pseudo_geoms, + document_category=document_category, + margin_strip_detected=margin_strip_detected, + ) + if dd["confidence"] > dict_detection["confidence"]: + dict_detection = dd + except Exception as e: + logger.warning("Dictionary detection failed: %s", e) + return dict_detection + + +def _insert_syllable_dividers( + zones_data: List[Dict[str, Any]], + img_bgr: Any, + session_id: str, + syllable_mode: str, + dict_detection: Dict[str, Any], + en_col_type: Optional[str], + all_content_cols: set, + total_cols: int, +) -> int: + """Insert syllable dividers for dictionary pages. Returns insertion count.""" + syllable_insertions = 0 + if syllable_mode == "none" or img_bgr is None: + if syllable_mode == "none": + for z in zones_data: + for cell in z.get("cells", []): + t = cell.get("text", "") + if "|" in t: + cell["text"] = t.replace("|", "") + return syllable_insertions + + _syllable_eligible = False + if syllable_mode in ("all", "de", "en"): + _syllable_eligible = True + elif (dict_detection.get("is_dictionary") + and dict_detection.get("article_col_index") is not None): + _syllable_eligible = True + + _syllable_col_filter: Optional[set] = None + if syllable_mode == "en": + _syllable_col_filter = {en_col_type} if en_col_type else set() + elif syllable_mode == "de": + if en_col_type and total_cols >= 3: + _syllable_col_filter = all_content_cols - {en_col_type} + + if _syllable_eligible: + try: + from cv_syllable_detect import insert_syllable_dividers + force_syllables = (syllable_mode in ("all", "de", "en")) + syllable_insertions = insert_syllable_dividers( + zones_data, img_bgr, session_id, + force=force_syllables, + col_filter=_syllable_col_filter, + ) + except Exception as e: + logger.warning("Syllable insertion failed: %s", e) + + return syllable_insertions + + +def _split_merged_words( + zones_data: List[Dict[str, Any]], + session_id: str, +) -> None: + """Split merged words using dictionary lookup.""" + try: + from cv_review import _try_split_merged_word, _SPELL_AVAILABLE + if not _SPELL_AVAILABLE: + return + split_count = 0 + for z in zones_data: + for cell in z.get("cells", []): + text = cell.get("text", "") + if not text: + continue + parts = [] + changed = False + for token in text.split(): + clean = token + bracket_pos = clean.find('[') + suffix_ipa = "" + if bracket_pos > 0: + suffix_ipa = clean[bracket_pos:] + clean = clean[:bracket_pos] + suffix_punct = "" + stripped = clean.rstrip(".,!?;:'\")") + if stripped != clean: + suffix_punct = clean[len(stripped):] + clean = stripped + suffix = suffix_punct + suffix_ipa + contraction = "" + if "'" in clean and clean.index("'") >= 2: + apos_pos = clean.index("'") + contraction = clean[apos_pos:] + clean = clean[:apos_pos] + suffix = contraction + suffix + if len(clean) >= 4 and clean.isalpha(): + split = _try_split_merged_word(clean) + if split: + parts.append(split + suffix) + changed = True + continue + parts.append(token) + if changed: + cell["text"] = " ".join(parts) + split_count += 1 + if split_count: + logger.info("build-grid session %s: split %d merged words", session_id, split_count) + except ImportError: + pass + + +def _fix_ipa_spacing(zones_data: List[Dict[str, Any]]) -> None: + """Ensure space before IPA/phonetic brackets: 'word[ipa]' -> 'word [ipa]'.""" + _IPA_NOSPACE_RE = re.compile(r'([a-zA-ZäöüÄÖÜß])(\[[^\]]{2,}\])') + for z in zones_data: + for cell in z.get("cells", []): + text = cell.get("text", "") + if text and "[" in text: + fixed = _IPA_NOSPACE_RE.sub(r'\1 \2', text) + if fixed != text: + cell["text"] = fixed + + +def _run_spell_checker( + zones_data: List[Dict[str, Any]], + session_id: str, + en_col_type: Optional[str], + total_cols: int, +) -> None: + """Run SmartSpellChecker on all cells.""" + try: + from smart_spell import SmartSpellChecker + _ssc = SmartSpellChecker() + spell_fix_count = 0 + + for z in zones_data: + for cell in z.get("cells", []): + text = cell.get("text", "") + if not text or not text.strip(): + continue + ct = cell.get("col_type", "") + if not ct.startswith("column_"): + continue + + if total_cols >= 3 and en_col_type: + lang = "en" if ct == en_col_type else "de" + elif total_cols <= 2: + lang = "auto" + else: + lang = "auto" + + result = _ssc.correct_text(text, lang=lang) + if result.changed: + cell["text"] = result.corrected + spell_fix_count += 1 + + if spell_fix_count: + logger.info( + "build-grid session %s: SmartSpellChecker fixed %d cells", + session_id, spell_fix_count, + ) + except ImportError: + logger.debug("SmartSpellChecker not available in build-grid") + except Exception as e: + logger.warning("SmartSpellChecker error in build-grid: %s", e) + + +def _remove_empty_columns(zones_data: List[Dict[str, Any]]) -> None: + """Remove columns that have no cells assigned.""" + for z in zones_data: + cells = z.get("cells", []) + used_col_indices = {c.get("col_index") for c in cells} + old_cols = z.get("columns", []) + new_cols = [c for c in old_cols if c.get("col_index", c.get("index", -1)) in used_col_indices] + if len(new_cols) < len(old_cols): + old_to_new = {} + for new_i, col in enumerate(new_cols): + old_i = col.get("col_index", col.get("index", new_i)) + old_to_new[old_i] = new_i + col["col_index"] = new_i + col["index"] = new_i + col["label"] = f"column_{new_i + 1}" if len(new_cols) > 1 else "column_text" + for cell in cells: + old_ci = cell.get("col_index", 0) + cell["col_index"] = old_to_new.get(old_ci, old_ci) + cell["col_type"] = f"column_{cell['col_index'] + 1}" if len(new_cols) > 1 else "column_text" + z["columns"] = new_cols + + +def _assemble_result( + zones_data: List[Dict[str, Any]], + all_words: List[Dict[str, Any]], + img_w: int, + img_h: int, + session_id: str, + ipa_mode: str, + syllable_mode: str, + ipa_target_cols: set, + skip_ipa: bool, + dict_detection: Dict[str, Any], + page_number_info: Optional[Dict], + boxes_detected: int, + recovered_count: int, + duration: float, + syllable_insertions: int, +) -> dict: + """Build the final result dict (Phase 6).""" + total_cells = sum(len(z.get("cells", [])) for z in zones_data) + total_columns = sum(len(z.get("columns", [])) for z in zones_data) + total_rows = sum(len(z.get("rows", [])) for z in zones_data) + + # Collect color statistics + color_stats: Dict[str, int] = {} + for z in zones_data: + for cell in z.get("cells", []): + for wb in cell.get("word_boxes", []): + cn = wb.get("color_name", "black") + color_stats[cn] = color_stats.get(cn, 0) + 1 + + # Compute layout metrics + all_content_row_heights: List[float] = [] + for z in zones_data: + for row in z.get("rows", []): + if not row.get("is_header", False): + h = row.get("y_max_px", 0) - row.get("y_min_px", 0) + if h > 0: + all_content_row_heights.append(h) + avg_row_height = ( + sum(all_content_row_heights) / len(all_content_row_heights) + if all_content_row_heights else 30.0 + ) + font_size_suggestion = max(10, int(avg_row_height * 0.6)) + + return { + "session_id": session_id, + "image_width": img_w, + "image_height": img_h, + "zones": zones_data, + "boxes_detected": boxes_detected, + "summary": { + "total_zones": len(zones_data), + "total_columns": total_columns, + "total_rows": total_rows, + "total_cells": total_cells, + "total_words": len(all_words), + "recovered_colored": recovered_count, + "color_stats": color_stats, + }, + "formatting": { + "bold_columns": [], + "header_rows": [], + }, + "layout_metrics": { + "page_width_px": img_w, + "page_height_px": img_h, + "avg_row_height_px": round(avg_row_height, 1), + "font_size_suggestion_px": font_size_suggestion, + }, + "dictionary_detection": { + "is_dictionary": dict_detection.get("is_dictionary", False), + "confidence": dict_detection.get("confidence", 0.0), + "signals": dict_detection.get("signals", {}), + "article_col_index": dict_detection.get("article_col_index"), + "headword_col_index": dict_detection.get("headword_col_index"), + }, + "processing_modes": { + "ipa_mode": ipa_mode, + "syllable_mode": syllable_mode, + "ipa_applied": bool(ipa_target_cols) if not skip_ipa else False, + "syllables_applied": syllable_insertions > 0, + }, + "page_number": page_number_info, + "duration_seconds": round(duration, 2), + } diff --git a/klausur-service/backend/grid/build/text_ops.py b/klausur-service/backend/grid/build/text_ops.py new file mode 100644 index 0000000..dc0bf9c --- /dev/null +++ b/klausur-service/backend/grid/build/text_ops.py @@ -0,0 +1,489 @@ +""" +Grid Build Text Ops — Phase 4+5a: Color annotation, heading detection, +parenthesis fix, IPA phonetic correction, page ref extraction, and +slash-IPA conversion. + +Extracted from grid_build_core.py for maintainability. +""" + +import logging +import re +from typing import Any, Dict, List, Optional, Set, Tuple + +from cv_color_detect import detect_word_colors +from cv_ocr_engines import ( + fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa, + _lookup_ipa, +) +from grid.editor.headers import ( + _detect_heading_rows_by_color, + _detect_heading_rows_by_single_cell, +) + +logger = logging.getLogger(__name__) + + +def _process_text( + zones_data: List[Dict[str, Any]], + img_bgr: Any, + img_w: int, + img_h: int, + ipa_mode: str, + page_number_info: Optional[Dict], +) -> Dict[str, Any]: + """Run color annotation, heading detection, IPA correction, and page refs. + + Args: + zones_data: List of zone dicts (modified in place). + img_bgr: BGR image array (or None). + img_w: Image width. + img_h: Image height. + ipa_mode: IPA processing mode. + page_number_info: Existing page number metadata (may be None). + + Returns: + Dict with keys: en_col_type, ipa_target_cols, all_content_cols, + skip_ipa, page_number_info. + """ + # 5. Color annotation on final word_boxes in cells + if img_bgr is not None: + all_wb: List[Dict] = [] + for z in zones_data: + for cell in z.get("cells", []): + all_wb.extend(cell.get("word_boxes", [])) + detect_word_colors(img_bgr, all_wb) + + # 5a. Heading detection by color + height + heading_count = _detect_heading_rows_by_color(zones_data, img_w, img_h) + if heading_count: + logger.info("Detected %d heading rows by color+height", heading_count) + + # 5b. Fix unmatched parentheses in cell text + for z in zones_data: + for cell in z.get("cells", []): + text = cell.get("text", "") + if ")" in text and "(" not in text: + cell["text"] = "(" + text + + # 5c. IPA phonetic correction + all_cells = [cell for z in zones_data for cell in z.get("cells", [])] + total_cols = sum(len(z.get("columns", [])) for z in zones_data) + en_col_type = None + ipa_target_cols: set = set() + all_content_cols: set = set() + skip_ipa = (ipa_mode == "none") + + # When ipa_mode=none, strip ALL square brackets from ALL content columns + if skip_ipa: + _SQUARE_BRACKET_RE_NONE = re.compile(r'\s*\[[^\]]+\]') + for cell in all_cells: + ct = cell.get("col_type", "") + if not ct.startswith("column_"): + continue + text = cell.get("text", "") + if "[" in text: + stripped = _SQUARE_BRACKET_RE_NONE.sub("", text) + if stripped != text: + cell["text"] = stripped.strip() + cell["_ipa_corrected"] = True + + if not skip_ipa and total_cols >= 3: + en_col_type, ipa_target_cols, all_content_cols = _run_ipa_correction( + all_cells, total_cols, ipa_mode, zones_data + ) + elif not skip_ipa: + # Collect all_content_cols even when <3 cols (needed by finalize) + for cell in all_cells: + ct = cell.get("col_type", "") + if ct.startswith("column_") and (cell.get("text") or "").strip(): + all_content_cols.add(ct) + + # 5e. Heading detection by single-cell rows + single_heading_count = _detect_heading_rows_by_single_cell(zones_data, img_w, img_h) + if single_heading_count: + logger.info("Detected %d heading rows by single-cell heuristic", single_heading_count) + + # 5f. Strip IPA from headings + for z in zones_data: + for cell in z.get("cells", []): + if cell.get("col_type") != "heading": + continue + text = cell.get("text", "") + stripped = re.sub(r'\s*\[[^\]]*\]\s*$', '', text).strip() + if stripped and stripped != text: + cell["text"] = stripped + + # 5g. Extract page_ref cells and footer rows + _extract_page_refs_and_footers(zones_data, page_number_info) + + # 5h. Convert slash-delimited IPA to bracket notation + _convert_slash_ipa(zones_data, skip_ipa, en_col_type) + + return { + "en_col_type": en_col_type, + "ipa_target_cols": ipa_target_cols, + "all_content_cols": all_content_cols, + "skip_ipa": skip_ipa, + "page_number_info": page_number_info, + } + + +def _run_ipa_correction( + all_cells: List[Dict], + total_cols: int, + ipa_mode: str, + zones_data: List[Dict[str, Any]], +) -> Tuple[Optional[str], set, set]: + """Run IPA correction on cells. Returns (en_col_type, ipa_target_cols, all_content_cols).""" + en_col_type = None + all_content_cols: set = set() + + # Detect English headword column via IPA signals + col_ipa_count: Dict[str, int] = {} + for cell in all_cells: + ct = cell.get("col_type", "") + if not ct.startswith("column_"): + continue + txt = cell.get("text", "") or "" + if txt.strip(): + all_content_cols.add(ct) + if '[' in txt or _text_has_garbled_ipa(txt): + col_ipa_count[ct] = col_ipa_count.get(ct, 0) + 1 + if col_ipa_count: + en_col_type = max(col_ipa_count, key=col_ipa_count.get) + elif ipa_mode == "all": + col_cell_count: Dict[str, int] = {} + for cell in all_cells: + ct = cell.get("col_type", "") + if ct.startswith("column_") and (cell.get("text") or "").strip(): + col_cell_count[ct] = col_cell_count.get(ct, 0) + 1 + if col_cell_count: + en_col_type = max(col_cell_count, key=col_cell_count.get) + + # Decide which columns to process based on ipa_mode + en_ipa_target_cols: set = set() + de_ipa_target_cols: set = set() + if ipa_mode in ("auto", "en"): + if en_col_type: + en_ipa_target_cols.add(en_col_type) + elif ipa_mode == "de": + de_ipa_target_cols = all_content_cols - {en_col_type} if en_col_type else all_content_cols + elif ipa_mode == "all": + if en_col_type: + en_ipa_target_cols.add(en_col_type) + de_ipa_target_cols = all_content_cols - en_ipa_target_cols + + # --- Strip IPA from columns NOT in the target set --- + _SQUARE_BRACKET_RE = re.compile(r'\s*\[[^\]]+\]') + strip_en_ipa = en_col_type and en_col_type not in en_ipa_target_cols + if strip_en_ipa or ipa_mode == "none": + strip_cols = {en_col_type} if strip_en_ipa and ipa_mode != "none" else all_content_cols + for cell in all_cells: + ct = cell.get("col_type", "") + if ct not in strip_cols: + continue + text = cell.get("text", "") + if "[" in text: + stripped = _SQUARE_BRACKET_RE.sub("", text) + if stripped != text: + cell["text"] = stripped.strip() + cell["_ipa_corrected"] = True + + # --- English IPA (Britfone + eng_to_ipa) --- + if en_ipa_target_cols: + for cell in all_cells: + ct = cell.get("col_type") + if ct in en_ipa_target_cols: + cell["_orig_col_type"] = ct + cell["col_type"] = "column_en" + _pre_ipa = {id(c): c.get("text", "") for c in all_cells} + fix_cell_phonetics(all_cells, pronunciation="british") + for cell in all_cells: + orig = cell.pop("_orig_col_type", None) + if orig: + cell["col_type"] = orig + if cell.get("text", "") != _pre_ipa.get(id(cell), ""): + cell["_ipa_corrected"] = True + + # --- German IPA (wiki-pronunciation-dict + epitran) --- + if de_ipa_target_cols: + from cv_ipa_german import insert_german_ipa + insert_german_ipa(all_cells, de_ipa_target_cols) + + ipa_target_cols = en_ipa_target_cols | de_ipa_target_cols + + # Mark cells whose text was changed by IPA correction + for cell in all_cells: + if cell.get("text", "") != _pre_ipa.get(id(cell), ""): + cell["_ipa_corrected"] = True + + # 5d. Fix IPA continuation cells + skip_ipa = (ipa_mode == "none") + _REAL_IPA_CHARS = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ") + ipa_cont_fixed = 0 + for z in ([] if skip_ipa else zones_data): + rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"]) + z_cells = z.get("cells", []) + for idx, row in enumerate(rows_sorted): + if idx == 0: + continue + ri = row["index"] + row_cells = [c for c in z_cells if c.get("row_index") == ri] + for cell in row_cells: + ct = cell.get("col_type", "") + if not ct.startswith("column_"): + continue + cell_text = (cell.get("text") or "").strip() + if not cell_text: + wb_texts = [w.get("text", "") + for w in cell.get("word_boxes", [])] + cell_text = " ".join(wb_texts).strip() + if not cell_text: + continue + + is_bracketed = ( + cell_text.startswith('[') and cell_text.endswith(']') + ) + + if is_bracketed: + if not _text_has_garbled_ipa(cell_text): + continue + if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', cell_text): + continue + else: + content_cells_in_row = [ + c for c in row_cells + if c.get("col_type", "").startswith("column_") + and c.get("col_type") != "column_1" + ] + if len(content_cells_in_row) != 1: + continue + if not _text_has_garbled_ipa(cell_text): + continue + if any(c in _REAL_IPA_CHARS for c in cell_text): + continue + _words_in_text = re.findall(r'[A-Za-zÄÖÜäöüß]{3,}', cell_text) + if len(_words_in_text) >= 3: + continue + + # Find headword in previous row, same column + prev_ri = rows_sorted[idx - 1]["index"] + prev_same_col = [ + c for c in z_cells + if c.get("row_index") == prev_ri + and c.get("col_type") == ct + ] + if not prev_same_col: + continue + prev_text = prev_same_col[0].get("text", "") + fixed = fix_ipa_continuation_cell( + cell_text, prev_text, pronunciation="british", + ) + if fixed != cell_text: + cell["text"] = fixed + ipa_cont_fixed += 1 + logger.info( + "IPA continuation R%d %s: '%s' -> '%s'", + ri, ct, cell_text, fixed, + ) + if ipa_cont_fixed: + logger.info("Fixed %d IPA continuation cells", ipa_cont_fixed) + + return en_col_type, ipa_target_cols, all_content_cols + + +def _extract_page_refs_and_footers( + zones_data: List[Dict[str, Any]], + page_number_info: Optional[Dict], +) -> None: + """Extract page_ref cells and footer rows from content zones. + + Modifies zones_data in place. Updates page_number_info if a page number + footer is found. + """ + _REAL_IPA_CHARS_SET = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ") + _PAGE_REF_RE = re.compile(r'^[pP,]?\s*\.?\s*\d+$') + _NUMBER_WORDS = { + "one", "two", "three", "four", "five", "six", "seven", + "eight", "nine", "ten", "eleven", "twelve", "thirteen", + "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", + "nineteen", "twenty", "thirty", "forty", "fifty", "sixty", + "seventy", "eighty", "ninety", "hundred", "thousand", "and", + "einhundert", "zweihundert", "dreihundert", "vierhundert", + "und", "zehn", "zwanzig", "dreißig", "vierzig", "fünfzig", + } + + for z in zones_data: + if z.get("zone_type") != "content": + continue + cells = z.get("cells", []) + rows = z.get("rows", []) + if not rows: + continue + + # Extract column_1 cells that look like page references + page_refs = [] + page_ref_cell_ids = set() + for cell in cells: + if cell.get("col_type") != "column_1": + continue + text = (cell.get("text") or "").strip() + if not text: + continue + if not _PAGE_REF_RE.match(text): + continue + page_refs.append({ + "row_index": cell.get("row_index"), + "text": text, + "bbox_pct": cell.get("bbox_pct", {}), + }) + page_ref_cell_ids.add(cell.get("cell_id")) + + # Detect footer: last non-header row if it has only 1 cell + footer_rows = [] + non_header_rows = [r for r in rows if not r.get("is_header")] + if non_header_rows: + last_row = non_header_rows[-1] + last_ri = last_row["index"] + last_cells = [c for c in z["cells"] + if c.get("row_index") == last_ri] + if len(last_cells) == 1: + text = (last_cells[0].get("text") or "").strip() + has_real_ipa = any(c in _REAL_IPA_CHARS_SET for c in text) + has_commas = ',' in text + text_words = set(text.lower().split()) + is_written_number = len(text_words) >= 2 and text_words.issubset(_NUMBER_WORDS) + is_page_number = len(text) <= 20 or is_written_number + if (text and not has_real_ipa and not has_commas + and is_page_number + and last_cells[0].get("col_type") != "heading"): + footer_rows.append({ + "row_index": last_ri, + "text": text, + "bbox_pct": last_cells[0].get("bbox_pct", {}), + }) + + # Classify footer rows + page_number_footers = [] + other_footers = [] + for fr in footer_rows: + ft = fr["text"].strip() + digits = "".join(c for c in ft if c.isdigit()) + if digits and re.match(r'^[\d\s.]+$', ft): + page_number_footers.append(fr) + elif ft.lower().split() and set(ft.lower().split()).issubset(_NUMBER_WORDS): + page_number_footers.append(fr) + else: + other_footers.append(fr) + + # Remove page-number footer rows from grid entirely + if page_number_footers: + pn_ris = {fr["row_index"] for fr in page_number_footers} + z["cells"] = [c for c in z["cells"] if c.get("row_index") not in pn_ris] + z["rows"] = [r for r in z["rows"] if r["index"] not in pn_ris] + pn_text = page_number_footers[0]["text"].strip() + pn_digits = "".join(c for c in pn_text if c.isdigit()) + if not page_number_info: + page_number_info = { + "text": pn_text, + "y_pct": page_number_footers[0].get("bbox_pct", {}).get("y", 95), + } + if pn_digits: + page_number_info["number"] = int(pn_digits) + + # Mark remaining footer rows + if other_footers: + footer_ris = {fr["row_index"] for fr in other_footers} + for r in z["rows"]: + if r["index"] in footer_ris: + r["is_footer"] = True + for c in z["cells"]: + if c.get("row_index") in footer_ris: + c["col_type"] = "footer" + + if page_refs or footer_rows: + logger.info( + "Extracted %d page_refs + %d footer rows (%d page numbers removed) from zone %d", + len(page_refs), len(footer_rows), len(page_number_footers), + z.get("zone_index", 0), + ) + + if page_refs: + z["page_refs"] = page_refs + if other_footers: + z["footer"] = other_footers + + +def _convert_slash_ipa( + zones_data: List[Dict[str, Any]], + skip_ipa: bool, + en_col_type: Optional[str], +) -> None: + """Convert slash-delimited IPA to bracket notation. + + Dictionary-style pages print IPA between slashes: "tiger /'taiga/" + """ + _SLASH_IPA_RE = re.compile( + r'(\b[a-zA-Z]+[²³¹]?)\s*' # headword (capture group 1) + r"(/[^/]{2,}/)" # /ipa/ (capture group 2), min 2 chars + ) + _STANDALONE_SLASH_IPA_RE = re.compile(r'^/([^/]{2,})/') + _SLASH_IPA_REJECT_RE = re.compile(r'[\s(),]') + slash_ipa_fixed = 0 + + for z in ([] if skip_ipa else zones_data): + for cell in z.get("cells", []): + if en_col_type and cell.get("col_type") != en_col_type: + continue + text = cell.get("text", "") + if "/" not in text: + continue + + def _replace_slash_ipa(m: re.Match) -> str: + nonlocal slash_ipa_fixed + headword = m.group(1) + ocr_ipa = m.group(2) + inner_raw = ocr_ipa.strip("/").strip() + if _SLASH_IPA_REJECT_RE.search(inner_raw): + return m.group(0) + clean_hw = re.sub(r'[²³¹\d]', '', headword).strip() + ipa = _lookup_ipa(clean_hw, "british") if clean_hw else None + if ipa: + slash_ipa_fixed += 1 + return f"{headword} [{ipa}]" + inner = inner_raw.lstrip("'").strip() + if inner: + slash_ipa_fixed += 1 + return f"{headword} [{inner}]" + return m.group(0) + + new_text = _SLASH_IPA_RE.sub(_replace_slash_ipa, text) + + _AFTER_BRACKET_SLASH = re.compile(r'(?<=\])\s*(/[^/]{2,}/)') + + def _replace_trailing_slash(m: re.Match) -> str: + nonlocal slash_ipa_fixed + inner = m.group(1).strip("/").strip().lstrip("'").strip() + if _SLASH_IPA_REJECT_RE.search(inner): + return m.group(0) + if inner: + slash_ipa_fixed += 1 + return f" [{inner}]" + return m.group(0) + new_text = _AFTER_BRACKET_SLASH.sub(_replace_trailing_slash, new_text) + + if new_text == text: + m = _STANDALONE_SLASH_IPA_RE.match(text) + if m: + inner = m.group(1).strip() + if not _SLASH_IPA_REJECT_RE.search(inner): + inner = inner.lstrip("'").strip() + if inner: + new_text = "[" + inner + "]" + text[m.end():] + slash_ipa_fixed += 1 + + if new_text != text: + cell["text"] = new_text + + if slash_ipa_fixed: + logger.info("Step 5h: converted %d slash-IPA to bracket notation", slash_ipa_fixed) diff --git a/klausur-service/backend/grid/build/zones.py b/klausur-service/backend/grid/build/zones.py new file mode 100644 index 0000000..8616e3a --- /dev/null +++ b/klausur-service/backend/grid/build/zones.py @@ -0,0 +1,464 @@ +""" +Grid Build Zones — Phase 2: Image processing, graphic detection, box/zone +detection and zone-aware grid building. + +Extracted from grid_build_core.py for maintainability. +""" + +import logging +from typing import Any, Dict, List, Optional + +import cv2 +import numpy as np + +from cv_box_detect import detect_boxes, split_page_into_zones +from cv_graphic_detect import detect_graphic_elements +from cv_color_detect import recover_colored_text +from cv_vocab_types import PageZone +from ocr_pipeline_session_store import get_session_image + +from grid.editor.filters import ( + _filter_border_strip_words, + _filter_border_ghosts, + _words_in_zone, +) +from grid.editor.zones import ( + _PIPE_RE_VSPLIT, + _detect_vertical_dividers, + _split_zone_at_vertical_dividers, + _merge_content_zones_across_boxes, + _build_zone_grid, +) + +logger = logging.getLogger(__name__) + + +async def _build_zones( + session_id: str, + session: dict, + all_words: List[Dict[str, Any]], + graphic_rects: List[Dict[str, int]], + content_x: int, + content_y: int, + content_w: int, + content_h: int, + img_w: int, + img_h: int, +) -> Dict[str, Any]: + """Load image, detect graphics/boxes, build zone-aware grids. + + Returns a dict with keys: + zones_data, boxes_detected, recovered_count, border_prefiltered, + img_bgr, all_words (modified in-place but returned for clarity). + """ + zones_data: List[Dict[str, Any]] = [] + boxes_detected = 0 + recovered_count = 0 + border_prefiltered = False + img_bgr = None + + # 3. Load image for box detection + img_png = await get_session_image(session_id, "cropped") + if not img_png: + img_png = await get_session_image(session_id, "dewarped") + if not img_png: + img_png = await get_session_image(session_id, "original") + + if img_png: + # Decode image for color detection + box detection + arr = np.frombuffer(img_png, dtype=np.uint8) + img_bgr = cv2.imdecode(arr, cv2.IMREAD_COLOR) + + if img_bgr is not None: + # --- 3a. Detect graphic/image regions via CV and hard-filter --- + sig_words = [w for w in all_words if len((w.get("text") or "").strip()) >= 3] + fresh_graphics = detect_graphic_elements(img_bgr, sig_words) + if fresh_graphics: + fresh_rects = [ + {"x": g.x, "y": g.y, "w": g.width, "h": g.height} + for g in fresh_graphics + ] + graphic_rects.extend(fresh_rects) + logger.info( + "build-grid session %s: detected %d graphic region(s) via CV", + session_id, len(fresh_graphics), + ) + # Hard-filter words inside newly detected graphic regions + before = len(all_words) + all_words[:] = [ + w for w in all_words + if not any( + gr["x"] <= w["left"] + w.get("width", 0) / 2 <= gr["x"] + gr["w"] + and gr["y"] <= w["top"] + w.get("height", 0) / 2 <= gr["y"] + gr["h"] + for gr in fresh_rects + ) + ] + removed = before - len(all_words) + if removed: + logger.info( + "build-grid session %s: hard-removed %d words inside %d fresh graphic region(s)", + session_id, removed, len(fresh_rects), + ) + + # --- Recover colored text that OCR missed (before grid building) --- + recovered = recover_colored_text(img_bgr, all_words) + if recovered and graphic_rects: + # Filter recovered chars inside graphic regions + recovered = [ + r for r in recovered + if not any( + gr["x"] <= r["left"] + r.get("width", 0) / 2 <= gr["x"] + gr["w"] + and gr["y"] <= r["top"] + r.get("height", 0) / 2 <= gr["y"] + gr["h"] + for gr in graphic_rects + ) + ] + if recovered: + recovered_count = len(recovered) + all_words.extend(recovered) + logger.info( + "build-grid session %s: +%d recovered colored words", + session_id, recovered_count, + ) + + # Detect bordered boxes + boxes = detect_boxes( + img_bgr, + content_x=content_x, + content_w=content_w, + content_y=content_y, + content_h=content_h, + ) + boxes_detected = len(boxes) + + if boxes: + # Filter border ghost words before grid building + all_words_new, ghost_count = _filter_border_ghosts(all_words, boxes) + if ghost_count: + all_words[:] = all_words_new + logger.info( + "build-grid session %s: removed %d border ghost words", + session_id, ghost_count, + ) + + # Split page into zones + page_zones = split_page_into_zones( + content_x, content_y, content_w, content_h, boxes + ) + + # Merge content zones separated by box zones + page_zones = _merge_content_zones_across_boxes( + page_zones, content_x, content_w + ) + + # 3b. Detect vertical dividers and split content zones + page_zones, border_prefiltered_vd = _detect_and_split_vertical_dividers( + page_zones, all_words + ) + + # --- First pass: build grids per zone independently --- + zone_grids = _build_grids_per_zone( + page_zones, all_words, img_w, img_h + ) + border_prefiltered = border_prefiltered or any( + zg.get("_border_prefiltered") for zg in zone_grids + ) + + # --- Second pass: merge column boundaries from all content zones --- + _merge_content_zone_columns( + zone_grids, all_words, content_w, img_w, img_h, session_id + ) + + # --- Build zones_data from zone_grids --- + for zg in zone_grids: + pz = zg["pz"] + grid = zg["grid"] + grid.pop("_raw_columns", None) + + zone_entry: Dict[str, Any] = { + "zone_index": pz.index, + "zone_type": pz.zone_type, + "bbox_px": { + "x": pz.x, "y": pz.y, + "w": pz.width, "h": pz.height, + }, + "bbox_pct": { + "x": round(pz.x / img_w * 100, 2) if img_w else 0, + "y": round(pz.y / img_h * 100, 2) if img_h else 0, + "w": round(pz.width / img_w * 100, 2) if img_w else 0, + "h": round(pz.height / img_h * 100, 2) if img_h else 0, + }, + "border": None, + "word_count": len(zg["words"]), + **grid, + } + + if pz.box: + zone_entry["border"] = { + "thickness": pz.box.border_thickness, + "confidence": pz.box.confidence, + } + + if pz.image_overlays: + zone_entry["image_overlays"] = pz.image_overlays + + if pz.layout_hint: + zone_entry["layout_hint"] = pz.layout_hint + if pz.vsplit_group is not None: + zone_entry["vsplit_group"] = pz.vsplit_group + + zones_data.append(zone_entry) + + # 4. Fallback: no boxes detected -> single zone with all words + if not zones_data: + before = len(all_words) + filtered_words = [ + w for w in all_words + if not (w.get("recovered") and len(w.get("text", "").strip()) <= 2) + ] + removed = before - len(filtered_words) + if removed: + logger.info( + "build-grid session %s: filtered %d recovered artifacts (fallback zone)", + session_id, removed, + ) + filtered_words, bs_removed = _filter_border_strip_words(filtered_words) + if bs_removed: + border_prefiltered = True + logger.info( + "build-grid session %s: pre-filtered %d border-strip words", + session_id, bs_removed, + ) + grid = _build_zone_grid( + filtered_words, content_x, content_y, content_w, content_h, + 0, img_w, img_h, + ) + grid.pop("_raw_columns", None) + zones_data.append({ + "zone_index": 0, + "zone_type": "content", + "bbox_px": { + "x": content_x, "y": content_y, + "w": content_w, "h": content_h, + }, + "bbox_pct": { + "x": round(content_x / img_w * 100, 2) if img_w else 0, + "y": round(content_y / img_h * 100, 2) if img_h else 0, + "w": round(content_w / img_w * 100, 2) if img_w else 0, + "h": round(content_h / img_h * 100, 2) if img_h else 0, + }, + "border": None, + "word_count": len(all_words), + **grid, + }) + + return { + "zones_data": zones_data, + "boxes_detected": boxes_detected, + "recovered_count": recovered_count, + "border_prefiltered": border_prefiltered, + "img_bgr": img_bgr, + } + + +def _detect_and_split_vertical_dividers( + page_zones: List[PageZone], + all_words: List[Dict[str, Any]], +) -> tuple: + """Detect vertical dividers and split content zones. + + Returns (expanded_zones, border_prefiltered_from_vsplit). + """ + vsplit_group_counter = 0 + expanded_zones: List = [] + for pz in page_zones: + if pz.zone_type != "content": + expanded_zones.append(pz) + continue + zone_words = _words_in_zone( + all_words, pz.y, pz.height, pz.x, pz.width + ) + divider_xs = _detect_vertical_dividers( + zone_words, pz.x, pz.width, pz.y, pz.height + ) + if divider_xs: + sub_zones = _split_zone_at_vertical_dividers( + pz, divider_xs, vsplit_group_counter + ) + expanded_zones.extend(sub_zones) + vsplit_group_counter += 1 + # Remove pipe words so they don't appear in sub-zones + pipe_ids = set( + id(w) for w in zone_words + if _PIPE_RE_VSPLIT.match((w.get("text") or "").strip()) + ) + all_words[:] = [w for w in all_words if id(w) not in pipe_ids] + logger.info( + "build-grid: vertical split zone %d at x=%s -> %d sub-zones", + pz.index, [int(x) for x in divider_xs], len(sub_zones), + ) + else: + expanded_zones.append(pz) + # Re-index zones + for i, pz in enumerate(expanded_zones): + pz.index = i + return expanded_zones, False + + +def _build_grids_per_zone( + page_zones: List[PageZone], + all_words: List[Dict[str, Any]], + img_w: int, + img_h: int, +) -> List[Dict[str, Any]]: + """Build grids for each zone independently (first pass).""" + zone_grids: List[Dict] = [] + + for pz in page_zones: + zone_words = _words_in_zone( + all_words, pz.y, pz.height, pz.x, pz.width + ) + if pz.zone_type == "content": + logger.info( + "build-grid zone %d (%s): bounds x=%d..%d y=%d..%d -> %d/%d words", + pz.index, pz.zone_type, + pz.x, pz.x + pz.width, pz.y, pz.y + pz.height, + len(zone_words), len(all_words), + ) + # Filter recovered single-char artifacts in ALL zones + before = len(zone_words) + zone_words = [ + w for w in zone_words + if not ( + w.get("recovered") + and len(w.get("text", "").strip()) <= 2 + ) + ] + removed = before - len(zone_words) + if removed: + logger.info( + "build-grid: filtered %d recovered artifacts from %s zone %d", + removed, pz.zone_type, pz.index, + ) + # Filter words inside image overlay regions (merged box zones) + if pz.image_overlays: + before_ov = len(zone_words) + zone_words = [ + w for w in zone_words + if not any( + ov["y"] <= w["top"] + w["height"] / 2 <= ov["y"] + ov["height"] + and ov["x"] <= w["left"] + w["width"] / 2 <= ov["x"] + ov["width"] + for ov in pz.image_overlays + ) + ] + ov_removed = before_ov - len(zone_words) + if ov_removed: + logger.info( + "build-grid: filtered %d words inside image overlays from zone %d", + ov_removed, pz.index, + ) + zone_words, bs_removed = _filter_border_strip_words(zone_words) + bp = False + if bs_removed: + bp = True + logger.info( + "build-grid: pre-filtered %d border-strip words from zone %d", + bs_removed, pz.index, + ) + grid = _build_zone_grid( + zone_words, pz.x, pz.y, pz.width, pz.height, + pz.index, img_w, img_h, + skip_first_row_header=bool(pz.image_overlays), + ) + zone_grids.append({ + "pz": pz, "words": zone_words, "grid": grid, + "_border_prefiltered": bp, + }) + + return zone_grids + + +def _merge_content_zone_columns( + zone_grids: List[Dict[str, Any]], + all_words: List[Dict[str, Any]], + content_w: int, + img_w: int, + img_h: int, + session_id: str, +) -> None: + """Second pass: merge column boundaries from all content zones. + + Modifies zone_grids in place. + """ + content_zones = [ + zg for zg in zone_grids + if zg["pz"].zone_type == "content" + and zg["pz"].vsplit_group is None + ] + if len(content_zones) <= 1: + return + + # Collect column split points (x_min of non-first columns) + all_split_xs: List[float] = [] + for zg in content_zones: + raw_cols = zg["grid"].get("_raw_columns", []) + for col in raw_cols[1:]: + all_split_xs.append(col["x_min"]) + + if not all_split_xs: + return + + all_split_xs.sort() + merge_distance = max(25, int(content_w * 0.03)) + merged_xs = [all_split_xs[0]] + for x in all_split_xs[1:]: + if x - merged_xs[-1] < merge_distance: + merged_xs[-1] = (merged_xs[-1] + x) / 2 + else: + merged_xs.append(x) + + total_cols = len(merged_xs) + 1 + max_zone_cols = max( + len(zg["grid"].get("_raw_columns", [])) + for zg in content_zones + ) + + if total_cols < max_zone_cols: + return + + cx_min = min(w["left"] for w in all_words) + cx_max = max(w["left"] + w["width"] for w in all_words) + merged_columns: List[Dict[str, Any]] = [] + prev_x = cx_min + for i, sx in enumerate(merged_xs): + merged_columns.append({ + "index": i, + "type": f"column_{i + 1}", + "x_min": prev_x, + "x_max": sx, + }) + prev_x = sx + merged_columns.append({ + "index": len(merged_xs), + "type": f"column_{len(merged_xs) + 1}", + "x_min": prev_x, + "x_max": cx_max, + }) + + # Re-build ALL content zones with merged columns + for zg in zone_grids: + pz = zg["pz"] + if pz.zone_type == "content": + grid = _build_zone_grid( + zg["words"], pz.x, pz.y, + pz.width, pz.height, + pz.index, img_w, img_h, + global_columns=merged_columns, + skip_first_row_header=bool(pz.image_overlays), + ) + zg["grid"] = grid + logger.info( + "build-grid session %s: union of %d content " + "zones -> %d merged columns (max single zone: %d)", + session_id, len(content_zones), + total_cols, max_zone_cols, + ) diff --git a/klausur-service/backend/grid/editor/__init__.py b/klausur-service/backend/grid/editor/__init__.py new file mode 100644 index 0000000..1745f51 --- /dev/null +++ b/klausur-service/backend/grid/editor/__init__.py @@ -0,0 +1,15 @@ +""" +Grid Editor sub-package — FastAPI endpoints and helper functions. + +Modules: + - api — barrel re-export (combined router + _build_grid_core) + - api_grid — build-grid, save-grid, get-grid endpoints + - api_gutter — gutter-repair endpoints + - api_box — build-box-grids endpoints + - api_unified — build-unified-grid endpoints + - helpers — barrel re-export of all helper symbols + - columns — column detection, cross-column splitting + - filters — word/zone filtering, border ghosts + - headers — header/heading detection, colspan detection + - zones — vertical dividers, zone splitting/merging +""" diff --git a/klausur-service/backend/grid/editor/api.py b/klausur-service/backend/grid/editor/api.py new file mode 100644 index 0000000..aef3113 --- /dev/null +++ b/klausur-service/backend/grid/editor/api.py @@ -0,0 +1,31 @@ +""" +Grid Editor API — barrel re-export. + +The actual endpoints live in: + - grid_editor_api_grid.py (build-grid, rerun-ocr, save-grid, get-grid) + - grid_editor_api_gutter.py (gutter-repair, gutter-repair/apply) + - grid_editor_api_box.py (build-box-grids) + - grid_editor_api_unified.py (build-unified-grid, unified-grid) + +This module re-exports the combined router and key symbols so that +existing `from grid_editor_api import router` / `from grid_editor_api import _build_grid_core` +continue to work unchanged. +""" + +from fastapi import APIRouter + +from .api_grid import router as _grid_router +from .api_gutter import router as _gutter_router +from .api_box import router as _box_router +from .api_unified import router as _unified_router + +# Re-export _build_grid_core so callers that do +# `from grid_editor_api import _build_grid_core` keep working. +from grid.build.core import _build_grid_core # noqa: F401 + +# Merge all sub-routers into one combined router +router = APIRouter() +router.include_router(_grid_router) +router.include_router(_gutter_router) +router.include_router(_box_router) +router.include_router(_unified_router) diff --git a/klausur-service/backend/grid/editor/api_box.py b/klausur-service/backend/grid/editor/api_box.py new file mode 100644 index 0000000..94a5d0a --- /dev/null +++ b/klausur-service/backend/grid/editor/api_box.py @@ -0,0 +1,177 @@ +""" +Grid Editor API — box-grid-review endpoints. +""" + +import logging + +from fastapi import APIRouter, HTTPException, Request + +from .filters import _words_in_zone +from ocr_pipeline_session_store import ( + get_session_db, + update_session_db, +) + +logger = logging.getLogger(__name__) + +router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["grid-editor"]) + + +@router.post("/sessions/{session_id}/build-box-grids") +async def build_box_grids(session_id: str, request: Request): + """Rebuild grid structure for all detected boxes with layout-aware detection. + + Uses structure_result.boxes (from Step 7) as the source of box coordinates, + and raw_paddle_words as OCR word source. Creates or updates box zones in + the grid_editor_result. + + Optional body: { "overrides": { "0": "bullet_list" } } + Maps box_index -> forced layout_type. + """ + session = await get_session_db(session_id) + if not session: + raise HTTPException(status_code=404, detail=f"Session {session_id} not found") + + grid_data = session.get("grid_editor_result") + if not grid_data: + raise HTTPException(status_code=400, detail="No grid data. Run build-grid first.") + + # Get raw OCR words (with top/left/width/height keys) + word_result = session.get("word_result") or {} + all_words = word_result.get("raw_paddle_words") or word_result.get("raw_tesseract_words") or [] + if not all_words: + raise HTTPException(status_code=400, detail="No raw OCR words available.") + + # Get detected boxes from structure_result + structure_result = session.get("structure_result") or {} + gt = session.get("ground_truth") or {} + if not structure_result: + structure_result = gt.get("structure_result") or {} + detected_boxes = structure_result.get("boxes") or [] + if not detected_boxes: + return {"session_id": session_id, "box_zones_rebuilt": 0, "spell_fixes": 0, "message": "No boxes detected"} + + # Filter out false-positive boxes in header/footer margins. + img_h_for_filter = grid_data.get("image_height", 0) or word_result.get("image_height", 0) + if img_h_for_filter > 0: + margin_frac = 0.07 # 7% of image height + margin_top = img_h_for_filter * margin_frac + margin_bottom = img_h_for_filter * (1 - margin_frac) + filtered = [] + for box in detected_boxes: + by = box.get("y", 0) + bh = box.get("h", 0) + box_center_y = by + bh / 2 + if box_center_y < margin_top or box_center_y > margin_bottom: + logger.info("build-box-grids: skipping header/footer box at y=%d h=%d (center=%.0f, margins=%.0f/%.0f)", + by, bh, box_center_y, margin_top, margin_bottom) + continue + filtered.append(box) + detected_boxes = filtered + + body = {} + try: + body = await request.json() + except Exception: + pass + layout_overrides = body.get("overrides", {}) + + from cv_box_layout import build_box_zone_grid + + img_w = grid_data.get("image_width", 0) or word_result.get("image_width", 0) + img_h = grid_data.get("image_height", 0) or word_result.get("image_height", 0) + + zones = grid_data.get("zones", []) + + # Find highest existing zone_index + max_zone_idx = max((z.get("zone_index", 0) for z in zones), default=-1) + + # Remove old box zones (we'll rebuild them) + zones = [z for z in zones if z.get("zone_type") != "box"] + + box_count = 0 + spell_fixes = 0 + + for box_idx, box in enumerate(detected_boxes): + bx = box.get("x", 0) + by = box.get("y", 0) + bw = box.get("w", 0) + bh = box.get("h", 0) + + if bw <= 0 or bh <= 0: + continue + + # Filter raw OCR words inside this box + zone_words = _words_in_zone(all_words, by, bh, bx, bw) + if not zone_words: + logger.info("Box %d: no words found in bbox (%d,%d,%d,%d)", box_idx, bx, by, bw, bh) + continue + + zone_idx = max_zone_idx + 1 + box_idx + forced_layout = layout_overrides.get(str(box_idx)) + + # Build box grid + box_grid = build_box_zone_grid( + zone_words, bx, by, bw, bh, + zone_idx, img_w, img_h, + layout_type=forced_layout, + ) + + # Apply SmartSpellChecker to all box cells + try: + from smart_spell import SmartSpellChecker + ssc = SmartSpellChecker() + for cell in box_grid.get("cells", []): + text = cell.get("text", "") + if not text: + continue + result = ssc.correct_text(text, lang="auto") + if result.changed: + cell["text"] = result.corrected + spell_fixes += 1 + except ImportError: + pass + + # Build zone entry + zone_entry = { + "zone_index": zone_idx, + "zone_type": "box", + "bbox_px": {"x": bx, "y": by, "w": bw, "h": bh}, + "bbox_pct": { + "x": round(bx / img_w * 100, 2) if img_w else 0, + "y": round(by / img_h * 100, 2) if img_h else 0, + "w": round(bw / img_w * 100, 2) if img_w else 0, + "h": round(bh / img_h * 100, 2) if img_h else 0, + }, + "border": None, + "word_count": len(zone_words), + "columns": box_grid["columns"], + "rows": box_grid["rows"], + "cells": box_grid["cells"], + "header_rows": box_grid.get("header_rows", []), + "box_layout_type": box_grid.get("box_layout_type", "flowing"), + "box_grid_reviewed": False, + "box_bg_color": box.get("bg_color_name", ""), + "box_bg_hex": box.get("bg_color_hex", ""), + } + zones.append(zone_entry) + box_count += 1 + + # Sort zones by y-position for correct reading order + zones.sort(key=lambda z: z.get("bbox_px", {}).get("y", 0)) + + grid_data["zones"] = zones + await update_session_db(session_id, grid_editor_result=grid_data) + + logger.info( + "build-box-grids session %s: %d boxes processed (%d words spell-fixed) from %d detected", + session_id, box_count, spell_fixes, len(detected_boxes), + ) + + return { + "session_id": session_id, + "box_zones_rebuilt": box_count, + "total_detected_boxes": len(detected_boxes), + "spell_fixes": spell_fixes, + "zones": zones, + } diff --git a/klausur-service/backend/grid/editor/api_grid.py b/klausur-service/backend/grid/editor/api_grid.py new file mode 100644 index 0000000..685d789 --- /dev/null +++ b/klausur-service/backend/grid/editor/api_grid.py @@ -0,0 +1,334 @@ +""" +Grid Editor API — grid build, save, and retrieve endpoints. +""" + +import logging + +from fastapi import APIRouter, HTTPException, Query, Request + +from grid.build.core import _build_grid_core +from ocr_pipeline_session_store import ( + get_session_db, + update_session_db, +) +from ocr_pipeline_common import ( + _cache, + _load_session_to_cache, + _get_cached, +) + +logger = logging.getLogger(__name__) + +router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["grid-editor"]) + + +@router.post("/sessions/{session_id}/build-grid") +async def build_grid( + session_id: str, + ipa_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"), + syllable_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"), + enhance: bool = Query(True, description="Step 3: CLAHE + denoise for degraded scans"), + max_cols: int = Query(0, description="Step 2: Max column count (0=unlimited)"), + min_conf: int = Query(0, description="Step 1: Min OCR confidence (0=auto)"), +): + """Build a structured, zone-aware grid from existing Kombi word results. + + Requires that paddle-kombi or rapid-kombi has already been run on the session. + Uses the image for box detection and the word positions for grid structuring. + + Query params: + ipa_mode: "auto" (only when English IPA detected), "all" (force), "none" (skip) + syllable_mode: "auto" (only when original has dividers), "all" (force), "none" (skip) + + Returns a StructuredGrid with zones, each containing their own + columns, rows, and cells — ready for the frontend Excel-like editor. + """ + session = await get_session_db(session_id) + if not session: + raise HTTPException(status_code=404, detail=f"Session {session_id} not found") + + try: + result = await _build_grid_core( + session_id, session, + ipa_mode=ipa_mode, syllable_mode=syllable_mode, + enhance=enhance, + max_columns=max_cols if max_cols > 0 else None, + min_conf=min_conf if min_conf > 0 else None, + ) + except ValueError as e: + raise HTTPException(status_code=400, detail=str(e)) + + # Save automatic grid snapshot for later comparison with manual corrections + # Lazy import to avoid circular dependency with ocr_pipeline_regression + from ocr_pipeline_regression import _build_reference_snapshot + + wr = session.get("word_result") or {} + engine = wr.get("ocr_engine", "") + if engine in ("kombi", "rapid_kombi"): + auto_pipeline = "kombi" + elif engine == "paddle_direct": + auto_pipeline = "paddle-direct" + else: + auto_pipeline = "pipeline" + auto_snapshot = _build_reference_snapshot(result, pipeline=auto_pipeline) + + gt = session.get("ground_truth") or {} + gt["auto_grid_snapshot"] = auto_snapshot + + # Persist to DB and advance current_step to 11 (reconstruction complete) + await update_session_db(session_id, grid_editor_result=result, ground_truth=gt, current_step=11) + + logger.info( + "build-grid session %s: %d zones, %d cols, %d rows, %d cells, " + "%d boxes in %.2fs", + session_id, + len(result.get("zones", [])), + result.get("summary", {}).get("total_columns", 0), + result.get("summary", {}).get("total_rows", 0), + result.get("summary", {}).get("total_cells", 0), + result.get("boxes_detected", 0), + result.get("duration_seconds", 0), + ) + + return result + + +@router.post("/sessions/{session_id}/rerun-ocr-and-build-grid") +async def rerun_ocr_and_build_grid( + session_id: str, + ipa_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"), + syllable_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"), + enhance: bool = Query(True, description="Step 3: CLAHE + denoise for degraded scans"), + max_cols: int = Query(0, description="Step 2: Max column count (0=unlimited)"), + min_conf: int = Query(0, description="Step 1: Min OCR confidence (0=auto)"), + vision_fusion: bool = Query(False, description="Step 4: Vision-LLM fusion for degraded scans"), + doc_category: str = Query("", description="Document type for Vision-LLM prompt context"), +): + """Re-run OCR with quality settings, then rebuild the grid. + + Unlike build-grid (which only rebuilds from existing words), + this endpoint re-runs the full OCR pipeline on the cropped image + with optional CLAHE enhancement, then builds the grid. + + Steps executed: Image Enhancement -> OCR -> Grid Build + """ + session = await get_session_db(session_id) + if not session: + raise HTTPException(status_code=404, detail=f"Session {session_id} not found") + + import time as _time + t0 = _time.time() + + # 1. Load the cropped/dewarped image from cache or session + if session_id not in _cache: + await _load_session_to_cache(session_id) + cached = _get_cached(session_id) + + dewarped_bgr = cached.get("cropped_bgr") if cached.get("cropped_bgr") is not None else cached.get("dewarped_bgr") + if dewarped_bgr is None: + raise HTTPException(status_code=400, detail="No cropped/dewarped image available. Run preprocessing steps first.") + + img_h, img_w = dewarped_bgr.shape[:2] + ocr_input = dewarped_bgr.copy() + + # 2. Scan quality assessment + scan_quality_info = {} + try: + from scan_quality import score_scan_quality + quality_report = score_scan_quality(ocr_input) + scan_quality_info = quality_report.to_dict() + actual_min_conf = min_conf if min_conf > 0 else quality_report.recommended_min_conf + except Exception as e: + logger.warning(f"rerun-ocr: scan quality failed: {e}") + actual_min_conf = min_conf if min_conf > 0 else 40 + + # 3. Image enhancement (Step 3) + is_degraded = scan_quality_info.get("is_degraded", False) + if enhance and is_degraded: + try: + from ocr_image_enhance import enhance_for_ocr + ocr_input = enhance_for_ocr(ocr_input, is_degraded=True) + logger.info("rerun-ocr: CLAHE enhancement applied") + except Exception as e: + logger.warning(f"rerun-ocr: enhancement failed: {e}") + + # 4. Run dual-engine OCR + from PIL import Image + import pytesseract + + # RapidOCR + rapid_words = [] + try: + from cv_ocr_engines import ocr_region_rapid + from cv_vocab_types import PageRegion + full_region = PageRegion(type="full_page", x=0, y=0, width=img_w, height=img_h) + rapid_words = ocr_region_rapid(ocr_input, full_region) or [] + except Exception as e: + logger.warning(f"rerun-ocr: RapidOCR failed: {e}") + + # Tesseract + pil_img = Image.fromarray(ocr_input[:, :, ::-1]) + data = pytesseract.image_to_data(pil_img, lang='eng+deu', config='--psm 6 --oem 3', output_type=pytesseract.Output.DICT) + tess_words = [] + for i in range(len(data["text"])): + text = (data["text"][i] or "").strip() + conf_raw = str(data["conf"][i]) + conf = int(conf_raw) if conf_raw.lstrip("-").isdigit() else -1 + if not text or conf < actual_min_conf: + continue + tess_words.append({ + "text": text, "left": data["left"][i], "top": data["top"][i], + "width": data["width"][i], "height": data["height"][i], "conf": conf, + }) + + # 5. Merge OCR results + from ocr_pipeline_ocr_merge import _split_paddle_multi_words, _merge_paddle_tesseract, _deduplicate_words + rapid_split = _split_paddle_multi_words(rapid_words) if rapid_words else [] + if rapid_split or tess_words: + merged_words = _merge_paddle_tesseract(rapid_split, tess_words) + merged_words = _deduplicate_words(merged_words) + else: + merged_words = tess_words + + # 6. Store updated word_result in session + cells_for_storage = [{"text": w["text"], "left": w["left"], "top": w["top"], + "width": w["width"], "height": w["height"], "conf": w.get("conf", 0)} + for w in merged_words] + word_result = { + "cells": [{"text": " ".join(w["text"] for w in merged_words), + "word_boxes": cells_for_storage}], + "image_width": img_w, + "image_height": img_h, + "ocr_engine": "rapid_kombi", + "word_count": len(merged_words), + "raw_paddle_words": rapid_words, + } + # 6b. Vision-LLM Fusion (Step 4) — correct OCR using Vision model + vision_applied = False + if vision_fusion: + try: + from vision_ocr_fusion import vision_fuse_ocr + category = doc_category or session.get("document_category") or "vokabelseite" + logger.info(f"rerun-ocr: running Vision-LLM fusion (category={category})") + merged_words = await vision_fuse_ocr(ocr_input, merged_words, category) + vision_applied = True + # Rebuild storage from fused words + cells_for_storage = [{"text": w["text"], "left": w["left"], "top": w["top"], + "width": w["width"], "height": w["height"], "conf": w.get("conf", 0)} + for w in merged_words] + word_result["cells"] = [{"text": " ".join(w["text"] for w in merged_words), + "word_boxes": cells_for_storage}] + word_result["word_count"] = len(merged_words) + word_result["ocr_engine"] = "vision_fusion" + except Exception as e: + logger.warning(f"rerun-ocr: Vision-LLM fusion failed: {e}") + + await update_session_db(session_id, word_result=word_result) + + # Reload session with updated word_result + session = await get_session_db(session_id) + + ocr_duration = _time.time() - t0 + logger.info( + "rerun-ocr session %s: %d words (rapid=%d, tess=%d, merged=%d) in %.1fs " + "(enhance=%s, min_conf=%d, quality=%s)", + session_id, len(merged_words), len(rapid_words), len(tess_words), + len(merged_words), ocr_duration, enhance, actual_min_conf, + scan_quality_info.get("quality_pct", "?"), + ) + + # 7. Build grid from new words + try: + result = await _build_grid_core( + session_id, session, + ipa_mode=ipa_mode, syllable_mode=syllable_mode, + enhance=enhance, + max_columns=max_cols if max_cols > 0 else None, + min_conf=min_conf if min_conf > 0 else None, + ) + except ValueError as e: + raise HTTPException(status_code=400, detail=str(e)) + + # Persist grid + await update_session_db(session_id, grid_editor_result=result, current_step=11) + + # Add quality info to response + result["scan_quality"] = scan_quality_info + result["ocr_stats"] = { + "rapid_words": len(rapid_words), + "tess_words": len(tess_words), + "merged_words": len(merged_words), + "min_conf_used": actual_min_conf, + "enhance_applied": enhance and is_degraded, + "vision_fusion_applied": vision_applied, + "document_category": doc_category or session.get("document_category", ""), + "ocr_duration_seconds": round(ocr_duration, 1), + } + + total_duration = _time.time() - t0 + logger.info( + "rerun-ocr+build-grid session %s: %d zones, %d cols, %d cells in %.1fs", + session_id, + len(result.get("zones", [])), + result.get("summary", {}).get("total_columns", 0), + result.get("summary", {}).get("total_cells", 0), + total_duration, + ) + + return result + + +@router.post("/sessions/{session_id}/save-grid") +async def save_grid(session_id: str, request: Request): + """Save edited grid data from the frontend Excel-like editor. + + Receives the full StructuredGrid with user edits (text changes, + formatting changes like bold columns, header rows, etc.) and + persists it to the session's grid_editor_result. + """ + session = await get_session_db(session_id) + if not session: + raise HTTPException(status_code=404, detail=f"Session {session_id} not found") + + body = await request.json() + + # Validate basic structure + if "zones" not in body: + raise HTTPException(status_code=400, detail="Missing 'zones' in request body") + + # Preserve metadata from the original build + existing = session.get("grid_editor_result") or {} + result = { + "session_id": session_id, + "image_width": body.get("image_width", existing.get("image_width", 0)), + "image_height": body.get("image_height", existing.get("image_height", 0)), + "zones": body["zones"], + "boxes_detected": body.get("boxes_detected", existing.get("boxes_detected", 0)), + "summary": body.get("summary", existing.get("summary", {})), + "formatting": body.get("formatting", existing.get("formatting", {})), + "duration_seconds": existing.get("duration_seconds", 0), + "edited": True, + } + + await update_session_db(session_id, grid_editor_result=result, current_step=11) + + logger.info("save-grid session %s: %d zones saved", session_id, len(body["zones"])) + + return {"session_id": session_id, "saved": True} + + +@router.get("/sessions/{session_id}/grid-editor") +async def get_grid(session_id: str): + """Retrieve the current grid editor state for a session.""" + session = await get_session_db(session_id) + if not session: + raise HTTPException(status_code=404, detail=f"Session {session_id} not found") + + result = session.get("grid_editor_result") + if not result: + raise HTTPException( + status_code=404, + detail="No grid editor data. Run build-grid first.", + ) + + return result diff --git a/klausur-service/backend/grid/editor/api_gutter.py b/klausur-service/backend/grid/editor/api_gutter.py new file mode 100644 index 0000000..7dfbd9f --- /dev/null +++ b/klausur-service/backend/grid/editor/api_gutter.py @@ -0,0 +1,110 @@ +""" +Grid Editor API — gutter repair endpoints. +""" + +import logging + +from fastapi import APIRouter, HTTPException, Request + +from ocr_pipeline_session_store import ( + get_session_db, + update_session_db, +) + +logger = logging.getLogger(__name__) + +router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["grid-editor"]) + + +@router.post("/sessions/{session_id}/gutter-repair") +async def gutter_repair(session_id: str): + """Analyse grid for gutter-edge OCR errors and return repair suggestions. + + Detects: + - Words truncated/blurred at the book binding (spell_fix) + - Words split across rows with missing hyphen chars (hyphen_join) + """ + session = await get_session_db(session_id) + if not session: + raise HTTPException(status_code=404, detail=f"Session {session_id} not found") + + grid_data = session.get("grid_editor_result") + if not grid_data: + raise HTTPException( + status_code=400, + detail="No grid data. Run build-grid first.", + ) + + from cv_gutter_repair import analyse_grid_for_gutter_repair + + image_width = grid_data.get("image_width", 0) + result = analyse_grid_for_gutter_repair(grid_data, image_width=image_width) + + # Persist suggestions in ground_truth.gutter_repair (avoids DB migration) + gt = session.get("ground_truth") or {} + gt["gutter_repair"] = result + await update_session_db(session_id, ground_truth=gt) + + logger.info( + "gutter-repair session %s: %d suggestions in %.2fs", + session_id, + result.get("stats", {}).get("suggestions_found", 0), + result.get("duration_seconds", 0), + ) + + return result + + +@router.post("/sessions/{session_id}/gutter-repair/apply") +async def gutter_repair_apply(session_id: str, request: Request): + """Apply accepted gutter repair suggestions to the grid. + + Body: { "accepted": ["suggestion_id_1", "suggestion_id_2", ...] } + """ + session = await get_session_db(session_id) + if not session: + raise HTTPException(status_code=404, detail=f"Session {session_id} not found") + + grid_data = session.get("grid_editor_result") + if not grid_data: + raise HTTPException(status_code=400, detail="No grid data.") + + gt = session.get("ground_truth") or {} + gutter_result = gt.get("gutter_repair") + if not gutter_result: + raise HTTPException( + status_code=400, + detail="No gutter repair data. Run gutter-repair first.", + ) + + body = await request.json() + accepted_ids = body.get("accepted", []) + if not accepted_ids: + return {"applied_count": 0, "changes": []} + + # text_overrides: { suggestion_id: "alternative_text" } + # Allows the user to pick a different correction from the alternatives list + text_overrides = body.get("text_overrides", {}) + + from cv_gutter_repair import apply_gutter_suggestions + + suggestions = gutter_result.get("suggestions", []) + + # Apply user-selected alternatives before passing to apply + for s in suggestions: + sid = s.get("id", "") + if sid in text_overrides and text_overrides[sid]: + s["suggested_text"] = text_overrides[sid] + + result = apply_gutter_suggestions(grid_data, accepted_ids, suggestions) + + # Save updated grid back to session + await update_session_db(session_id, grid_editor_result=grid_data) + + logger.info( + "gutter-repair/apply session %s: %d changes applied", + session_id, + result.get("applied_count", 0), + ) + + return result diff --git a/klausur-service/backend/grid/editor/api_unified.py b/klausur-service/backend/grid/editor/api_unified.py new file mode 100644 index 0000000..9ee83b8 --- /dev/null +++ b/klausur-service/backend/grid/editor/api_unified.py @@ -0,0 +1,71 @@ +""" +Grid Editor API — unified grid endpoints. +""" + +import logging + +from fastapi import APIRouter, HTTPException + +from ocr_pipeline_session_store import ( + get_session_db, + update_session_db, +) + +logger = logging.getLogger(__name__) + +router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["grid-editor"]) + + +@router.post("/sessions/{session_id}/build-unified-grid") +async def build_unified_grid_endpoint(session_id: str): + """Build a single-zone unified grid merging content + box zones. + + Takes the existing multi-zone grid_editor_result and produces a + unified grid where boxes are integrated into the main row sequence. + Persists as unified_grid_result (preserves original multi-zone data). + """ + session = await get_session_db(session_id) + if not session: + raise HTTPException(status_code=404, detail=f"Session {session_id} not found") + + grid_data = session.get("grid_editor_result") + if not grid_data: + raise HTTPException(status_code=400, detail="No grid data. Run build-grid first.") + + from unified_grid import build_unified_grid + + result = build_unified_grid( + zones=grid_data.get("zones", []), + image_width=grid_data.get("image_width", 0), + image_height=grid_data.get("image_height", 0), + layout_metrics=grid_data.get("layout_metrics", {}), + ) + + # Persist as separate field (don't overwrite original multi-zone grid) + await update_session_db(session_id, unified_grid_result=result) + + logger.info( + "build-unified-grid session %s: %d rows, %d cells", + session_id, + result.get("summary", {}).get("total_rows", 0), + result.get("summary", {}).get("total_cells", 0), + ) + + return result + + +@router.get("/sessions/{session_id}/unified-grid") +async def get_unified_grid(session_id: str): + """Retrieve the unified grid for a session.""" + session = await get_session_db(session_id) + if not session: + raise HTTPException(status_code=404, detail=f"Session {session_id} not found") + + result = session.get("unified_grid_result") + if not result: + raise HTTPException( + status_code=404, + detail="No unified grid. Run build-unified-grid first.", + ) + + return result diff --git a/klausur-service/backend/grid/editor/columns.py b/klausur-service/backend/grid/editor/columns.py new file mode 100644 index 0000000..6731798 --- /dev/null +++ b/klausur-service/backend/grid/editor/columns.py @@ -0,0 +1,492 @@ +""" +Grid Editor — column detection, cross-column splitting, marker merging. + +Split from grid_editor_helpers.py for maintainability. +All functions are pure computation — no HTTP, DB, or session side effects. + +Lizenz: Apache 2.0 (kommerziell nutzbar) +DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. +""" + +import logging +import re +from typing import Any, Dict, List, Optional + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Cross-column word splitting +# --------------------------------------------------------------------------- + +_spell_cache: Optional[Any] = None +_spell_loaded = False + + +def _is_recognized_word(text: str) -> bool: + """Check if *text* is a recognized German or English word. + + Uses the spellchecker library (same as cv_syllable_detect.py). + Returns True for real words like "oder", "Kabel", "Zeitung". + Returns False for OCR merge artifacts like "sichzie", "dasZimmer". + """ + global _spell_cache, _spell_loaded + if not text or len(text) < 2: + return False + + if not _spell_loaded: + _spell_loaded = True + try: + from spellchecker import SpellChecker + _spell_cache = SpellChecker(language="de") + except Exception: + pass + + if _spell_cache is None: + return False + + return text.lower() in _spell_cache + + +def _split_cross_column_words( + words: List[Dict], + columns: List[Dict], +) -> List[Dict]: + """Split word boxes that span across column boundaries. + + When OCR merges adjacent words from different columns (e.g. "sichzie" + spanning Col 1 and Col 2, or "dasZimmer" crossing the boundary), + split the word box at the column boundary so each piece is assigned + to the correct column. + + Only splits when: + - The word has significant overlap (>15% of its width) on both sides + - AND the word is not a recognized real word (OCR merge artifact), OR + the word contains a case transition (lowercase->uppercase) near the + boundary indicating two merged words like "dasZimmer". + """ + if len(columns) < 2: + return words + + # Column boundaries = midpoints between adjacent column edges + boundaries = [] + for i in range(len(columns) - 1): + boundary = (columns[i]["x_max"] + columns[i + 1]["x_min"]) / 2 + boundaries.append(boundary) + + new_words: List[Dict] = [] + split_count = 0 + for w in words: + w_left = w["left"] + w_width = w["width"] + w_right = w_left + w_width + text = (w.get("text") or "").strip() + + if not text or len(text) < 4 or w_width < 10: + new_words.append(w) + continue + + # Find the first boundary this word straddles significantly + split_boundary = None + for b in boundaries: + if w_left < b < w_right: + left_part = b - w_left + right_part = w_right - b + # Both sides must have at least 15% of the word width + if left_part > w_width * 0.15 and right_part > w_width * 0.15: + split_boundary = b + break + + if split_boundary is None: + new_words.append(w) + continue + + # Compute approximate split position in the text. + left_width = split_boundary - w_left + split_ratio = left_width / w_width + approx_pos = len(text) * split_ratio + + # Strategy 1: look for a case transition (lowercase->uppercase) near + # the approximate split point — e.g. "dasZimmer" splits at 'Z'. + split_char = None + search_lo = max(1, int(approx_pos) - 3) + search_hi = min(len(text), int(approx_pos) + 2) + for i in range(search_lo, search_hi): + if text[i - 1].islower() and text[i].isupper(): + split_char = i + break + + # Strategy 2: if no case transition, only split if the whole word + # is NOT a real word (i.e. it's an OCR merge artifact like "sichzie"). + # Real words like "oder", "Kabel", "Zeitung" must not be split. + if split_char is None: + clean = re.sub(r"[,;:.!?]+$", "", text) # strip trailing punct + if _is_recognized_word(clean): + new_words.append(w) + continue + # Not a real word — use floor of proportional position + split_char = max(1, min(len(text) - 1, int(approx_pos))) + + left_text = text[:split_char].rstrip() + right_text = text[split_char:].lstrip() + + if len(left_text) < 2 or len(right_text) < 2: + new_words.append(w) + continue + + right_width = w_width - round(left_width) + new_words.append({ + **w, + "text": left_text, + "width": round(left_width), + }) + new_words.append({ + **w, + "text": right_text, + "left": round(split_boundary), + "width": right_width, + }) + split_count += 1 + logger.info( + "split cross-column word %r -> %r + %r at boundary %.0f", + text, left_text, right_text, split_boundary, + ) + + if split_count: + logger.info("split %d cross-column word(s)", split_count) + return new_words + + +def _cluster_columns_by_alignment( + words: List[Dict], + zone_w: int, + rows: List[Dict], +) -> List[Dict[str, Any]]: + """Detect columns by clustering left-edge alignment across rows. + + Hybrid approach: + 1. Group words by row, find "group start" positions within each row + (words preceded by a large gap or first word in row) + 2. Cluster group-start left-edges by X-proximity across rows + 3. Filter by row coverage (how many rows have a group start here) + 4. Merge nearby clusters + 5. Build column boundaries + + This filters out mid-phrase word positions (e.g. IPA transcriptions, + second words in multi-word entries) by only considering positions + where a new word group begins within a row. + """ + if not words or not rows: + return [] + + total_rows = len(rows) + if total_rows == 0: + return [] + + # --- Group words by row --- + row_words: Dict[int, List[Dict]] = {} + for w in words: + y_center = w["top"] + w["height"] / 2 + best = min(rows, key=lambda r: abs(r["y_center"] - y_center)) + row_words.setdefault(best["index"], []).append(w) + + # --- Compute adaptive gap threshold for group-start detection --- + all_gaps: List[float] = [] + for ri, rw_list in row_words.items(): + sorted_rw = sorted(rw_list, key=lambda w: w["left"]) + for i in range(len(sorted_rw) - 1): + right = sorted_rw[i]["left"] + sorted_rw[i]["width"] + gap = sorted_rw[i + 1]["left"] - right + if gap > 0: + all_gaps.append(gap) + + if all_gaps: + sorted_gaps = sorted(all_gaps) + median_gap = sorted_gaps[len(sorted_gaps) // 2] + heights = [w["height"] for w in words if w.get("height", 0) > 0] + median_h = sorted(heights)[len(heights) // 2] if heights else 25 + + # For small word counts (boxes, sub-zones): PaddleOCR returns + # multi-word blocks, so ALL inter-word gaps are potential column + # boundaries. Use a low threshold based on word height — any gap + # wider than ~1x median word height is a column separator. + if len(words) <= 60: + gap_threshold = max(median_h * 1.0, 25) + logger.info( + "alignment columns (small zone): gap_threshold=%.0f " + "(median_h=%.0f, %d words, %d gaps: %s)", + gap_threshold, median_h, len(words), len(sorted_gaps), + [int(g) for g in sorted_gaps[:10]], + ) + else: + # Standard approach for large zones (full pages) + gap_threshold = max(median_gap * 3, median_h * 1.5, 30) + # Cap at 25% of zone width + max_gap = zone_w * 0.25 + if gap_threshold > max_gap > 30: + logger.info("alignment columns: capping gap_threshold %.0f -> %.0f (25%% of zone_w=%d)", gap_threshold, max_gap, zone_w) + gap_threshold = max_gap + else: + gap_threshold = 50 + + # --- Find group-start positions (left-edges that begin a new column) --- + start_positions: List[tuple] = [] # (left_edge, row_index) + for ri, rw_list in row_words.items(): + sorted_rw = sorted(rw_list, key=lambda w: w["left"]) + # First word in row is always a group start + start_positions.append((sorted_rw[0]["left"], ri)) + for i in range(1, len(sorted_rw)): + right_prev = sorted_rw[i - 1]["left"] + sorted_rw[i - 1]["width"] + gap = sorted_rw[i]["left"] - right_prev + if gap >= gap_threshold: + start_positions.append((sorted_rw[i]["left"], ri)) + + start_positions.sort(key=lambda x: x[0]) + + logger.info( + "alignment columns: %d group-start positions from %d words " + "(gap_threshold=%.0f, %d rows)", + len(start_positions), len(words), gap_threshold, total_rows, + ) + + if not start_positions: + x_min = min(w["left"] for w in words) + x_max = max(w["left"] + w["width"] for w in words) + return [{"index": 0, "type": "column_text", "x_min": x_min, "x_max": x_max}] + + # --- Cluster group-start positions by X-proximity --- + tolerance = max(10, int(zone_w * 0.01)) + clusters: List[Dict[str, Any]] = [] + cur_edges = [start_positions[0][0]] + cur_rows = {start_positions[0][1]} + + for left, row_idx in start_positions[1:]: + if left - cur_edges[-1] <= tolerance: + cur_edges.append(left) + cur_rows.add(row_idx) + else: + clusters.append({ + "mean_x": int(sum(cur_edges) / len(cur_edges)), + "min_edge": min(cur_edges), + "max_edge": max(cur_edges), + "count": len(cur_edges), + "distinct_rows": len(cur_rows), + "row_coverage": len(cur_rows) / total_rows, + }) + cur_edges = [left] + cur_rows = {row_idx} + clusters.append({ + "mean_x": int(sum(cur_edges) / len(cur_edges)), + "min_edge": min(cur_edges), + "max_edge": max(cur_edges), + "count": len(cur_edges), + "distinct_rows": len(cur_rows), + "row_coverage": len(cur_rows) / total_rows, + }) + + # --- Filter by row coverage --- + # These thresholds must be high enough to avoid false columns in flowing + # text (random inter-word gaps) while still detecting real columns in + # vocabulary worksheets (which typically have >80% row coverage). + MIN_COVERAGE_PRIMARY = 0.35 + MIN_COVERAGE_SECONDARY = 0.12 + MIN_WORDS_SECONDARY = 4 + MIN_DISTINCT_ROWS = 3 + + # Content boundary for left-margin detection + content_x_min = min(w["left"] for w in words) + content_x_max = max(w["left"] + w["width"] for w in words) + content_span = content_x_max - content_x_min + + primary = [ + c for c in clusters + if c["row_coverage"] >= MIN_COVERAGE_PRIMARY + and c["distinct_rows"] >= MIN_DISTINCT_ROWS + ] + primary_ids = {id(c) for c in primary} + secondary = [ + c for c in clusters + if id(c) not in primary_ids + and c["row_coverage"] >= MIN_COVERAGE_SECONDARY + and c["count"] >= MIN_WORDS_SECONDARY + and c["distinct_rows"] >= MIN_DISTINCT_ROWS + ] + + # Tertiary: narrow left-margin columns (page refs, markers) that have + # too few rows for secondary but are clearly left-aligned and separated + # from the main content. These appear at the far left or far right and + # have a large gap to the nearest significant cluster. + used_ids = {id(c) for c in primary} | {id(c) for c in secondary} + sig_xs = [c["mean_x"] for c in primary + secondary] + + # Tertiary: clusters that are clearly to the LEFT of the first + # significant column (or RIGHT of the last). If words consistently + # start at a position left of the established first column boundary, + # they MUST be a separate column — regardless of how few rows they + # cover. The only requirement is a clear spatial gap. + MIN_COVERAGE_TERTIARY = 0.02 # at least 1 row effectively + tertiary = [] + for c in clusters: + if id(c) in used_ids: + continue + if c["distinct_rows"] < 1: + continue + if c["row_coverage"] < MIN_COVERAGE_TERTIARY: + continue + # Must be near left or right content margin (within 15%) + rel_pos = (c["mean_x"] - content_x_min) / content_span if content_span else 0.5 + if not (rel_pos < 0.15 or rel_pos > 0.85): + continue + # Must have significant gap to nearest significant cluster + if sig_xs: + min_dist = min(abs(c["mean_x"] - sx) for sx in sig_xs) + if min_dist < max(30, content_span * 0.02): + continue + tertiary.append(c) + + if tertiary: + for c in tertiary: + logger.info( + " tertiary (margin) cluster: x=%d (range %d-%d), %d words, %d rows (%.0f%%)", + c["mean_x"], c["min_edge"], c["max_edge"], + c["count"], c["distinct_rows"], c["row_coverage"] * 100, + ) + + significant = sorted(primary + secondary + tertiary, key=lambda c: c["mean_x"]) + + for c in significant: + logger.info( + " significant cluster: x=%d (range %d-%d), %d words, %d rows (%.0f%%)", + c["mean_x"], c["min_edge"], c["max_edge"], + c["count"], c["distinct_rows"], c["row_coverage"] * 100, + ) + logger.info( + "alignment columns: %d clusters, %d primary, %d secondary -> %d significant", + len(clusters), len(primary), len(secondary), len(significant), + ) + + if not significant: + # Fallback: single column covering all content + x_min = min(w["left"] for w in words) + x_max = max(w["left"] + w["width"] for w in words) + return [{"index": 0, "type": "column_text", "x_min": x_min, "x_max": x_max}] + + # --- Merge nearby clusters --- + merge_distance = max(25, int(zone_w * 0.03)) + merged = [significant[0].copy()] + for s in significant[1:]: + if s["mean_x"] - merged[-1]["mean_x"] < merge_distance: + prev = merged[-1] + total = prev["count"] + s["count"] + prev["mean_x"] = ( + prev["mean_x"] * prev["count"] + s["mean_x"] * s["count"] + ) // total + prev["count"] = total + prev["min_edge"] = min(prev["min_edge"], s["min_edge"]) + prev["max_edge"] = max(prev["max_edge"], s["max_edge"]) + prev["distinct_rows"] = max(prev["distinct_rows"], s["distinct_rows"]) + else: + merged.append(s.copy()) + + logger.info( + "alignment columns: %d after merge (distance=%d)", + len(merged), merge_distance, + ) + + # --- Build column boundaries --- + margin = max(5, int(zone_w * 0.005)) + content_x_min = min(w["left"] for w in words) + content_x_max = max(w["left"] + w["width"] for w in words) + + columns: List[Dict[str, Any]] = [] + for i, cluster in enumerate(merged): + x_min = max(content_x_min, cluster["min_edge"] - margin) + if i + 1 < len(merged): + x_max = merged[i + 1]["min_edge"] - margin + else: + x_max = content_x_max + + columns.append({ + "index": i, + "type": f"column_{i + 1}" if len(merged) > 1 else "column_text", + "x_min": x_min, + "x_max": x_max, + }) + + return columns + + +_MARKER_CHARS = set("*-+#>") + + +def _merge_inline_marker_columns( + columns: List[Dict], + words: List[Dict], +) -> List[Dict]: + """Merge narrow marker columns (bullets, numbering) into adjacent text. + + Bullet points (*, -) and numbering (1., 2.) create narrow columns + at the left edge of a zone. These are inline markers that indent text, + not real separate columns. Merge them with their right neighbour. + + Does NOT merge columns containing alphabetic words like "to", "in", + "der", "die", "das" — those are legitimate content columns. + """ + if len(columns) < 2: + return columns + + merged: List[Dict] = [] + skip: set = set() + + for i, col in enumerate(columns): + if i in skip: + continue + + # Find words in this column + col_words = [ + w for w in words + if col["x_min"] <= w["left"] + w["width"] / 2 < col["x_max"] + ] + col_width = col["x_max"] - col["x_min"] + + # Narrow column with mostly short words -> MIGHT be inline markers + if col_words and col_width < 80: + avg_len = sum(len(w.get("text", "")) for w in col_words) / len(col_words) + if avg_len <= 2 and i + 1 < len(columns): + # Check if words are actual markers (symbols/numbers) vs + # real alphabetic words like "to", "in", "der", "die" + texts = [(w.get("text") or "").strip() for w in col_words] + alpha_count = sum( + 1 for t in texts + if t and t[0].isalpha() and t not in _MARKER_CHARS + ) + alpha_ratio = alpha_count / len(texts) if texts else 0 + + # If >=50% of words are alphabetic, this is a real column + if alpha_ratio >= 0.5: + logger.info( + " kept narrow column %d (w=%d, avg_len=%.1f, " + "alpha=%.0f%%) -- contains real words", + i, col_width, avg_len, alpha_ratio * 100, + ) + else: + # Merge into next column + next_col = columns[i + 1].copy() + next_col["x_min"] = col["x_min"] + merged.append(next_col) + skip.add(i + 1) + logger.info( + " merged inline marker column %d (w=%d, avg_len=%.1f) " + "into column %d", + i, col_width, avg_len, i + 1, + ) + continue + + merged.append(col) + + # Re-index + for i, col in enumerate(merged): + col["index"] = i + col["type"] = f"column_{i + 1}" if len(merged) > 1 else "column_text" + + return merged diff --git a/klausur-service/backend/grid/editor/filters.py b/klausur-service/backend/grid/editor/filters.py new file mode 100644 index 0000000..c938569 --- /dev/null +++ b/klausur-service/backend/grid/editor/filters.py @@ -0,0 +1,402 @@ +""" +Grid Editor — word/zone filtering, border ghosts, decorative margins, footers. + +Split from grid_editor_helpers.py for maintainability. +All functions are pure computation — no HTTP, DB, or session side effects. + +Lizenz: Apache 2.0 (kommerziell nutzbar) +DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. +""" + +import logging +from typing import Any, Dict, List, Optional, Tuple + +logger = logging.getLogger(__name__) + + +def _filter_border_strip_words(words: List[Dict]) -> Tuple[List[Dict], int]: + """Remove page-border decoration strip words BEFORE column detection. + + Scans from each page edge inward to find the first significant x-gap + (>30 px). If the edge cluster contains <15 % of total words, those + words are removed as border-strip artifacts (alphabet letters, + illustration fragments). + + Must run BEFORE ``_build_zone_grid`` so that column detection only + sees real content words and doesn't produce inflated row counts. + """ + if len(words) < 10: + return words, 0 + + sorted_words = sorted(words, key=lambda w: w.get("left", 0)) + total = len(sorted_words) + + # -- Left-edge scan (running max right-edge) -- + left_count = 0 + running_right = 0 + for gi in range(total - 1): + running_right = max( + running_right, + sorted_words[gi].get("left", 0) + sorted_words[gi].get("width", 0), + ) + if sorted_words[gi + 1].get("left", 0) - running_right > 30: + left_count = gi + 1 + break + + # -- Right-edge scan (running min left) -- + right_count = 0 + running_left = sorted_words[-1].get("left", 0) + for gi in range(total - 1, 0, -1): + running_left = min(running_left, sorted_words[gi].get("left", 0)) + prev_right = ( + sorted_words[gi - 1].get("left", 0) + + sorted_words[gi - 1].get("width", 0) + ) + if running_left - prev_right > 30: + right_count = total - gi + break + + # Validate candidate strip: real border decorations are mostly short + # words (alphabet letters like "A", "Bb", stray marks). Multi-word + # content like "der Ranzen" or "die Schals" (continuation of German + # translations) must NOT be removed. + def _is_decorative_strip(candidates: List[Dict]) -> bool: + if not candidates: + return False + short = sum(1 for w in candidates if len((w.get("text") or "").strip()) <= 2) + return short / len(candidates) >= 0.45 + + strip_ids: set = set() + if left_count > 0 and left_count / total < 0.20: + candidates = sorted_words[:left_count] + if _is_decorative_strip(candidates): + strip_ids = {id(w) for w in candidates} + elif right_count > 0 and right_count / total < 0.20: + candidates = sorted_words[total - right_count:] + if _is_decorative_strip(candidates): + strip_ids = {id(w) for w in candidates} + + if not strip_ids: + return words, 0 + + return [w for w in words if id(w) not in strip_ids], len(strip_ids) + + +# Characters that are typically OCR artefacts from box border lines. +# Intentionally excludes ! (red markers) and . , ; (real punctuation). +_GRID_GHOST_CHARS = set("|1lI[](){}/\\-\u2014\u2013_~=+") + + +def _filter_border_ghosts( + words: List[Dict], + boxes: List, +) -> tuple: + """Remove words sitting on box borders that are OCR artefacts. + + Returns (filtered_words, removed_count). + """ + if not boxes or not words: + return words, 0 + + # Build border bands from detected boxes + x_bands: List[tuple] = [] + y_bands: List[tuple] = [] + for b in boxes: + bt = ( + b.border_thickness + if hasattr(b, "border_thickness") + else b.get("border_thickness", 3) + ) + # Skip borderless boxes (images/graphics) -- no border line to produce ghosts + if bt == 0: + continue + bx = b.x if hasattr(b, "x") else b.get("x", 0) + by = b.y if hasattr(b, "y") else b.get("y", 0) + bw = b.width if hasattr(b, "width") else b.get("w", b.get("width", 0)) + bh = b.height if hasattr(b, "height") else b.get("h", b.get("height", 0)) + margin = max(bt * 2, 10) + 6 + x_bands.append((bx - margin, bx + margin)) + x_bands.append((bx + bw - margin, bx + bw + margin)) + y_bands.append((by - margin, by + margin)) + y_bands.append((by + bh - margin, by + bh + margin)) + + def _is_ghost(w: Dict) -> bool: + text = (w.get("text") or "").strip() + if not text: + return False + # Check if any word edge (not just center) touches a border band + w_left = w["left"] + w_right = w["left"] + w["width"] + w_top = w["top"] + w_bottom = w["top"] + w["height"] + on_border = ( + any(lo <= w_left <= hi or lo <= w_right <= hi for lo, hi in x_bands) + or any(lo <= w_top <= hi or lo <= w_bottom <= hi for lo, hi in y_bands) + ) + if not on_border: + return False + if len(text) == 1 and text in _GRID_GHOST_CHARS: + return True + return False + + filtered = [w for w in words if not _is_ghost(w)] + return filtered, len(words) - len(filtered) + + +def _flatten_word_boxes(cells: List[Dict]) -> List[Dict]: + """Extract all word_boxes from cells into a flat list of word dicts.""" + words: List[Dict] = [] + for cell in cells: + for wb in cell.get("word_boxes") or []: + if wb.get("text", "").strip(): + words.append({ + "text": wb["text"], + "left": wb["left"], + "top": wb["top"], + "width": wb["width"], + "height": wb["height"], + "conf": wb.get("conf", 0), + }) + return words + + +def _words_in_zone( + words: List[Dict], + zone_y: int, + zone_h: int, + zone_x: int, + zone_w: int, +) -> List[Dict]: + """Filter words whose Y-center falls within a zone's bounds.""" + zone_y_end = zone_y + zone_h + zone_x_end = zone_x + zone_w + result = [] + for w in words: + cy = w["top"] + w["height"] / 2 + cx = w["left"] + w["width"] / 2 + if zone_y <= cy <= zone_y_end and zone_x <= cx <= zone_x_end: + result.append(w) + return result + + +def _get_content_bounds(words: List[Dict]) -> tuple: + """Get content bounds from word positions.""" + if not words: + return 0, 0, 0, 0 + x_min = min(w["left"] for w in words) + y_min = min(w["top"] for w in words) + x_max = max(w["left"] + w["width"] for w in words) + y_max = max(w["top"] + w["height"] for w in words) + return x_min, y_min, x_max - x_min, y_max - y_min + + +def _filter_decorative_margin( + words: List[Dict], + img_w: int, + log: Any, + session_id: str, +) -> Dict[str, Any]: + """Remove words that belong to a decorative alphabet strip on a margin. + + Some vocabulary worksheets have a vertical A-Z alphabet graphic along + the left or right edge. OCR reads each letter as an isolated single- + character word. These decorative elements are not content and confuse + column/row detection. + + Detection criteria (phase 1 -- find the strip using single-char words): + - Words are in the outer 30% of the page (left or right) + - Nearly all words are single characters (letters or digits) + - At least 8 such words form a vertical strip (>=8 unique Y positions) + - Average horizontal spread of the strip is small (< 80px) + + Phase 2 -- once a strip is confirmed, also remove any short word (<=3 + chars) in the same narrow x-range. This catches multi-char OCR + artifacts like "Vv" that belong to the same decorative element. + + Modifies *words* in place. + + Returns: + Dict with 'found' (bool), 'side' (str), 'letters_detected' (int). + """ + no_strip: Dict[str, Any] = {"found": False, "side": "", "letters_detected": 0} + if not words or img_w <= 0: + return no_strip + + margin_cutoff = img_w * 0.30 + # Phase 1: find candidate strips using short words (1-2 chars). + # OCR often reads alphabet sidebar letters as pairs ("Aa", "Bb") + # rather than singles, so accept <=2-char words as strip candidates. + left_strip = [ + w for w in words + if len((w.get("text") or "").strip()) <= 2 + and w["left"] + w.get("width", 0) / 2 < margin_cutoff + ] + right_strip = [ + w for w in words + if len((w.get("text") or "").strip()) <= 2 + and w["left"] + w.get("width", 0) / 2 > img_w - margin_cutoff + ] + + for strip, side in [(left_strip, "left"), (right_strip, "right")]: + if len(strip) < 6: + continue + # Check vertical distribution: should have many distinct Y positions + y_centers = sorted(set( + int(w["top"] + w.get("height", 0) / 2) // 20 * 20 # bucket + for w in strip + )) + if len(y_centers) < 6: + continue + # Check horizontal compactness + x_positions = [w["left"] for w in strip] + x_min = min(x_positions) + x_max = max(x_positions) + x_spread = x_max - x_min + if x_spread > 80: + continue + + # Phase 2: strip confirmed -- also collect short words in same x-range + # Expand x-range slightly to catch neighbors (e.g. "Vv" next to "U") + strip_x_lo = x_min - 20 + strip_x_hi = x_max + 60 # word width + tolerance + all_strip_words = [ + w for w in words + if len((w.get("text") or "").strip()) <= 3 + and strip_x_lo <= w["left"] <= strip_x_hi + and (w["left"] + w.get("width", 0) / 2 < margin_cutoff + if side == "left" + else w["left"] + w.get("width", 0) / 2 > img_w - margin_cutoff) + ] + + strip_set = set(id(w) for w in all_strip_words) + before = len(words) + words[:] = [w for w in words if id(w) not in strip_set] + removed = before - len(words) + if removed: + log.info( + "build-grid session %s: removed %d decorative %s-margin words " + "(strip x=%d-%d)", + session_id, removed, side, strip_x_lo, strip_x_hi, + ) + return {"found": True, "side": side, "letters_detected": len(strip)} + + return no_strip + + +def _filter_footer_words( + words: List[Dict], + img_h: int, + log: Any, + session_id: str, +) -> Optional[Dict]: + """Remove isolated words in the bottom 5% of the page (page numbers). + + Modifies *words* in place and returns a page_number metadata dict + if a page number was extracted, or None. + """ + if not words or img_h <= 0: + return None + footer_y = img_h * 0.95 + footer_words = [ + w for w in words + if w["top"] + w.get("height", 0) / 2 > footer_y + ] + if not footer_words: + return None + # Only remove if footer has very few words (<= 3) with short text + total_text = "".join((w.get("text") or "").strip() for w in footer_words) + if len(footer_words) <= 3 and len(total_text) <= 10: + # Extract page number metadata before removing + page_number_info = { + "text": total_text.strip(), + "y_pct": round(footer_words[0]["top"] / img_h * 100, 1), + } + # Try to parse as integer + digits = "".join(c for c in total_text if c.isdigit()) + if digits: + page_number_info["number"] = int(digits) + + footer_set = set(id(w) for w in footer_words) + words[:] = [w for w in words if id(w) not in footer_set] + log.info( + "build-grid session %s: extracted page number '%s' and removed %d footer words", + session_id, total_text, len(footer_words), + ) + return page_number_info + return None + + +def _filter_header_junk( + words: List[Dict], + img_h: int, + log: Any, + session_id: str, +) -> None: + """Remove OCR junk from header illustrations above the real content. + + Textbook pages often have decorative header graphics (illustrations, + icons) that OCR reads as low-confidence junk characters. Real content + typically starts further down the page. + + Algorithm: + 1. Find the "content start" -- the first Y position where a dense + horizontal row of 3+ high-confidence words begins. + 2. Above that line, remove words with conf < 75 and text <= 3 chars. + These are almost certainly OCR artifacts from illustrations. + + Modifies *words* in place. + """ + if not words or img_h <= 0: + return + + # --- Find content start: first horizontal row with >=3 high-conf words --- + # Sort words by Y + sorted_by_y = sorted(words, key=lambda w: w["top"]) + content_start_y = 0 + _ROW_TOLERANCE = img_h * 0.02 # words within 2% of page height = same row + _MIN_ROW_WORDS = 3 + _MIN_CONF = 80 + + i = 0 + while i < len(sorted_by_y): + row_y = sorted_by_y[i]["top"] + # Collect words in this row band + row_words = [] + j = i + while j < len(sorted_by_y) and sorted_by_y[j]["top"] - row_y < _ROW_TOLERANCE: + row_words.append(sorted_by_y[j]) + j += 1 + # Count high-confidence words with real text (> 1 char) + high_conf = [ + w for w in row_words + if w.get("conf", 0) >= _MIN_CONF + and len((w.get("text") or "").strip()) > 1 + ] + if len(high_conf) >= _MIN_ROW_WORDS: + content_start_y = row_y + break + i = j if j > i else i + 1 + + if content_start_y <= 0: + return # no clear content start found + + # --- Remove low-conf short junk above content start --- + junk = [ + w for w in words + if w["top"] + w.get("height", 0) < content_start_y + and w.get("conf", 0) < 75 + and len((w.get("text") or "").strip()) <= 3 + ] + if not junk: + return + + junk_set = set(id(w) for w in junk) + before = len(words) + words[:] = [w for w in words if id(w) not in junk_set] + removed = before - len(words) + if removed: + log.info( + "build-grid session %s: removed %d header junk words above y=%d " + "(content start)", + session_id, removed, content_start_y, + ) diff --git a/klausur-service/backend/grid/editor/headers.py b/klausur-service/backend/grid/editor/headers.py new file mode 100644 index 0000000..6d6cb43 --- /dev/null +++ b/klausur-service/backend/grid/editor/headers.py @@ -0,0 +1,499 @@ +""" +Grid Editor — header/heading detection and colspan (merged cell) detection. +Split from grid_editor_helpers.py. Pure computation, no HTTP/DB side effects. +Lizenz: Apache 2.0 | DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. +""" + +import logging +import re +from typing import Dict, List, Optional + +from cv_ocr_engines import _text_has_garbled_ipa + +logger = logging.getLogger(__name__) + + +def _detect_heading_rows_by_color(zones_data: List[Dict], img_w: int, img_h: int) -> int: + """Detect heading rows by color + height after color annotation. + + A row is a heading if: + 1. ALL word_boxes have color_name != 'black' (typically 'blue') + 2. Mean word height > 1.2x median height of all words in the zone + + Detected heading rows are merged into a single spanning cell. + Returns count of headings detected. + """ + heading_count = 0 + + for z in zones_data: + cells = z.get("cells", []) + rows = z.get("rows", []) + columns = z.get("columns", []) + if not cells or not rows or len(columns) < 2: + continue + + # Compute median word height across the zone + all_heights = [] + for cell in cells: + for wb in cell.get("word_boxes") or []: + h = wb.get("height", 0) + if h > 0: + all_heights.append(h) + if not all_heights: + continue + all_heights_sorted = sorted(all_heights) + median_h = all_heights_sorted[len(all_heights_sorted) // 2] + + heading_row_indices = [] + for row in rows: + if row.get("is_header"): + continue # already detected as header + ri = row["index"] + row_cells = [c for c in cells if c.get("row_index") == ri] + row_wbs = [ + wb for cell in row_cells + for wb in cell.get("word_boxes") or [] + ] + if not row_wbs: + continue + + # Condition 1: ALL words are non-black + all_colored = all( + wb.get("color_name", "black") != "black" + for wb in row_wbs + ) + if not all_colored: + continue + + # Condition 2: mean height > 1.2x median + mean_h = sum(wb.get("height", 0) for wb in row_wbs) / len(row_wbs) + if mean_h <= median_h * 1.2: + continue + + heading_row_indices.append(ri) + + # Merge heading cells into spanning cells + for hri in heading_row_indices: + header_cells = [c for c in cells if c.get("row_index") == hri] + if len(header_cells) <= 1: + # Single cell -- just mark it as heading + if header_cells: + header_cells[0]["col_type"] = "heading" + heading_count += 1 + # Mark row as header + for row in rows: + if row["index"] == hri: + row["is_header"] = True + continue + + # Collect all word_boxes and text from all columns + all_wb = [] + all_text_parts = [] + for hc in sorted(header_cells, key=lambda c: c["col_index"]): + all_wb.extend(hc.get("word_boxes", [])) + if hc.get("text", "").strip(): + all_text_parts.append(hc["text"].strip()) + + # Remove all cells for this row, replace with one spanning cell + z["cells"] = [c for c in z["cells"] if c.get("row_index") != hri] + + if all_wb: + x_min = min(wb["left"] for wb in all_wb) + y_min = min(wb["top"] for wb in all_wb) + x_max = max(wb["left"] + wb["width"] for wb in all_wb) + y_max = max(wb["top"] + wb["height"] for wb in all_wb) + + # Use the actual starting col_index from the first cell + first_col = min(hc["col_index"] for hc in header_cells) + zone_idx = z.get("zone_index", 0) + z["cells"].append({ + "cell_id": f"Z{zone_idx}_R{hri:02d}_C{first_col}", + "zone_index": zone_idx, + "row_index": hri, + "col_index": first_col, + "col_type": "heading", + "text": " ".join(all_text_parts), + "confidence": 0.0, + "bbox_px": {"x": x_min, "y": y_min, + "w": x_max - x_min, "h": y_max - y_min}, + "bbox_pct": { + "x": round(x_min / img_w * 100, 2) if img_w else 0, + "y": round(y_min / img_h * 100, 2) if img_h else 0, + "w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0, + "h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0, + }, + "word_boxes": all_wb, + "ocr_engine": "words_first", + "is_bold": True, + }) + + # Mark row as header + for row in rows: + if row["index"] == hri: + row["is_header"] = True + heading_count += 1 + + return heading_count + + +def _detect_heading_rows_by_single_cell( + zones_data: List[Dict], img_w: int, img_h: int, +) -> int: + """Detect heading rows that have only a single content cell. + + Black headings like "Theme" have normal color and height, so they are + missed by ``_detect_heading_rows_by_color``. The distinguishing signal + is that they occupy only one column while normal vocabulary rows fill + at least 2-3 columns. + + A row qualifies as a heading if: + 1. It is not already marked as a header/heading. + 2. It has exactly ONE cell whose col_type starts with ``column_`` + (excluding column_1 / page_ref which only carries page numbers). + 3. That single cell is NOT in the last column (continuation/example + lines like "2. Ver\u00e4nderung, Wechsel" often sit alone in column_4). + 4. The text does not start with ``[`` (IPA continuation). + 5. The zone has >=3 columns and >=5 rows (avoids false positives in + tiny zones). + 6. The majority of rows in the zone have >=2 content cells (ensures + we are in a multi-column vocab layout). + """ + heading_count = 0 + + for z in zones_data: + cells = z.get("cells", []) + rows = z.get("rows", []) + columns = z.get("columns", []) + if len(columns) < 3 or len(rows) < 5: + continue + + # Determine the last col_index (example/sentence column) + col_indices = sorted(set(c.get("col_index", 0) for c in cells)) + if not col_indices: + continue + last_col = col_indices[-1] + + # Count content cells per row (column_* but not column_1/page_ref). + # Exception: column_1 cells that contain a dictionary article word + # (die/der/das etc.) ARE content -- they appear in dictionary layouts + # where the leftmost column holds grammatical articles. + _ARTICLE_WORDS = { + "die", "der", "das", "dem", "den", "des", "ein", "eine", + "the", "a", "an", + } + row_content_counts: Dict[int, int] = {} + for cell in cells: + ct = cell.get("col_type", "") + if not ct.startswith("column_"): + continue + if ct == "column_1": + ctext = (cell.get("text") or "").strip().lower() + if ctext not in _ARTICLE_WORDS: + continue + ri = cell.get("row_index", -1) + row_content_counts[ri] = row_content_counts.get(ri, 0) + 1 + + # Majority of rows must have >=2 content cells + multi_col_rows = sum(1 for cnt in row_content_counts.values() if cnt >= 2) + if multi_col_rows < len(rows) * 0.4: + continue + + # Exclude first and last non-header rows -- these are typically + # page numbers or footer text, not headings. + non_header_rows = [r for r in rows if not r.get("is_header")] + if len(non_header_rows) < 3: + continue + first_ri = non_header_rows[0]["index"] + last_ri = non_header_rows[-1]["index"] + + heading_row_indices = [] + for row in rows: + if row.get("is_header"): + continue + ri = row["index"] + if ri == first_ri or ri == last_ri: + continue + row_cells = [c for c in cells if c.get("row_index") == ri] + content_cells = [ + c for c in row_cells + if c.get("col_type", "").startswith("column_") + and (c.get("col_type") != "column_1" + or (c.get("text") or "").strip().lower() in _ARTICLE_WORDS) + ] + if len(content_cells) != 1: + continue + cell = content_cells[0] + # Not in the last column (continuation/example lines) + if cell.get("col_index") == last_col: + continue + text = (cell.get("text") or "").strip() + if not text or text.startswith("["): + continue + # Continuation lines start with "(" -- e.g. "(usw.)", "(TV-Serie)" + if text.startswith("("): + continue + # Single cell NOT in the first content column is likely a + # continuation/overflow line, not a heading. Real headings + # ("Theme 1", "Unit 3: ...") appear in the first or second + # content column. + first_content_col = col_indices[0] if col_indices else 0 + if cell.get("col_index", 0) > first_content_col + 1: + continue + # Skip garbled IPA without brackets (e.g. "ska:f -- ska:vz") + # but NOT text with real IPA symbols (e.g. "Theme [\u03b8\u02c8i\u02d0m]") + _REAL_IPA_CHARS = set("\u02c8\u02cc\u0259\u026a\u025b\u0252\u028a\u028c\u00e6\u0251\u0254\u0283\u0292\u03b8\u00f0\u014b") + if _text_has_garbled_ipa(text) and not any(c in _REAL_IPA_CHARS for c in text): + continue + # Guard: dictionary section headings are short (1-4 alpha chars + # like "A", "Ab", "Zi", "Sch"). Longer text that starts + # lowercase is a regular vocabulary word (e.g. "zentral") that + # happens to appear alone in its row. + alpha_only = re.sub(r'[^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]', '', text) + if len(alpha_only) > 4 and text[0].islower(): + continue + heading_row_indices.append(ri) + + # Guard: if >25% of eligible rows would become headings, the + # heuristic is misfiring (e.g. sparse single-column layout where + # most rows naturally have only 1 content cell). + eligible_rows = len(non_header_rows) - 2 # minus first/last excluded + if eligible_rows > 0 and len(heading_row_indices) > eligible_rows * 0.25: + logger.debug( + "Skipping single-cell heading detection for zone %s: " + "%d/%d rows would be headings (>25%%)", + z.get("zone_index"), len(heading_row_indices), eligible_rows, + ) + continue + + for hri in heading_row_indices: + header_cells = [c for c in cells if c.get("row_index") == hri] + if not header_cells: + continue + + # Collect all word_boxes and text + all_wb = [] + all_text_parts = [] + for hc in sorted(header_cells, key=lambda c: c["col_index"]): + all_wb.extend(hc.get("word_boxes", [])) + if hc.get("text", "").strip(): + all_text_parts.append(hc["text"].strip()) + + first_col_idx = min(hc["col_index"] for hc in header_cells) + + # Remove old cells for this row, add spanning heading cell + z["cells"] = [c for c in z["cells"] if c.get("row_index") != hri] + + if all_wb: + x_min = min(wb["left"] for wb in all_wb) + y_min = min(wb["top"] for wb in all_wb) + x_max = max(wb["left"] + wb["width"] for wb in all_wb) + y_max = max(wb["top"] + wb["height"] for wb in all_wb) + else: + # Fallback to first cell bbox + bp = header_cells[0].get("bbox_px", {}) + x_min = bp.get("x", 0) + y_min = bp.get("y", 0) + x_max = x_min + bp.get("w", 0) + y_max = y_min + bp.get("h", 0) + + zone_idx = z.get("zone_index", 0) + z["cells"].append({ + "cell_id": f"Z{zone_idx}_R{hri:02d}_C{first_col_idx}", + "zone_index": zone_idx, + "row_index": hri, + "col_index": first_col_idx, + "col_type": "heading", + "text": " ".join(all_text_parts), + "confidence": 0.0, + "bbox_px": {"x": x_min, "y": y_min, + "w": x_max - x_min, "h": y_max - y_min}, + "bbox_pct": { + "x": round(x_min / img_w * 100, 2) if img_w else 0, + "y": round(y_min / img_h * 100, 2) if img_h else 0, + "w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0, + "h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0, + }, + "word_boxes": all_wb, + "ocr_engine": "words_first", + "is_bold": False, + }) + + for row in rows: + if row["index"] == hri: + row["is_header"] = True + heading_count += 1 + + return heading_count + + +def _detect_header_rows( + rows: List[Dict], + zone_words: List[Dict], + zone_y: int, + columns: Optional[List[Dict]] = None, + skip_first_row_header: bool = False, +) -> List[int]: + """Detect header rows: first-row heuristic + spanning header detection. + + A "spanning header" is a row whose words stretch across multiple column + boundaries (e.g. "Unit4: Bonnie Scotland" centred across 4 columns). + """ + if len(rows) < 2: + return [] + + headers = [] + + if not skip_first_row_header: + first_row = rows[0] + second_row = rows[1] + + # Gap between first and second row > 0.5x average row height + avg_h = sum(r["y_max"] - r["y_min"] for r in rows) / len(rows) + gap = second_row["y_min"] - first_row["y_max"] + if gap > avg_h * 0.5: + headers.append(0) + + # Also check if first row words are taller than average (bold/header text) + all_heights = [w["height"] for w in zone_words] + median_h = sorted(all_heights)[len(all_heights) // 2] if all_heights else 20 + first_row_words = [ + w for w in zone_words + if first_row["y_min"] <= w["top"] + w["height"] / 2 <= first_row["y_max"] + ] + if first_row_words: + first_h = max(w["height"] for w in first_row_words) + if first_h > median_h * 1.3: + if 0 not in headers: + headers.append(0) + + # Note: Spanning-header detection (rows spanning all columns) has been + # disabled because it produces too many false positives on vocabulary + # worksheets where IPA transcriptions or short entries naturally span + # multiple columns with few words. The first-row heuristic above is + # sufficient for detecting real headers. + + return headers + + +def _detect_colspan_cells( + zone_words: List[Dict], + columns: List[Dict], + rows: List[Dict], + cells: List[Dict], + img_w: int, + img_h: int, +) -> List[Dict]: + """Detect and merge cells that span multiple columns (colspan). + + A word-block (PaddleOCR phrase) that extends significantly past a column + boundary into the next column indicates a merged cell. This replaces + the incorrectly split cells with a single cell spanning multiple columns. + + Works for both full-page scans and box zones. + """ + if len(columns) < 2 or not zone_words or not rows: + return cells + + from cv_words_first import _assign_word_to_row + + # Column boundaries (midpoints between adjacent columns) + col_boundaries = [] + for ci in range(len(columns) - 1): + col_boundaries.append((columns[ci]["x_max"] + columns[ci + 1]["x_min"]) / 2) + + def _cols_covered(w_left: float, w_right: float) -> List[int]: + """Return list of column indices that a word-block covers.""" + covered = [] + for col in columns: + col_mid = (col["x_min"] + col["x_max"]) / 2 + # Word covers a column if it extends past the column's midpoint + if w_left < col_mid < w_right: + covered.append(col["index"]) + # Also include column if word starts within it + elif col["x_min"] <= w_left < col["x_max"]: + covered.append(col["index"]) + return sorted(set(covered)) + + # Group original word-blocks by row + row_word_blocks: Dict[int, List[Dict]] = {} + for w in zone_words: + ri = _assign_word_to_row(w, rows) + row_word_blocks.setdefault(ri, []).append(w) + + # For each row, check if any word-block spans multiple columns + rows_to_merge: Dict[int, List[Dict]] = {} # row_index -> list of spanning word-blocks + + for ri, wblocks in row_word_blocks.items(): + spanning = [] + for w in wblocks: + w_left = w["left"] + w_right = w_left + w["width"] + covered = _cols_covered(w_left, w_right) + if len(covered) >= 2: + spanning.append({"word": w, "cols": covered}) + if spanning: + rows_to_merge[ri] = spanning + + if not rows_to_merge: + return cells + + # Merge cells for spanning rows + new_cells = [] + for cell in cells: + ri = cell.get("row_index", -1) + if ri not in rows_to_merge: + new_cells.append(cell) + continue + + # Check if this cell's column is part of a spanning block + ci = cell.get("col_index", -1) + is_part_of_span = False + for span in rows_to_merge[ri]: + if ci in span["cols"]: + is_part_of_span = True + # Only emit the merged cell for the FIRST column in the span + if ci == span["cols"][0]: + # Use the ORIGINAL word-block text (not the split cell texts + # which may have broken words like "euros a" + "nd cents") + orig_word = span["word"] + merged_text = orig_word.get("text", "").strip() + all_wb = [orig_word] + + # Compute merged bbox + if all_wb: + x_min = min(wb["left"] for wb in all_wb) + y_min = min(wb["top"] for wb in all_wb) + x_max = max(wb["left"] + wb["width"] for wb in all_wb) + y_max = max(wb["top"] + wb["height"] for wb in all_wb) + else: + x_min = y_min = x_max = y_max = 0 + + new_cells.append({ + "cell_id": cell["cell_id"], + "row_index": ri, + "col_index": span["cols"][0], + "col_type": "spanning_header", + "colspan": len(span["cols"]), + "text": merged_text, + "confidence": cell.get("confidence", 0), + "bbox_px": {"x": x_min, "y": y_min, + "w": x_max - x_min, "h": y_max - y_min}, + "bbox_pct": { + "x": round(x_min / img_w * 100, 2) if img_w else 0, + "y": round(y_min / img_h * 100, 2) if img_h else 0, + "w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0, + "h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0, + }, + "word_boxes": all_wb, + "ocr_engine": cell.get("ocr_engine", ""), + "is_bold": cell.get("is_bold", False), + }) + logger.info( + "colspan detected: row %d, cols %s -> merged %d cells (%r)", + ri, span["cols"], len(span["cols"]), merged_text[:50], + ) + break + if not is_part_of_span: + new_cells.append(cell) + + return new_cells diff --git a/klausur-service/backend/grid/editor/helpers.py b/klausur-service/backend/grid/editor/helpers.py new file mode 100644 index 0000000..209e7ff --- /dev/null +++ b/klausur-service/backend/grid/editor/helpers.py @@ -0,0 +1,58 @@ +""" +Grid Editor helper functions — barrel re-export module. + +This file re-exports all public symbols from the split sub-modules +so that existing ``from grid_editor_helpers import ...`` statements +continue to work without changes. + +Sub-modules: + - columns — column detection, cross-column splitting, marker merging + - filters — word/zone filtering, border ghosts, decorative margins + - headers — header/heading detection, colspan detection + - zones — vertical dividers, zone splitting/merging, zone grid building + +Lizenz: Apache 2.0 (kommerziell nutzbar) +DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. +""" + +# --- Re-export: columns --------------------------------------------------- +from .columns import ( # noqa: F401 + _is_recognized_word, + _split_cross_column_words, + _cluster_columns_by_alignment, + _MARKER_CHARS, + _merge_inline_marker_columns, +) + +# --- Re-export: filters ---------------------------------------------------- +from .filters import ( # noqa: F401 + _filter_border_strip_words, + _GRID_GHOST_CHARS, + _filter_border_ghosts, + _flatten_word_boxes, + _words_in_zone, + _get_content_bounds, + _filter_decorative_margin, + _filter_footer_words, + _filter_header_junk, +) + +# --- Re-export: headers ---------------------------------------------------- +from .headers import ( # noqa: F401 + _detect_heading_rows_by_color, + _detect_heading_rows_by_single_cell, + _detect_header_rows, + _detect_colspan_cells, +) + +# --- Re-export: zones ------------------------------------------------------- +from .zones import ( # noqa: F401 + _PIPE_RE_VSPLIT, + _detect_vertical_dividers, + _split_zone_at_vertical_dividers, + _merge_content_zones_across_boxes, + _build_zone_grid, +) + +# --- Re-export from cv_words_first (used by cv_box_layout.py) --------------- +from cv_words_first import _cluster_rows # noqa: F401 diff --git a/klausur-service/backend/grid/editor/zones.py b/klausur-service/backend/grid/editor/zones.py new file mode 100644 index 0000000..e77a1c5 --- /dev/null +++ b/klausur-service/backend/grid/editor/zones.py @@ -0,0 +1,389 @@ +""" +Grid Editor — vertical divider detection, zone splitting/merging, zone grid building. + +Split from grid_editor_helpers.py for maintainability. +All functions are pure computation — no HTTP, DB, or session side effects. + +Lizenz: Apache 2.0 (kommerziell nutzbar) +DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. +""" + +import logging +import re +from typing import Any, Dict, List, Optional + +from cv_vocab_types import PageZone +from cv_words_first import _cluster_rows, _build_cells + +from .columns import ( + _cluster_columns_by_alignment, + _merge_inline_marker_columns, + _split_cross_column_words, +) +from .headers import ( + _detect_header_rows, + _detect_colspan_cells, +) + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Vertical divider detection and zone splitting +# --------------------------------------------------------------------------- + +_PIPE_RE_VSPLIT = re.compile(r"^\|+$") + + +def _detect_vertical_dividers( + words: List[Dict], + zone_x: int, + zone_w: int, + zone_y: int, + zone_h: int, +) -> List[float]: + """Detect vertical divider lines from pipe word_boxes at consistent x. + + Returns list of divider x-positions (empty if no dividers found). + """ + if not words or zone_w <= 0 or zone_h <= 0: + return [] + + # Collect pipe word_boxes + pipes = [ + w for w in words + if _PIPE_RE_VSPLIT.match((w.get("text") or "").strip()) + ] + if len(pipes) < 5: + return [] + + # Cluster pipe x-centers by proximity + tolerance = max(15, int(zone_w * 0.02)) + pipe_xs = sorted(w["left"] + w["width"] / 2 for w in pipes) + + clusters: List[List[float]] = [[pipe_xs[0]]] + for x in pipe_xs[1:]: + if x - clusters[-1][-1] <= tolerance: + clusters[-1].append(x) + else: + clusters.append([x]) + + dividers: List[float] = [] + for cluster in clusters: + if len(cluster) < 5: + continue + mean_x = sum(cluster) / len(cluster) + # Must be between 15% and 85% of zone width + rel_pos = (mean_x - zone_x) / zone_w + if rel_pos < 0.15 or rel_pos > 0.85: + continue + # Check vertical coverage: pipes must span >= 50% of zone height + cluster_pipes = [ + w for w in pipes + if abs(w["left"] + w["width"] / 2 - mean_x) <= tolerance + ] + ys = [w["top"] for w in cluster_pipes] + [w["top"] + w["height"] for w in cluster_pipes] + y_span = max(ys) - min(ys) if ys else 0 + if y_span < zone_h * 0.5: + continue + dividers.append(mean_x) + + return sorted(dividers) + + +def _split_zone_at_vertical_dividers( + zone: "PageZone", + divider_xs: List[float], + vsplit_group_id: int, +) -> List["PageZone"]: + """Split a PageZone at vertical divider positions into sub-zones.""" + boundaries = [zone.x] + divider_xs + [zone.x + zone.width] + hints = [] + for i in range(len(boundaries) - 1): + if i == 0: + hints.append("left_of_vsplit") + elif i == len(boundaries) - 2: + hints.append("right_of_vsplit") + else: + hints.append("middle_of_vsplit") + + sub_zones = [] + for i in range(len(boundaries) - 1): + x_start = int(boundaries[i]) + x_end = int(boundaries[i + 1]) + sub = PageZone( + index=0, # re-indexed later + zone_type=zone.zone_type, + y=zone.y, + height=zone.height, + x=x_start, + width=x_end - x_start, + box=zone.box, + image_overlays=zone.image_overlays, + layout_hint=hints[i], + vsplit_group=vsplit_group_id, + ) + sub_zones.append(sub) + + return sub_zones + + +def _merge_content_zones_across_boxes( + zones: List, + content_x: int, + content_w: int, +) -> List: + """Merge content zones separated by box zones into single zones. + + Box zones become image_overlays on the merged content zone. + Pattern: [content, box*, content] -> [merged_content with overlay] + Box zones NOT between two content zones stay as standalone zones. + """ + if len(zones) < 3: + return zones + + # Group consecutive runs of [content, box+, content] + result: List = [] + i = 0 + while i < len(zones): + z = zones[i] + if z.zone_type != "content": + result.append(z) + i += 1 + continue + + # Start of a potential merge group: content zone + group_contents = [z] + group_boxes = [] + j = i + 1 + # Absorb [box, content] pairs -- only absorb a box if it's + # confirmed to be followed by another content zone. + while j < len(zones): + if (zones[j].zone_type == "box" + and j + 1 < len(zones) + and zones[j + 1].zone_type == "content"): + group_boxes.append(zones[j]) + group_contents.append(zones[j + 1]) + j += 2 + else: + break + + if len(group_contents) >= 2 and group_boxes: + # Merge: create one large content zone spanning all + y_min = min(c.y for c in group_contents) + y_max = max(c.y + c.height for c in group_contents) + overlays = [] + for bz in group_boxes: + overlay = { + "y": bz.y, + "height": bz.height, + "x": bz.x, + "width": bz.width, + } + if bz.box: + overlay["box"] = { + "x": bz.box.x, + "y": bz.box.y, + "width": bz.box.width, + "height": bz.box.height, + "confidence": bz.box.confidence, + "border_thickness": bz.box.border_thickness, + } + overlays.append(overlay) + + merged = PageZone( + index=0, # re-indexed below + zone_type="content", + y=y_min, + height=y_max - y_min, + x=content_x, + width=content_w, + image_overlays=overlays, + ) + result.append(merged) + i = j + else: + # No merge possible -- emit just the content zone + result.append(z) + i += 1 + + # Re-index zones + for idx, z in enumerate(result): + z.index = idx + + logger.info( + "zone-merge: %d zones -> %d zones after merging across boxes", + len(zones), len(result), + ) + return result + + +def _build_zone_grid( + zone_words: List[Dict], + zone_x: int, + zone_y: int, + zone_w: int, + zone_h: int, + zone_index: int, + img_w: int, + img_h: int, + global_columns: Optional[List[Dict]] = None, + skip_first_row_header: bool = False, +) -> Dict[str, Any]: + """Build columns, rows, cells for a single zone from its words. + + Args: + global_columns: If provided, use these pre-computed column boundaries + instead of detecting columns per zone. Used for content zones so + that all content zones (above/between/below boxes) share the same + column structure. Box zones always detect columns independently. + """ + if not zone_words: + return { + "columns": [], + "rows": [], + "cells": [], + "header_rows": [], + } + + # Cluster rows first (needed for column alignment analysis) + rows = _cluster_rows(zone_words) + + # Diagnostic logging for small/medium zones (box zones typically have 40-60 words) + if len(zone_words) <= 60: + import statistics as _st + _heights = [w['height'] for w in zone_words if w.get('height', 0) > 0] + _med_h = _st.median(_heights) if _heights else 20 + _y_tol = max(_med_h * 0.5, 5) + logger.info( + "zone %d row-clustering: %d words, median_h=%.0f, y_tol=%.1f -> %d rows", + zone_index, len(zone_words), _med_h, _y_tol, len(rows), + ) + for w in sorted(zone_words, key=lambda ww: (ww['top'], ww['left'])): + logger.info( + " zone %d word: y=%d x=%d h=%d w=%d '%s'", + zone_index, w['top'], w['left'], w['height'], w['width'], + w.get('text', '')[:40], + ) + for r in rows: + logger.info( + " zone %d row %d: y_min=%d y_max=%d y_center=%.0f", + zone_index, r['index'], r['y_min'], r['y_max'], r['y_center'], + ) + + # Use global columns if provided, otherwise detect per zone + columns = global_columns if global_columns else _cluster_columns_by_alignment(zone_words, zone_w, rows) + + # Merge inline marker columns (bullets, numbering) into adjacent text + if not global_columns: + columns = _merge_inline_marker_columns(columns, zone_words) + + if not columns or not rows: + return { + "columns": [], + "rows": [], + "cells": [], + "header_rows": [], + } + + # Split word boxes that straddle column boundaries (e.g. "sichzie" + # spanning Col 1 + Col 2). Must happen after column detection and + # before cell assignment. + # Keep original words for colspan detection (split destroys span info). + original_zone_words = zone_words + if len(columns) >= 2: + zone_words = _split_cross_column_words(zone_words, columns) + + # Build cells + cells = _build_cells(zone_words, columns, rows, img_w, img_h) + + # --- Detect colspan (merged cells spanning multiple columns) --- + # Uses the ORIGINAL (pre-split) words to detect word-blocks that span + # multiple columns. _split_cross_column_words would have destroyed + # this information by cutting words at column boundaries. + if len(columns) >= 2: + cells = _detect_colspan_cells(original_zone_words, columns, rows, cells, img_w, img_h) + + # Prefix cell IDs with zone index + for cell in cells: + cell["cell_id"] = f"Z{zone_index}_{cell['cell_id']}" + cell["zone_index"] = zone_index + + # Detect header rows (pass columns for spanning header detection) + header_rows = _detect_header_rows(rows, zone_words, zone_y, columns, + skip_first_row_header=skip_first_row_header) + + # Merge cells in spanning header rows into a single col-0 cell + if header_rows and len(columns) >= 2: + for hri in header_rows: + header_cells = [c for c in cells if c["row_index"] == hri] + if len(header_cells) <= 1: + continue + # Collect all word_boxes and text from all columns + all_wb = [] + all_text_parts = [] + for hc in sorted(header_cells, key=lambda c: c["col_index"]): + all_wb.extend(hc.get("word_boxes", [])) + if hc.get("text", "").strip(): + all_text_parts.append(hc["text"].strip()) + # Remove all header cells, replace with one spanning cell + cells = [c for c in cells if c["row_index"] != hri] + if all_wb: + x_min = min(wb["left"] for wb in all_wb) + y_min = min(wb["top"] for wb in all_wb) + x_max = max(wb["left"] + wb["width"] for wb in all_wb) + y_max = max(wb["top"] + wb["height"] for wb in all_wb) + cells.append({ + "cell_id": f"R{hri:02d}_C0", + "row_index": hri, + "col_index": 0, + "col_type": "spanning_header", + "text": " ".join(all_text_parts), + "confidence": 0.0, + "bbox_px": {"x": x_min, "y": y_min, + "w": x_max - x_min, "h": y_max - y_min}, + "bbox_pct": { + "x": round(x_min / img_w * 100, 2) if img_w else 0, + "y": round(y_min / img_h * 100, 2) if img_h else 0, + "w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0, + "h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0, + }, + "word_boxes": all_wb, + "ocr_engine": "words_first", + "is_bold": True, + }) + + # Convert columns to output format with percentages + out_columns = [] + for col in columns: + x_min = col["x_min"] + x_max = col["x_max"] + out_columns.append({ + "index": col["index"], + "label": col["type"], + "x_min_px": round(x_min), + "x_max_px": round(x_max), + "x_min_pct": round(x_min / img_w * 100, 2) if img_w else 0, + "x_max_pct": round(x_max / img_w * 100, 2) if img_w else 0, + "bold": False, + }) + + # Convert rows to output format with percentages + out_rows = [] + for row in rows: + out_rows.append({ + "index": row["index"], + "y_min_px": round(row["y_min"]), + "y_max_px": round(row["y_max"]), + "y_min_pct": round(row["y_min"] / img_h * 100, 2) if img_h else 0, + "y_max_pct": round(row["y_max"] / img_h * 100, 2) if img_h else 0, + "is_header": row["index"] in header_rows, + }) + + return { + "columns": out_columns, + "rows": out_rows, + "cells": cells, + "header_rows": header_rows, + "_raw_columns": columns, # internal: for propagation to other zones + } diff --git a/klausur-service/backend/grid_build_cell_ops.py b/klausur-service/backend/grid_build_cell_ops.py index 57bc721..0d2c74d 100644 --- a/klausur-service/backend/grid_build_cell_ops.py +++ b/klausur-service/backend/grid_build_cell_ops.py @@ -1,305 +1,4 @@ -""" -Grid Build Cell Ops — Cell-level operations: bullet/artifact removal, -garbled cell cleanup, word-box reordering, and max_columns enforcement. - -Extracted from grid_build_core.py for maintainability. -""" - -import logging -import re -from typing import Any, Dict, List, Tuple - -from cv_ocr_engines import ( - _words_to_reading_order_text, _group_words_into_lines, _lookup_ipa, -) - -logger = logging.getLogger(__name__) - - -def _remove_bullets_and_artifacts(zones_data: List[Dict[str, Any]]) -> None: - """Remove blue bullet/artifact word_boxes (Step 5i). - - Handles tiny coloured symbols, overlapping word_boxes, duplicate text, - and syllable-split word merging. - """ - _ALPHA_WORD_RE = re.compile(r'^[A-Za-z\u00c0-\u024f\-]+[.,;:!?]*$') - _REMOVE_SYMBOLS = {'>', '<', '~', '\\', '^', '`', '#', '|', '¬', '¦'} - - bullet_removed = 0 - for z in zones_data: - for cell in z.get("cells", []): - wbs = cell.get("word_boxes") or [] - if len(wbs) < 2: - continue - to_remove: set = set() - - # Rule (a): tiny coloured symbols - for i, wb in enumerate(wbs): - cn = wb.get("color_name", "black") - if (cn != "black" - and wb.get("width", 0) * wb.get("height", 0) < 200 - and wb.get("conf", 100) < 85): - to_remove.add(i) - - # Rule (a2): isolated non-alphanumeric symbols - for i, wb in enumerate(wbs): - t = (wb.get("text") or "").strip() - if t and not re.search(r'[a-zA-Z0-9äöüÄÖÜß]', t) and len(t) <= 2: - if t in _REMOVE_SYMBOLS: - to_remove.add(i) - - # Rule (b) + (c): overlap and duplicate detection - to_merge: List[Tuple[int, int]] = [] - indexed = sorted(enumerate(wbs), key=lambda iw: iw[1].get("left", 0)) - for p in range(len(indexed) - 1): - i1, w1 = indexed[p] - i2, w2 = indexed[p + 1] - x1s, x1e = w1.get("left", 0), w1.get("left", 0) + w1.get("width", 0) - x2s, x2e = w2.get("left", 0), w2.get("left", 0) + w2.get("width", 0) - overlap = max(0, min(x1e, x2e) - max(x1s, x2s)) - min_w = min(w1.get("width", 1), w2.get("width", 1)) - gap = x2s - x1e - overlap_pct = overlap / min_w if min_w > 0 else 0 - - if overlap_pct > 0.20: - t1 = (w1.get("text") or "").strip() - t2 = (w2.get("text") or "").strip() - - # Syllable-split words - if (overlap_pct <= 0.75 - and _ALPHA_WORD_RE.match(t1) - and _ALPHA_WORD_RE.match(t2)): - to_merge.append((i1, i2)) - continue - - # High overlap with short prefix - if (overlap_pct > 0.75 - and _ALPHA_WORD_RE.match(t1) - and _ALPHA_WORD_RE.match(t2) - and t1.rstrip(".,;:!?").lower() != t2.rstrip(".,;:!?").lower() - and min(len(t1.rstrip(".,;:!?")), len(t2.rstrip(".,;:!?"))) <= 4): - to_merge.append((i1, i2)) - continue - - if overlap_pct <= 0.40: - continue - - c1 = w1.get("conf", 50) - c2 = w2.get("conf", 50) - - # Very high overlap: prefer IPA-dictionary word - if overlap_pct > 0.90 and t1.lower() != t2.lower(): - in_dict_1 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t1.lower()), "british")) if t1.isalpha() else False - in_dict_2 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t2.lower()), "british")) if t2.isalpha() else False - if in_dict_1 and not in_dict_2: - to_remove.add(i2) - continue - elif in_dict_2 and not in_dict_1: - to_remove.add(i1) - continue - - if c1 < c2: - to_remove.add(i1) - elif c2 < c1: - to_remove.add(i2) - else: - if w1.get("height", 0) > w2.get("height", 0): - to_remove.add(i1) - else: - to_remove.add(i2) - - elif (gap < 6 - and w1.get("color_name") == "blue" - and w2.get("color_name") == "blue" - and (w1.get("text") or "").strip() == (w2.get("text") or "").strip()): - c1 = w1.get("conf", 50) - c2 = w2.get("conf", 50) - to_remove.add(i1 if c1 <= c2 else i2) - - # Execute merges first (syllable-split words) - if to_merge: - merge_parent: Dict[int, int] = {} - for mi1, mi2 in to_merge: - actual_mi1 = mi1 - while actual_mi1 in merge_parent: - actual_mi1 = merge_parent[actual_mi1] - if actual_mi1 in to_remove or mi2 in to_remove: - continue - if mi2 in merge_parent: - continue - mw1, mw2 = wbs[actual_mi1], wbs[mi2] - mt1 = (mw1.get("text") or "").rstrip(".,;:!?") - mt2 = (mw2.get("text") or "").strip() - merged_text = mt1 + mt2 - mx = min(mw1["left"], mw2["left"]) - my = min(mw1["top"], mw2["top"]) - mr = max(mw1["left"] + mw1["width"], - mw2["left"] + mw2["width"]) - mb = max(mw1["top"] + mw1["height"], - mw2["top"] + mw2["height"]) - mw1["text"] = merged_text - mw1["left"] = mx - mw1["top"] = my - mw1["width"] = mr - mx - mw1["height"] = mb - my - mw1["conf"] = (mw1.get("conf", 50) + mw2.get("conf", 50)) / 2 - to_remove.add(mi2) - merge_parent[mi2] = actual_mi1 - bullet_removed -= 1 - - if to_remove: - bullet_removed += len(to_remove) - filtered = [wb for i, wb in enumerate(wbs) if i not in to_remove] - cell["word_boxes"] = filtered - if not cell.get("_ipa_corrected"): - cell["text"] = _words_to_reading_order_text(filtered) - - if bullet_removed: - for z in zones_data: - z["cells"] = [c for c in z.get("cells", []) - if (c.get("word_boxes") or c.get("text", "").strip())] - logger.info("Step 5i: removed %d bullet/artifact word_boxes", bullet_removed) - - -def _remove_garbled_cells(zones_data: List[Dict[str, Any]]) -> None: - """Remove cells whose text is entirely garbled / artifact noise (Step 5j-pre).""" - _COMMON_SHORT_WORDS = { - "ab", "am", "an", "da", "du", "er", "es", "im", "in", "ja", - "ob", "so", "um", "zu", "wo", "je", "oh", "or", - "die", "der", "das", "dem", "den", "des", "ein", "und", - "auf", "aus", "bei", "bis", "für", "mit", "nur", "von", - "a", "i", "an", "as", "at", "be", "by", "do", "go", "he", - "if", "in", "is", "it", "me", "my", "no", "of", "oh", "ok", - "on", "or", "so", "to", "up", "us", "we", - "the", "and", "but", "for", "not", - } - _PURE_JUNK_RE = re.compile(r'^[\W\d\s]+$') - artifact_cells_removed = 0 - - for z in zones_data: - before = len(z.get("cells", [])) - kept = [] - for cell in z.get("cells", []): - text = (cell.get("text") or "").strip() - core = text.rstrip(".,;:!?'\"") - is_artifact = False - if not core: - is_artifact = True - elif _PURE_JUNK_RE.match(core): - if core.strip() not in ('=', '(=', '=)', ';', ':', '-', '–', '—'): - is_artifact = True - elif len(core) <= 2 and core.lower() not in _COMMON_SHORT_WORDS and not core.isalpha(): - is_artifact = True - elif len(core) <= 3 and core.isupper() and core.lower() not in _COMMON_SHORT_WORDS: - is_artifact = True - elif (len(core) <= 5 and re.search(r'\d', core) and re.search(r'[A-Za-z]', core) - and not re.match(r'^[pPsS]\.?\d+$', core)): - is_artifact = True - if is_artifact: - kept.append(None) - else: - kept.append(cell) - z["cells"] = [c for c in kept if c is not None] - artifact_cells_removed += before - len(z["cells"]) - - if artifact_cells_removed: - for z in zones_data: - cell_ris = {c.get("row_index") for c in z.get("cells", [])} - z["rows"] = [r for r in z.get("rows", []) if r["index"] in cell_ris] - logger.info("Step 5j-pre: removed %d artifact cells", artifact_cells_removed) - - -def _normalize_word_order(zones_data: List[Dict[str, Any]]) -> None: - """Normalise word_box order to reading order (Step 5j).""" - wb_reordered = 0 - for z in zones_data: - for cell in z.get("cells", []): - wbs = cell.get("word_boxes") or [] - if len(wbs) < 2: - continue - lines = _group_words_into_lines(wbs, y_tolerance_px=15) - sorted_wbs = [w for line in lines for w in line] - if [id(w) for w in sorted_wbs] != [id(w) for w in wbs]: - cell["word_boxes"] = sorted_wbs - wb_reordered += 1 - if wb_reordered: - logger.info("Step 5j: re-ordered word_boxes in %d cells to reading order", wb_reordered) - - -def _enforce_max_columns( - zones_data: List[Dict[str, Any]], - max_columns: int, -) -> None: - """Enforce max_columns by merging narrowest columns (Step 5k).""" - for z in zones_data: - if z.get("zone_type") != "content": - continue - cols = z.get("columns", []) - cells = z.get("cells", []) - if len(cols) <= max_columns: - continue - - logger.info( - "max_columns=%d: zone %s has %d columns -> merging", - max_columns, z.get("zone_index"), len(cols), - ) - - cols_by_width = sorted(cols, key=lambda c: (c.get("x_max_px", 0) - c.get("x_min_px", 0))) - - while len(cols) > max_columns: - narrowest = cols_by_width.pop(0) - ni = narrowest["index"] - - sorted_by_x = sorted(cols, key=lambda c: c.get("x_min_px", c.get("x_min_pct", 0))) - pos = next(i for i, c in enumerate(sorted_by_x) if c["index"] == ni) - if pos + 1 < len(sorted_by_x): - merge_target = sorted_by_x[pos + 1] - elif pos > 0: - merge_target = sorted_by_x[pos - 1] - else: - break - - ti = merge_target["index"] - - merge_target["x_min_px"] = min( - merge_target.get("x_min_px", merge_target.get("x_min_pct", 0)), - narrowest.get("x_min_px", narrowest.get("x_min_pct", 0)), - ) - merge_target["x_max_px"] = max( - merge_target.get("x_max_px", merge_target.get("x_max_pct", 100)), - narrowest.get("x_max_px", narrowest.get("x_max_pct", 100)), - ) - if "x_min_pct" in merge_target and "x_min_pct" in narrowest: - merge_target["x_min_pct"] = min(merge_target["x_min_pct"], narrowest["x_min_pct"]) - merge_target["x_max_pct"] = max(merge_target["x_max_pct"], narrowest["x_max_pct"]) - - for cell in cells: - if cell.get("col_index") == ni: - cell["col_index"] = ti - existing = next( - (c for c in cells if c["col_index"] == ti - and c["row_index"] == cell["row_index"] - and c is not cell), - None, - ) - if existing: - existing["text"] = ( - (existing.get("text", "") + " " + cell.get("text", "")).strip() - ) - existing["word_boxes"] = existing.get("word_boxes", []) + cell.get("word_boxes", []) - cell["_merged"] = True - - z["cells"] = [c for c in cells if not c.get("_merged")] - cells = z["cells"] - cols.remove(narrowest) - cols_by_width = [c for c in cols_by_width if c["index"] != ni] - - # Re-index columns 0..N-1 - for new_idx, col in enumerate(sorted(cols, key=lambda c: c.get("x_min_px", c.get("x_min_pct", 0)))): - old_idx = col["index"] - col["index"] = new_idx - for cell in cells: - if cell.get("col_index") == old_idx: - cell["col_index"] = new_idx - - logger.info("max_columns: zone %s now has %d columns", z.get("zone_index"), len(cols)) +# Backward-compat shim -- module moved to grid/build/cell_ops.py +import importlib as _importlib +import sys as _sys +_sys.modules[__name__] = _importlib.import_module("grid.build.cell_ops") diff --git a/klausur-service/backend/grid_build_cleanup.py b/klausur-service/backend/grid_build_cleanup.py index 39a60d8..b1d25a1 100644 --- a/klausur-service/backend/grid_build_cleanup.py +++ b/klausur-service/backend/grid_build_cleanup.py @@ -1,390 +1,4 @@ -""" -Grid Build Cleanup — Phase 3: Junk row removal, artifact cleanup, pipe -divider removal, connector normalization, border strip detection, and -alphabet sidebar removal. - -Extracted from grid_build_core.py for maintainability. -""" - -import logging -import re -from typing import Any, Dict, List - -from cv_ocr_engines import _words_to_reading_order_text - -logger = logging.getLogger(__name__) - -_PIPE_RE = re.compile(r"^\|+$") - - -def _cleanup_zones( - zones_data: List[Dict[str, Any]], - border_prefiltered: bool, - session_id: str, -) -> bool: - """Clean up zone data: remove junk rows, artifacts, pipes, border strips. - - Args: - zones_data: List of zone dicts (modified in place). - border_prefiltered: Whether border words were already pre-filtered. - session_id: For logging. - - Returns: - Updated border_prefiltered flag. - """ - _remove_junk_rows(zones_data) - _remove_artifact_cells(zones_data) - _remove_oversized_word_boxes(zones_data) - _remove_pipe_dividers(zones_data) - _normalize_connector_columns(zones_data) - border_prefiltered = _remove_border_strips(zones_data, border_prefiltered) - _remove_alphabet_sidebars(zones_data) - return border_prefiltered - - -def _remove_junk_rows(zones_data: List[Dict[str, Any]]) -> None: - """Remove rows where ALL cells contain only short, low-confidence text. - - Also removes 'oversized stub' rows and 'scattered debris' rows. - """ - _JUNK_CONF_THRESHOLD = 50 - _JUNK_MAX_TEXT_LEN = 3 - - for z in zones_data: - cells = z.get("cells", []) - rows = z.get("rows", []) - if not cells or not rows: - continue - - # Compute median word height across the zone for oversized detection - all_wb_heights = [ - wb["height"] - for cell in cells - for wb in cell.get("word_boxes") or [] - if wb.get("height", 0) > 0 - ] - median_wb_h = sorted(all_wb_heights)[len(all_wb_heights) // 2] if all_wb_heights else 28 - - junk_row_indices = set() - for row in rows: - ri = row["index"] - row_cells = [c for c in cells if c.get("row_index") == ri] - if not row_cells: - continue - - row_wbs = [ - wb for cell in row_cells - for wb in cell.get("word_boxes") or [] - ] - - # Rule 1: ALL word_boxes are low-conf AND short text - all_junk = True - for wb in row_wbs: - text = (wb.get("text") or "").strip() - conf = wb.get("conf", 0) - if conf >= _JUNK_CONF_THRESHOLD or len(text) > _JUNK_MAX_TEXT_LEN: - all_junk = False - break - if all_junk and row_wbs: - junk_row_indices.add(ri) - continue - - # Rule 2: oversized stub -- <=3 words, short total text, - # and word height > 1.8x median - if len(row_wbs) <= 3: - total_text = "".join((wb.get("text") or "").strip() for wb in row_wbs) - max_h = max((wb.get("height", 0) for wb in row_wbs), default=0) - has_page_ref = any( - re.match(r'^[pPsS]\.?\s*\d+$', (wb.get("text") or "").strip()) - for wb in row_wbs - ) - if len(total_text) <= 5 and max_h > median_wb_h * 1.8 and not has_page_ref: - junk_row_indices.add(ri) - continue - - # Rule 3: scattered debris -- rows with only tiny fragments - longest = max(len((wb.get("text") or "").strip()) for wb in row_wbs) - if longest <= 2: - junk_row_indices.add(ri) - continue - - if junk_row_indices: - z["cells"] = [c for c in cells if c.get("row_index") not in junk_row_indices] - z["rows"] = [r for r in rows if r["index"] not in junk_row_indices] - logger.info( - "build-grid: removed %d junk rows from zone %d: %s", - len(junk_row_indices), z["zone_index"], - sorted(junk_row_indices), - ) - - -def _remove_artifact_cells(zones_data: List[Dict[str, Any]]) -> None: - """Remove individual cells with a single very-short, low-conf word.""" - _ARTIFACT_MAX_LEN = 2 - _ARTIFACT_CONF_THRESHOLD = 65 - - for z in zones_data: - cells = z.get("cells", []) - if not cells: - continue - artifact_ids = set() - for cell in cells: - wbs = cell.get("word_boxes") or [] - if len(wbs) != 1: - continue - wb = wbs[0] - text = (wb.get("text") or "").strip() - conf = wb.get("conf", 100) - if len(text) <= _ARTIFACT_MAX_LEN and conf < _ARTIFACT_CONF_THRESHOLD: - artifact_ids.add(cell.get("cell_id")) - if artifact_ids: - z["cells"] = [c for c in cells if c.get("cell_id") not in artifact_ids] - logger.info( - "build-grid: removed %d artifact cells from zone %d: %s", - len(artifact_ids), z.get("zone_index", 0), - [c.get("text") for c in cells if c.get("cell_id") in artifact_ids], - ) - - -def _remove_oversized_word_boxes(zones_data: List[Dict[str, Any]]) -> None: - """Remove word_boxes whose height is 3x+ the median (graphic artifacts).""" - for z in zones_data: - cells = z.get("cells", []) - if not cells: - continue - all_wh = [ - wb["height"] - for cell in cells - for wb in cell.get("word_boxes") or [] - if wb.get("height", 0) > 0 - ] - if not all_wh: - continue - med_h = sorted(all_wh)[len(all_wh) // 2] - oversized_threshold = med_h * 3 - removed_oversized = 0 - for cell in cells: - wbs = cell.get("word_boxes") or [] - filtered = [wb for wb in wbs if wb.get("height", 0) <= oversized_threshold] - if len(filtered) < len(wbs): - removed_oversized += len(wbs) - len(filtered) - cell["word_boxes"] = filtered - cell["text"] = _words_to_reading_order_text(filtered) - if removed_oversized: - z["cells"] = [c for c in cells if c.get("word_boxes")] - logger.info( - "build-grid: removed %d oversized word_boxes (>%dpx) from zone %d", - removed_oversized, oversized_threshold, z.get("zone_index", 0), - ) - - -def _remove_pipe_dividers(zones_data: List[Dict[str, Any]]) -> None: - """Remove pipe-character word_boxes (column divider artifacts).""" - for z in zones_data: - if z.get("vsplit_group") is not None: - continue # pipes already removed before split - removed_pipes = 0 - for cell in z.get("cells", []): - wbs = cell.get("word_boxes") or [] - filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())] - if len(filtered) < len(wbs): - removed_pipes += len(wbs) - len(filtered) - cell["word_boxes"] = filtered - cell["text"] = _words_to_reading_order_text(filtered) - if removed_pipes: - z["cells"] = [c for c in z.get("cells", []) if (c.get("word_boxes") or c.get("text", "").strip())] - logger.info( - "build-grid: removed %d pipe-divider word_boxes from zone %d", - removed_pipes, z.get("zone_index", 0), - ) - - # Strip pipe chars ONLY from cell edges (OCR artifacts). - # Preserve pipes embedded in words as syllable separators. - for z in zones_data: - for cell in z.get("cells", []): - text = cell.get("text", "") - if "|" in text: - cleaned = text.strip("|").strip() - if cleaned != text.strip(): - cell["text"] = cleaned - - -def _normalize_connector_columns(zones_data: List[Dict[str, Any]]) -> None: - """Normalize narrow connector columns where OCR appends noise chars. - - In synonym dictionaries a narrow column repeats the same word - (e.g. "oder") in every row. OCR sometimes appends noise chars. - """ - for z in zones_data: - cols = z.get("columns", []) - cells = z.get("cells", []) - if not cols or not cells: - continue - for col in cols: - ci = col.get("index") - col_cells = [c for c in cells if c.get("col_index") == ci] - if len(col_cells) < 3: - continue - text_counts: Dict[str, int] = {} - for c in col_cells: - t = (c.get("text") or "").strip() - if t: - text_counts[t] = text_counts.get(t, 0) + 1 - if not text_counts: - continue - dominant_text = max(text_counts, key=text_counts.get) # type: ignore[arg-type] - dominant_count = text_counts[dominant_text] - if len(dominant_text) > 10 or dominant_count < len(col_cells) * 0.6: - continue - fixed = 0 - for c in col_cells: - t = (c.get("text") or "").strip() - if t != dominant_text and t.startswith(dominant_text) and len(t) <= len(dominant_text) + 2: - c["text"] = dominant_text - wbs = c.get("word_boxes") or [] - if len(wbs) == 1: - wbs[0]["text"] = dominant_text - fixed += 1 - if fixed: - logger.info( - "build-grid: normalized %d outlier cells in connector column %d " - "(dominant='%s') zone %d", - fixed, ci, dominant_text, z.get("zone_index", 0), - ) - - -def _remove_border_strips( - zones_data: List[Dict[str, Any]], - border_prefiltered: bool, -) -> bool: - """Detect and remove page-border decoration strips. - - Returns updated border_prefiltered flag. - """ - border_strip_removed = 0 - if border_prefiltered: - logger.info("Step 4e: skipped (border pre-filter already applied)") - return border_prefiltered - - for z in zones_data: - cells = z.get("cells", []) - if not cells: - continue - all_wbs_with_cell: list = [] - for cell in cells: - for wb in cell.get("word_boxes") or []: - all_wbs_with_cell.append((wb.get("left", 0), wb, cell)) - if len(all_wbs_with_cell) < 10: - continue - all_wbs_with_cell.sort(key=lambda t: t[0]) - total = len(all_wbs_with_cell) - - # -- Left-edge scan -- - left_strip_count = 0 - left_gap = 0 - running_right = 0 - for gi in range(total - 1): - running_right = max( - running_right, - all_wbs_with_cell[gi][0] + all_wbs_with_cell[gi][1].get("width", 0), - ) - gap = all_wbs_with_cell[gi + 1][0] - running_right - if gap > 30: - left_strip_count = gi + 1 - left_gap = gap - break - - # -- Right-edge scan -- - right_strip_count = 0 - right_gap = 0 - running_left = all_wbs_with_cell[-1][0] - for gi in range(total - 1, 0, -1): - running_left = min(running_left, all_wbs_with_cell[gi][0]) - prev_right = ( - all_wbs_with_cell[gi - 1][0] - + all_wbs_with_cell[gi - 1][1].get("width", 0) - ) - gap = running_left - prev_right - if gap > 30: - right_strip_count = total - gi - right_gap = gap - break - - strip_wbs: set = set() - strip_side = "" - strip_gap = 0 - strip_count = 0 - if left_strip_count > 0 and left_strip_count / total < 0.20: - strip_side = "left" - strip_count = left_strip_count - strip_gap = left_gap - strip_wbs = {id(t[1]) for t in all_wbs_with_cell[:left_strip_count]} - elif right_strip_count > 0 and right_strip_count / total < 0.20: - strip_side = "right" - strip_count = right_strip_count - strip_gap = right_gap - strip_wbs = {id(t[1]) for t in all_wbs_with_cell[total - right_strip_count:]} - - if not strip_wbs: - continue - for cell in cells: - wbs = cell.get("word_boxes") or [] - filtered = [wb for wb in wbs if id(wb) not in strip_wbs] - if len(filtered) < len(wbs): - border_strip_removed += len(wbs) - len(filtered) - cell["word_boxes"] = filtered - cell["text"] = _words_to_reading_order_text(filtered) - z["cells"] = [c for c in cells - if (c.get("word_boxes") or c.get("text", "").strip())] - logger.info( - "Step 4e: removed %d border-strip word_boxes (%s) from zone %d " - "(gap=%dpx, strip=%d/%d wbs)", - border_strip_removed, strip_side, z.get("zone_index", 0), - strip_gap, strip_count, total, - ) - - return border_prefiltered - - -def _remove_alphabet_sidebars(zones_data: List[Dict[str, Any]]) -> None: - """Remove decorative edge columns (alphabet sidebar safety net). - - Dictionary pages have A-Z letter sidebars that OCR reads as single- - character word_boxes. - """ - for z in zones_data: - columns = z.get("columns", []) - cells = z.get("cells", []) - if len(columns) < 3 or not cells: - continue - col_cells: Dict[str, List[Dict]] = {} - for cell in cells: - ct = cell.get("col_type", "") - if ct.startswith("column_"): - col_cells.setdefault(ct, []).append(cell) - col_types_ordered = sorted(col_cells.keys()) - if len(col_types_ordered) < 3: - continue - for edge_ct in [col_types_ordered[0], col_types_ordered[-1]]: - edge_cells_list = col_cells.get(edge_ct, []) - if len(edge_cells_list) < 3: - continue - texts = [(c.get("text") or "").strip() for c in edge_cells_list] - avg_len = sum(len(t) for t in texts) / len(texts) - single_char = sum(1 for t in texts if len(t) <= 1) - single_ratio = single_char / len(texts) - if avg_len > 1.5: - continue - if single_ratio < 0.7: - continue - removed_count = len(edge_cells_list) - edge_ids = {id(c) for c in edge_cells_list} - z["cells"] = [c for c in cells if id(c) not in edge_ids] - z["columns"] = [col for col in columns if col.get("col_type") != edge_ct] - logger.info( - "Step 4f: removed decorative edge column '%s' from zone %d " - "(%d cells, avg_len=%.1f, single_char=%.0f%%)", - edge_ct, z.get("zone_index", 0), removed_count, - avg_len, single_ratio * 100, - ) - break # only remove one edge per zone +# Backward-compat shim -- module moved to grid/build/cleanup.py +import importlib as _importlib +import sys as _sys +_sys.modules[__name__] = _importlib.import_module("grid.build.cleanup") diff --git a/klausur-service/backend/grid_build_core.py b/klausur-service/backend/grid_build_core.py index 8fa6f72..19d1cbb 100644 --- a/klausur-service/backend/grid_build_core.py +++ b/klausur-service/backend/grid_build_core.py @@ -1,213 +1,4 @@ -""" -Grid Build Core — the main _build_grid_core() function. - -Extracted from grid_editor_api.py for maintainability. -Takes merged OCR word positions and builds a structured, zone-aware grid. - -The function delegates to phase-specific modules: -- grid_build_zones.py — image loading, graphic/box detection, zone grids -- grid_build_cleanup.py — junk rows, artifacts, pipes, border strips -- grid_build_text_ops.py — color, headings, IPA, page refs -- grid_build_finalize.py — bullets, max_columns, dictionary, spelling, result -""" - -import logging -import time -from typing import Any, Dict, List, Optional - -from grid_editor_helpers import ( - _flatten_word_boxes, - _get_content_bounds, - _filter_decorative_margin, - _filter_footer_words, - _filter_header_junk, -) - -from grid_build_zones import _build_zones -from grid_build_cleanup import _cleanup_zones -from grid_build_text_ops import _process_text -from grid_build_finalize import _finalize_grid - -logger = logging.getLogger(__name__) - - -async def _build_grid_core( - session_id: str, - session: dict, - *, - ipa_mode: str = "auto", - syllable_mode: str = "auto", - enhance: bool = True, - max_columns: Optional[int] = None, - min_conf: Optional[int] = None, -) -> dict: - """Core grid building logic — pure computation, no HTTP or DB side effects. - - Args: - session_id: Session identifier (for logging and image loading). - session: Full session dict from get_session_db(). - ipa_mode: "auto" (only when English headwords detected), "all" - (force IPA on all content columns), "en" (English column only), - "de" (German/definition columns only), or "none" (skip entirely). - syllable_mode: "auto" (only when original has pipe dividers), - "all" (force syllabification on all words), "en" (English only), - "de" (German only), or "none" (skip). - - Returns: - StructuredGrid result dict. - - Raises: - ValueError: If session data is incomplete. - """ - t0 = time.time() - - # ── Phase 1: Input Validation & Word Filtering ────────────────── - - # 1. Validate and load word results - word_result = session.get("word_result") - if not word_result or not word_result.get("cells"): - raise ValueError("No word results found. Run paddle-kombi or rapid-kombi first.") - - img_w = word_result.get("image_width", 0) - img_h = word_result.get("image_height", 0) - if not img_w or not img_h: - raise ValueError("Missing image dimensions in word_result") - - # 2. Flatten all word boxes from cells - all_words = _flatten_word_boxes(word_result["cells"]) - if not all_words: - raise ValueError("No word boxes found in cells") - - # 2a-pre. Apply min_conf filter if specified - if min_conf and min_conf > 0: - before = len(all_words) - all_words = [w for w in all_words if w.get('conf', 100) >= min_conf] - removed = before - len(all_words) - if removed: - logger.info("build-grid session %s: min_conf=%d removed %d/%d words", - session_id, min_conf, removed, before) - - logger.info("build-grid session %s: %d words from %d cells (enhance=%s, max_cols=%s, min_conf=%s)", - session_id, len(all_words), len(word_result["cells"]), - enhance, max_columns, min_conf) - - # 2b. Filter decorative margin columns (alphabet graphics) - margin_strip_info = _filter_decorative_margin(all_words, img_w, logger, session_id) - margin_strip_detected = margin_strip_info.get("found", False) - - # Read document_category from session - document_category = session.get("document_category") - - # 2c. Filter footer rows (page numbers at the very bottom) - page_number_info = _filter_footer_words(all_words, img_h, logger, session_id) - - # 2c2. Filter OCR junk from header illustrations - _filter_header_junk(all_words, img_h, logger, session_id) - - # 2d. Filter words inside user-defined exclude regions - structure_result = session.get("structure_result") - exclude_rects = [] - if structure_result: - for er in structure_result.get("exclude_regions", []): - exclude_rects.append({ - "x": er["x"], "y": er["y"], - "w": er["w"], "h": er["h"], - }) - if exclude_rects: - before = len(all_words) - filtered = [] - for w in all_words: - w_cx = w["left"] + w.get("width", 0) / 2 - w_cy = w["top"] + w.get("height", 0) / 2 - inside = any( - er["x"] <= w_cx <= er["x"] + er["w"] - and er["y"] <= w_cy <= er["y"] + er["h"] - for er in exclude_rects - ) - if not inside: - filtered.append(w) - removed = before - len(filtered) - if removed: - all_words = filtered - logger.info( - "build-grid session %s: removed %d words inside %d user exclude region(s)", - session_id, removed, len(exclude_rects), - ) - - # 2e. Hard-filter words inside graphic/image regions from structure step - graphic_rects: List[Dict[str, int]] = [] - if structure_result: - for g in structure_result.get("graphics", []): - graphic_rects.append({ - "x": g["x"], "y": g["y"], - "w": g["w"], "h": g["h"], - }) - if graphic_rects: - before = len(all_words) - all_words = [ - w for w in all_words - if not any( - gr["x"] <= w["left"] + w.get("width", 0) / 2 <= gr["x"] + gr["w"] - and gr["y"] <= w["top"] + w.get("height", 0) / 2 <= gr["y"] + gr["h"] - for gr in graphic_rects - ) - ] - removed = before - len(all_words) - if removed: - logger.info( - "build-grid session %s: hard-removed %d words inside %d structure graphic region(s)", - session_id, removed, len(graphic_rects), - ) - - content_x, content_y, content_w, content_h = _get_content_bounds(all_words) - - # ── Phase 2: Image Processing & Zone Detection ────────────────── - - zone_result = await _build_zones( - session_id, session, all_words, graphic_rects, - content_x, content_y, content_w, content_h, - img_w, img_h, - ) - zones_data = zone_result["zones_data"] - boxes_detected = zone_result["boxes_detected"] - recovered_count = zone_result["recovered_count"] - border_prefiltered = zone_result["border_prefiltered"] - img_bgr = zone_result["img_bgr"] - - # ── Phase 3: Junk Removal & Cell Cleanup ──────────────────────── - - border_prefiltered = _cleanup_zones(zones_data, border_prefiltered, session_id) - - # ── Phase 4+5a: Color, Headings, IPA, Page Refs ───────────────── - - text_result = _process_text( - zones_data, img_bgr, img_w, img_h, ipa_mode, page_number_info, - ) - - # ── Phase 5b+6: Finalize & Result Assembly ────────────────────── - - duration = time.time() - t0 - - result = _finalize_grid( - zones_data=zones_data, - all_words=all_words, - img_bgr=img_bgr, - img_w=img_w, - img_h=img_h, - session_id=session_id, - max_columns=max_columns, - ipa_mode=ipa_mode, - syllable_mode=syllable_mode, - en_col_type=text_result["en_col_type"], - ipa_target_cols=text_result["ipa_target_cols"], - all_content_cols=text_result["all_content_cols"], - skip_ipa=text_result["skip_ipa"], - document_category=document_category, - margin_strip_detected=margin_strip_detected, - page_number_info=text_result["page_number_info"], - boxes_detected=boxes_detected, - recovered_count=recovered_count, - duration=duration, - ) - - return result +# Backward-compat shim -- module moved to grid/build/core.py +import importlib as _importlib +import sys as _sys +_sys.modules[__name__] = _importlib.import_module("grid.build.core") diff --git a/klausur-service/backend/grid_build_finalize.py b/klausur-service/backend/grid_build_finalize.py index 16a543a..23a37ca 100644 --- a/klausur-service/backend/grid_build_finalize.py +++ b/klausur-service/backend/grid_build_finalize.py @@ -1,452 +1,4 @@ -""" -Grid Build Finalize — Phase 5b+6: Orchestrates cell-level operations, -dictionary detection, syllable dividers, spell checking, empty column -removal, and result assembly. - -Extracted from grid_build_core.py for maintainability. -""" - -import logging -import re -from typing import Any, Dict, List, Optional - -from grid_build_cell_ops import ( - _remove_bullets_and_artifacts, - _remove_garbled_cells, - _normalize_word_order, - _enforce_max_columns, -) - -logger = logging.getLogger(__name__) - - -def _finalize_grid( - zones_data: List[Dict[str, Any]], - all_words: List[Dict[str, Any]], - img_bgr: Any, - img_w: int, - img_h: int, - session_id: str, - max_columns: Optional[int], - ipa_mode: str, - syllable_mode: str, - en_col_type: Optional[str], - ipa_target_cols: set, - all_content_cols: set, - skip_ipa: bool, - document_category: Optional[str], - margin_strip_detected: bool, - page_number_info: Optional[Dict], - boxes_detected: int, - recovered_count: int, - duration: float, -) -> dict: - """Run final processing steps and assemble result dict. - - Handles: bullet removal, artifact cells, word ordering, max_columns, - dictionary detection, syllable dividers, spell check, empty columns, - internal flag cleanup, and result assembly. - """ - total_cols = sum(len(z.get("columns", [])) for z in zones_data) - - # 5i. Remove blue bullet/artifact word_boxes - _remove_bullets_and_artifacts(zones_data) - - # 5j-pre. Remove cells whose text is entirely garbled / artifact noise - _remove_garbled_cells(zones_data) - - # 5j. Normalise word_box order to reading order - _normalize_word_order(zones_data) - - # 5k. Enforce max_columns by merging narrowest columns - if max_columns and max_columns > 0: - _enforce_max_columns(zones_data, max_columns) - - # --- Dictionary detection on assembled grid --- - dict_detection = _detect_dictionary( - zones_data, img_w, img_h, document_category, margin_strip_detected - ) - - # --- Word-gap merge --- - try: - from cv_syllable_detect import merge_word_gaps_in_zones - merge_word_gaps_in_zones(zones_data, session_id) - except Exception as e: - logger.warning("Word-gap merge failed: %s", e) - - # --- Pipe auto-correction --- - try: - from cv_syllable_detect import autocorrect_pipe_artifacts - autocorrect_pipe_artifacts(zones_data, session_id) - except Exception as e: - logger.warning("Pipe autocorrect failed: %s", e) - - # --- Syllable divider insertion --- - syllable_insertions = _insert_syllable_dividers( - zones_data, img_bgr, session_id, syllable_mode, dict_detection, - en_col_type, all_content_cols, total_cols, - ) - - # --- Split merged words --- - _split_merged_words(zones_data, session_id) - - # --- Ensure space before IPA/phonetic brackets --- - _fix_ipa_spacing(zones_data) - - # --- SmartSpellChecker --- - _run_spell_checker(zones_data, session_id, en_col_type, total_cols) - - # --- Debug log cell counts per column --- - for z in zones_data: - if z.get("zone_type") == "content": - from collections import Counter as _Counter - _cc = _Counter(c.get("col_index") for c in z.get("cells", [])) - _cols = z.get("columns", []) - logger.info( - "pre-empty-col-removal zone %d: %d cols, cells_per_col=%s", - z.get("zone_index", 0), len(_cols), dict(sorted(_cc.items())), - ) - - # --- Remove empty columns --- - _remove_empty_columns(zones_data) - - # Clean up internal flags before returning - for z in zones_data: - for cell in z.get("cells", []): - cell.pop("_ipa_corrected", None) - - # 6. Build result - return _assemble_result( - zones_data, all_words, img_w, img_h, session_id, - ipa_mode, syllable_mode, ipa_target_cols, skip_ipa, - dict_detection, page_number_info, boxes_detected, - recovered_count, duration, syllable_insertions, - ) - - -def _detect_dictionary( - zones_data: List[Dict[str, Any]], - img_w: int, - img_h: int, - document_category: Optional[str], - margin_strip_detected: bool, -) -> Dict[str, Any]: - """Run dictionary detection on the assembled grid.""" - from cv_layout import _score_dictionary_signals - dict_detection: Dict[str, Any] = {"is_dictionary": False, "confidence": 0.0} - try: - from cv_vocab_types import ColumnGeometry - for z in zones_data: - zone_cells = z.get("cells", []) - zone_cols = z.get("columns", []) - if len(zone_cols) < 2 or len(zone_cells) < 10: - continue - pseudo_geoms = [] - for col in zone_cols: - ci = col["index"] - col_cells = [c for c in zone_cells if c.get("col_index") == ci] - col_words = [] - for cell in col_cells: - for wb in cell.get("word_boxes") or []: - col_words.append({ - "text": wb.get("text", ""), - "conf": wb.get("conf", 0), - "top": wb.get("top", 0), - "left": wb.get("left", 0), - "height": wb.get("height", 0), - "width": wb.get("width", 0), - }) - if not cell.get("word_boxes") and cell.get("text"): - col_words.append({ - "text": cell["text"], - "conf": cell.get("confidence", 50), - "top": cell.get("bbox_px", {}).get("y", 0), - "left": cell.get("bbox_px", {}).get("x", 0), - "height": cell.get("bbox_px", {}).get("h", 20), - "width": cell.get("bbox_px", {}).get("w", 50), - }) - col_w = col.get("x_max_px", 0) - col.get("x_min_px", 0) - pseudo_geoms.append(ColumnGeometry( - index=ci, x=col.get("x_min_px", 0), y=0, - width=max(col_w, 1), height=img_h, - word_count=len(col_words), words=col_words, - width_ratio=col_w / max(img_w, 1), - )) - if len(pseudo_geoms) >= 2: - dd = _score_dictionary_signals( - pseudo_geoms, - document_category=document_category, - margin_strip_detected=margin_strip_detected, - ) - if dd["confidence"] > dict_detection["confidence"]: - dict_detection = dd - except Exception as e: - logger.warning("Dictionary detection failed: %s", e) - return dict_detection - - -def _insert_syllable_dividers( - zones_data: List[Dict[str, Any]], - img_bgr: Any, - session_id: str, - syllable_mode: str, - dict_detection: Dict[str, Any], - en_col_type: Optional[str], - all_content_cols: set, - total_cols: int, -) -> int: - """Insert syllable dividers for dictionary pages. Returns insertion count.""" - syllable_insertions = 0 - if syllable_mode == "none" or img_bgr is None: - if syllable_mode == "none": - for z in zones_data: - for cell in z.get("cells", []): - t = cell.get("text", "") - if "|" in t: - cell["text"] = t.replace("|", "") - return syllable_insertions - - _syllable_eligible = False - if syllable_mode in ("all", "de", "en"): - _syllable_eligible = True - elif (dict_detection.get("is_dictionary") - and dict_detection.get("article_col_index") is not None): - _syllable_eligible = True - - _syllable_col_filter: Optional[set] = None - if syllable_mode == "en": - _syllable_col_filter = {en_col_type} if en_col_type else set() - elif syllable_mode == "de": - if en_col_type and total_cols >= 3: - _syllable_col_filter = all_content_cols - {en_col_type} - - if _syllable_eligible: - try: - from cv_syllable_detect import insert_syllable_dividers - force_syllables = (syllable_mode in ("all", "de", "en")) - syllable_insertions = insert_syllable_dividers( - zones_data, img_bgr, session_id, - force=force_syllables, - col_filter=_syllable_col_filter, - ) - except Exception as e: - logger.warning("Syllable insertion failed: %s", e) - - return syllable_insertions - - -def _split_merged_words( - zones_data: List[Dict[str, Any]], - session_id: str, -) -> None: - """Split merged words using dictionary lookup.""" - try: - from cv_review import _try_split_merged_word, _SPELL_AVAILABLE - if not _SPELL_AVAILABLE: - return - split_count = 0 - for z in zones_data: - for cell in z.get("cells", []): - text = cell.get("text", "") - if not text: - continue - parts = [] - changed = False - for token in text.split(): - clean = token - bracket_pos = clean.find('[') - suffix_ipa = "" - if bracket_pos > 0: - suffix_ipa = clean[bracket_pos:] - clean = clean[:bracket_pos] - suffix_punct = "" - stripped = clean.rstrip(".,!?;:'\")") - if stripped != clean: - suffix_punct = clean[len(stripped):] - clean = stripped - suffix = suffix_punct + suffix_ipa - contraction = "" - if "'" in clean and clean.index("'") >= 2: - apos_pos = clean.index("'") - contraction = clean[apos_pos:] - clean = clean[:apos_pos] - suffix = contraction + suffix - if len(clean) >= 4 and clean.isalpha(): - split = _try_split_merged_word(clean) - if split: - parts.append(split + suffix) - changed = True - continue - parts.append(token) - if changed: - cell["text"] = " ".join(parts) - split_count += 1 - if split_count: - logger.info("build-grid session %s: split %d merged words", session_id, split_count) - except ImportError: - pass - - -def _fix_ipa_spacing(zones_data: List[Dict[str, Any]]) -> None: - """Ensure space before IPA/phonetic brackets: 'word[ipa]' -> 'word [ipa]'.""" - _IPA_NOSPACE_RE = re.compile(r'([a-zA-ZäöüÄÖÜß])(\[[^\]]{2,}\])') - for z in zones_data: - for cell in z.get("cells", []): - text = cell.get("text", "") - if text and "[" in text: - fixed = _IPA_NOSPACE_RE.sub(r'\1 \2', text) - if fixed != text: - cell["text"] = fixed - - -def _run_spell_checker( - zones_data: List[Dict[str, Any]], - session_id: str, - en_col_type: Optional[str], - total_cols: int, -) -> None: - """Run SmartSpellChecker on all cells.""" - try: - from smart_spell import SmartSpellChecker - _ssc = SmartSpellChecker() - spell_fix_count = 0 - - for z in zones_data: - for cell in z.get("cells", []): - text = cell.get("text", "") - if not text or not text.strip(): - continue - ct = cell.get("col_type", "") - if not ct.startswith("column_"): - continue - - if total_cols >= 3 and en_col_type: - lang = "en" if ct == en_col_type else "de" - elif total_cols <= 2: - lang = "auto" - else: - lang = "auto" - - result = _ssc.correct_text(text, lang=lang) - if result.changed: - cell["text"] = result.corrected - spell_fix_count += 1 - - if spell_fix_count: - logger.info( - "build-grid session %s: SmartSpellChecker fixed %d cells", - session_id, spell_fix_count, - ) - except ImportError: - logger.debug("SmartSpellChecker not available in build-grid") - except Exception as e: - logger.warning("SmartSpellChecker error in build-grid: %s", e) - - -def _remove_empty_columns(zones_data: List[Dict[str, Any]]) -> None: - """Remove columns that have no cells assigned.""" - for z in zones_data: - cells = z.get("cells", []) - used_col_indices = {c.get("col_index") for c in cells} - old_cols = z.get("columns", []) - new_cols = [c for c in old_cols if c.get("col_index", c.get("index", -1)) in used_col_indices] - if len(new_cols) < len(old_cols): - old_to_new = {} - for new_i, col in enumerate(new_cols): - old_i = col.get("col_index", col.get("index", new_i)) - old_to_new[old_i] = new_i - col["col_index"] = new_i - col["index"] = new_i - col["label"] = f"column_{new_i + 1}" if len(new_cols) > 1 else "column_text" - for cell in cells: - old_ci = cell.get("col_index", 0) - cell["col_index"] = old_to_new.get(old_ci, old_ci) - cell["col_type"] = f"column_{cell['col_index'] + 1}" if len(new_cols) > 1 else "column_text" - z["columns"] = new_cols - - -def _assemble_result( - zones_data: List[Dict[str, Any]], - all_words: List[Dict[str, Any]], - img_w: int, - img_h: int, - session_id: str, - ipa_mode: str, - syllable_mode: str, - ipa_target_cols: set, - skip_ipa: bool, - dict_detection: Dict[str, Any], - page_number_info: Optional[Dict], - boxes_detected: int, - recovered_count: int, - duration: float, - syllable_insertions: int, -) -> dict: - """Build the final result dict (Phase 6).""" - total_cells = sum(len(z.get("cells", [])) for z in zones_data) - total_columns = sum(len(z.get("columns", [])) for z in zones_data) - total_rows = sum(len(z.get("rows", [])) for z in zones_data) - - # Collect color statistics - color_stats: Dict[str, int] = {} - for z in zones_data: - for cell in z.get("cells", []): - for wb in cell.get("word_boxes", []): - cn = wb.get("color_name", "black") - color_stats[cn] = color_stats.get(cn, 0) + 1 - - # Compute layout metrics - all_content_row_heights: List[float] = [] - for z in zones_data: - for row in z.get("rows", []): - if not row.get("is_header", False): - h = row.get("y_max_px", 0) - row.get("y_min_px", 0) - if h > 0: - all_content_row_heights.append(h) - avg_row_height = ( - sum(all_content_row_heights) / len(all_content_row_heights) - if all_content_row_heights else 30.0 - ) - font_size_suggestion = max(10, int(avg_row_height * 0.6)) - - return { - "session_id": session_id, - "image_width": img_w, - "image_height": img_h, - "zones": zones_data, - "boxes_detected": boxes_detected, - "summary": { - "total_zones": len(zones_data), - "total_columns": total_columns, - "total_rows": total_rows, - "total_cells": total_cells, - "total_words": len(all_words), - "recovered_colored": recovered_count, - "color_stats": color_stats, - }, - "formatting": { - "bold_columns": [], - "header_rows": [], - }, - "layout_metrics": { - "page_width_px": img_w, - "page_height_px": img_h, - "avg_row_height_px": round(avg_row_height, 1), - "font_size_suggestion_px": font_size_suggestion, - }, - "dictionary_detection": { - "is_dictionary": dict_detection.get("is_dictionary", False), - "confidence": dict_detection.get("confidence", 0.0), - "signals": dict_detection.get("signals", {}), - "article_col_index": dict_detection.get("article_col_index"), - "headword_col_index": dict_detection.get("headword_col_index"), - }, - "processing_modes": { - "ipa_mode": ipa_mode, - "syllable_mode": syllable_mode, - "ipa_applied": bool(ipa_target_cols) if not skip_ipa else False, - "syllables_applied": syllable_insertions > 0, - }, - "page_number": page_number_info, - "duration_seconds": round(duration, 2), - } +# Backward-compat shim -- module moved to grid/build/finalize.py +import importlib as _importlib +import sys as _sys +_sys.modules[__name__] = _importlib.import_module("grid.build.finalize") diff --git a/klausur-service/backend/grid_build_text_ops.py b/klausur-service/backend/grid_build_text_ops.py index 7c2bf66..e3b69c9 100644 --- a/klausur-service/backend/grid_build_text_ops.py +++ b/klausur-service/backend/grid_build_text_ops.py @@ -1,489 +1,4 @@ -""" -Grid Build Text Ops — Phase 4+5a: Color annotation, heading detection, -parenthesis fix, IPA phonetic correction, page ref extraction, and -slash-IPA conversion. - -Extracted from grid_build_core.py for maintainability. -""" - -import logging -import re -from typing import Any, Dict, List, Optional, Set, Tuple - -from cv_color_detect import detect_word_colors -from cv_ocr_engines import ( - fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa, - _lookup_ipa, -) -from grid_editor_helpers import ( - _detect_heading_rows_by_color, - _detect_heading_rows_by_single_cell, -) - -logger = logging.getLogger(__name__) - - -def _process_text( - zones_data: List[Dict[str, Any]], - img_bgr: Any, - img_w: int, - img_h: int, - ipa_mode: str, - page_number_info: Optional[Dict], -) -> Dict[str, Any]: - """Run color annotation, heading detection, IPA correction, and page refs. - - Args: - zones_data: List of zone dicts (modified in place). - img_bgr: BGR image array (or None). - img_w: Image width. - img_h: Image height. - ipa_mode: IPA processing mode. - page_number_info: Existing page number metadata (may be None). - - Returns: - Dict with keys: en_col_type, ipa_target_cols, all_content_cols, - skip_ipa, page_number_info. - """ - # 5. Color annotation on final word_boxes in cells - if img_bgr is not None: - all_wb: List[Dict] = [] - for z in zones_data: - for cell in z.get("cells", []): - all_wb.extend(cell.get("word_boxes", [])) - detect_word_colors(img_bgr, all_wb) - - # 5a. Heading detection by color + height - heading_count = _detect_heading_rows_by_color(zones_data, img_w, img_h) - if heading_count: - logger.info("Detected %d heading rows by color+height", heading_count) - - # 5b. Fix unmatched parentheses in cell text - for z in zones_data: - for cell in z.get("cells", []): - text = cell.get("text", "") - if ")" in text and "(" not in text: - cell["text"] = "(" + text - - # 5c. IPA phonetic correction - all_cells = [cell for z in zones_data for cell in z.get("cells", [])] - total_cols = sum(len(z.get("columns", [])) for z in zones_data) - en_col_type = None - ipa_target_cols: set = set() - all_content_cols: set = set() - skip_ipa = (ipa_mode == "none") - - # When ipa_mode=none, strip ALL square brackets from ALL content columns - if skip_ipa: - _SQUARE_BRACKET_RE_NONE = re.compile(r'\s*\[[^\]]+\]') - for cell in all_cells: - ct = cell.get("col_type", "") - if not ct.startswith("column_"): - continue - text = cell.get("text", "") - if "[" in text: - stripped = _SQUARE_BRACKET_RE_NONE.sub("", text) - if stripped != text: - cell["text"] = stripped.strip() - cell["_ipa_corrected"] = True - - if not skip_ipa and total_cols >= 3: - en_col_type, ipa_target_cols, all_content_cols = _run_ipa_correction( - all_cells, total_cols, ipa_mode, zones_data - ) - elif not skip_ipa: - # Collect all_content_cols even when <3 cols (needed by finalize) - for cell in all_cells: - ct = cell.get("col_type", "") - if ct.startswith("column_") and (cell.get("text") or "").strip(): - all_content_cols.add(ct) - - # 5e. Heading detection by single-cell rows - single_heading_count = _detect_heading_rows_by_single_cell(zones_data, img_w, img_h) - if single_heading_count: - logger.info("Detected %d heading rows by single-cell heuristic", single_heading_count) - - # 5f. Strip IPA from headings - for z in zones_data: - for cell in z.get("cells", []): - if cell.get("col_type") != "heading": - continue - text = cell.get("text", "") - stripped = re.sub(r'\s*\[[^\]]*\]\s*$', '', text).strip() - if stripped and stripped != text: - cell["text"] = stripped - - # 5g. Extract page_ref cells and footer rows - _extract_page_refs_and_footers(zones_data, page_number_info) - - # 5h. Convert slash-delimited IPA to bracket notation - _convert_slash_ipa(zones_data, skip_ipa, en_col_type) - - return { - "en_col_type": en_col_type, - "ipa_target_cols": ipa_target_cols, - "all_content_cols": all_content_cols, - "skip_ipa": skip_ipa, - "page_number_info": page_number_info, - } - - -def _run_ipa_correction( - all_cells: List[Dict], - total_cols: int, - ipa_mode: str, - zones_data: List[Dict[str, Any]], -) -> Tuple[Optional[str], set, set]: - """Run IPA correction on cells. Returns (en_col_type, ipa_target_cols, all_content_cols).""" - en_col_type = None - all_content_cols: set = set() - - # Detect English headword column via IPA signals - col_ipa_count: Dict[str, int] = {} - for cell in all_cells: - ct = cell.get("col_type", "") - if not ct.startswith("column_"): - continue - txt = cell.get("text", "") or "" - if txt.strip(): - all_content_cols.add(ct) - if '[' in txt or _text_has_garbled_ipa(txt): - col_ipa_count[ct] = col_ipa_count.get(ct, 0) + 1 - if col_ipa_count: - en_col_type = max(col_ipa_count, key=col_ipa_count.get) - elif ipa_mode == "all": - col_cell_count: Dict[str, int] = {} - for cell in all_cells: - ct = cell.get("col_type", "") - if ct.startswith("column_") and (cell.get("text") or "").strip(): - col_cell_count[ct] = col_cell_count.get(ct, 0) + 1 - if col_cell_count: - en_col_type = max(col_cell_count, key=col_cell_count.get) - - # Decide which columns to process based on ipa_mode - en_ipa_target_cols: set = set() - de_ipa_target_cols: set = set() - if ipa_mode in ("auto", "en"): - if en_col_type: - en_ipa_target_cols.add(en_col_type) - elif ipa_mode == "de": - de_ipa_target_cols = all_content_cols - {en_col_type} if en_col_type else all_content_cols - elif ipa_mode == "all": - if en_col_type: - en_ipa_target_cols.add(en_col_type) - de_ipa_target_cols = all_content_cols - en_ipa_target_cols - - # --- Strip IPA from columns NOT in the target set --- - _SQUARE_BRACKET_RE = re.compile(r'\s*\[[^\]]+\]') - strip_en_ipa = en_col_type and en_col_type not in en_ipa_target_cols - if strip_en_ipa or ipa_mode == "none": - strip_cols = {en_col_type} if strip_en_ipa and ipa_mode != "none" else all_content_cols - for cell in all_cells: - ct = cell.get("col_type", "") - if ct not in strip_cols: - continue - text = cell.get("text", "") - if "[" in text: - stripped = _SQUARE_BRACKET_RE.sub("", text) - if stripped != text: - cell["text"] = stripped.strip() - cell["_ipa_corrected"] = True - - # --- English IPA (Britfone + eng_to_ipa) --- - if en_ipa_target_cols: - for cell in all_cells: - ct = cell.get("col_type") - if ct in en_ipa_target_cols: - cell["_orig_col_type"] = ct - cell["col_type"] = "column_en" - _pre_ipa = {id(c): c.get("text", "") for c in all_cells} - fix_cell_phonetics(all_cells, pronunciation="british") - for cell in all_cells: - orig = cell.pop("_orig_col_type", None) - if orig: - cell["col_type"] = orig - if cell.get("text", "") != _pre_ipa.get(id(cell), ""): - cell["_ipa_corrected"] = True - - # --- German IPA (wiki-pronunciation-dict + epitran) --- - if de_ipa_target_cols: - from cv_ipa_german import insert_german_ipa - insert_german_ipa(all_cells, de_ipa_target_cols) - - ipa_target_cols = en_ipa_target_cols | de_ipa_target_cols - - # Mark cells whose text was changed by IPA correction - for cell in all_cells: - if cell.get("text", "") != _pre_ipa.get(id(cell), ""): - cell["_ipa_corrected"] = True - - # 5d. Fix IPA continuation cells - skip_ipa = (ipa_mode == "none") - _REAL_IPA_CHARS = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ") - ipa_cont_fixed = 0 - for z in ([] if skip_ipa else zones_data): - rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"]) - z_cells = z.get("cells", []) - for idx, row in enumerate(rows_sorted): - if idx == 0: - continue - ri = row["index"] - row_cells = [c for c in z_cells if c.get("row_index") == ri] - for cell in row_cells: - ct = cell.get("col_type", "") - if not ct.startswith("column_"): - continue - cell_text = (cell.get("text") or "").strip() - if not cell_text: - wb_texts = [w.get("text", "") - for w in cell.get("word_boxes", [])] - cell_text = " ".join(wb_texts).strip() - if not cell_text: - continue - - is_bracketed = ( - cell_text.startswith('[') and cell_text.endswith(']') - ) - - if is_bracketed: - if not _text_has_garbled_ipa(cell_text): - continue - if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', cell_text): - continue - else: - content_cells_in_row = [ - c for c in row_cells - if c.get("col_type", "").startswith("column_") - and c.get("col_type") != "column_1" - ] - if len(content_cells_in_row) != 1: - continue - if not _text_has_garbled_ipa(cell_text): - continue - if any(c in _REAL_IPA_CHARS for c in cell_text): - continue - _words_in_text = re.findall(r'[A-Za-zÄÖÜäöüß]{3,}', cell_text) - if len(_words_in_text) >= 3: - continue - - # Find headword in previous row, same column - prev_ri = rows_sorted[idx - 1]["index"] - prev_same_col = [ - c for c in z_cells - if c.get("row_index") == prev_ri - and c.get("col_type") == ct - ] - if not prev_same_col: - continue - prev_text = prev_same_col[0].get("text", "") - fixed = fix_ipa_continuation_cell( - cell_text, prev_text, pronunciation="british", - ) - if fixed != cell_text: - cell["text"] = fixed - ipa_cont_fixed += 1 - logger.info( - "IPA continuation R%d %s: '%s' -> '%s'", - ri, ct, cell_text, fixed, - ) - if ipa_cont_fixed: - logger.info("Fixed %d IPA continuation cells", ipa_cont_fixed) - - return en_col_type, ipa_target_cols, all_content_cols - - -def _extract_page_refs_and_footers( - zones_data: List[Dict[str, Any]], - page_number_info: Optional[Dict], -) -> None: - """Extract page_ref cells and footer rows from content zones. - - Modifies zones_data in place. Updates page_number_info if a page number - footer is found. - """ - _REAL_IPA_CHARS_SET = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ") - _PAGE_REF_RE = re.compile(r'^[pP,]?\s*\.?\s*\d+$') - _NUMBER_WORDS = { - "one", "two", "three", "four", "five", "six", "seven", - "eight", "nine", "ten", "eleven", "twelve", "thirteen", - "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", - "nineteen", "twenty", "thirty", "forty", "fifty", "sixty", - "seventy", "eighty", "ninety", "hundred", "thousand", "and", - "einhundert", "zweihundert", "dreihundert", "vierhundert", - "und", "zehn", "zwanzig", "dreißig", "vierzig", "fünfzig", - } - - for z in zones_data: - if z.get("zone_type") != "content": - continue - cells = z.get("cells", []) - rows = z.get("rows", []) - if not rows: - continue - - # Extract column_1 cells that look like page references - page_refs = [] - page_ref_cell_ids = set() - for cell in cells: - if cell.get("col_type") != "column_1": - continue - text = (cell.get("text") or "").strip() - if not text: - continue - if not _PAGE_REF_RE.match(text): - continue - page_refs.append({ - "row_index": cell.get("row_index"), - "text": text, - "bbox_pct": cell.get("bbox_pct", {}), - }) - page_ref_cell_ids.add(cell.get("cell_id")) - - # Detect footer: last non-header row if it has only 1 cell - footer_rows = [] - non_header_rows = [r for r in rows if not r.get("is_header")] - if non_header_rows: - last_row = non_header_rows[-1] - last_ri = last_row["index"] - last_cells = [c for c in z["cells"] - if c.get("row_index") == last_ri] - if len(last_cells) == 1: - text = (last_cells[0].get("text") or "").strip() - has_real_ipa = any(c in _REAL_IPA_CHARS_SET for c in text) - has_commas = ',' in text - text_words = set(text.lower().split()) - is_written_number = len(text_words) >= 2 and text_words.issubset(_NUMBER_WORDS) - is_page_number = len(text) <= 20 or is_written_number - if (text and not has_real_ipa and not has_commas - and is_page_number - and last_cells[0].get("col_type") != "heading"): - footer_rows.append({ - "row_index": last_ri, - "text": text, - "bbox_pct": last_cells[0].get("bbox_pct", {}), - }) - - # Classify footer rows - page_number_footers = [] - other_footers = [] - for fr in footer_rows: - ft = fr["text"].strip() - digits = "".join(c for c in ft if c.isdigit()) - if digits and re.match(r'^[\d\s.]+$', ft): - page_number_footers.append(fr) - elif ft.lower().split() and set(ft.lower().split()).issubset(_NUMBER_WORDS): - page_number_footers.append(fr) - else: - other_footers.append(fr) - - # Remove page-number footer rows from grid entirely - if page_number_footers: - pn_ris = {fr["row_index"] for fr in page_number_footers} - z["cells"] = [c for c in z["cells"] if c.get("row_index") not in pn_ris] - z["rows"] = [r for r in z["rows"] if r["index"] not in pn_ris] - pn_text = page_number_footers[0]["text"].strip() - pn_digits = "".join(c for c in pn_text if c.isdigit()) - if not page_number_info: - page_number_info = { - "text": pn_text, - "y_pct": page_number_footers[0].get("bbox_pct", {}).get("y", 95), - } - if pn_digits: - page_number_info["number"] = int(pn_digits) - - # Mark remaining footer rows - if other_footers: - footer_ris = {fr["row_index"] for fr in other_footers} - for r in z["rows"]: - if r["index"] in footer_ris: - r["is_footer"] = True - for c in z["cells"]: - if c.get("row_index") in footer_ris: - c["col_type"] = "footer" - - if page_refs or footer_rows: - logger.info( - "Extracted %d page_refs + %d footer rows (%d page numbers removed) from zone %d", - len(page_refs), len(footer_rows), len(page_number_footers), - z.get("zone_index", 0), - ) - - if page_refs: - z["page_refs"] = page_refs - if other_footers: - z["footer"] = other_footers - - -def _convert_slash_ipa( - zones_data: List[Dict[str, Any]], - skip_ipa: bool, - en_col_type: Optional[str], -) -> None: - """Convert slash-delimited IPA to bracket notation. - - Dictionary-style pages print IPA between slashes: "tiger /'taiga/" - """ - _SLASH_IPA_RE = re.compile( - r'(\b[a-zA-Z]+[²³¹]?)\s*' # headword (capture group 1) - r"(/[^/]{2,}/)" # /ipa/ (capture group 2), min 2 chars - ) - _STANDALONE_SLASH_IPA_RE = re.compile(r'^/([^/]{2,})/') - _SLASH_IPA_REJECT_RE = re.compile(r'[\s(),]') - slash_ipa_fixed = 0 - - for z in ([] if skip_ipa else zones_data): - for cell in z.get("cells", []): - if en_col_type and cell.get("col_type") != en_col_type: - continue - text = cell.get("text", "") - if "/" not in text: - continue - - def _replace_slash_ipa(m: re.Match) -> str: - nonlocal slash_ipa_fixed - headword = m.group(1) - ocr_ipa = m.group(2) - inner_raw = ocr_ipa.strip("/").strip() - if _SLASH_IPA_REJECT_RE.search(inner_raw): - return m.group(0) - clean_hw = re.sub(r'[²³¹\d]', '', headword).strip() - ipa = _lookup_ipa(clean_hw, "british") if clean_hw else None - if ipa: - slash_ipa_fixed += 1 - return f"{headword} [{ipa}]" - inner = inner_raw.lstrip("'").strip() - if inner: - slash_ipa_fixed += 1 - return f"{headword} [{inner}]" - return m.group(0) - - new_text = _SLASH_IPA_RE.sub(_replace_slash_ipa, text) - - _AFTER_BRACKET_SLASH = re.compile(r'(?<=\])\s*(/[^/]{2,}/)') - - def _replace_trailing_slash(m: re.Match) -> str: - nonlocal slash_ipa_fixed - inner = m.group(1).strip("/").strip().lstrip("'").strip() - if _SLASH_IPA_REJECT_RE.search(inner): - return m.group(0) - if inner: - slash_ipa_fixed += 1 - return f" [{inner}]" - return m.group(0) - new_text = _AFTER_BRACKET_SLASH.sub(_replace_trailing_slash, new_text) - - if new_text == text: - m = _STANDALONE_SLASH_IPA_RE.match(text) - if m: - inner = m.group(1).strip() - if not _SLASH_IPA_REJECT_RE.search(inner): - inner = inner.lstrip("'").strip() - if inner: - new_text = "[" + inner + "]" + text[m.end():] - slash_ipa_fixed += 1 - - if new_text != text: - cell["text"] = new_text - - if slash_ipa_fixed: - logger.info("Step 5h: converted %d slash-IPA to bracket notation", slash_ipa_fixed) +# Backward-compat shim -- module moved to grid/build/text_ops.py +import importlib as _importlib +import sys as _sys +_sys.modules[__name__] = _importlib.import_module("grid.build.text_ops") diff --git a/klausur-service/backend/grid_build_zones.py b/klausur-service/backend/grid_build_zones.py index 36b22d5..3a51a62 100644 --- a/klausur-service/backend/grid_build_zones.py +++ b/klausur-service/backend/grid_build_zones.py @@ -1,462 +1,4 @@ -""" -Grid Build Zones — Phase 2: Image processing, graphic detection, box/zone -detection and zone-aware grid building. - -Extracted from grid_build_core.py for maintainability. -""" - -import logging -from typing import Any, Dict, List, Optional - -import cv2 -import numpy as np - -from cv_box_detect import detect_boxes, split_page_into_zones -from cv_graphic_detect import detect_graphic_elements -from cv_color_detect import recover_colored_text -from cv_vocab_types import PageZone -from ocr_pipeline_session_store import get_session_image - -from grid_editor_helpers import ( - _filter_border_strip_words, - _filter_border_ghosts, - _words_in_zone, - _PIPE_RE_VSPLIT, - _detect_vertical_dividers, - _split_zone_at_vertical_dividers, - _merge_content_zones_across_boxes, - _build_zone_grid, -) - -logger = logging.getLogger(__name__) - - -async def _build_zones( - session_id: str, - session: dict, - all_words: List[Dict[str, Any]], - graphic_rects: List[Dict[str, int]], - content_x: int, - content_y: int, - content_w: int, - content_h: int, - img_w: int, - img_h: int, -) -> Dict[str, Any]: - """Load image, detect graphics/boxes, build zone-aware grids. - - Returns a dict with keys: - zones_data, boxes_detected, recovered_count, border_prefiltered, - img_bgr, all_words (modified in-place but returned for clarity). - """ - zones_data: List[Dict[str, Any]] = [] - boxes_detected = 0 - recovered_count = 0 - border_prefiltered = False - img_bgr = None - - # 3. Load image for box detection - img_png = await get_session_image(session_id, "cropped") - if not img_png: - img_png = await get_session_image(session_id, "dewarped") - if not img_png: - img_png = await get_session_image(session_id, "original") - - if img_png: - # Decode image for color detection + box detection - arr = np.frombuffer(img_png, dtype=np.uint8) - img_bgr = cv2.imdecode(arr, cv2.IMREAD_COLOR) - - if img_bgr is not None: - # --- 3a. Detect graphic/image regions via CV and hard-filter --- - sig_words = [w for w in all_words if len((w.get("text") or "").strip()) >= 3] - fresh_graphics = detect_graphic_elements(img_bgr, sig_words) - if fresh_graphics: - fresh_rects = [ - {"x": g.x, "y": g.y, "w": g.width, "h": g.height} - for g in fresh_graphics - ] - graphic_rects.extend(fresh_rects) - logger.info( - "build-grid session %s: detected %d graphic region(s) via CV", - session_id, len(fresh_graphics), - ) - # Hard-filter words inside newly detected graphic regions - before = len(all_words) - all_words[:] = [ - w for w in all_words - if not any( - gr["x"] <= w["left"] + w.get("width", 0) / 2 <= gr["x"] + gr["w"] - and gr["y"] <= w["top"] + w.get("height", 0) / 2 <= gr["y"] + gr["h"] - for gr in fresh_rects - ) - ] - removed = before - len(all_words) - if removed: - logger.info( - "build-grid session %s: hard-removed %d words inside %d fresh graphic region(s)", - session_id, removed, len(fresh_rects), - ) - - # --- Recover colored text that OCR missed (before grid building) --- - recovered = recover_colored_text(img_bgr, all_words) - if recovered and graphic_rects: - # Filter recovered chars inside graphic regions - recovered = [ - r for r in recovered - if not any( - gr["x"] <= r["left"] + r.get("width", 0) / 2 <= gr["x"] + gr["w"] - and gr["y"] <= r["top"] + r.get("height", 0) / 2 <= gr["y"] + gr["h"] - for gr in graphic_rects - ) - ] - if recovered: - recovered_count = len(recovered) - all_words.extend(recovered) - logger.info( - "build-grid session %s: +%d recovered colored words", - session_id, recovered_count, - ) - - # Detect bordered boxes - boxes = detect_boxes( - img_bgr, - content_x=content_x, - content_w=content_w, - content_y=content_y, - content_h=content_h, - ) - boxes_detected = len(boxes) - - if boxes: - # Filter border ghost words before grid building - all_words_new, ghost_count = _filter_border_ghosts(all_words, boxes) - if ghost_count: - all_words[:] = all_words_new - logger.info( - "build-grid session %s: removed %d border ghost words", - session_id, ghost_count, - ) - - # Split page into zones - page_zones = split_page_into_zones( - content_x, content_y, content_w, content_h, boxes - ) - - # Merge content zones separated by box zones - page_zones = _merge_content_zones_across_boxes( - page_zones, content_x, content_w - ) - - # 3b. Detect vertical dividers and split content zones - page_zones, border_prefiltered_vd = _detect_and_split_vertical_dividers( - page_zones, all_words - ) - - # --- First pass: build grids per zone independently --- - zone_grids = _build_grids_per_zone( - page_zones, all_words, img_w, img_h - ) - border_prefiltered = border_prefiltered or any( - zg.get("_border_prefiltered") for zg in zone_grids - ) - - # --- Second pass: merge column boundaries from all content zones --- - _merge_content_zone_columns( - zone_grids, all_words, content_w, img_w, img_h, session_id - ) - - # --- Build zones_data from zone_grids --- - for zg in zone_grids: - pz = zg["pz"] - grid = zg["grid"] - grid.pop("_raw_columns", None) - - zone_entry: Dict[str, Any] = { - "zone_index": pz.index, - "zone_type": pz.zone_type, - "bbox_px": { - "x": pz.x, "y": pz.y, - "w": pz.width, "h": pz.height, - }, - "bbox_pct": { - "x": round(pz.x / img_w * 100, 2) if img_w else 0, - "y": round(pz.y / img_h * 100, 2) if img_h else 0, - "w": round(pz.width / img_w * 100, 2) if img_w else 0, - "h": round(pz.height / img_h * 100, 2) if img_h else 0, - }, - "border": None, - "word_count": len(zg["words"]), - **grid, - } - - if pz.box: - zone_entry["border"] = { - "thickness": pz.box.border_thickness, - "confidence": pz.box.confidence, - } - - if pz.image_overlays: - zone_entry["image_overlays"] = pz.image_overlays - - if pz.layout_hint: - zone_entry["layout_hint"] = pz.layout_hint - if pz.vsplit_group is not None: - zone_entry["vsplit_group"] = pz.vsplit_group - - zones_data.append(zone_entry) - - # 4. Fallback: no boxes detected -> single zone with all words - if not zones_data: - before = len(all_words) - filtered_words = [ - w for w in all_words - if not (w.get("recovered") and len(w.get("text", "").strip()) <= 2) - ] - removed = before - len(filtered_words) - if removed: - logger.info( - "build-grid session %s: filtered %d recovered artifacts (fallback zone)", - session_id, removed, - ) - filtered_words, bs_removed = _filter_border_strip_words(filtered_words) - if bs_removed: - border_prefiltered = True - logger.info( - "build-grid session %s: pre-filtered %d border-strip words", - session_id, bs_removed, - ) - grid = _build_zone_grid( - filtered_words, content_x, content_y, content_w, content_h, - 0, img_w, img_h, - ) - grid.pop("_raw_columns", None) - zones_data.append({ - "zone_index": 0, - "zone_type": "content", - "bbox_px": { - "x": content_x, "y": content_y, - "w": content_w, "h": content_h, - }, - "bbox_pct": { - "x": round(content_x / img_w * 100, 2) if img_w else 0, - "y": round(content_y / img_h * 100, 2) if img_h else 0, - "w": round(content_w / img_w * 100, 2) if img_w else 0, - "h": round(content_h / img_h * 100, 2) if img_h else 0, - }, - "border": None, - "word_count": len(all_words), - **grid, - }) - - return { - "zones_data": zones_data, - "boxes_detected": boxes_detected, - "recovered_count": recovered_count, - "border_prefiltered": border_prefiltered, - "img_bgr": img_bgr, - } - - -def _detect_and_split_vertical_dividers( - page_zones: List[PageZone], - all_words: List[Dict[str, Any]], -) -> tuple: - """Detect vertical dividers and split content zones. - - Returns (expanded_zones, border_prefiltered_from_vsplit). - """ - vsplit_group_counter = 0 - expanded_zones: List = [] - for pz in page_zones: - if pz.zone_type != "content": - expanded_zones.append(pz) - continue - zone_words = _words_in_zone( - all_words, pz.y, pz.height, pz.x, pz.width - ) - divider_xs = _detect_vertical_dividers( - zone_words, pz.x, pz.width, pz.y, pz.height - ) - if divider_xs: - sub_zones = _split_zone_at_vertical_dividers( - pz, divider_xs, vsplit_group_counter - ) - expanded_zones.extend(sub_zones) - vsplit_group_counter += 1 - # Remove pipe words so they don't appear in sub-zones - pipe_ids = set( - id(w) for w in zone_words - if _PIPE_RE_VSPLIT.match((w.get("text") or "").strip()) - ) - all_words[:] = [w for w in all_words if id(w) not in pipe_ids] - logger.info( - "build-grid: vertical split zone %d at x=%s -> %d sub-zones", - pz.index, [int(x) for x in divider_xs], len(sub_zones), - ) - else: - expanded_zones.append(pz) - # Re-index zones - for i, pz in enumerate(expanded_zones): - pz.index = i - return expanded_zones, False - - -def _build_grids_per_zone( - page_zones: List[PageZone], - all_words: List[Dict[str, Any]], - img_w: int, - img_h: int, -) -> List[Dict[str, Any]]: - """Build grids for each zone independently (first pass).""" - zone_grids: List[Dict] = [] - - for pz in page_zones: - zone_words = _words_in_zone( - all_words, pz.y, pz.height, pz.x, pz.width - ) - if pz.zone_type == "content": - logger.info( - "build-grid zone %d (%s): bounds x=%d..%d y=%d..%d -> %d/%d words", - pz.index, pz.zone_type, - pz.x, pz.x + pz.width, pz.y, pz.y + pz.height, - len(zone_words), len(all_words), - ) - # Filter recovered single-char artifacts in ALL zones - before = len(zone_words) - zone_words = [ - w for w in zone_words - if not ( - w.get("recovered") - and len(w.get("text", "").strip()) <= 2 - ) - ] - removed = before - len(zone_words) - if removed: - logger.info( - "build-grid: filtered %d recovered artifacts from %s zone %d", - removed, pz.zone_type, pz.index, - ) - # Filter words inside image overlay regions (merged box zones) - if pz.image_overlays: - before_ov = len(zone_words) - zone_words = [ - w for w in zone_words - if not any( - ov["y"] <= w["top"] + w["height"] / 2 <= ov["y"] + ov["height"] - and ov["x"] <= w["left"] + w["width"] / 2 <= ov["x"] + ov["width"] - for ov in pz.image_overlays - ) - ] - ov_removed = before_ov - len(zone_words) - if ov_removed: - logger.info( - "build-grid: filtered %d words inside image overlays from zone %d", - ov_removed, pz.index, - ) - zone_words, bs_removed = _filter_border_strip_words(zone_words) - bp = False - if bs_removed: - bp = True - logger.info( - "build-grid: pre-filtered %d border-strip words from zone %d", - bs_removed, pz.index, - ) - grid = _build_zone_grid( - zone_words, pz.x, pz.y, pz.width, pz.height, - pz.index, img_w, img_h, - skip_first_row_header=bool(pz.image_overlays), - ) - zone_grids.append({ - "pz": pz, "words": zone_words, "grid": grid, - "_border_prefiltered": bp, - }) - - return zone_grids - - -def _merge_content_zone_columns( - zone_grids: List[Dict[str, Any]], - all_words: List[Dict[str, Any]], - content_w: int, - img_w: int, - img_h: int, - session_id: str, -) -> None: - """Second pass: merge column boundaries from all content zones. - - Modifies zone_grids in place. - """ - content_zones = [ - zg for zg in zone_grids - if zg["pz"].zone_type == "content" - and zg["pz"].vsplit_group is None - ] - if len(content_zones) <= 1: - return - - # Collect column split points (x_min of non-first columns) - all_split_xs: List[float] = [] - for zg in content_zones: - raw_cols = zg["grid"].get("_raw_columns", []) - for col in raw_cols[1:]: - all_split_xs.append(col["x_min"]) - - if not all_split_xs: - return - - all_split_xs.sort() - merge_distance = max(25, int(content_w * 0.03)) - merged_xs = [all_split_xs[0]] - for x in all_split_xs[1:]: - if x - merged_xs[-1] < merge_distance: - merged_xs[-1] = (merged_xs[-1] + x) / 2 - else: - merged_xs.append(x) - - total_cols = len(merged_xs) + 1 - max_zone_cols = max( - len(zg["grid"].get("_raw_columns", [])) - for zg in content_zones - ) - - if total_cols < max_zone_cols: - return - - cx_min = min(w["left"] for w in all_words) - cx_max = max(w["left"] + w["width"] for w in all_words) - merged_columns: List[Dict[str, Any]] = [] - prev_x = cx_min - for i, sx in enumerate(merged_xs): - merged_columns.append({ - "index": i, - "type": f"column_{i + 1}", - "x_min": prev_x, - "x_max": sx, - }) - prev_x = sx - merged_columns.append({ - "index": len(merged_xs), - "type": f"column_{len(merged_xs) + 1}", - "x_min": prev_x, - "x_max": cx_max, - }) - - # Re-build ALL content zones with merged columns - for zg in zone_grids: - pz = zg["pz"] - if pz.zone_type == "content": - grid = _build_zone_grid( - zg["words"], pz.x, pz.y, - pz.width, pz.height, - pz.index, img_w, img_h, - global_columns=merged_columns, - skip_first_row_header=bool(pz.image_overlays), - ) - zg["grid"] = grid - logger.info( - "build-grid session %s: union of %d content " - "zones -> %d merged columns (max single zone: %d)", - session_id, len(content_zones), - total_cols, max_zone_cols, - ) +# Backward-compat shim -- module moved to grid/build/zones.py +import importlib as _importlib +import sys as _sys +_sys.modules[__name__] = _importlib.import_module("grid.build.zones") diff --git a/klausur-service/backend/grid_editor_api.py b/klausur-service/backend/grid_editor_api.py index 62ecaec..822f504 100644 --- a/klausur-service/backend/grid_editor_api.py +++ b/klausur-service/backend/grid_editor_api.py @@ -1,31 +1,4 @@ -""" -Grid Editor API — barrel re-export. - -The actual endpoints live in: - - grid_editor_api_grid.py (build-grid, rerun-ocr, save-grid, get-grid) - - grid_editor_api_gutter.py (gutter-repair, gutter-repair/apply) - - grid_editor_api_box.py (build-box-grids) - - grid_editor_api_unified.py (build-unified-grid, unified-grid) - -This module re-exports the combined router and key symbols so that -existing `from grid_editor_api import router` / `from grid_editor_api import _build_grid_core` -continue to work unchanged. -""" - -from fastapi import APIRouter - -from grid_editor_api_grid import router as _grid_router -from grid_editor_api_gutter import router as _gutter_router -from grid_editor_api_box import router as _box_router -from grid_editor_api_unified import router as _unified_router - -# Re-export _build_grid_core so callers that do -# `from grid_editor_api import _build_grid_core` keep working. -from grid_build_core import _build_grid_core # noqa: F401 - -# Merge all sub-routers into one combined router -router = APIRouter() -router.include_router(_grid_router) -router.include_router(_gutter_router) -router.include_router(_box_router) -router.include_router(_unified_router) +# Backward-compat shim -- module moved to grid/editor/api.py +import importlib as _importlib +import sys as _sys +_sys.modules[__name__] = _importlib.import_module("grid.editor.api") diff --git a/klausur-service/backend/grid_editor_api_box.py b/klausur-service/backend/grid_editor_api_box.py index cbe3c75..3fbd259 100644 --- a/klausur-service/backend/grid_editor_api_box.py +++ b/klausur-service/backend/grid_editor_api_box.py @@ -1,177 +1,4 @@ -""" -Grid Editor API — box-grid-review endpoints. -""" - -import logging - -from fastapi import APIRouter, HTTPException, Request - -from grid_editor_helpers import _words_in_zone -from ocr_pipeline_session_store import ( - get_session_db, - update_session_db, -) - -logger = logging.getLogger(__name__) - -router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["grid-editor"]) - - -@router.post("/sessions/{session_id}/build-box-grids") -async def build_box_grids(session_id: str, request: Request): - """Rebuild grid structure for all detected boxes with layout-aware detection. - - Uses structure_result.boxes (from Step 7) as the source of box coordinates, - and raw_paddle_words as OCR word source. Creates or updates box zones in - the grid_editor_result. - - Optional body: { "overrides": { "0": "bullet_list" } } - Maps box_index -> forced layout_type. - """ - session = await get_session_db(session_id) - if not session: - raise HTTPException(status_code=404, detail=f"Session {session_id} not found") - - grid_data = session.get("grid_editor_result") - if not grid_data: - raise HTTPException(status_code=400, detail="No grid data. Run build-grid first.") - - # Get raw OCR words (with top/left/width/height keys) - word_result = session.get("word_result") or {} - all_words = word_result.get("raw_paddle_words") or word_result.get("raw_tesseract_words") or [] - if not all_words: - raise HTTPException(status_code=400, detail="No raw OCR words available.") - - # Get detected boxes from structure_result - structure_result = session.get("structure_result") or {} - gt = session.get("ground_truth") or {} - if not structure_result: - structure_result = gt.get("structure_result") or {} - detected_boxes = structure_result.get("boxes") or [] - if not detected_boxes: - return {"session_id": session_id, "box_zones_rebuilt": 0, "spell_fixes": 0, "message": "No boxes detected"} - - # Filter out false-positive boxes in header/footer margins. - img_h_for_filter = grid_data.get("image_height", 0) or word_result.get("image_height", 0) - if img_h_for_filter > 0: - margin_frac = 0.07 # 7% of image height - margin_top = img_h_for_filter * margin_frac - margin_bottom = img_h_for_filter * (1 - margin_frac) - filtered = [] - for box in detected_boxes: - by = box.get("y", 0) - bh = box.get("h", 0) - box_center_y = by + bh / 2 - if box_center_y < margin_top or box_center_y > margin_bottom: - logger.info("build-box-grids: skipping header/footer box at y=%d h=%d (center=%.0f, margins=%.0f/%.0f)", - by, bh, box_center_y, margin_top, margin_bottom) - continue - filtered.append(box) - detected_boxes = filtered - - body = {} - try: - body = await request.json() - except Exception: - pass - layout_overrides = body.get("overrides", {}) - - from cv_box_layout import build_box_zone_grid - - img_w = grid_data.get("image_width", 0) or word_result.get("image_width", 0) - img_h = grid_data.get("image_height", 0) or word_result.get("image_height", 0) - - zones = grid_data.get("zones", []) - - # Find highest existing zone_index - max_zone_idx = max((z.get("zone_index", 0) for z in zones), default=-1) - - # Remove old box zones (we'll rebuild them) - zones = [z for z in zones if z.get("zone_type") != "box"] - - box_count = 0 - spell_fixes = 0 - - for box_idx, box in enumerate(detected_boxes): - bx = box.get("x", 0) - by = box.get("y", 0) - bw = box.get("w", 0) - bh = box.get("h", 0) - - if bw <= 0 or bh <= 0: - continue - - # Filter raw OCR words inside this box - zone_words = _words_in_zone(all_words, by, bh, bx, bw) - if not zone_words: - logger.info("Box %d: no words found in bbox (%d,%d,%d,%d)", box_idx, bx, by, bw, bh) - continue - - zone_idx = max_zone_idx + 1 + box_idx - forced_layout = layout_overrides.get(str(box_idx)) - - # Build box grid - box_grid = build_box_zone_grid( - zone_words, bx, by, bw, bh, - zone_idx, img_w, img_h, - layout_type=forced_layout, - ) - - # Apply SmartSpellChecker to all box cells - try: - from smart_spell import SmartSpellChecker - ssc = SmartSpellChecker() - for cell in box_grid.get("cells", []): - text = cell.get("text", "") - if not text: - continue - result = ssc.correct_text(text, lang="auto") - if result.changed: - cell["text"] = result.corrected - spell_fixes += 1 - except ImportError: - pass - - # Build zone entry - zone_entry = { - "zone_index": zone_idx, - "zone_type": "box", - "bbox_px": {"x": bx, "y": by, "w": bw, "h": bh}, - "bbox_pct": { - "x": round(bx / img_w * 100, 2) if img_w else 0, - "y": round(by / img_h * 100, 2) if img_h else 0, - "w": round(bw / img_w * 100, 2) if img_w else 0, - "h": round(bh / img_h * 100, 2) if img_h else 0, - }, - "border": None, - "word_count": len(zone_words), - "columns": box_grid["columns"], - "rows": box_grid["rows"], - "cells": box_grid["cells"], - "header_rows": box_grid.get("header_rows", []), - "box_layout_type": box_grid.get("box_layout_type", "flowing"), - "box_grid_reviewed": False, - "box_bg_color": box.get("bg_color_name", ""), - "box_bg_hex": box.get("bg_color_hex", ""), - } - zones.append(zone_entry) - box_count += 1 - - # Sort zones by y-position for correct reading order - zones.sort(key=lambda z: z.get("bbox_px", {}).get("y", 0)) - - grid_data["zones"] = zones - await update_session_db(session_id, grid_editor_result=grid_data) - - logger.info( - "build-box-grids session %s: %d boxes processed (%d words spell-fixed) from %d detected", - session_id, box_count, spell_fixes, len(detected_boxes), - ) - - return { - "session_id": session_id, - "box_zones_rebuilt": box_count, - "total_detected_boxes": len(detected_boxes), - "spell_fixes": spell_fixes, - "zones": zones, - } +# Backward-compat shim -- module moved to grid/editor/api_box.py +import importlib as _importlib +import sys as _sys +_sys.modules[__name__] = _importlib.import_module("grid.editor.api_box") diff --git a/klausur-service/backend/grid_editor_api_grid.py b/klausur-service/backend/grid_editor_api_grid.py index b75f5ac..39ba7bc 100644 --- a/klausur-service/backend/grid_editor_api_grid.py +++ b/klausur-service/backend/grid_editor_api_grid.py @@ -1,337 +1,4 @@ -""" -Grid Editor API — grid build, save, and retrieve endpoints. -""" - -import logging -import time -from typing import Any, Dict - -from fastapi import APIRouter, HTTPException, Query, Request - -from grid_build_core import _build_grid_core -from ocr_pipeline_session_store import ( - get_session_db, - update_session_db, -) -from ocr_pipeline_common import ( - _cache, - _load_session_to_cache, - _get_cached, -) - -logger = logging.getLogger(__name__) - -router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["grid-editor"]) - - -@router.post("/sessions/{session_id}/build-grid") -async def build_grid( - session_id: str, - ipa_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"), - syllable_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"), - enhance: bool = Query(True, description="Step 3: CLAHE + denoise for degraded scans"), - max_cols: int = Query(0, description="Step 2: Max column count (0=unlimited)"), - min_conf: int = Query(0, description="Step 1: Min OCR confidence (0=auto)"), -): - """Build a structured, zone-aware grid from existing Kombi word results. - - Requires that paddle-kombi or rapid-kombi has already been run on the session. - Uses the image for box detection and the word positions for grid structuring. - - Query params: - ipa_mode: "auto" (only when English IPA detected), "all" (force), "none" (skip) - syllable_mode: "auto" (only when original has dividers), "all" (force), "none" (skip) - - Returns a StructuredGrid with zones, each containing their own - columns, rows, and cells — ready for the frontend Excel-like editor. - """ - session = await get_session_db(session_id) - if not session: - raise HTTPException(status_code=404, detail=f"Session {session_id} not found") - - try: - result = await _build_grid_core( - session_id, session, - ipa_mode=ipa_mode, syllable_mode=syllable_mode, - enhance=enhance, - max_columns=max_cols if max_cols > 0 else None, - min_conf=min_conf if min_conf > 0 else None, - ) - except ValueError as e: - raise HTTPException(status_code=400, detail=str(e)) - - # Save automatic grid snapshot for later comparison with manual corrections - # Lazy import to avoid circular dependency with ocr_pipeline_regression - from ocr_pipeline_regression import _build_reference_snapshot - - wr = session.get("word_result") or {} - engine = wr.get("ocr_engine", "") - if engine in ("kombi", "rapid_kombi"): - auto_pipeline = "kombi" - elif engine == "paddle_direct": - auto_pipeline = "paddle-direct" - else: - auto_pipeline = "pipeline" - auto_snapshot = _build_reference_snapshot(result, pipeline=auto_pipeline) - - gt = session.get("ground_truth") or {} - gt["auto_grid_snapshot"] = auto_snapshot - - # Persist to DB and advance current_step to 11 (reconstruction complete) - await update_session_db(session_id, grid_editor_result=result, ground_truth=gt, current_step=11) - - logger.info( - "build-grid session %s: %d zones, %d cols, %d rows, %d cells, " - "%d boxes in %.2fs", - session_id, - len(result.get("zones", [])), - result.get("summary", {}).get("total_columns", 0), - result.get("summary", {}).get("total_rows", 0), - result.get("summary", {}).get("total_cells", 0), - result.get("boxes_detected", 0), - result.get("duration_seconds", 0), - ) - - return result - - -@router.post("/sessions/{session_id}/rerun-ocr-and-build-grid") -async def rerun_ocr_and_build_grid( - session_id: str, - ipa_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"), - syllable_mode: str = Query("auto", pattern="^(auto|all|de|en|none)$"), - enhance: bool = Query(True, description="Step 3: CLAHE + denoise for degraded scans"), - max_cols: int = Query(0, description="Step 2: Max column count (0=unlimited)"), - min_conf: int = Query(0, description="Step 1: Min OCR confidence (0=auto)"), - vision_fusion: bool = Query(False, description="Step 4: Vision-LLM fusion for degraded scans"), - doc_category: str = Query("", description="Document type for Vision-LLM prompt context"), -): - """Re-run OCR with quality settings, then rebuild the grid. - - Unlike build-grid (which only rebuilds from existing words), - this endpoint re-runs the full OCR pipeline on the cropped image - with optional CLAHE enhancement, then builds the grid. - - Steps executed: Image Enhancement -> OCR -> Grid Build - """ - session = await get_session_db(session_id) - if not session: - raise HTTPException(status_code=404, detail=f"Session {session_id} not found") - - import time as _time - t0 = _time.time() - - # 1. Load the cropped/dewarped image from cache or session - if session_id not in _cache: - await _load_session_to_cache(session_id) - cached = _get_cached(session_id) - - dewarped_bgr = cached.get("cropped_bgr") if cached.get("cropped_bgr") is not None else cached.get("dewarped_bgr") - if dewarped_bgr is None: - raise HTTPException(status_code=400, detail="No cropped/dewarped image available. Run preprocessing steps first.") - - import numpy as np - img_h, img_w = dewarped_bgr.shape[:2] - ocr_input = dewarped_bgr.copy() - - # 2. Scan quality assessment - scan_quality_info = {} - try: - from scan_quality import score_scan_quality - quality_report = score_scan_quality(ocr_input) - scan_quality_info = quality_report.to_dict() - actual_min_conf = min_conf if min_conf > 0 else quality_report.recommended_min_conf - except Exception as e: - logger.warning(f"rerun-ocr: scan quality failed: {e}") - actual_min_conf = min_conf if min_conf > 0 else 40 - - # 3. Image enhancement (Step 3) - is_degraded = scan_quality_info.get("is_degraded", False) - if enhance and is_degraded: - try: - from ocr_image_enhance import enhance_for_ocr - ocr_input = enhance_for_ocr(ocr_input, is_degraded=True) - logger.info("rerun-ocr: CLAHE enhancement applied") - except Exception as e: - logger.warning(f"rerun-ocr: enhancement failed: {e}") - - # 4. Run dual-engine OCR - from PIL import Image - import pytesseract - - # RapidOCR - rapid_words = [] - try: - from cv_ocr_engines import ocr_region_rapid - from cv_vocab_types import PageRegion - full_region = PageRegion(type="full_page", x=0, y=0, width=img_w, height=img_h) - rapid_words = ocr_region_rapid(ocr_input, full_region) or [] - except Exception as e: - logger.warning(f"rerun-ocr: RapidOCR failed: {e}") - - # Tesseract - pil_img = Image.fromarray(ocr_input[:, :, ::-1]) - data = pytesseract.image_to_data(pil_img, lang='eng+deu', config='--psm 6 --oem 3', output_type=pytesseract.Output.DICT) - tess_words = [] - for i in range(len(data["text"])): - text = (data["text"][i] or "").strip() - conf_raw = str(data["conf"][i]) - conf = int(conf_raw) if conf_raw.lstrip("-").isdigit() else -1 - if not text or conf < actual_min_conf: - continue - tess_words.append({ - "text": text, "left": data["left"][i], "top": data["top"][i], - "width": data["width"][i], "height": data["height"][i], "conf": conf, - }) - - # 5. Merge OCR results - from ocr_pipeline_ocr_merge import _split_paddle_multi_words, _merge_paddle_tesseract, _deduplicate_words - rapid_split = _split_paddle_multi_words(rapid_words) if rapid_words else [] - if rapid_split or tess_words: - merged_words = _merge_paddle_tesseract(rapid_split, tess_words) - merged_words = _deduplicate_words(merged_words) - else: - merged_words = tess_words - - # 6. Store updated word_result in session - cells_for_storage = [{"text": w["text"], "left": w["left"], "top": w["top"], - "width": w["width"], "height": w["height"], "conf": w.get("conf", 0)} - for w in merged_words] - word_result = { - "cells": [{"text": " ".join(w["text"] for w in merged_words), - "word_boxes": cells_for_storage}], - "image_width": img_w, - "image_height": img_h, - "ocr_engine": "rapid_kombi", - "word_count": len(merged_words), - "raw_paddle_words": rapid_words, - } - # 6b. Vision-LLM Fusion (Step 4) — correct OCR using Vision model - vision_applied = False - if vision_fusion: - try: - from vision_ocr_fusion import vision_fuse_ocr - category = doc_category or session.get("document_category") or "vokabelseite" - logger.info(f"rerun-ocr: running Vision-LLM fusion (category={category})") - merged_words = await vision_fuse_ocr(ocr_input, merged_words, category) - vision_applied = True - # Rebuild storage from fused words - cells_for_storage = [{"text": w["text"], "left": w["left"], "top": w["top"], - "width": w["width"], "height": w["height"], "conf": w.get("conf", 0)} - for w in merged_words] - word_result["cells"] = [{"text": " ".join(w["text"] for w in merged_words), - "word_boxes": cells_for_storage}] - word_result["word_count"] = len(merged_words) - word_result["ocr_engine"] = "vision_fusion" - except Exception as e: - logger.warning(f"rerun-ocr: Vision-LLM fusion failed: {e}") - - await update_session_db(session_id, word_result=word_result) - - # Reload session with updated word_result - session = await get_session_db(session_id) - - ocr_duration = _time.time() - t0 - logger.info( - "rerun-ocr session %s: %d words (rapid=%d, tess=%d, merged=%d) in %.1fs " - "(enhance=%s, min_conf=%d, quality=%s)", - session_id, len(merged_words), len(rapid_words), len(tess_words), - len(merged_words), ocr_duration, enhance, actual_min_conf, - scan_quality_info.get("quality_pct", "?"), - ) - - # 7. Build grid from new words - try: - result = await _build_grid_core( - session_id, session, - ipa_mode=ipa_mode, syllable_mode=syllable_mode, - enhance=enhance, - max_columns=max_cols if max_cols > 0 else None, - min_conf=min_conf if min_conf > 0 else None, - ) - except ValueError as e: - raise HTTPException(status_code=400, detail=str(e)) - - # Persist grid - await update_session_db(session_id, grid_editor_result=result, current_step=11) - - # Add quality info to response - result["scan_quality"] = scan_quality_info - result["ocr_stats"] = { - "rapid_words": len(rapid_words), - "tess_words": len(tess_words), - "merged_words": len(merged_words), - "min_conf_used": actual_min_conf, - "enhance_applied": enhance and is_degraded, - "vision_fusion_applied": vision_applied, - "document_category": doc_category or session.get("document_category", ""), - "ocr_duration_seconds": round(ocr_duration, 1), - } - - total_duration = _time.time() - t0 - logger.info( - "rerun-ocr+build-grid session %s: %d zones, %d cols, %d cells in %.1fs", - session_id, - len(result.get("zones", [])), - result.get("summary", {}).get("total_columns", 0), - result.get("summary", {}).get("total_cells", 0), - total_duration, - ) - - return result - - -@router.post("/sessions/{session_id}/save-grid") -async def save_grid(session_id: str, request: Request): - """Save edited grid data from the frontend Excel-like editor. - - Receives the full StructuredGrid with user edits (text changes, - formatting changes like bold columns, header rows, etc.) and - persists it to the session's grid_editor_result. - """ - session = await get_session_db(session_id) - if not session: - raise HTTPException(status_code=404, detail=f"Session {session_id} not found") - - body = await request.json() - - # Validate basic structure - if "zones" not in body: - raise HTTPException(status_code=400, detail="Missing 'zones' in request body") - - # Preserve metadata from the original build - existing = session.get("grid_editor_result") or {} - result = { - "session_id": session_id, - "image_width": body.get("image_width", existing.get("image_width", 0)), - "image_height": body.get("image_height", existing.get("image_height", 0)), - "zones": body["zones"], - "boxes_detected": body.get("boxes_detected", existing.get("boxes_detected", 0)), - "summary": body.get("summary", existing.get("summary", {})), - "formatting": body.get("formatting", existing.get("formatting", {})), - "duration_seconds": existing.get("duration_seconds", 0), - "edited": True, - } - - await update_session_db(session_id, grid_editor_result=result, current_step=11) - - logger.info("save-grid session %s: %d zones saved", session_id, len(body["zones"])) - - return {"session_id": session_id, "saved": True} - - -@router.get("/sessions/{session_id}/grid-editor") -async def get_grid(session_id: str): - """Retrieve the current grid editor state for a session.""" - session = await get_session_db(session_id) - if not session: - raise HTTPException(status_code=404, detail=f"Session {session_id} not found") - - result = session.get("grid_editor_result") - if not result: - raise HTTPException( - status_code=404, - detail="No grid editor data. Run build-grid first.", - ) - - return result +# Backward-compat shim -- module moved to grid/editor/api_grid.py +import importlib as _importlib +import sys as _sys +_sys.modules[__name__] = _importlib.import_module("grid.editor.api_grid") diff --git a/klausur-service/backend/grid_editor_api_gutter.py b/klausur-service/backend/grid_editor_api_gutter.py index 7dfbd9f..9b84c2c 100644 --- a/klausur-service/backend/grid_editor_api_gutter.py +++ b/klausur-service/backend/grid_editor_api_gutter.py @@ -1,110 +1,4 @@ -""" -Grid Editor API — gutter repair endpoints. -""" - -import logging - -from fastapi import APIRouter, HTTPException, Request - -from ocr_pipeline_session_store import ( - get_session_db, - update_session_db, -) - -logger = logging.getLogger(__name__) - -router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["grid-editor"]) - - -@router.post("/sessions/{session_id}/gutter-repair") -async def gutter_repair(session_id: str): - """Analyse grid for gutter-edge OCR errors and return repair suggestions. - - Detects: - - Words truncated/blurred at the book binding (spell_fix) - - Words split across rows with missing hyphen chars (hyphen_join) - """ - session = await get_session_db(session_id) - if not session: - raise HTTPException(status_code=404, detail=f"Session {session_id} not found") - - grid_data = session.get("grid_editor_result") - if not grid_data: - raise HTTPException( - status_code=400, - detail="No grid data. Run build-grid first.", - ) - - from cv_gutter_repair import analyse_grid_for_gutter_repair - - image_width = grid_data.get("image_width", 0) - result = analyse_grid_for_gutter_repair(grid_data, image_width=image_width) - - # Persist suggestions in ground_truth.gutter_repair (avoids DB migration) - gt = session.get("ground_truth") or {} - gt["gutter_repair"] = result - await update_session_db(session_id, ground_truth=gt) - - logger.info( - "gutter-repair session %s: %d suggestions in %.2fs", - session_id, - result.get("stats", {}).get("suggestions_found", 0), - result.get("duration_seconds", 0), - ) - - return result - - -@router.post("/sessions/{session_id}/gutter-repair/apply") -async def gutter_repair_apply(session_id: str, request: Request): - """Apply accepted gutter repair suggestions to the grid. - - Body: { "accepted": ["suggestion_id_1", "suggestion_id_2", ...] } - """ - session = await get_session_db(session_id) - if not session: - raise HTTPException(status_code=404, detail=f"Session {session_id} not found") - - grid_data = session.get("grid_editor_result") - if not grid_data: - raise HTTPException(status_code=400, detail="No grid data.") - - gt = session.get("ground_truth") or {} - gutter_result = gt.get("gutter_repair") - if not gutter_result: - raise HTTPException( - status_code=400, - detail="No gutter repair data. Run gutter-repair first.", - ) - - body = await request.json() - accepted_ids = body.get("accepted", []) - if not accepted_ids: - return {"applied_count": 0, "changes": []} - - # text_overrides: { suggestion_id: "alternative_text" } - # Allows the user to pick a different correction from the alternatives list - text_overrides = body.get("text_overrides", {}) - - from cv_gutter_repair import apply_gutter_suggestions - - suggestions = gutter_result.get("suggestions", []) - - # Apply user-selected alternatives before passing to apply - for s in suggestions: - sid = s.get("id", "") - if sid in text_overrides and text_overrides[sid]: - s["suggested_text"] = text_overrides[sid] - - result = apply_gutter_suggestions(grid_data, accepted_ids, suggestions) - - # Save updated grid back to session - await update_session_db(session_id, grid_editor_result=grid_data) - - logger.info( - "gutter-repair/apply session %s: %d changes applied", - session_id, - result.get("applied_count", 0), - ) - - return result +# Backward-compat shim -- module moved to grid/editor/api_gutter.py +import importlib as _importlib +import sys as _sys +_sys.modules[__name__] = _importlib.import_module("grid.editor.api_gutter") diff --git a/klausur-service/backend/grid_editor_api_unified.py b/klausur-service/backend/grid_editor_api_unified.py index 9ee83b8..6c6c25f 100644 --- a/klausur-service/backend/grid_editor_api_unified.py +++ b/klausur-service/backend/grid_editor_api_unified.py @@ -1,71 +1,4 @@ -""" -Grid Editor API — unified grid endpoints. -""" - -import logging - -from fastapi import APIRouter, HTTPException - -from ocr_pipeline_session_store import ( - get_session_db, - update_session_db, -) - -logger = logging.getLogger(__name__) - -router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["grid-editor"]) - - -@router.post("/sessions/{session_id}/build-unified-grid") -async def build_unified_grid_endpoint(session_id: str): - """Build a single-zone unified grid merging content + box zones. - - Takes the existing multi-zone grid_editor_result and produces a - unified grid where boxes are integrated into the main row sequence. - Persists as unified_grid_result (preserves original multi-zone data). - """ - session = await get_session_db(session_id) - if not session: - raise HTTPException(status_code=404, detail=f"Session {session_id} not found") - - grid_data = session.get("grid_editor_result") - if not grid_data: - raise HTTPException(status_code=400, detail="No grid data. Run build-grid first.") - - from unified_grid import build_unified_grid - - result = build_unified_grid( - zones=grid_data.get("zones", []), - image_width=grid_data.get("image_width", 0), - image_height=grid_data.get("image_height", 0), - layout_metrics=grid_data.get("layout_metrics", {}), - ) - - # Persist as separate field (don't overwrite original multi-zone grid) - await update_session_db(session_id, unified_grid_result=result) - - logger.info( - "build-unified-grid session %s: %d rows, %d cells", - session_id, - result.get("summary", {}).get("total_rows", 0), - result.get("summary", {}).get("total_cells", 0), - ) - - return result - - -@router.get("/sessions/{session_id}/unified-grid") -async def get_unified_grid(session_id: str): - """Retrieve the unified grid for a session.""" - session = await get_session_db(session_id) - if not session: - raise HTTPException(status_code=404, detail=f"Session {session_id} not found") - - result = session.get("unified_grid_result") - if not result: - raise HTTPException( - status_code=404, - detail="No unified grid. Run build-unified-grid first.", - ) - - return result +# Backward-compat shim -- module moved to grid/editor/api_unified.py +import importlib as _importlib +import sys as _sys +_sys.modules[__name__] = _importlib.import_module("grid.editor.api_unified") diff --git a/klausur-service/backend/grid_editor_columns.py b/klausur-service/backend/grid_editor_columns.py index 6731798..f8b8099 100644 --- a/klausur-service/backend/grid_editor_columns.py +++ b/klausur-service/backend/grid_editor_columns.py @@ -1,492 +1,4 @@ -""" -Grid Editor — column detection, cross-column splitting, marker merging. - -Split from grid_editor_helpers.py for maintainability. -All functions are pure computation — no HTTP, DB, or session side effects. - -Lizenz: Apache 2.0 (kommerziell nutzbar) -DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. -""" - -import logging -import re -from typing import Any, Dict, List, Optional - -logger = logging.getLogger(__name__) - - -# --------------------------------------------------------------------------- -# Cross-column word splitting -# --------------------------------------------------------------------------- - -_spell_cache: Optional[Any] = None -_spell_loaded = False - - -def _is_recognized_word(text: str) -> bool: - """Check if *text* is a recognized German or English word. - - Uses the spellchecker library (same as cv_syllable_detect.py). - Returns True for real words like "oder", "Kabel", "Zeitung". - Returns False for OCR merge artifacts like "sichzie", "dasZimmer". - """ - global _spell_cache, _spell_loaded - if not text or len(text) < 2: - return False - - if not _spell_loaded: - _spell_loaded = True - try: - from spellchecker import SpellChecker - _spell_cache = SpellChecker(language="de") - except Exception: - pass - - if _spell_cache is None: - return False - - return text.lower() in _spell_cache - - -def _split_cross_column_words( - words: List[Dict], - columns: List[Dict], -) -> List[Dict]: - """Split word boxes that span across column boundaries. - - When OCR merges adjacent words from different columns (e.g. "sichzie" - spanning Col 1 and Col 2, or "dasZimmer" crossing the boundary), - split the word box at the column boundary so each piece is assigned - to the correct column. - - Only splits when: - - The word has significant overlap (>15% of its width) on both sides - - AND the word is not a recognized real word (OCR merge artifact), OR - the word contains a case transition (lowercase->uppercase) near the - boundary indicating two merged words like "dasZimmer". - """ - if len(columns) < 2: - return words - - # Column boundaries = midpoints between adjacent column edges - boundaries = [] - for i in range(len(columns) - 1): - boundary = (columns[i]["x_max"] + columns[i + 1]["x_min"]) / 2 - boundaries.append(boundary) - - new_words: List[Dict] = [] - split_count = 0 - for w in words: - w_left = w["left"] - w_width = w["width"] - w_right = w_left + w_width - text = (w.get("text") or "").strip() - - if not text or len(text) < 4 or w_width < 10: - new_words.append(w) - continue - - # Find the first boundary this word straddles significantly - split_boundary = None - for b in boundaries: - if w_left < b < w_right: - left_part = b - w_left - right_part = w_right - b - # Both sides must have at least 15% of the word width - if left_part > w_width * 0.15 and right_part > w_width * 0.15: - split_boundary = b - break - - if split_boundary is None: - new_words.append(w) - continue - - # Compute approximate split position in the text. - left_width = split_boundary - w_left - split_ratio = left_width / w_width - approx_pos = len(text) * split_ratio - - # Strategy 1: look for a case transition (lowercase->uppercase) near - # the approximate split point — e.g. "dasZimmer" splits at 'Z'. - split_char = None - search_lo = max(1, int(approx_pos) - 3) - search_hi = min(len(text), int(approx_pos) + 2) - for i in range(search_lo, search_hi): - if text[i - 1].islower() and text[i].isupper(): - split_char = i - break - - # Strategy 2: if no case transition, only split if the whole word - # is NOT a real word (i.e. it's an OCR merge artifact like "sichzie"). - # Real words like "oder", "Kabel", "Zeitung" must not be split. - if split_char is None: - clean = re.sub(r"[,;:.!?]+$", "", text) # strip trailing punct - if _is_recognized_word(clean): - new_words.append(w) - continue - # Not a real word — use floor of proportional position - split_char = max(1, min(len(text) - 1, int(approx_pos))) - - left_text = text[:split_char].rstrip() - right_text = text[split_char:].lstrip() - - if len(left_text) < 2 or len(right_text) < 2: - new_words.append(w) - continue - - right_width = w_width - round(left_width) - new_words.append({ - **w, - "text": left_text, - "width": round(left_width), - }) - new_words.append({ - **w, - "text": right_text, - "left": round(split_boundary), - "width": right_width, - }) - split_count += 1 - logger.info( - "split cross-column word %r -> %r + %r at boundary %.0f", - text, left_text, right_text, split_boundary, - ) - - if split_count: - logger.info("split %d cross-column word(s)", split_count) - return new_words - - -def _cluster_columns_by_alignment( - words: List[Dict], - zone_w: int, - rows: List[Dict], -) -> List[Dict[str, Any]]: - """Detect columns by clustering left-edge alignment across rows. - - Hybrid approach: - 1. Group words by row, find "group start" positions within each row - (words preceded by a large gap or first word in row) - 2. Cluster group-start left-edges by X-proximity across rows - 3. Filter by row coverage (how many rows have a group start here) - 4. Merge nearby clusters - 5. Build column boundaries - - This filters out mid-phrase word positions (e.g. IPA transcriptions, - second words in multi-word entries) by only considering positions - where a new word group begins within a row. - """ - if not words or not rows: - return [] - - total_rows = len(rows) - if total_rows == 0: - return [] - - # --- Group words by row --- - row_words: Dict[int, List[Dict]] = {} - for w in words: - y_center = w["top"] + w["height"] / 2 - best = min(rows, key=lambda r: abs(r["y_center"] - y_center)) - row_words.setdefault(best["index"], []).append(w) - - # --- Compute adaptive gap threshold for group-start detection --- - all_gaps: List[float] = [] - for ri, rw_list in row_words.items(): - sorted_rw = sorted(rw_list, key=lambda w: w["left"]) - for i in range(len(sorted_rw) - 1): - right = sorted_rw[i]["left"] + sorted_rw[i]["width"] - gap = sorted_rw[i + 1]["left"] - right - if gap > 0: - all_gaps.append(gap) - - if all_gaps: - sorted_gaps = sorted(all_gaps) - median_gap = sorted_gaps[len(sorted_gaps) // 2] - heights = [w["height"] for w in words if w.get("height", 0) > 0] - median_h = sorted(heights)[len(heights) // 2] if heights else 25 - - # For small word counts (boxes, sub-zones): PaddleOCR returns - # multi-word blocks, so ALL inter-word gaps are potential column - # boundaries. Use a low threshold based on word height — any gap - # wider than ~1x median word height is a column separator. - if len(words) <= 60: - gap_threshold = max(median_h * 1.0, 25) - logger.info( - "alignment columns (small zone): gap_threshold=%.0f " - "(median_h=%.0f, %d words, %d gaps: %s)", - gap_threshold, median_h, len(words), len(sorted_gaps), - [int(g) for g in sorted_gaps[:10]], - ) - else: - # Standard approach for large zones (full pages) - gap_threshold = max(median_gap * 3, median_h * 1.5, 30) - # Cap at 25% of zone width - max_gap = zone_w * 0.25 - if gap_threshold > max_gap > 30: - logger.info("alignment columns: capping gap_threshold %.0f -> %.0f (25%% of zone_w=%d)", gap_threshold, max_gap, zone_w) - gap_threshold = max_gap - else: - gap_threshold = 50 - - # --- Find group-start positions (left-edges that begin a new column) --- - start_positions: List[tuple] = [] # (left_edge, row_index) - for ri, rw_list in row_words.items(): - sorted_rw = sorted(rw_list, key=lambda w: w["left"]) - # First word in row is always a group start - start_positions.append((sorted_rw[0]["left"], ri)) - for i in range(1, len(sorted_rw)): - right_prev = sorted_rw[i - 1]["left"] + sorted_rw[i - 1]["width"] - gap = sorted_rw[i]["left"] - right_prev - if gap >= gap_threshold: - start_positions.append((sorted_rw[i]["left"], ri)) - - start_positions.sort(key=lambda x: x[0]) - - logger.info( - "alignment columns: %d group-start positions from %d words " - "(gap_threshold=%.0f, %d rows)", - len(start_positions), len(words), gap_threshold, total_rows, - ) - - if not start_positions: - x_min = min(w["left"] for w in words) - x_max = max(w["left"] + w["width"] for w in words) - return [{"index": 0, "type": "column_text", "x_min": x_min, "x_max": x_max}] - - # --- Cluster group-start positions by X-proximity --- - tolerance = max(10, int(zone_w * 0.01)) - clusters: List[Dict[str, Any]] = [] - cur_edges = [start_positions[0][0]] - cur_rows = {start_positions[0][1]} - - for left, row_idx in start_positions[1:]: - if left - cur_edges[-1] <= tolerance: - cur_edges.append(left) - cur_rows.add(row_idx) - else: - clusters.append({ - "mean_x": int(sum(cur_edges) / len(cur_edges)), - "min_edge": min(cur_edges), - "max_edge": max(cur_edges), - "count": len(cur_edges), - "distinct_rows": len(cur_rows), - "row_coverage": len(cur_rows) / total_rows, - }) - cur_edges = [left] - cur_rows = {row_idx} - clusters.append({ - "mean_x": int(sum(cur_edges) / len(cur_edges)), - "min_edge": min(cur_edges), - "max_edge": max(cur_edges), - "count": len(cur_edges), - "distinct_rows": len(cur_rows), - "row_coverage": len(cur_rows) / total_rows, - }) - - # --- Filter by row coverage --- - # These thresholds must be high enough to avoid false columns in flowing - # text (random inter-word gaps) while still detecting real columns in - # vocabulary worksheets (which typically have >80% row coverage). - MIN_COVERAGE_PRIMARY = 0.35 - MIN_COVERAGE_SECONDARY = 0.12 - MIN_WORDS_SECONDARY = 4 - MIN_DISTINCT_ROWS = 3 - - # Content boundary for left-margin detection - content_x_min = min(w["left"] for w in words) - content_x_max = max(w["left"] + w["width"] for w in words) - content_span = content_x_max - content_x_min - - primary = [ - c for c in clusters - if c["row_coverage"] >= MIN_COVERAGE_PRIMARY - and c["distinct_rows"] >= MIN_DISTINCT_ROWS - ] - primary_ids = {id(c) for c in primary} - secondary = [ - c for c in clusters - if id(c) not in primary_ids - and c["row_coverage"] >= MIN_COVERAGE_SECONDARY - and c["count"] >= MIN_WORDS_SECONDARY - and c["distinct_rows"] >= MIN_DISTINCT_ROWS - ] - - # Tertiary: narrow left-margin columns (page refs, markers) that have - # too few rows for secondary but are clearly left-aligned and separated - # from the main content. These appear at the far left or far right and - # have a large gap to the nearest significant cluster. - used_ids = {id(c) for c in primary} | {id(c) for c in secondary} - sig_xs = [c["mean_x"] for c in primary + secondary] - - # Tertiary: clusters that are clearly to the LEFT of the first - # significant column (or RIGHT of the last). If words consistently - # start at a position left of the established first column boundary, - # they MUST be a separate column — regardless of how few rows they - # cover. The only requirement is a clear spatial gap. - MIN_COVERAGE_TERTIARY = 0.02 # at least 1 row effectively - tertiary = [] - for c in clusters: - if id(c) in used_ids: - continue - if c["distinct_rows"] < 1: - continue - if c["row_coverage"] < MIN_COVERAGE_TERTIARY: - continue - # Must be near left or right content margin (within 15%) - rel_pos = (c["mean_x"] - content_x_min) / content_span if content_span else 0.5 - if not (rel_pos < 0.15 or rel_pos > 0.85): - continue - # Must have significant gap to nearest significant cluster - if sig_xs: - min_dist = min(abs(c["mean_x"] - sx) for sx in sig_xs) - if min_dist < max(30, content_span * 0.02): - continue - tertiary.append(c) - - if tertiary: - for c in tertiary: - logger.info( - " tertiary (margin) cluster: x=%d (range %d-%d), %d words, %d rows (%.0f%%)", - c["mean_x"], c["min_edge"], c["max_edge"], - c["count"], c["distinct_rows"], c["row_coverage"] * 100, - ) - - significant = sorted(primary + secondary + tertiary, key=lambda c: c["mean_x"]) - - for c in significant: - logger.info( - " significant cluster: x=%d (range %d-%d), %d words, %d rows (%.0f%%)", - c["mean_x"], c["min_edge"], c["max_edge"], - c["count"], c["distinct_rows"], c["row_coverage"] * 100, - ) - logger.info( - "alignment columns: %d clusters, %d primary, %d secondary -> %d significant", - len(clusters), len(primary), len(secondary), len(significant), - ) - - if not significant: - # Fallback: single column covering all content - x_min = min(w["left"] for w in words) - x_max = max(w["left"] + w["width"] for w in words) - return [{"index": 0, "type": "column_text", "x_min": x_min, "x_max": x_max}] - - # --- Merge nearby clusters --- - merge_distance = max(25, int(zone_w * 0.03)) - merged = [significant[0].copy()] - for s in significant[1:]: - if s["mean_x"] - merged[-1]["mean_x"] < merge_distance: - prev = merged[-1] - total = prev["count"] + s["count"] - prev["mean_x"] = ( - prev["mean_x"] * prev["count"] + s["mean_x"] * s["count"] - ) // total - prev["count"] = total - prev["min_edge"] = min(prev["min_edge"], s["min_edge"]) - prev["max_edge"] = max(prev["max_edge"], s["max_edge"]) - prev["distinct_rows"] = max(prev["distinct_rows"], s["distinct_rows"]) - else: - merged.append(s.copy()) - - logger.info( - "alignment columns: %d after merge (distance=%d)", - len(merged), merge_distance, - ) - - # --- Build column boundaries --- - margin = max(5, int(zone_w * 0.005)) - content_x_min = min(w["left"] for w in words) - content_x_max = max(w["left"] + w["width"] for w in words) - - columns: List[Dict[str, Any]] = [] - for i, cluster in enumerate(merged): - x_min = max(content_x_min, cluster["min_edge"] - margin) - if i + 1 < len(merged): - x_max = merged[i + 1]["min_edge"] - margin - else: - x_max = content_x_max - - columns.append({ - "index": i, - "type": f"column_{i + 1}" if len(merged) > 1 else "column_text", - "x_min": x_min, - "x_max": x_max, - }) - - return columns - - -_MARKER_CHARS = set("*-+#>") - - -def _merge_inline_marker_columns( - columns: List[Dict], - words: List[Dict], -) -> List[Dict]: - """Merge narrow marker columns (bullets, numbering) into adjacent text. - - Bullet points (*, -) and numbering (1., 2.) create narrow columns - at the left edge of a zone. These are inline markers that indent text, - not real separate columns. Merge them with their right neighbour. - - Does NOT merge columns containing alphabetic words like "to", "in", - "der", "die", "das" — those are legitimate content columns. - """ - if len(columns) < 2: - return columns - - merged: List[Dict] = [] - skip: set = set() - - for i, col in enumerate(columns): - if i in skip: - continue - - # Find words in this column - col_words = [ - w for w in words - if col["x_min"] <= w["left"] + w["width"] / 2 < col["x_max"] - ] - col_width = col["x_max"] - col["x_min"] - - # Narrow column with mostly short words -> MIGHT be inline markers - if col_words and col_width < 80: - avg_len = sum(len(w.get("text", "")) for w in col_words) / len(col_words) - if avg_len <= 2 and i + 1 < len(columns): - # Check if words are actual markers (symbols/numbers) vs - # real alphabetic words like "to", "in", "der", "die" - texts = [(w.get("text") or "").strip() for w in col_words] - alpha_count = sum( - 1 for t in texts - if t and t[0].isalpha() and t not in _MARKER_CHARS - ) - alpha_ratio = alpha_count / len(texts) if texts else 0 - - # If >=50% of words are alphabetic, this is a real column - if alpha_ratio >= 0.5: - logger.info( - " kept narrow column %d (w=%d, avg_len=%.1f, " - "alpha=%.0f%%) -- contains real words", - i, col_width, avg_len, alpha_ratio * 100, - ) - else: - # Merge into next column - next_col = columns[i + 1].copy() - next_col["x_min"] = col["x_min"] - merged.append(next_col) - skip.add(i + 1) - logger.info( - " merged inline marker column %d (w=%d, avg_len=%.1f) " - "into column %d", - i, col_width, avg_len, i + 1, - ) - continue - - merged.append(col) - - # Re-index - for i, col in enumerate(merged): - col["index"] = i - col["type"] = f"column_{i + 1}" if len(merged) > 1 else "column_text" - - return merged +# Backward-compat shim -- module moved to grid/editor/columns.py +import importlib as _importlib +import sys as _sys +_sys.modules[__name__] = _importlib.import_module("grid.editor.columns") diff --git a/klausur-service/backend/grid_editor_filters.py b/klausur-service/backend/grid_editor_filters.py index c938569..7f4063d 100644 --- a/klausur-service/backend/grid_editor_filters.py +++ b/klausur-service/backend/grid_editor_filters.py @@ -1,402 +1,4 @@ -""" -Grid Editor — word/zone filtering, border ghosts, decorative margins, footers. - -Split from grid_editor_helpers.py for maintainability. -All functions are pure computation — no HTTP, DB, or session side effects. - -Lizenz: Apache 2.0 (kommerziell nutzbar) -DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. -""" - -import logging -from typing import Any, Dict, List, Optional, Tuple - -logger = logging.getLogger(__name__) - - -def _filter_border_strip_words(words: List[Dict]) -> Tuple[List[Dict], int]: - """Remove page-border decoration strip words BEFORE column detection. - - Scans from each page edge inward to find the first significant x-gap - (>30 px). If the edge cluster contains <15 % of total words, those - words are removed as border-strip artifacts (alphabet letters, - illustration fragments). - - Must run BEFORE ``_build_zone_grid`` so that column detection only - sees real content words and doesn't produce inflated row counts. - """ - if len(words) < 10: - return words, 0 - - sorted_words = sorted(words, key=lambda w: w.get("left", 0)) - total = len(sorted_words) - - # -- Left-edge scan (running max right-edge) -- - left_count = 0 - running_right = 0 - for gi in range(total - 1): - running_right = max( - running_right, - sorted_words[gi].get("left", 0) + sorted_words[gi].get("width", 0), - ) - if sorted_words[gi + 1].get("left", 0) - running_right > 30: - left_count = gi + 1 - break - - # -- Right-edge scan (running min left) -- - right_count = 0 - running_left = sorted_words[-1].get("left", 0) - for gi in range(total - 1, 0, -1): - running_left = min(running_left, sorted_words[gi].get("left", 0)) - prev_right = ( - sorted_words[gi - 1].get("left", 0) - + sorted_words[gi - 1].get("width", 0) - ) - if running_left - prev_right > 30: - right_count = total - gi - break - - # Validate candidate strip: real border decorations are mostly short - # words (alphabet letters like "A", "Bb", stray marks). Multi-word - # content like "der Ranzen" or "die Schals" (continuation of German - # translations) must NOT be removed. - def _is_decorative_strip(candidates: List[Dict]) -> bool: - if not candidates: - return False - short = sum(1 for w in candidates if len((w.get("text") or "").strip()) <= 2) - return short / len(candidates) >= 0.45 - - strip_ids: set = set() - if left_count > 0 and left_count / total < 0.20: - candidates = sorted_words[:left_count] - if _is_decorative_strip(candidates): - strip_ids = {id(w) for w in candidates} - elif right_count > 0 and right_count / total < 0.20: - candidates = sorted_words[total - right_count:] - if _is_decorative_strip(candidates): - strip_ids = {id(w) for w in candidates} - - if not strip_ids: - return words, 0 - - return [w for w in words if id(w) not in strip_ids], len(strip_ids) - - -# Characters that are typically OCR artefacts from box border lines. -# Intentionally excludes ! (red markers) and . , ; (real punctuation). -_GRID_GHOST_CHARS = set("|1lI[](){}/\\-\u2014\u2013_~=+") - - -def _filter_border_ghosts( - words: List[Dict], - boxes: List, -) -> tuple: - """Remove words sitting on box borders that are OCR artefacts. - - Returns (filtered_words, removed_count). - """ - if not boxes or not words: - return words, 0 - - # Build border bands from detected boxes - x_bands: List[tuple] = [] - y_bands: List[tuple] = [] - for b in boxes: - bt = ( - b.border_thickness - if hasattr(b, "border_thickness") - else b.get("border_thickness", 3) - ) - # Skip borderless boxes (images/graphics) -- no border line to produce ghosts - if bt == 0: - continue - bx = b.x if hasattr(b, "x") else b.get("x", 0) - by = b.y if hasattr(b, "y") else b.get("y", 0) - bw = b.width if hasattr(b, "width") else b.get("w", b.get("width", 0)) - bh = b.height if hasattr(b, "height") else b.get("h", b.get("height", 0)) - margin = max(bt * 2, 10) + 6 - x_bands.append((bx - margin, bx + margin)) - x_bands.append((bx + bw - margin, bx + bw + margin)) - y_bands.append((by - margin, by + margin)) - y_bands.append((by + bh - margin, by + bh + margin)) - - def _is_ghost(w: Dict) -> bool: - text = (w.get("text") or "").strip() - if not text: - return False - # Check if any word edge (not just center) touches a border band - w_left = w["left"] - w_right = w["left"] + w["width"] - w_top = w["top"] - w_bottom = w["top"] + w["height"] - on_border = ( - any(lo <= w_left <= hi or lo <= w_right <= hi for lo, hi in x_bands) - or any(lo <= w_top <= hi or lo <= w_bottom <= hi for lo, hi in y_bands) - ) - if not on_border: - return False - if len(text) == 1 and text in _GRID_GHOST_CHARS: - return True - return False - - filtered = [w for w in words if not _is_ghost(w)] - return filtered, len(words) - len(filtered) - - -def _flatten_word_boxes(cells: List[Dict]) -> List[Dict]: - """Extract all word_boxes from cells into a flat list of word dicts.""" - words: List[Dict] = [] - for cell in cells: - for wb in cell.get("word_boxes") or []: - if wb.get("text", "").strip(): - words.append({ - "text": wb["text"], - "left": wb["left"], - "top": wb["top"], - "width": wb["width"], - "height": wb["height"], - "conf": wb.get("conf", 0), - }) - return words - - -def _words_in_zone( - words: List[Dict], - zone_y: int, - zone_h: int, - zone_x: int, - zone_w: int, -) -> List[Dict]: - """Filter words whose Y-center falls within a zone's bounds.""" - zone_y_end = zone_y + zone_h - zone_x_end = zone_x + zone_w - result = [] - for w in words: - cy = w["top"] + w["height"] / 2 - cx = w["left"] + w["width"] / 2 - if zone_y <= cy <= zone_y_end and zone_x <= cx <= zone_x_end: - result.append(w) - return result - - -def _get_content_bounds(words: List[Dict]) -> tuple: - """Get content bounds from word positions.""" - if not words: - return 0, 0, 0, 0 - x_min = min(w["left"] for w in words) - y_min = min(w["top"] for w in words) - x_max = max(w["left"] + w["width"] for w in words) - y_max = max(w["top"] + w["height"] for w in words) - return x_min, y_min, x_max - x_min, y_max - y_min - - -def _filter_decorative_margin( - words: List[Dict], - img_w: int, - log: Any, - session_id: str, -) -> Dict[str, Any]: - """Remove words that belong to a decorative alphabet strip on a margin. - - Some vocabulary worksheets have a vertical A-Z alphabet graphic along - the left or right edge. OCR reads each letter as an isolated single- - character word. These decorative elements are not content and confuse - column/row detection. - - Detection criteria (phase 1 -- find the strip using single-char words): - - Words are in the outer 30% of the page (left or right) - - Nearly all words are single characters (letters or digits) - - At least 8 such words form a vertical strip (>=8 unique Y positions) - - Average horizontal spread of the strip is small (< 80px) - - Phase 2 -- once a strip is confirmed, also remove any short word (<=3 - chars) in the same narrow x-range. This catches multi-char OCR - artifacts like "Vv" that belong to the same decorative element. - - Modifies *words* in place. - - Returns: - Dict with 'found' (bool), 'side' (str), 'letters_detected' (int). - """ - no_strip: Dict[str, Any] = {"found": False, "side": "", "letters_detected": 0} - if not words or img_w <= 0: - return no_strip - - margin_cutoff = img_w * 0.30 - # Phase 1: find candidate strips using short words (1-2 chars). - # OCR often reads alphabet sidebar letters as pairs ("Aa", "Bb") - # rather than singles, so accept <=2-char words as strip candidates. - left_strip = [ - w for w in words - if len((w.get("text") or "").strip()) <= 2 - and w["left"] + w.get("width", 0) / 2 < margin_cutoff - ] - right_strip = [ - w for w in words - if len((w.get("text") or "").strip()) <= 2 - and w["left"] + w.get("width", 0) / 2 > img_w - margin_cutoff - ] - - for strip, side in [(left_strip, "left"), (right_strip, "right")]: - if len(strip) < 6: - continue - # Check vertical distribution: should have many distinct Y positions - y_centers = sorted(set( - int(w["top"] + w.get("height", 0) / 2) // 20 * 20 # bucket - for w in strip - )) - if len(y_centers) < 6: - continue - # Check horizontal compactness - x_positions = [w["left"] for w in strip] - x_min = min(x_positions) - x_max = max(x_positions) - x_spread = x_max - x_min - if x_spread > 80: - continue - - # Phase 2: strip confirmed -- also collect short words in same x-range - # Expand x-range slightly to catch neighbors (e.g. "Vv" next to "U") - strip_x_lo = x_min - 20 - strip_x_hi = x_max + 60 # word width + tolerance - all_strip_words = [ - w for w in words - if len((w.get("text") or "").strip()) <= 3 - and strip_x_lo <= w["left"] <= strip_x_hi - and (w["left"] + w.get("width", 0) / 2 < margin_cutoff - if side == "left" - else w["left"] + w.get("width", 0) / 2 > img_w - margin_cutoff) - ] - - strip_set = set(id(w) for w in all_strip_words) - before = len(words) - words[:] = [w for w in words if id(w) not in strip_set] - removed = before - len(words) - if removed: - log.info( - "build-grid session %s: removed %d decorative %s-margin words " - "(strip x=%d-%d)", - session_id, removed, side, strip_x_lo, strip_x_hi, - ) - return {"found": True, "side": side, "letters_detected": len(strip)} - - return no_strip - - -def _filter_footer_words( - words: List[Dict], - img_h: int, - log: Any, - session_id: str, -) -> Optional[Dict]: - """Remove isolated words in the bottom 5% of the page (page numbers). - - Modifies *words* in place and returns a page_number metadata dict - if a page number was extracted, or None. - """ - if not words or img_h <= 0: - return None - footer_y = img_h * 0.95 - footer_words = [ - w for w in words - if w["top"] + w.get("height", 0) / 2 > footer_y - ] - if not footer_words: - return None - # Only remove if footer has very few words (<= 3) with short text - total_text = "".join((w.get("text") or "").strip() for w in footer_words) - if len(footer_words) <= 3 and len(total_text) <= 10: - # Extract page number metadata before removing - page_number_info = { - "text": total_text.strip(), - "y_pct": round(footer_words[0]["top"] / img_h * 100, 1), - } - # Try to parse as integer - digits = "".join(c for c in total_text if c.isdigit()) - if digits: - page_number_info["number"] = int(digits) - - footer_set = set(id(w) for w in footer_words) - words[:] = [w for w in words if id(w) not in footer_set] - log.info( - "build-grid session %s: extracted page number '%s' and removed %d footer words", - session_id, total_text, len(footer_words), - ) - return page_number_info - return None - - -def _filter_header_junk( - words: List[Dict], - img_h: int, - log: Any, - session_id: str, -) -> None: - """Remove OCR junk from header illustrations above the real content. - - Textbook pages often have decorative header graphics (illustrations, - icons) that OCR reads as low-confidence junk characters. Real content - typically starts further down the page. - - Algorithm: - 1. Find the "content start" -- the first Y position where a dense - horizontal row of 3+ high-confidence words begins. - 2. Above that line, remove words with conf < 75 and text <= 3 chars. - These are almost certainly OCR artifacts from illustrations. - - Modifies *words* in place. - """ - if not words or img_h <= 0: - return - - # --- Find content start: first horizontal row with >=3 high-conf words --- - # Sort words by Y - sorted_by_y = sorted(words, key=lambda w: w["top"]) - content_start_y = 0 - _ROW_TOLERANCE = img_h * 0.02 # words within 2% of page height = same row - _MIN_ROW_WORDS = 3 - _MIN_CONF = 80 - - i = 0 - while i < len(sorted_by_y): - row_y = sorted_by_y[i]["top"] - # Collect words in this row band - row_words = [] - j = i - while j < len(sorted_by_y) and sorted_by_y[j]["top"] - row_y < _ROW_TOLERANCE: - row_words.append(sorted_by_y[j]) - j += 1 - # Count high-confidence words with real text (> 1 char) - high_conf = [ - w for w in row_words - if w.get("conf", 0) >= _MIN_CONF - and len((w.get("text") or "").strip()) > 1 - ] - if len(high_conf) >= _MIN_ROW_WORDS: - content_start_y = row_y - break - i = j if j > i else i + 1 - - if content_start_y <= 0: - return # no clear content start found - - # --- Remove low-conf short junk above content start --- - junk = [ - w for w in words - if w["top"] + w.get("height", 0) < content_start_y - and w.get("conf", 0) < 75 - and len((w.get("text") or "").strip()) <= 3 - ] - if not junk: - return - - junk_set = set(id(w) for w in junk) - before = len(words) - words[:] = [w for w in words if id(w) not in junk_set] - removed = before - len(words) - if removed: - log.info( - "build-grid session %s: removed %d header junk words above y=%d " - "(content start)", - session_id, removed, content_start_y, - ) +# Backward-compat shim -- module moved to grid/editor/filters.py +import importlib as _importlib +import sys as _sys +_sys.modules[__name__] = _importlib.import_module("grid.editor.filters") diff --git a/klausur-service/backend/grid_editor_headers.py b/klausur-service/backend/grid_editor_headers.py index 3096e59..05563a3 100644 --- a/klausur-service/backend/grid_editor_headers.py +++ b/klausur-service/backend/grid_editor_headers.py @@ -1,499 +1,4 @@ -""" -Grid Editor — header/heading detection and colspan (merged cell) detection. -Split from grid_editor_helpers.py. Pure computation, no HTTP/DB side effects. -Lizenz: Apache 2.0 | DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. -""" - -import logging -import re -from typing import Any, Dict, List, Optional - -from cv_ocr_engines import _text_has_garbled_ipa - -logger = logging.getLogger(__name__) - - -def _detect_heading_rows_by_color(zones_data: List[Dict], img_w: int, img_h: int) -> int: - """Detect heading rows by color + height after color annotation. - - A row is a heading if: - 1. ALL word_boxes have color_name != 'black' (typically 'blue') - 2. Mean word height > 1.2x median height of all words in the zone - - Detected heading rows are merged into a single spanning cell. - Returns count of headings detected. - """ - heading_count = 0 - - for z in zones_data: - cells = z.get("cells", []) - rows = z.get("rows", []) - columns = z.get("columns", []) - if not cells or not rows or len(columns) < 2: - continue - - # Compute median word height across the zone - all_heights = [] - for cell in cells: - for wb in cell.get("word_boxes") or []: - h = wb.get("height", 0) - if h > 0: - all_heights.append(h) - if not all_heights: - continue - all_heights_sorted = sorted(all_heights) - median_h = all_heights_sorted[len(all_heights_sorted) // 2] - - heading_row_indices = [] - for row in rows: - if row.get("is_header"): - continue # already detected as header - ri = row["index"] - row_cells = [c for c in cells if c.get("row_index") == ri] - row_wbs = [ - wb for cell in row_cells - for wb in cell.get("word_boxes") or [] - ] - if not row_wbs: - continue - - # Condition 1: ALL words are non-black - all_colored = all( - wb.get("color_name", "black") != "black" - for wb in row_wbs - ) - if not all_colored: - continue - - # Condition 2: mean height > 1.2x median - mean_h = sum(wb.get("height", 0) for wb in row_wbs) / len(row_wbs) - if mean_h <= median_h * 1.2: - continue - - heading_row_indices.append(ri) - - # Merge heading cells into spanning cells - for hri in heading_row_indices: - header_cells = [c for c in cells if c.get("row_index") == hri] - if len(header_cells) <= 1: - # Single cell -- just mark it as heading - if header_cells: - header_cells[0]["col_type"] = "heading" - heading_count += 1 - # Mark row as header - for row in rows: - if row["index"] == hri: - row["is_header"] = True - continue - - # Collect all word_boxes and text from all columns - all_wb = [] - all_text_parts = [] - for hc in sorted(header_cells, key=lambda c: c["col_index"]): - all_wb.extend(hc.get("word_boxes", [])) - if hc.get("text", "").strip(): - all_text_parts.append(hc["text"].strip()) - - # Remove all cells for this row, replace with one spanning cell - z["cells"] = [c for c in z["cells"] if c.get("row_index") != hri] - - if all_wb: - x_min = min(wb["left"] for wb in all_wb) - y_min = min(wb["top"] for wb in all_wb) - x_max = max(wb["left"] + wb["width"] for wb in all_wb) - y_max = max(wb["top"] + wb["height"] for wb in all_wb) - - # Use the actual starting col_index from the first cell - first_col = min(hc["col_index"] for hc in header_cells) - zone_idx = z.get("zone_index", 0) - z["cells"].append({ - "cell_id": f"Z{zone_idx}_R{hri:02d}_C{first_col}", - "zone_index": zone_idx, - "row_index": hri, - "col_index": first_col, - "col_type": "heading", - "text": " ".join(all_text_parts), - "confidence": 0.0, - "bbox_px": {"x": x_min, "y": y_min, - "w": x_max - x_min, "h": y_max - y_min}, - "bbox_pct": { - "x": round(x_min / img_w * 100, 2) if img_w else 0, - "y": round(y_min / img_h * 100, 2) if img_h else 0, - "w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0, - "h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0, - }, - "word_boxes": all_wb, - "ocr_engine": "words_first", - "is_bold": True, - }) - - # Mark row as header - for row in rows: - if row["index"] == hri: - row["is_header"] = True - heading_count += 1 - - return heading_count - - -def _detect_heading_rows_by_single_cell( - zones_data: List[Dict], img_w: int, img_h: int, -) -> int: - """Detect heading rows that have only a single content cell. - - Black headings like "Theme" have normal color and height, so they are - missed by ``_detect_heading_rows_by_color``. The distinguishing signal - is that they occupy only one column while normal vocabulary rows fill - at least 2-3 columns. - - A row qualifies as a heading if: - 1. It is not already marked as a header/heading. - 2. It has exactly ONE cell whose col_type starts with ``column_`` - (excluding column_1 / page_ref which only carries page numbers). - 3. That single cell is NOT in the last column (continuation/example - lines like "2. Ver\u00e4nderung, Wechsel" often sit alone in column_4). - 4. The text does not start with ``[`` (IPA continuation). - 5. The zone has >=3 columns and >=5 rows (avoids false positives in - tiny zones). - 6. The majority of rows in the zone have >=2 content cells (ensures - we are in a multi-column vocab layout). - """ - heading_count = 0 - - for z in zones_data: - cells = z.get("cells", []) - rows = z.get("rows", []) - columns = z.get("columns", []) - if len(columns) < 3 or len(rows) < 5: - continue - - # Determine the last col_index (example/sentence column) - col_indices = sorted(set(c.get("col_index", 0) for c in cells)) - if not col_indices: - continue - last_col = col_indices[-1] - - # Count content cells per row (column_* but not column_1/page_ref). - # Exception: column_1 cells that contain a dictionary article word - # (die/der/das etc.) ARE content -- they appear in dictionary layouts - # where the leftmost column holds grammatical articles. - _ARTICLE_WORDS = { - "die", "der", "das", "dem", "den", "des", "ein", "eine", - "the", "a", "an", - } - row_content_counts: Dict[int, int] = {} - for cell in cells: - ct = cell.get("col_type", "") - if not ct.startswith("column_"): - continue - if ct == "column_1": - ctext = (cell.get("text") or "").strip().lower() - if ctext not in _ARTICLE_WORDS: - continue - ri = cell.get("row_index", -1) - row_content_counts[ri] = row_content_counts.get(ri, 0) + 1 - - # Majority of rows must have >=2 content cells - multi_col_rows = sum(1 for cnt in row_content_counts.values() if cnt >= 2) - if multi_col_rows < len(rows) * 0.4: - continue - - # Exclude first and last non-header rows -- these are typically - # page numbers or footer text, not headings. - non_header_rows = [r for r in rows if not r.get("is_header")] - if len(non_header_rows) < 3: - continue - first_ri = non_header_rows[0]["index"] - last_ri = non_header_rows[-1]["index"] - - heading_row_indices = [] - for row in rows: - if row.get("is_header"): - continue - ri = row["index"] - if ri == first_ri or ri == last_ri: - continue - row_cells = [c for c in cells if c.get("row_index") == ri] - content_cells = [ - c for c in row_cells - if c.get("col_type", "").startswith("column_") - and (c.get("col_type") != "column_1" - or (c.get("text") or "").strip().lower() in _ARTICLE_WORDS) - ] - if len(content_cells) != 1: - continue - cell = content_cells[0] - # Not in the last column (continuation/example lines) - if cell.get("col_index") == last_col: - continue - text = (cell.get("text") or "").strip() - if not text or text.startswith("["): - continue - # Continuation lines start with "(" -- e.g. "(usw.)", "(TV-Serie)" - if text.startswith("("): - continue - # Single cell NOT in the first content column is likely a - # continuation/overflow line, not a heading. Real headings - # ("Theme 1", "Unit 3: ...") appear in the first or second - # content column. - first_content_col = col_indices[0] if col_indices else 0 - if cell.get("col_index", 0) > first_content_col + 1: - continue - # Skip garbled IPA without brackets (e.g. "ska:f -- ska:vz") - # but NOT text with real IPA symbols (e.g. "Theme [\u03b8\u02c8i\u02d0m]") - _REAL_IPA_CHARS = set("\u02c8\u02cc\u0259\u026a\u025b\u0252\u028a\u028c\u00e6\u0251\u0254\u0283\u0292\u03b8\u00f0\u014b") - if _text_has_garbled_ipa(text) and not any(c in _REAL_IPA_CHARS for c in text): - continue - # Guard: dictionary section headings are short (1-4 alpha chars - # like "A", "Ab", "Zi", "Sch"). Longer text that starts - # lowercase is a regular vocabulary word (e.g. "zentral") that - # happens to appear alone in its row. - alpha_only = re.sub(r'[^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]', '', text) - if len(alpha_only) > 4 and text[0].islower(): - continue - heading_row_indices.append(ri) - - # Guard: if >25% of eligible rows would become headings, the - # heuristic is misfiring (e.g. sparse single-column layout where - # most rows naturally have only 1 content cell). - eligible_rows = len(non_header_rows) - 2 # minus first/last excluded - if eligible_rows > 0 and len(heading_row_indices) > eligible_rows * 0.25: - logger.debug( - "Skipping single-cell heading detection for zone %s: " - "%d/%d rows would be headings (>25%%)", - z.get("zone_index"), len(heading_row_indices), eligible_rows, - ) - continue - - for hri in heading_row_indices: - header_cells = [c for c in cells if c.get("row_index") == hri] - if not header_cells: - continue - - # Collect all word_boxes and text - all_wb = [] - all_text_parts = [] - for hc in sorted(header_cells, key=lambda c: c["col_index"]): - all_wb.extend(hc.get("word_boxes", [])) - if hc.get("text", "").strip(): - all_text_parts.append(hc["text"].strip()) - - first_col_idx = min(hc["col_index"] for hc in header_cells) - - # Remove old cells for this row, add spanning heading cell - z["cells"] = [c for c in z["cells"] if c.get("row_index") != hri] - - if all_wb: - x_min = min(wb["left"] for wb in all_wb) - y_min = min(wb["top"] for wb in all_wb) - x_max = max(wb["left"] + wb["width"] for wb in all_wb) - y_max = max(wb["top"] + wb["height"] for wb in all_wb) - else: - # Fallback to first cell bbox - bp = header_cells[0].get("bbox_px", {}) - x_min = bp.get("x", 0) - y_min = bp.get("y", 0) - x_max = x_min + bp.get("w", 0) - y_max = y_min + bp.get("h", 0) - - zone_idx = z.get("zone_index", 0) - z["cells"].append({ - "cell_id": f"Z{zone_idx}_R{hri:02d}_C{first_col_idx}", - "zone_index": zone_idx, - "row_index": hri, - "col_index": first_col_idx, - "col_type": "heading", - "text": " ".join(all_text_parts), - "confidence": 0.0, - "bbox_px": {"x": x_min, "y": y_min, - "w": x_max - x_min, "h": y_max - y_min}, - "bbox_pct": { - "x": round(x_min / img_w * 100, 2) if img_w else 0, - "y": round(y_min / img_h * 100, 2) if img_h else 0, - "w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0, - "h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0, - }, - "word_boxes": all_wb, - "ocr_engine": "words_first", - "is_bold": False, - }) - - for row in rows: - if row["index"] == hri: - row["is_header"] = True - heading_count += 1 - - return heading_count - - -def _detect_header_rows( - rows: List[Dict], - zone_words: List[Dict], - zone_y: int, - columns: Optional[List[Dict]] = None, - skip_first_row_header: bool = False, -) -> List[int]: - """Detect header rows: first-row heuristic + spanning header detection. - - A "spanning header" is a row whose words stretch across multiple column - boundaries (e.g. "Unit4: Bonnie Scotland" centred across 4 columns). - """ - if len(rows) < 2: - return [] - - headers = [] - - if not skip_first_row_header: - first_row = rows[0] - second_row = rows[1] - - # Gap between first and second row > 0.5x average row height - avg_h = sum(r["y_max"] - r["y_min"] for r in rows) / len(rows) - gap = second_row["y_min"] - first_row["y_max"] - if gap > avg_h * 0.5: - headers.append(0) - - # Also check if first row words are taller than average (bold/header text) - all_heights = [w["height"] for w in zone_words] - median_h = sorted(all_heights)[len(all_heights) // 2] if all_heights else 20 - first_row_words = [ - w for w in zone_words - if first_row["y_min"] <= w["top"] + w["height"] / 2 <= first_row["y_max"] - ] - if first_row_words: - first_h = max(w["height"] for w in first_row_words) - if first_h > median_h * 1.3: - if 0 not in headers: - headers.append(0) - - # Note: Spanning-header detection (rows spanning all columns) has been - # disabled because it produces too many false positives on vocabulary - # worksheets where IPA transcriptions or short entries naturally span - # multiple columns with few words. The first-row heuristic above is - # sufficient for detecting real headers. - - return headers - - -def _detect_colspan_cells( - zone_words: List[Dict], - columns: List[Dict], - rows: List[Dict], - cells: List[Dict], - img_w: int, - img_h: int, -) -> List[Dict]: - """Detect and merge cells that span multiple columns (colspan). - - A word-block (PaddleOCR phrase) that extends significantly past a column - boundary into the next column indicates a merged cell. This replaces - the incorrectly split cells with a single cell spanning multiple columns. - - Works for both full-page scans and box zones. - """ - if len(columns) < 2 or not zone_words or not rows: - return cells - - from cv_words_first import _assign_word_to_row - - # Column boundaries (midpoints between adjacent columns) - col_boundaries = [] - for ci in range(len(columns) - 1): - col_boundaries.append((columns[ci]["x_max"] + columns[ci + 1]["x_min"]) / 2) - - def _cols_covered(w_left: float, w_right: float) -> List[int]: - """Return list of column indices that a word-block covers.""" - covered = [] - for col in columns: - col_mid = (col["x_min"] + col["x_max"]) / 2 - # Word covers a column if it extends past the column's midpoint - if w_left < col_mid < w_right: - covered.append(col["index"]) - # Also include column if word starts within it - elif col["x_min"] <= w_left < col["x_max"]: - covered.append(col["index"]) - return sorted(set(covered)) - - # Group original word-blocks by row - row_word_blocks: Dict[int, List[Dict]] = {} - for w in zone_words: - ri = _assign_word_to_row(w, rows) - row_word_blocks.setdefault(ri, []).append(w) - - # For each row, check if any word-block spans multiple columns - rows_to_merge: Dict[int, List[Dict]] = {} # row_index -> list of spanning word-blocks - - for ri, wblocks in row_word_blocks.items(): - spanning = [] - for w in wblocks: - w_left = w["left"] - w_right = w_left + w["width"] - covered = _cols_covered(w_left, w_right) - if len(covered) >= 2: - spanning.append({"word": w, "cols": covered}) - if spanning: - rows_to_merge[ri] = spanning - - if not rows_to_merge: - return cells - - # Merge cells for spanning rows - new_cells = [] - for cell in cells: - ri = cell.get("row_index", -1) - if ri not in rows_to_merge: - new_cells.append(cell) - continue - - # Check if this cell's column is part of a spanning block - ci = cell.get("col_index", -1) - is_part_of_span = False - for span in rows_to_merge[ri]: - if ci in span["cols"]: - is_part_of_span = True - # Only emit the merged cell for the FIRST column in the span - if ci == span["cols"][0]: - # Use the ORIGINAL word-block text (not the split cell texts - # which may have broken words like "euros a" + "nd cents") - orig_word = span["word"] - merged_text = orig_word.get("text", "").strip() - all_wb = [orig_word] - - # Compute merged bbox - if all_wb: - x_min = min(wb["left"] for wb in all_wb) - y_min = min(wb["top"] for wb in all_wb) - x_max = max(wb["left"] + wb["width"] for wb in all_wb) - y_max = max(wb["top"] + wb["height"] for wb in all_wb) - else: - x_min = y_min = x_max = y_max = 0 - - new_cells.append({ - "cell_id": cell["cell_id"], - "row_index": ri, - "col_index": span["cols"][0], - "col_type": "spanning_header", - "colspan": len(span["cols"]), - "text": merged_text, - "confidence": cell.get("confidence", 0), - "bbox_px": {"x": x_min, "y": y_min, - "w": x_max - x_min, "h": y_max - y_min}, - "bbox_pct": { - "x": round(x_min / img_w * 100, 2) if img_w else 0, - "y": round(y_min / img_h * 100, 2) if img_h else 0, - "w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0, - "h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0, - }, - "word_boxes": all_wb, - "ocr_engine": cell.get("ocr_engine", ""), - "is_bold": cell.get("is_bold", False), - }) - logger.info( - "colspan detected: row %d, cols %s -> merged %d cells (%r)", - ri, span["cols"], len(span["cols"]), merged_text[:50], - ) - break - if not is_part_of_span: - new_cells.append(cell) - - return new_cells +# Backward-compat shim -- module moved to grid/editor/headers.py +import importlib as _importlib +import sys as _sys +_sys.modules[__name__] = _importlib.import_module("grid.editor.headers") diff --git a/klausur-service/backend/grid_editor_helpers.py b/klausur-service/backend/grid_editor_helpers.py index c75e161..3b17c78 100644 --- a/klausur-service/backend/grid_editor_helpers.py +++ b/klausur-service/backend/grid_editor_helpers.py @@ -1,58 +1,4 @@ -""" -Grid Editor helper functions — barrel re-export module. - -This file re-exports all public symbols from the split sub-modules -so that existing ``from grid_editor_helpers import ...`` statements -continue to work without changes. - -Sub-modules: - - grid_editor_columns — column detection, cross-column splitting, marker merging - - grid_editor_filters — word/zone filtering, border ghosts, decorative margins - - grid_editor_headers — header/heading detection, colspan detection - - grid_editor_zones — vertical dividers, zone splitting/merging, zone grid building - -Lizenz: Apache 2.0 (kommerziell nutzbar) -DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. -""" - -# --- Re-export: columns --------------------------------------------------- -from grid_editor_columns import ( # noqa: F401 - _is_recognized_word, - _split_cross_column_words, - _cluster_columns_by_alignment, - _MARKER_CHARS, - _merge_inline_marker_columns, -) - -# --- Re-export: filters ---------------------------------------------------- -from grid_editor_filters import ( # noqa: F401 - _filter_border_strip_words, - _GRID_GHOST_CHARS, - _filter_border_ghosts, - _flatten_word_boxes, - _words_in_zone, - _get_content_bounds, - _filter_decorative_margin, - _filter_footer_words, - _filter_header_junk, -) - -# --- Re-export: headers ---------------------------------------------------- -from grid_editor_headers import ( # noqa: F401 - _detect_heading_rows_by_color, - _detect_heading_rows_by_single_cell, - _detect_header_rows, - _detect_colspan_cells, -) - -# --- Re-export: zones ------------------------------------------------------- -from grid_editor_zones import ( # noqa: F401 - _PIPE_RE_VSPLIT, - _detect_vertical_dividers, - _split_zone_at_vertical_dividers, - _merge_content_zones_across_boxes, - _build_zone_grid, -) - -# --- Re-export from cv_words_first (used by cv_box_layout.py) --------------- -from cv_words_first import _cluster_rows # noqa: F401 +# Backward-compat shim -- module moved to grid/editor/helpers.py +import importlib as _importlib +import sys as _sys +_sys.modules[__name__] = _importlib.import_module("grid.editor.helpers") diff --git a/klausur-service/backend/grid_editor_zones.py b/klausur-service/backend/grid_editor_zones.py index 2640c09..2e055ff 100644 --- a/klausur-service/backend/grid_editor_zones.py +++ b/klausur-service/backend/grid_editor_zones.py @@ -1,389 +1,4 @@ -""" -Grid Editor — vertical divider detection, zone splitting/merging, zone grid building. - -Split from grid_editor_helpers.py for maintainability. -All functions are pure computation — no HTTP, DB, or session side effects. - -Lizenz: Apache 2.0 (kommerziell nutzbar) -DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. -""" - -import logging -import re -from typing import Any, Dict, List, Optional - -from cv_vocab_types import PageZone -from cv_words_first import _cluster_rows, _build_cells - -from grid_editor_columns import ( - _cluster_columns_by_alignment, - _merge_inline_marker_columns, - _split_cross_column_words, -) -from grid_editor_headers import ( - _detect_header_rows, - _detect_colspan_cells, -) - -logger = logging.getLogger(__name__) - - -# --------------------------------------------------------------------------- -# Vertical divider detection and zone splitting -# --------------------------------------------------------------------------- - -_PIPE_RE_VSPLIT = re.compile(r"^\|+$") - - -def _detect_vertical_dividers( - words: List[Dict], - zone_x: int, - zone_w: int, - zone_y: int, - zone_h: int, -) -> List[float]: - """Detect vertical divider lines from pipe word_boxes at consistent x. - - Returns list of divider x-positions (empty if no dividers found). - """ - if not words or zone_w <= 0 or zone_h <= 0: - return [] - - # Collect pipe word_boxes - pipes = [ - w for w in words - if _PIPE_RE_VSPLIT.match((w.get("text") or "").strip()) - ] - if len(pipes) < 5: - return [] - - # Cluster pipe x-centers by proximity - tolerance = max(15, int(zone_w * 0.02)) - pipe_xs = sorted(w["left"] + w["width"] / 2 for w in pipes) - - clusters: List[List[float]] = [[pipe_xs[0]]] - for x in pipe_xs[1:]: - if x - clusters[-1][-1] <= tolerance: - clusters[-1].append(x) - else: - clusters.append([x]) - - dividers: List[float] = [] - for cluster in clusters: - if len(cluster) < 5: - continue - mean_x = sum(cluster) / len(cluster) - # Must be between 15% and 85% of zone width - rel_pos = (mean_x - zone_x) / zone_w - if rel_pos < 0.15 or rel_pos > 0.85: - continue - # Check vertical coverage: pipes must span >= 50% of zone height - cluster_pipes = [ - w for w in pipes - if abs(w["left"] + w["width"] / 2 - mean_x) <= tolerance - ] - ys = [w["top"] for w in cluster_pipes] + [w["top"] + w["height"] for w in cluster_pipes] - y_span = max(ys) - min(ys) if ys else 0 - if y_span < zone_h * 0.5: - continue - dividers.append(mean_x) - - return sorted(dividers) - - -def _split_zone_at_vertical_dividers( - zone: "PageZone", - divider_xs: List[float], - vsplit_group_id: int, -) -> List["PageZone"]: - """Split a PageZone at vertical divider positions into sub-zones.""" - boundaries = [zone.x] + divider_xs + [zone.x + zone.width] - hints = [] - for i in range(len(boundaries) - 1): - if i == 0: - hints.append("left_of_vsplit") - elif i == len(boundaries) - 2: - hints.append("right_of_vsplit") - else: - hints.append("middle_of_vsplit") - - sub_zones = [] - for i in range(len(boundaries) - 1): - x_start = int(boundaries[i]) - x_end = int(boundaries[i + 1]) - sub = PageZone( - index=0, # re-indexed later - zone_type=zone.zone_type, - y=zone.y, - height=zone.height, - x=x_start, - width=x_end - x_start, - box=zone.box, - image_overlays=zone.image_overlays, - layout_hint=hints[i], - vsplit_group=vsplit_group_id, - ) - sub_zones.append(sub) - - return sub_zones - - -def _merge_content_zones_across_boxes( - zones: List, - content_x: int, - content_w: int, -) -> List: - """Merge content zones separated by box zones into single zones. - - Box zones become image_overlays on the merged content zone. - Pattern: [content, box*, content] -> [merged_content with overlay] - Box zones NOT between two content zones stay as standalone zones. - """ - if len(zones) < 3: - return zones - - # Group consecutive runs of [content, box+, content] - result: List = [] - i = 0 - while i < len(zones): - z = zones[i] - if z.zone_type != "content": - result.append(z) - i += 1 - continue - - # Start of a potential merge group: content zone - group_contents = [z] - group_boxes = [] - j = i + 1 - # Absorb [box, content] pairs -- only absorb a box if it's - # confirmed to be followed by another content zone. - while j < len(zones): - if (zones[j].zone_type == "box" - and j + 1 < len(zones) - and zones[j + 1].zone_type == "content"): - group_boxes.append(zones[j]) - group_contents.append(zones[j + 1]) - j += 2 - else: - break - - if len(group_contents) >= 2 and group_boxes: - # Merge: create one large content zone spanning all - y_min = min(c.y for c in group_contents) - y_max = max(c.y + c.height for c in group_contents) - overlays = [] - for bz in group_boxes: - overlay = { - "y": bz.y, - "height": bz.height, - "x": bz.x, - "width": bz.width, - } - if bz.box: - overlay["box"] = { - "x": bz.box.x, - "y": bz.box.y, - "width": bz.box.width, - "height": bz.box.height, - "confidence": bz.box.confidence, - "border_thickness": bz.box.border_thickness, - } - overlays.append(overlay) - - merged = PageZone( - index=0, # re-indexed below - zone_type="content", - y=y_min, - height=y_max - y_min, - x=content_x, - width=content_w, - image_overlays=overlays, - ) - result.append(merged) - i = j - else: - # No merge possible -- emit just the content zone - result.append(z) - i += 1 - - # Re-index zones - for idx, z in enumerate(result): - z.index = idx - - logger.info( - "zone-merge: %d zones -> %d zones after merging across boxes", - len(zones), len(result), - ) - return result - - -def _build_zone_grid( - zone_words: List[Dict], - zone_x: int, - zone_y: int, - zone_w: int, - zone_h: int, - zone_index: int, - img_w: int, - img_h: int, - global_columns: Optional[List[Dict]] = None, - skip_first_row_header: bool = False, -) -> Dict[str, Any]: - """Build columns, rows, cells for a single zone from its words. - - Args: - global_columns: If provided, use these pre-computed column boundaries - instead of detecting columns per zone. Used for content zones so - that all content zones (above/between/below boxes) share the same - column structure. Box zones always detect columns independently. - """ - if not zone_words: - return { - "columns": [], - "rows": [], - "cells": [], - "header_rows": [], - } - - # Cluster rows first (needed for column alignment analysis) - rows = _cluster_rows(zone_words) - - # Diagnostic logging for small/medium zones (box zones typically have 40-60 words) - if len(zone_words) <= 60: - import statistics as _st - _heights = [w['height'] for w in zone_words if w.get('height', 0) > 0] - _med_h = _st.median(_heights) if _heights else 20 - _y_tol = max(_med_h * 0.5, 5) - logger.info( - "zone %d row-clustering: %d words, median_h=%.0f, y_tol=%.1f -> %d rows", - zone_index, len(zone_words), _med_h, _y_tol, len(rows), - ) - for w in sorted(zone_words, key=lambda ww: (ww['top'], ww['left'])): - logger.info( - " zone %d word: y=%d x=%d h=%d w=%d '%s'", - zone_index, w['top'], w['left'], w['height'], w['width'], - w.get('text', '')[:40], - ) - for r in rows: - logger.info( - " zone %d row %d: y_min=%d y_max=%d y_center=%.0f", - zone_index, r['index'], r['y_min'], r['y_max'], r['y_center'], - ) - - # Use global columns if provided, otherwise detect per zone - columns = global_columns if global_columns else _cluster_columns_by_alignment(zone_words, zone_w, rows) - - # Merge inline marker columns (bullets, numbering) into adjacent text - if not global_columns: - columns = _merge_inline_marker_columns(columns, zone_words) - - if not columns or not rows: - return { - "columns": [], - "rows": [], - "cells": [], - "header_rows": [], - } - - # Split word boxes that straddle column boundaries (e.g. "sichzie" - # spanning Col 1 + Col 2). Must happen after column detection and - # before cell assignment. - # Keep original words for colspan detection (split destroys span info). - original_zone_words = zone_words - if len(columns) >= 2: - zone_words = _split_cross_column_words(zone_words, columns) - - # Build cells - cells = _build_cells(zone_words, columns, rows, img_w, img_h) - - # --- Detect colspan (merged cells spanning multiple columns) --- - # Uses the ORIGINAL (pre-split) words to detect word-blocks that span - # multiple columns. _split_cross_column_words would have destroyed - # this information by cutting words at column boundaries. - if len(columns) >= 2: - cells = _detect_colspan_cells(original_zone_words, columns, rows, cells, img_w, img_h) - - # Prefix cell IDs with zone index - for cell in cells: - cell["cell_id"] = f"Z{zone_index}_{cell['cell_id']}" - cell["zone_index"] = zone_index - - # Detect header rows (pass columns for spanning header detection) - header_rows = _detect_header_rows(rows, zone_words, zone_y, columns, - skip_first_row_header=skip_first_row_header) - - # Merge cells in spanning header rows into a single col-0 cell - if header_rows and len(columns) >= 2: - for hri in header_rows: - header_cells = [c for c in cells if c["row_index"] == hri] - if len(header_cells) <= 1: - continue - # Collect all word_boxes and text from all columns - all_wb = [] - all_text_parts = [] - for hc in sorted(header_cells, key=lambda c: c["col_index"]): - all_wb.extend(hc.get("word_boxes", [])) - if hc.get("text", "").strip(): - all_text_parts.append(hc["text"].strip()) - # Remove all header cells, replace with one spanning cell - cells = [c for c in cells if c["row_index"] != hri] - if all_wb: - x_min = min(wb["left"] for wb in all_wb) - y_min = min(wb["top"] for wb in all_wb) - x_max = max(wb["left"] + wb["width"] for wb in all_wb) - y_max = max(wb["top"] + wb["height"] for wb in all_wb) - cells.append({ - "cell_id": f"R{hri:02d}_C0", - "row_index": hri, - "col_index": 0, - "col_type": "spanning_header", - "text": " ".join(all_text_parts), - "confidence": 0.0, - "bbox_px": {"x": x_min, "y": y_min, - "w": x_max - x_min, "h": y_max - y_min}, - "bbox_pct": { - "x": round(x_min / img_w * 100, 2) if img_w else 0, - "y": round(y_min / img_h * 100, 2) if img_h else 0, - "w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0, - "h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0, - }, - "word_boxes": all_wb, - "ocr_engine": "words_first", - "is_bold": True, - }) - - # Convert columns to output format with percentages - out_columns = [] - for col in columns: - x_min = col["x_min"] - x_max = col["x_max"] - out_columns.append({ - "index": col["index"], - "label": col["type"], - "x_min_px": round(x_min), - "x_max_px": round(x_max), - "x_min_pct": round(x_min / img_w * 100, 2) if img_w else 0, - "x_max_pct": round(x_max / img_w * 100, 2) if img_w else 0, - "bold": False, - }) - - # Convert rows to output format with percentages - out_rows = [] - for row in rows: - out_rows.append({ - "index": row["index"], - "y_min_px": round(row["y_min"]), - "y_max_px": round(row["y_max"]), - "y_min_pct": round(row["y_min"] / img_h * 100, 2) if img_h else 0, - "y_max_pct": round(row["y_max"] / img_h * 100, 2) if img_h else 0, - "is_header": row["index"] in header_rows, - }) - - return { - "columns": out_columns, - "rows": out_rows, - "cells": cells, - "header_rows": header_rows, - "_raw_columns": columns, # internal: for propagation to other zones - } +# Backward-compat shim -- module moved to grid/editor/zones.py +import importlib as _importlib +import sys as _sys +_sys.modules[__name__] = _importlib.import_module("grid.editor.zones") diff --git a/klausur-service/backend/vocab/__init__.py b/klausur-service/backend/vocab/__init__.py new file mode 100644 index 0000000..eabdd83 --- /dev/null +++ b/klausur-service/backend/vocab/__init__.py @@ -0,0 +1,6 @@ +""" +Vocab package — restructured from vocab_* flat modules. + +Backward-compatible re-exports: consumers can still use +``from vocab_worksheet_api import ...`` etc. via the shim files in backend/. +""" diff --git a/klausur-service/backend/vocab/learn_bridge.py b/klausur-service/backend/vocab/learn_bridge.py new file mode 100644 index 0000000..786c552 --- /dev/null +++ b/klausur-service/backend/vocab/learn_bridge.py @@ -0,0 +1,196 @@ +""" +Vocab Learn Bridge — Converts vocabulary session data into Learning Units. + +Bridges klausur-service (vocab extraction) with backend-lehrer (learning units + generators). +Creates a Learning Unit in backend-lehrer, then triggers MC/Cloze/QA generation. + +DATENSCHUTZ: All communication stays within Docker network (breakpilot-network). +""" + +import os +import json +import logging +import httpx +from typing import List, Dict, Any, Optional + +logger = logging.getLogger(__name__) + +BACKEND_LEHRER_URL = os.getenv("BACKEND_LEHRER_URL", "http://backend-lehrer:8001") + + +def vocab_to_analysis_data(session_name: str, vocabulary: List[Dict[str, Any]]) -> Dict[str, Any]: + """ + Convert vocabulary entries from a vocab session into the analysis_data format + expected by backend-lehrer generators (MC, Cloze, QA). + + The generators consume: + - title: Display name + - subject: Subject area + - grade_level: Target grade + - canonical_text: Full text representation + - printed_blocks: Individual text blocks + - vocabulary: Original vocab data (for vocab-specific modules) + """ + canonical_lines = [] + printed_blocks = [] + + for v in vocabulary: + en = v.get("english", "").strip() + de = v.get("german", "").strip() + example = v.get("example_sentence", "").strip() + + if not en and not de: + continue + + line = f"{en} = {de}" + if example: + line += f" ({example})" + canonical_lines.append(line) + + block_text = f"{en} — {de}" + if example: + block_text += f" | {example}" + printed_blocks.append({"text": block_text}) + + return { + "title": session_name, + "subject": "English Vocabulary", + "grade_level": "5-8", + "canonical_text": "\n".join(canonical_lines), + "printed_blocks": printed_blocks, + "vocabulary": vocabulary, + } + + +async def create_learning_unit( + session_name: str, + vocabulary: List[Dict[str, Any]], + grade: Optional[str] = None, +) -> Dict[str, Any]: + """ + Create a Learning Unit in backend-lehrer from vocabulary data. + + Steps: + 1. Create unit via POST /api/learning-units/ + 2. Return the created unit info + + Returns dict with unit_id, status, vocabulary_count. + """ + if not vocabulary: + raise ValueError("No vocabulary entries provided") + + analysis_data = vocab_to_analysis_data(session_name, vocabulary) + + async with httpx.AsyncClient(timeout=30.0) as client: + # 1. Create Learning Unit + create_payload = { + "title": session_name, + "subject": "Englisch", + "grade": grade or "5-8", + } + + try: + resp = await client.post( + f"{BACKEND_LEHRER_URL}/api/learning-units/", + json=create_payload, + ) + resp.raise_for_status() + unit = resp.json() + except httpx.HTTPError as e: + logger.error(f"Failed to create learning unit: {e}") + raise RuntimeError(f"Backend-Lehrer nicht erreichbar: {e}") + + unit_id = unit.get("id") + if not unit_id: + raise RuntimeError("Learning Unit created but no ID returned") + + logger.info(f"Created learning unit {unit_id} with {len(vocabulary)} vocabulary entries") + + # 2. Save analysis_data as JSON file for generators + analysis_dir = os.path.expanduser("~/Arbeitsblaetter/Lerneinheiten") + os.makedirs(analysis_dir, exist_ok=True) + analysis_path = os.path.join(analysis_dir, f"{unit_id}_analyse.json") + + with open(analysis_path, "w", encoding="utf-8") as f: + json.dump(analysis_data, f, ensure_ascii=False, indent=2) + + logger.info(f"Saved analysis data to {analysis_path}") + + return { + "unit_id": unit_id, + "unit": unit, + "analysis_path": analysis_path, + "vocabulary_count": len(vocabulary), + "status": "created", + } + + +async def generate_learning_modules( + unit_id: str, + analysis_path: str, +) -> Dict[str, Any]: + """ + Trigger MC, Cloze, and QA generation from analysis data. + + Imports generators directly (they run in-process for klausur-service) + or calls backend-lehrer API if generators aren't available locally. + + Returns dict with generation results. + """ + results = { + "unit_id": unit_id, + "mc": {"status": "pending"}, + "cloze": {"status": "pending"}, + "qa": {"status": "pending"}, + } + + # Load analysis data + with open(analysis_path, "r", encoding="utf-8") as f: + analysis_data = json.load(f) + + # Try to generate via backend-lehrer API + async with httpx.AsyncClient(timeout=120.0) as client: + # Generate QA (includes Leitner fields) + try: + resp = await client.post( + f"{BACKEND_LEHRER_URL}/api/learning-units/{unit_id}/generate-qa", + json={"analysis_data": analysis_data, "num_questions": min(len(analysis_data.get("vocabulary", [])), 20)}, + ) + if resp.status_code == 200: + results["qa"] = {"status": "generated", "data": resp.json()} + else: + logger.warning(f"QA generation returned {resp.status_code}") + results["qa"] = {"status": "skipped", "reason": f"HTTP {resp.status_code}"} + except Exception as e: + logger.warning(f"QA generation failed: {e}") + results["qa"] = {"status": "error", "reason": str(e)} + + # Generate MC + try: + resp = await client.post( + f"{BACKEND_LEHRER_URL}/api/learning-units/{unit_id}/generate-mc", + json={"analysis_data": analysis_data, "num_questions": min(len(analysis_data.get("vocabulary", [])), 10)}, + ) + if resp.status_code == 200: + results["mc"] = {"status": "generated", "data": resp.json()} + else: + results["mc"] = {"status": "skipped", "reason": f"HTTP {resp.status_code}"} + except Exception as e: + logger.warning(f"MC generation failed: {e}") + results["mc"] = {"status": "error", "reason": str(e)} + + # Generate Cloze + try: + resp = await client.post( + f"{BACKEND_LEHRER_URL}/api/learning-units/{unit_id}/generate-cloze", + json={"analysis_data": analysis_data}, + ) + if resp.status_code == 200: + results["cloze"] = {"status": "generated", "data": resp.json()} + else: + results["cloze"] = {"status": "skipped", "reason": f"HTTP {resp.status_code}"} + except Exception as e: + logger.warning(f"Cloze generation failed: {e}") + results["cloze"] = {"status": "error", "reason": str(e)} + + return results diff --git a/klausur-service/backend/vocab/session_store.py b/klausur-service/backend/vocab/session_store.py new file mode 100644 index 0000000..d63ca8b --- /dev/null +++ b/klausur-service/backend/vocab/session_store.py @@ -0,0 +1,427 @@ +""" +Vocabulary Session Store - PostgreSQL persistence for vocab extraction sessions. + +Replaces in-memory storage with database persistence. +See migrations/001_vocab_sessions.sql for schema. +""" + +import os +import uuid +import logging +import json +from typing import Optional, List, Dict, Any + +import asyncpg + +logger = logging.getLogger(__name__) + +# Database configuration +DATABASE_URL = os.getenv( + "DATABASE_URL", + "postgresql://breakpilot:breakpilot@postgres:5432/breakpilot_db" +) + +# Connection pool (initialized lazily) +_pool: Optional[asyncpg.Pool] = None + + +async def get_pool() -> asyncpg.Pool: + """Get or create the database connection pool.""" + global _pool + if _pool is None: + _pool = await asyncpg.create_pool(DATABASE_URL, min_size=2, max_size=10) + return _pool + + +async def init_vocab_tables(): + """ + Initialize vocab tables if they don't exist. + This is called at startup. + """ + pool = await get_pool() + async with pool.acquire() as conn: + # Check if tables exist + tables_exist = await conn.fetchval(""" + SELECT EXISTS ( + SELECT FROM information_schema.tables + WHERE table_name = 'vocab_sessions' + ) + """) + + if not tables_exist: + logger.info("Creating vocab tables...") + # Read and execute migration + migration_path = os.path.join( + os.path.dirname(__file__), + "migrations/001_vocab_sessions.sql" + ) + if os.path.exists(migration_path): + with open(migration_path, "r") as f: + sql = f.read() + await conn.execute(sql) + logger.info("Vocab tables created successfully") + else: + logger.warning(f"Migration file not found: {migration_path}") + else: + logger.debug("Vocab tables already exist") + + +# ============================================================================= +# SESSION OPERATIONS +# ============================================================================= + +async def create_session_db( + session_id: str, + name: str, + description: str = "", + source_language: str = "en", + target_language: str = "de" +) -> Dict[str, Any]: + """Create a new vocabulary session in the database.""" + pool = await get_pool() + async with pool.acquire() as conn: + row = await conn.fetchrow(""" + INSERT INTO vocab_sessions ( + id, name, description, source_language, target_language, + status, vocabulary_count + ) VALUES ($1, $2, $3, $4, $5, 'pending', 0) + RETURNING * + """, uuid.UUID(session_id), name, description, source_language, target_language) + + return _row_to_dict(row) + + +async def get_session_db(session_id: str) -> Optional[Dict[str, Any]]: + """Get a session by ID.""" + pool = await get_pool() + async with pool.acquire() as conn: + row = await conn.fetchrow(""" + SELECT * FROM vocab_sessions WHERE id = $1 + """, uuid.UUID(session_id)) + + if row: + return _row_to_dict(row) + return None + + +async def list_sessions_db( + limit: int = 50, + offset: int = 0, + status: Optional[str] = None +) -> List[Dict[str, Any]]: + """List all sessions with optional filtering.""" + pool = await get_pool() + async with pool.acquire() as conn: + if status: + rows = await conn.fetch(""" + SELECT * FROM vocab_sessions + WHERE status = $1 + ORDER BY created_at DESC + LIMIT $2 OFFSET $3 + """, status, limit, offset) + else: + rows = await conn.fetch(""" + SELECT * FROM vocab_sessions + ORDER BY created_at DESC + LIMIT $1 OFFSET $2 + """, limit, offset) + + return [_row_to_dict(row) for row in rows] + + +async def update_session_db( + session_id: str, + **kwargs +) -> Optional[Dict[str, Any]]: + """Update a session with given fields.""" + pool = await get_pool() + + # Build dynamic UPDATE query + fields = [] + values = [] + param_idx = 1 + + allowed_fields = [ + 'name', 'description', 'status', 'vocabulary_count', + 'extraction_confidence', 'image_path', 'pdf_path', 'pdf_page_count', + 'ocr_prompts', 'processed_pages', 'successful_pages', 'failed_pages' + ] + + for key, value in kwargs.items(): + if key in allowed_fields: + fields.append(f"{key} = ${param_idx}") + # Convert dicts/lists to JSON for JSONB columns + if key in ['ocr_prompts', 'processed_pages', 'successful_pages', 'failed_pages']: + value = json.dumps(value) if value else None + values.append(value) + param_idx += 1 + + if not fields: + return await get_session_db(session_id) + + values.append(uuid.UUID(session_id)) + + async with pool.acquire() as conn: + row = await conn.fetchrow(f""" + UPDATE vocab_sessions + SET {', '.join(fields)} + WHERE id = ${param_idx} + RETURNING * + """, *values) + + if row: + return _row_to_dict(row) + return None + + +async def delete_session_db(session_id: str) -> bool: + """Delete a session and all related data (cascades).""" + pool = await get_pool() + async with pool.acquire() as conn: + result = await conn.execute(""" + DELETE FROM vocab_sessions WHERE id = $1 + """, uuid.UUID(session_id)) + return result == "DELETE 1" + + +# ============================================================================= +# VOCABULARY OPERATIONS +# ============================================================================= + +async def add_vocabulary_db( + session_id: str, + vocab_list: List[Dict[str, Any]] +) -> List[Dict[str, Any]]: + """Add vocabulary entries to a session.""" + if not vocab_list: + return [] + + pool = await get_pool() + results = [] + + async with pool.acquire() as conn: + for vocab in vocab_list: + vocab_id = str(uuid.uuid4()) + row = await conn.fetchrow(""" + INSERT INTO vocab_entries ( + id, session_id, english, german, example_sentence, + example_sentence_gap, word_type, source_page + ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8) + RETURNING * + """, + uuid.UUID(vocab_id), + uuid.UUID(session_id), + vocab.get('english', ''), + vocab.get('german', ''), + vocab.get('example_sentence'), + vocab.get('example_sentence_gap'), + vocab.get('word_type'), + vocab.get('source_page') + ) + results.append(_row_to_dict(row)) + + # Update vocabulary count + await conn.execute(""" + UPDATE vocab_sessions + SET vocabulary_count = ( + SELECT COUNT(*) FROM vocab_entries WHERE session_id = $1 + ) + WHERE id = $1 + """, uuid.UUID(session_id)) + + return results + + +async def get_vocabulary_db( + session_id: str, + source_page: Optional[int] = None +) -> List[Dict[str, Any]]: + """Get vocabulary entries for a session.""" + pool = await get_pool() + async with pool.acquire() as conn: + if source_page is not None: + rows = await conn.fetch(""" + SELECT * FROM vocab_entries + WHERE session_id = $1 AND source_page = $2 + ORDER BY created_at + """, uuid.UUID(session_id), source_page) + else: + rows = await conn.fetch(""" + SELECT * FROM vocab_entries + WHERE session_id = $1 + ORDER BY source_page NULLS LAST, created_at + """, uuid.UUID(session_id)) + + return [_row_to_dict(row) for row in rows] + + +async def update_vocabulary_db( + entry_id: str, + **kwargs +) -> Optional[Dict[str, Any]]: + """Update a single vocabulary entry.""" + pool = await get_pool() + + fields = [] + values = [] + param_idx = 1 + + allowed_fields = [ + 'english', 'german', 'example_sentence', 'example_sentence_gap', + 'word_type', 'source_page' + ] + + for key, value in kwargs.items(): + if key in allowed_fields: + fields.append(f"{key} = ${param_idx}") + values.append(value) + param_idx += 1 + + if not fields: + return None + + values.append(uuid.UUID(entry_id)) + + async with pool.acquire() as conn: + row = await conn.fetchrow(f""" + UPDATE vocab_entries + SET {', '.join(fields)} + WHERE id = ${param_idx} + RETURNING * + """, *values) + + if row: + return _row_to_dict(row) + return None + + +async def clear_page_vocabulary_db(session_id: str, page: int) -> int: + """Clear all vocabulary for a specific page.""" + pool = await get_pool() + async with pool.acquire() as conn: + result = await conn.execute(""" + DELETE FROM vocab_entries + WHERE session_id = $1 AND source_page = $2 + """, uuid.UUID(session_id), page) + + # Update vocabulary count + await conn.execute(""" + UPDATE vocab_sessions + SET vocabulary_count = ( + SELECT COUNT(*) FROM vocab_entries WHERE session_id = $1 + ) + WHERE id = $1 + """, uuid.UUID(session_id)) + + # Return count of deleted rows + count = int(result.split()[-1]) if result else 0 + return count + + +# ============================================================================= +# WORKSHEET OPERATIONS +# ============================================================================= + +async def create_worksheet_db( + session_id: str, + worksheet_types: List[str], + pdf_path: Optional[str] = None, + solution_path: Optional[str] = None +) -> Dict[str, Any]: + """Create a worksheet record.""" + pool = await get_pool() + worksheet_id = str(uuid.uuid4()) + + async with pool.acquire() as conn: + row = await conn.fetchrow(""" + INSERT INTO vocab_worksheets ( + id, session_id, worksheet_types, pdf_path, solution_path + ) VALUES ($1, $2, $3, $4, $5) + RETURNING * + """, + uuid.UUID(worksheet_id), + uuid.UUID(session_id), + json.dumps(worksheet_types), + pdf_path, + solution_path + ) + + return _row_to_dict(row) + + +async def get_worksheet_db(worksheet_id: str) -> Optional[Dict[str, Any]]: + """Get a worksheet by ID.""" + pool = await get_pool() + async with pool.acquire() as conn: + row = await conn.fetchrow(""" + SELECT * FROM vocab_worksheets WHERE id = $1 + """, uuid.UUID(worksheet_id)) + + if row: + return _row_to_dict(row) + return None + + +async def delete_worksheets_for_session_db(session_id: str) -> int: + """Delete all worksheets for a session.""" + pool = await get_pool() + async with pool.acquire() as conn: + result = await conn.execute(""" + DELETE FROM vocab_worksheets WHERE session_id = $1 + """, uuid.UUID(session_id)) + + count = int(result.split()[-1]) if result else 0 + return count + + +# ============================================================================= +# PDF CACHE OPERATIONS +# ============================================================================= + +# Simple in-memory cache for PDF data (temporary until served) +_pdf_cache: Dict[str, bytes] = {} + + +def cache_pdf_data(worksheet_id: str, pdf_data: bytes) -> None: + """Cache PDF data temporarily for download.""" + _pdf_cache[worksheet_id] = pdf_data + + +def get_cached_pdf_data(worksheet_id: str) -> Optional[bytes]: + """Get cached PDF data.""" + return _pdf_cache.get(worksheet_id) + + +def clear_cached_pdf_data(worksheet_id: str) -> None: + """Clear cached PDF data.""" + _pdf_cache.pop(worksheet_id, None) + + +# ============================================================================= +# HELPER FUNCTIONS +# ============================================================================= + +def _row_to_dict(row: asyncpg.Record) -> Dict[str, Any]: + """Convert asyncpg Record to dict with proper type handling.""" + if row is None: + return {} + + result = dict(row) + + # Convert UUIDs to strings + for key in ['id', 'session_id']: + if key in result and result[key] is not None: + result[key] = str(result[key]) + + # Convert datetimes to ISO strings + for key in ['created_at', 'updated_at', 'generated_at']: + if key in result and result[key] is not None: + result[key] = result[key].isoformat() + + # Parse JSONB fields back to dicts/lists + for key in ['ocr_prompts', 'processed_pages', 'successful_pages', 'failed_pages', 'worksheet_types']: + if key in result and result[key] is not None: + if isinstance(result[key], str): + result[key] = json.loads(result[key]) + + return result diff --git a/klausur-service/backend/vocab/worksheet/__init__.py b/klausur-service/backend/vocab/worksheet/__init__.py new file mode 100644 index 0000000..93c3bae --- /dev/null +++ b/klausur-service/backend/vocab/worksheet/__init__.py @@ -0,0 +1,5 @@ +""" +Vocab worksheet sub-package. + +Main entry point: ``from vocab.worksheet.api import router`` +""" diff --git a/klausur-service/backend/vocab/worksheet/analysis_api.py b/klausur-service/backend/vocab/worksheet/analysis_api.py new file mode 100644 index 0000000..dd74c5b --- /dev/null +++ b/klausur-service/backend/vocab/worksheet/analysis_api.py @@ -0,0 +1,472 @@ +""" +Vocabulary Worksheet Analysis API - OCR export, ground truth labeling, +extract-with-boxes, deskewed images, and learning unit generation. + +The two large handlers (compare_ocr_methods, analyze_grid) live in +vocab_worksheet_compare_api.py and are included via compare_router. +""" + +from fastapi import APIRouter, Body, HTTPException +from fastapi.responses import StreamingResponse +from pydantic import BaseModel +from typing import Optional, Dict, Any +from datetime import datetime +import os +import io +import json +import logging + +def _get_sessions(): + from .api import _sessions + return _sessions + +def _get_local_storage_path(): + from .api import LOCAL_STORAGE_PATH + return LOCAL_STORAGE_PATH +from .generation import convert_pdf_page_to_image + +# Try to import Tesseract extractor +try: + from tesseract_vocab_extractor import ( + extract_bounding_boxes, TESSERACT_AVAILABLE, + ) +except ImportError: + TESSERACT_AVAILABLE = False + +# Try to import Grid Detection Service +try: + from services.grid_detection_service import GridDetectionService + GRID_SERVICE_AVAILABLE = True +except ImportError: + GRID_SERVICE_AVAILABLE = False + +logger = logging.getLogger(__name__) + +analysis_router = APIRouter() + +def _ocr_export_dir(): + return os.path.join(_get_local_storage_path(), "ocr-exports") + +def _ground_truth_dir(): + return os.path.join(_get_local_storage_path(), "ground-truth") + + +# ============================================================================= +# OCR Export Endpoints (for cross-app OCR data sharing) +# ============================================================================= + + +@analysis_router.post("/sessions/{session_id}/ocr-export/{page_number}") +async def save_ocr_export(session_id: str, page_number: int, data: Dict[str, Any] = Body(...)): + """ + Save OCR export data for cross-app sharing (admin-v2 -> studio-v2). + + Both apps proxy to klausur-service via /klausur-api/, so this endpoint + serves as shared storage accessible from both ports. + """ + + logger.info(f"Saving OCR export for session {session_id}, page {page_number}") + + os.makedirs(_ocr_export_dir(), exist_ok=True) + + # Save the export data + export_path = os.path.join(_ocr_export_dir(), f"{session_id}_page{page_number}.json") + with open(export_path, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=2) + + # Update latest pointer + latest_path = os.path.join(_ocr_export_dir(), "latest.json") + with open(latest_path, 'w', encoding='utf-8') as f: + json.dump({ + "session_id": session_id, + "page_number": page_number, + "saved_at": datetime.utcnow().isoformat(), + }, f, ensure_ascii=False, indent=2) + + return { + "success": True, + "session_id": session_id, + "page_number": page_number, + "message": "OCR export saved successfully", + } + + +@analysis_router.get("/sessions/{session_id}/ocr-export/{page_number}") +async def load_ocr_export(session_id: str, page_number: int): + """Load a specific OCR export by session and page number.""" + + export_path = os.path.join(_ocr_export_dir(), f"{session_id}_page{page_number}.json") + + if not os.path.exists(export_path): + raise HTTPException(status_code=404, detail="OCR export not found") + + with open(export_path, 'r', encoding='utf-8') as f: + data = json.load(f) + + return data + + +@analysis_router.get("/ocr-export/latest") +async def load_latest_ocr_export(): + """Load the most recently saved OCR export data.""" + + latest_path = os.path.join(_ocr_export_dir(), "latest.json") + + if not os.path.exists(latest_path): + raise HTTPException(status_code=404, detail="No OCR exports found") + + with open(latest_path, 'r', encoding='utf-8') as f: + pointer = json.load(f) + + session_id = pointer.get("session_id") + page_number = pointer.get("page_number") + + export_path = os.path.join(_ocr_export_dir(), f"{session_id}_page{page_number}.json") + + if not os.path.exists(export_path): + raise HTTPException(status_code=404, detail="Latest OCR export file not found") + + with open(export_path, 'r', encoding='utf-8') as f: + data = json.load(f) + + return data + + +# ============================================================================= +# Extract with Boxes & Deskewed Image +# ============================================================================= + + +async def extract_entries_with_boxes(image_bytes: bytes, lang: str = "eng+deu") -> dict: + """Extract vocabulary entries with bounding boxes using Tesseract + GridDetectionService. + + Returns dict with 'entries' list and 'image_width'/'image_height'. + Each entry has row_index, english, german, example, confidence, bbox, bbox_en, bbox_de, bbox_ex. + All bbox coordinates are in percent (0-100). + """ + if not TESSERACT_AVAILABLE: + raise HTTPException(status_code=500, detail="Tesseract not available") + if not GRID_SERVICE_AVAILABLE: + raise HTTPException(status_code=500, detail="GridDetectionService not available") + + # Step 1: Tesseract word-level bounding boxes + tess_result = await extract_bounding_boxes(image_bytes, lang=lang) + words = tess_result.get("words", []) + img_w = tess_result.get("image_width", 0) + img_h = tess_result.get("image_height", 0) + + if not words or img_w == 0 or img_h == 0: + return {"entries": [], "image_width": img_w, "image_height": img_h} + + # Step 2: Convert to OCR regions (percentage-based) + service = GridDetectionService() + regions = service.convert_tesseract_regions(words, img_w, img_h) + + if not regions: + return {"entries": [], "image_width": img_w, "image_height": img_h} + + # Step 3: Detect grid + grid_result = service.detect_grid(regions) + + if not grid_result.cells: + return {"entries": [], "image_width": img_w, "image_height": img_h} + + # Step 4: Group cells by logical_row and column_type + from services.grid_detection_service import ColumnType + + entries = [] + for row_idx, row_cells in enumerate(grid_result.cells): + en_text = "" + de_text = "" + ex_text = "" + en_bbox = None + de_bbox = None + ex_bbox = None + row_conf_sum = 0.0 + row_conf_count = 0 + + for cell in row_cells: + cell_bbox = {"x": round(cell.x, 2), "y": round(cell.y, 2), + "w": round(cell.width, 2), "h": round(cell.height, 2)} + + if cell.column_type == ColumnType.ENGLISH: + en_text = cell.text.strip() + en_bbox = cell_bbox + elif cell.column_type == ColumnType.GERMAN: + de_text = cell.text.strip() + de_bbox = cell_bbox + elif cell.column_type == ColumnType.EXAMPLE: + ex_text = cell.text.strip() + ex_bbox = cell_bbox + + if cell.text.strip(): + row_conf_sum += cell.confidence + row_conf_count += 1 + + # Skip completely empty rows + if not en_text and not de_text and not ex_text: + continue + + # Calculate whole-row bounding box + all_bboxes = [b for b in [en_bbox, de_bbox, ex_bbox] if b is not None] + if all_bboxes: + row_x = min(b["x"] for b in all_bboxes) + row_y = min(b["y"] for b in all_bboxes) + row_right = max(b["x"] + b["w"] for b in all_bboxes) + row_bottom = max(b["y"] + b["h"] for b in all_bboxes) + row_bbox = {"x": round(row_x, 2), "y": round(row_y, 2), + "w": round(row_right - row_x, 2), "h": round(row_bottom - row_y, 2)} + else: + row_bbox = {"x": 0, "y": 0, "w": 100, "h": 3} + + avg_conf = round((row_conf_sum / row_conf_count * 100) if row_conf_count > 0 else 0, 1) + + entries.append({ + "row_index": row_idx, + "english": en_text, + "german": de_text, + "example": ex_text, + "confidence": avg_conf, + "bbox": row_bbox, + "bbox_en": en_bbox or {"x": 0, "y": 0, "w": 0, "h": 0}, + "bbox_de": de_bbox or {"x": 0, "y": 0, "w": 0, "h": 0}, + "bbox_ex": ex_bbox or {"x": 0, "y": 0, "w": 0, "h": 0}, + }) + + return {"entries": entries, "image_width": img_w, "image_height": img_h} + + +@analysis_router.post("/sessions/{session_id}/extract-with-boxes/{page_number}") +async def extract_with_boxes(session_id: str, page_number: int): + """Extract vocabulary entries with bounding boxes for ground truth labeling. + + Uses Tesseract + GridDetectionService for spatial positioning. + page_number is 0-indexed. + """ + logger.info(f"Extract with boxes for session {session_id}, page {page_number}") + + if session_id not in _get_sessions(): + raise HTTPException(status_code=404, detail="Session not found") + + session = _get_sessions()[session_id] + pdf_data = session.get("pdf_data") + + if not pdf_data: + raise HTTPException(status_code=400, detail="No PDF uploaded for this session") + + page_count = session.get("pdf_page_count", 1) + if page_number < 0 or page_number >= page_count: + raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).") + + # Convert page to hires image + image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False) + + # Deskew image before OCR + deskew_angle = 0.0 + try: + from cv_vocab_pipeline import deskew_image_by_word_alignment, CV2_AVAILABLE + if CV2_AVAILABLE: + image_data, deskew_angle = deskew_image_by_word_alignment(image_data) + logger.info(f"Deskew: {deskew_angle:.2f}° for page {page_number}") + except Exception as e: + logger.warning(f"Deskew failed for page {page_number}: {e}") + + # Cache deskewed image in session for later serving + if "deskewed_images" not in session: + session["deskewed_images"] = {} + session["deskewed_images"][str(page_number)] = image_data + + # Extract entries with boxes (now on deskewed image) + result = await extract_entries_with_boxes(image_data) + + # Cache in session + if "gt_entries" not in session: + session["gt_entries"] = {} + session["gt_entries"][str(page_number)] = result["entries"] + + return { + "success": True, + "entries": result["entries"], + "entry_count": len(result["entries"]), + "image_width": result["image_width"], + "image_height": result["image_height"], + "deskew_angle": round(deskew_angle, 2), + "deskewed": abs(deskew_angle) > 0.05, + } + + +@analysis_router.get("/sessions/{session_id}/deskewed-image/{page_number}") +async def get_deskewed_image(session_id: str, page_number: int): + """Return the deskewed page image as PNG. + + Falls back to the original hires image if no deskewed version is cached. + """ + if session_id not in _get_sessions(): + raise HTTPException(status_code=404, detail="Session not found") + + session = _get_sessions()[session_id] + deskewed = session.get("deskewed_images", {}).get(str(page_number)) + + if deskewed: + return StreamingResponse(io.BytesIO(deskewed), media_type="image/png") + + # Fallback: render original hires image + pdf_data = session.get("pdf_data") + if not pdf_data: + raise HTTPException(status_code=400, detail="No PDF uploaded for this session") + + image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False) + return StreamingResponse(io.BytesIO(image_data), media_type="image/png") + + +# ============================================================================= +# Ground Truth Labeling +# ============================================================================= + + +@analysis_router.post("/sessions/{session_id}/ground-truth/{page_number}") +async def save_ground_truth(session_id: str, page_number: int, data: dict = Body(...)): + """Save ground truth labels for a page. + + Expects body with 'entries' list - each entry has english, german, example, + status ('confirmed' | 'edited' | 'skipped'), and bbox fields. + """ + logger.info(f"Save ground truth for session {session_id}, page {page_number}") + + if session_id not in _get_sessions(): + raise HTTPException(status_code=404, detail="Session not found") + + entries = data.get("entries", []) + if not entries: + raise HTTPException(status_code=400, detail="No entries provided") + + # Save in session + session = _get_sessions()[session_id] + if "ground_truth" not in session: + session["ground_truth"] = {} + session["ground_truth"][str(page_number)] = entries + + # Also save to disk + os.makedirs(_ground_truth_dir(), exist_ok=True) + gt_path = os.path.join(_ground_truth_dir(), f"{session_id}_page{page_number}.json") + gt_data = { + "session_id": session_id, + "page_number": page_number, + "saved_at": datetime.now().isoformat(), + "entry_count": len(entries), + "entries": entries, + } + with open(gt_path, 'w', encoding='utf-8') as f: + json.dump(gt_data, f, ensure_ascii=False, indent=2) + + logger.info(f"Ground truth saved: {len(entries)} entries to {gt_path}") + + confirmed = sum(1 for e in entries if e.get("status") == "confirmed") + edited = sum(1 for e in entries if e.get("status") == "edited") + skipped = sum(1 for e in entries if e.get("status") == "skipped") + + return { + "success": True, + "saved_count": len(entries), + "confirmed": confirmed, + "edited": edited, + "skipped": skipped, + "file_path": gt_path, + } + + +@analysis_router.get("/sessions/{session_id}/ground-truth/{page_number}") +async def load_ground_truth(session_id: str, page_number: int): + """Load saved ground truth for a page.""" + logger.info(f"Load ground truth for session {session_id}, page {page_number}") + + if session_id not in _get_sessions(): + raise HTTPException(status_code=404, detail="Session not found") + + # Try session cache first + session = _get_sessions()[session_id] + cached = session.get("ground_truth", {}).get(str(page_number)) + if cached: + return {"success": True, "entries": cached, "source": "cache"} + + # Try disk + gt_path = os.path.join(_ground_truth_dir(), f"{session_id}_page{page_number}.json") + if not os.path.exists(gt_path): + raise HTTPException(status_code=404, detail="No ground truth found for this page") + + with open(gt_path, 'r', encoding='utf-8') as f: + gt_data = json.load(f) + + return {"success": True, "entries": gt_data.get("entries", []), "source": "disk"} + + +# ─── Learning Module Generation ───────────────────────────────────────────── + + +class GenerateLearningUnitRequest(BaseModel): + grade: Optional[str] = None + generate_modules: bool = True + + +@analysis_router.post("/sessions/{session_id}/generate-learning-unit") +async def generate_learning_unit_endpoint(session_id: str, request: GenerateLearningUnitRequest = None): + """ + Create a Learning Unit from the vocabulary in this session. + + 1. Takes vocabulary from the session + 2. Creates a Learning Unit in backend-lehrer + 3. Optionally triggers MC/Cloze/QA generation + + Returns the created unit info and generation status. + """ + if request is None: + request = GenerateLearningUnitRequest() + + if session_id not in _get_sessions(): + raise HTTPException(status_code=404, detail="Session not found") + + session = _get_sessions()[session_id] + vocabulary = session.get("vocabulary", []) + + if not vocabulary: + raise HTTPException(status_code=400, detail="No vocabulary in this session") + + try: + from vocab.learn_bridge import create_learning_unit, generate_learning_modules + + # Step 1: Create Learning Unit + result = await create_learning_unit( + session_name=session["name"], + vocabulary=vocabulary, + grade=request.grade, + ) + + # Step 2: Generate modules if requested + if request.generate_modules: + try: + gen_result = await generate_learning_modules( + unit_id=result["unit_id"], + analysis_path=result["analysis_path"], + ) + result["generation"] = gen_result + except Exception as e: + logger.warning(f"Module generation failed (unit created): {e}") + result["generation"] = {"status": "error", "reason": str(e)} + + return result + + except ImportError: + raise HTTPException(status_code=501, detail="vocab_learn_bridge module not available") + except ValueError as e: + raise HTTPException(status_code=400, detail=str(e)) + except RuntimeError as e: + raise HTTPException(status_code=502, detail=str(e)) + + +# ============================================================================= +# Include compare_ocr_methods & analyze_grid from companion module +# ============================================================================= + +from .compare_api import compare_router # noqa: E402 + +analysis_router.include_router(compare_router) diff --git a/klausur-service/backend/vocab/worksheet/api.py b/klausur-service/backend/vocab/worksheet/api.py new file mode 100644 index 0000000..cb2832c --- /dev/null +++ b/klausur-service/backend/vocab/worksheet/api.py @@ -0,0 +1,498 @@ +""" +Vocabulary Worksheet API — core CRUD routes for sessions, uploads, +vocabulary editing, worksheet generation, and PDF downloads. + +Sub-routers (included at bottom): +- vocab_worksheet_upload_api: PDF upload, thumbnails, page processing +- vocab_worksheet_analysis_api: OCR compare, grid analysis, ground truth +""" + +from fastapi import APIRouter, HTTPException, UploadFile, File, Query +from fastapi.responses import StreamingResponse +from typing import List, Dict, Any +from datetime import datetime +import uuid +import os +import io +import logging + +logger = logging.getLogger(__name__) + +# --- Imports from extracted sub-modules --- +from .models import ( + SessionStatus, + VocabularyEntry, + SessionCreate, + SessionResponse, + VocabularyResponse, + VocabularyUpdate, + WorksheetGenerateRequest, + WorksheetResponse, +) +from .extraction import extract_vocabulary_from_image +from .generation import ( + generate_worksheet_html, generate_worksheet_pdf, + convert_pdf_page_to_image, +) + +# --- Database integration (used by main.py lifespan) --- +try: + from vocab.session_store import ( + DATABASE_URL, get_pool, init_vocab_tables, + list_sessions_db, get_session_db, + ) +except ImportError: + DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://breakpilot:breakpilot@postgres:5432/breakpilot_db") + get_pool = None + init_vocab_tables = None + list_sessions_db = None + get_session_db = None + +_db_pool = None + + +def set_db_pool(pool): + """Set the database connection pool (called from main.py lifespan).""" + global _db_pool + _db_pool = pool + + +async def _init_vocab_table(): + """Initialize vocab tables in database.""" + if init_vocab_tables: + try: + await init_vocab_tables() + logger.info("vocab_session_cache table ready") + except Exception as e: + logger.warning(f"Failed to init vocab tables: {e}") + else: + logger.info("vocab_session_cache table ready") + + +async def _load_all_sessions(): + """Load all vocab sessions from database into memory cache.""" + if not list_sessions_db: + logger.info("Loaded 0 vocab sessions from database") + return + + try: + sessions = await list_sessions_db(limit=500) + count = 0 + for s in sessions: + sid = s.get("id") or s.get("session_id") + if sid and sid not in _sessions: + _sessions[sid] = { + "id": sid, + "name": s.get("name", ""), + "description": s.get("description", ""), + "status": s.get("status", "created"), + "vocabulary_count": s.get("vocabulary_count", 0), + "source_language": s.get("source_language", "en"), + "target_language": s.get("target_language", "de"), + "created_at": str(s.get("created_at", "")), + } + count += 1 + logger.info(f"Loaded {count} vocab sessions from database") + except Exception as e: + logger.warning(f"Failed to load sessions from database: {e}") + + +# --- Router & module-level state --- +router = APIRouter(prefix="/api/v1/vocab", tags=["Vocabulary Worksheets"]) +LOCAL_STORAGE_PATH = os.getenv("VOCAB_STORAGE_PATH", "/app/vocab-worksheets") +_sessions: Dict[str, Dict[str, Any]] = {} +_worksheets: Dict[str, Dict[str, Any]] = {} + + +@router.post("/sessions", response_model=SessionResponse) +async def create_session(session: SessionCreate): + """Create a new vocabulary extraction session.""" + session_id = str(uuid.uuid4()) + + session_data = { + "id": session_id, + "name": session.name, + "description": session.description, + "source_language": session.source_language, + "target_language": session.target_language, + "status": SessionStatus.PENDING.value, + "vocabulary": [], + "vocabulary_count": 0, + "image_path": None, + "extraction_confidence": None, + "created_at": datetime.utcnow(), + } + + _sessions[session_id] = session_data + + # Create storage directory + session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id) + os.makedirs(session_dir, exist_ok=True) + + return SessionResponse( + id=session_id, + name=session.name, + description=session.description, + source_language=session.source_language, + target_language=session.target_language, + status=SessionStatus.PENDING.value, + vocabulary_count=0, + image_path=None, + created_at=session_data["created_at"], + ) + + +@router.get("/sessions", response_model=List[SessionResponse]) +async def list_sessions(limit: int = Query(50, ge=1, le=100)): + """List all vocabulary sessions.""" + sessions = sorted( + _sessions.values(), + key=lambda x: x["created_at"], + reverse=True + )[:limit] + + return [ + SessionResponse( + id=s["id"], + name=s["name"], + description=s.get("description"), + source_language=s["source_language"], + target_language=s["target_language"], + status=s["status"], + vocabulary_count=s.get("vocabulary_count", 0), + image_path=s.get("image_path"), + created_at=s["created_at"], + ) + for s in sessions + ] + + +@router.get("/sessions/{session_id}", response_model=SessionResponse) +async def get_session(session_id: str): + """Get a specific session.""" + if session_id not in _sessions: + raise HTTPException(status_code=404, detail="Session not found") + + s = _sessions[session_id] + return SessionResponse( + id=s["id"], + name=s["name"], + description=s.get("description"), + source_language=s["source_language"], + target_language=s["target_language"], + status=s["status"], + vocabulary_count=s.get("vocabulary_count", 0), + image_path=s.get("image_path"), + created_at=s["created_at"], + ) + + +@router.post("/sessions/{session_id}/upload") +async def upload_image( + session_id: str, + file: UploadFile = File(...), +): + """ + Upload a textbook page image or PDF and extract vocabulary. + + Supported formats: PNG, JPG, JPEG, PDF + """ + logger.info(f"Upload request for session {session_id}") + logger.info(f"File: filename={file.filename}, content_type={file.content_type}") + + if session_id not in _sessions: + logger.error(f"Session {session_id} not found") + raise HTTPException(status_code=404, detail="Session not found") + + session = _sessions[session_id] + + # Validate file type - check both extension and content type + extension = file.filename.split('.')[-1].lower() if file.filename else '' + content_type = file.content_type or '' + + # Accept images and PDFs + valid_image_extensions = ['png', 'jpg', 'jpeg'] + valid_image_content_types = ['image/png', 'image/jpeg', 'image/jpg'] + is_pdf = extension == 'pdf' or content_type == 'application/pdf' + is_image = extension in valid_image_extensions or content_type in valid_image_content_types + + if not is_pdf and not is_image: + logger.error(f"Invalid file type: extension={extension}, content_type={content_type}") + raise HTTPException( + status_code=400, + detail=f"Only PNG, JPG, JPEG, PDF files are supported. Got: extension={extension}, content_type={content_type}" + ) + + # Determine final extension for saving + if is_pdf: + save_extension = 'png' # PDFs will be converted to PNG + elif extension in valid_image_extensions: + save_extension = extension + elif content_type == 'image/png': + save_extension = 'png' + else: + save_extension = 'jpg' + + # Read file content + content = await file.read() + logger.info(f"Read {len(content)} bytes from uploaded file") + + # Convert PDF to image if needed + if is_pdf: + logger.info("Converting PDF to image...") + content = await convert_pdf_page_to_image(content, page_number=0) + logger.info(f"PDF converted, image size: {len(content)} bytes") + + # Save image + session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id) + os.makedirs(session_dir, exist_ok=True) + image_path = os.path.join(session_dir, f"source.{save_extension}") + + with open(image_path, 'wb') as f: + f.write(content) + + # Update session status + session["status"] = SessionStatus.PROCESSING.value + session["image_path"] = image_path + + # Extract vocabulary using Vision LLM + vocabulary, confidence, error = await extract_vocabulary_from_image(content, file.filename or "image.png", page_number=0) + + # Update session with extracted vocabulary + session["vocabulary"] = [v.dict() for v in vocabulary] + session["vocabulary_count"] = len(vocabulary) + session["extraction_confidence"] = confidence + session["status"] = SessionStatus.EXTRACTED.value + + result = { + "session_id": session_id, + "filename": file.filename, + "image_path": image_path, + "vocabulary_count": len(vocabulary), + "extraction_confidence": confidence, + "status": SessionStatus.EXTRACTED.value, + } + + if error: + result["error"] = error + + return result + + +@router.get("/sessions/{session_id}/vocabulary", response_model=VocabularyResponse) +async def get_vocabulary(session_id: str): + """Get extracted vocabulary for a session.""" + if session_id not in _sessions: + raise HTTPException(status_code=404, detail="Session not found") + session = _sessions[session_id] + vocabulary = [VocabularyEntry(**v) for v in session.get("vocabulary", [])] + return VocabularyResponse( + session_id=session_id, + vocabulary=vocabulary, + extraction_confidence=session.get("extraction_confidence"), + ) + + +@router.put("/sessions/{session_id}/vocabulary") +async def update_vocabulary(session_id: str, update: VocabularyUpdate): + """Update vocabulary entries (for manual corrections).""" + if session_id not in _sessions: + raise HTTPException(status_code=404, detail="Session not found") + + session = _sessions[session_id] + session["vocabulary"] = [v.dict() for v in update.vocabulary] + session["vocabulary_count"] = len(update.vocabulary) + + return { + "session_id": session_id, + "vocabulary_count": len(update.vocabulary), + "message": "Vocabulary updated successfully", + } + + +@router.post("/sessions/{session_id}/generate", response_model=WorksheetResponse) +async def generate_worksheet(session_id: str, request: WorksheetGenerateRequest): + """Generate worksheet PDF(s) from extracted vocabulary.""" + if session_id not in _sessions: + raise HTTPException(status_code=404, detail="Session not found") + + session = _sessions[session_id] + vocabulary = [VocabularyEntry(**v) for v in session.get("vocabulary", [])] + + if not vocabulary: + raise HTTPException(status_code=400, detail="No vocabulary to generate worksheet from") + + worksheet_id = str(uuid.uuid4()) + title = request.title or session["name"] + + # Generate HTML for each worksheet type + combined_html = "" + for wtype in request.worksheet_types: + html = generate_worksheet_html( + vocabulary=vocabulary, + worksheet_type=wtype, + title=f"{title} - {wtype.value}", + show_solutions=False, + repetitions=request.repetitions, + line_height=request.line_height, + ) + combined_html += html + '
' + + # Generate PDF + try: + pdf_bytes = await generate_worksheet_pdf(combined_html) + except Exception as e: + raise HTTPException(status_code=500, detail=f"PDF generation failed: {e}") + + # Save PDF + session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id) + pdf_path = os.path.join(session_dir, f"worksheet_{worksheet_id}.pdf") + with open(pdf_path, 'wb') as f: + f.write(pdf_bytes) + + # Generate solution PDF if requested + solution_path = None + if request.include_solutions: + solution_html = "" + for wtype in request.worksheet_types: + html = generate_worksheet_html( + vocabulary=vocabulary, + worksheet_type=wtype, + title=f"{title} - {wtype.value} (Loesung)", + show_solutions=True, + repetitions=request.repetitions, + line_height=request.line_height, + ) + solution_html += html + '' + + solution_bytes = await generate_worksheet_pdf(solution_html) + solution_path = os.path.join(session_dir, f"solution_{worksheet_id}.pdf") + with open(solution_path, 'wb') as f: + f.write(solution_bytes) + + # Store worksheet info + worksheet_data = { + "id": worksheet_id, + "session_id": session_id, + "worksheet_types": [wt.value for wt in request.worksheet_types], + "pdf_path": pdf_path, + "solution_path": solution_path, + "generated_at": datetime.utcnow(), + } + _worksheets[worksheet_id] = worksheet_data + + # Update session status + session["status"] = SessionStatus.COMPLETED.value + + return WorksheetResponse( + id=worksheet_id, + session_id=session_id, + worksheet_types=worksheet_data["worksheet_types"], + pdf_path=pdf_path, + solution_path=solution_path, + generated_at=worksheet_data["generated_at"], + ) + + +@router.get("/worksheets/{worksheet_id}/pdf") +async def download_worksheet_pdf(worksheet_id: str): + """Download the generated worksheet PDF.""" + if worksheet_id not in _worksheets: + raise HTTPException(status_code=404, detail="Worksheet not found") + + worksheet = _worksheets[worksheet_id] + pdf_path = worksheet["pdf_path"] + + if not os.path.exists(pdf_path): + raise HTTPException(status_code=404, detail="PDF file not found") + + with open(pdf_path, 'rb') as f: + pdf_bytes = f.read() + + return StreamingResponse( + io.BytesIO(pdf_bytes), + media_type="application/pdf", + headers={"Content-Disposition": f"attachment; filename=worksheet_{worksheet_id}.pdf"} + ) + + +@router.get("/worksheets/{worksheet_id}/solution") +async def download_solution_pdf(worksheet_id: str): + """Download the solution PDF.""" + if worksheet_id not in _worksheets: + raise HTTPException(status_code=404, detail="Worksheet not found") + + worksheet = _worksheets[worksheet_id] + solution_path = worksheet.get("solution_path") + + if not solution_path or not os.path.exists(solution_path): + raise HTTPException(status_code=404, detail="Solution PDF not found") + + with open(solution_path, 'rb') as f: + pdf_bytes = f.read() + + return StreamingResponse( + io.BytesIO(pdf_bytes), + media_type="application/pdf", + headers={"Content-Disposition": f"attachment; filename=solution_{worksheet_id}.pdf"} + ) + + +@router.get("/sessions/{session_id}/image") +async def get_session_image(session_id: str): + """Get the uploaded source image for a session.""" + if session_id not in _sessions: + raise HTTPException(status_code=404, detail="Session not found") + + session = _sessions[session_id] + image_path = session.get("image_path") + + if not image_path or not os.path.exists(image_path): + raise HTTPException(status_code=404, detail="Image not found") + + # Determine content type + extension = image_path.split('.')[-1].lower() + content_type = { + 'png': 'image/png', + 'jpg': 'image/jpeg', + 'jpeg': 'image/jpeg', + }.get(extension, 'application/octet-stream') + + with open(image_path, 'rb') as f: + image_bytes = f.read() + + return StreamingResponse( + io.BytesIO(image_bytes), + media_type=content_type, + ) + + +@router.delete("/sessions/{session_id}") +async def delete_session(session_id: str): + """Delete a vocabulary session and all associated files.""" + if session_id not in _sessions: + raise HTTPException(status_code=404, detail="Session not found") + + # Delete session directory + session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id) + if os.path.exists(session_dir): + import shutil + shutil.rmtree(session_dir) + + # Remove from storage + del _sessions[session_id] + + # Remove associated worksheets + for wid, ws in list(_worksheets.items()): + if ws["session_id"] == session_id: + del _worksheets[wid] + + return {"message": "Session deleted successfully", "session_id": session_id} + + +# --- Include sub-routers --- +from .upload_api import upload_router +from .analysis_api import analysis_router + +router.include_router(upload_router) +router.include_router(analysis_router) diff --git a/klausur-service/backend/vocab/worksheet/compare_api.py b/klausur-service/backend/vocab/worksheet/compare_api.py new file mode 100644 index 0000000..88d1df1 --- /dev/null +++ b/klausur-service/backend/vocab/worksheet/compare_api.py @@ -0,0 +1,542 @@ +""" +Vocabulary Worksheet Compare & Grid Analysis API. + +Split from vocab_worksheet_analysis_api.py — contains the two largest +route handlers: compare_ocr_methods (~234 LOC) and analyze_grid (~255 LOC). +""" + +from fastapi import APIRouter, HTTPException, Query +import base64 +import json +import logging +import os + +from .extraction import extract_vocabulary_from_image + +OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434") +VISION_MODEL = os.getenv("VISION_MODEL", "llama3.2-vision:11b") + +def _get_sessions(): + from .api import _sessions + return _sessions +from .generation import convert_pdf_page_to_image + +# Try to import Tesseract extractor +try: + from tesseract_vocab_extractor import ( + run_tesseract_pipeline, + match_positions_to_vocab, TESSERACT_AVAILABLE, + ) +except ImportError: + TESSERACT_AVAILABLE = False + +# Try to import CV Pipeline +try: + from cv_vocab_pipeline import run_cv_pipeline, CV_PIPELINE_AVAILABLE +except ImportError: + CV_PIPELINE_AVAILABLE = False + +# Try to import Grid Detection Service +try: + from services.grid_detection_service import GridDetectionService + GRID_SERVICE_AVAILABLE = True +except ImportError: + GRID_SERVICE_AVAILABLE = False + +logger = logging.getLogger(__name__) + +compare_router = APIRouter() + + +# ============================================================================= +# OCR Compare & Grid Analysis Endpoints +# ============================================================================= + + +@compare_router.post("/sessions/{session_id}/compare-ocr/{page_number}") +async def compare_ocr_methods(session_id: str, page_number: int): + """ + Run multiple OCR methods on a page and compare results. + + This endpoint: + 1. Gets the page image from the session's uploaded PDF + 2. Runs Vision LLM extraction (primary method) + 3. Optionally runs Tesseract extraction + 4. Compares found vocabulary across methods + 5. Returns structured comparison results + + page_number is 0-indexed. + """ + import time + + logger.info(f"Compare OCR for session {session_id}, page {page_number}") + + if session_id not in _get_sessions(): + raise HTTPException(status_code=404, detail="Session not found") + + session = _get_sessions()[session_id] + pdf_data = session.get("pdf_data") + + if not pdf_data: + raise HTTPException(status_code=400, detail="No PDF uploaded for this session") + + page_count = session.get("pdf_page_count", 1) + if page_number < 0 or page_number >= page_count: + raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).") + + # Convert page to image + image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False) + + methods_results = {} + all_vocab_sets = {} + + # --- Method: Vision LLM --- + try: + start = time.time() + vocab, confidence, error = await extract_vocabulary_from_image( + image_data, f"page_{page_number + 1}.png", page_number=page_number, use_hybrid=False + ) + duration = time.time() - start + + vocab_list = [] + for v in vocab: + entry = v.dict() if hasattr(v, 'dict') else (v.__dict__.copy() if hasattr(v, '__dict__') else dict(v)) + vocab_list.append({ + "english": entry.get("english", ""), + "german": entry.get("german", ""), + "example": entry.get("example_sentence", ""), + }) + + methods_results["vision_llm"] = { + "name": "Vision LLM", + "model": VISION_MODEL, + "duration_seconds": round(duration, 1), + "vocabulary_count": len(vocab_list), + "vocabulary": vocab_list, + "confidence": confidence, + "success": len(vocab_list) > 0 and not error, + "error": error if error else None, + } + all_vocab_sets["vision_llm"] = {(v["english"].lower().strip(), v["german"].lower().strip()) for v in vocab_list if v["english"] and v["german"]} + except Exception as e: + logger.error(f"Vision LLM failed: {e}") + methods_results["vision_llm"] = { + "name": "Vision LLM", + "model": VISION_MODEL, + "duration_seconds": 0, + "vocabulary_count": 0, + "vocabulary": [], + "confidence": 0, + "success": False, + "error": str(e), + } + all_vocab_sets["vision_llm"] = set() + + # --- Method: Tesseract OCR (bounding boxes + vocab extraction) --- + if TESSERACT_AVAILABLE: + try: + start = time.time() + tess_result = await run_tesseract_pipeline(image_data, lang="eng+deu") + duration = time.time() - start + + tess_vocab = tess_result.get("vocabulary", []) + tess_words = tess_result.get("words", []) + + # Store Tesseract words in session for later use (grid analysis, position matching) + session["tesseract_words"] = tess_words + session["tesseract_image_width"] = tess_result.get("image_width", 0) + session["tesseract_image_height"] = tess_result.get("image_height", 0) + session[f"tesseract_page_{page_number}"] = tess_result + + vocab_list_tess = [] + for v in tess_vocab: + vocab_list_tess.append({ + "english": v.get("english", ""), + "german": v.get("german", ""), + "example": v.get("example", ""), + }) + + methods_results["tesseract"] = { + "name": "Tesseract OCR", + "model": "tesseract-ocr (eng+deu)", + "duration_seconds": round(duration, 1), + "vocabulary_count": len(vocab_list_tess), + "vocabulary": vocab_list_tess, + "confidence": 0.7 if tess_vocab else 0, + "success": len(vocab_list_tess) > 0, + "error": tess_result.get("error"), + "word_count": tess_result.get("word_count", 0), + "columns_detected": len(tess_result.get("columns", [])), + } + all_vocab_sets["tesseract"] = { + (v["english"].lower().strip(), v["german"].lower().strip()) + for v in vocab_list_tess if v["english"] and v["german"] + } + + # Fuzzy-match: attach Tesseract bounding boxes to Vision LLM results + if "vision_llm" in methods_results and methods_results["vision_llm"]["success"]: + llm_vocab_with_bbox = match_positions_to_vocab( + tess_words, + methods_results["vision_llm"]["vocabulary"], + tess_result.get("image_width", 1), + tess_result.get("image_height", 1), + ) + methods_results["vision_llm"]["vocabulary"] = llm_vocab_with_bbox + + except Exception as e: + logger.error(f"Tesseract failed: {e}") + import traceback + logger.debug(traceback.format_exc()) + methods_results["tesseract"] = { + "name": "Tesseract OCR", + "model": "tesseract-ocr", + "duration_seconds": 0, + "vocabulary_count": 0, + "vocabulary": [], + "confidence": 0, + "success": False, + "error": str(e), + } + all_vocab_sets["tesseract"] = set() + + # --- Method: CV Pipeline (Document Reconstruction) --- + if CV_PIPELINE_AVAILABLE: + try: + start = time.time() + cv_result = await run_cv_pipeline(pdf_data=pdf_data, page_number=page_number) + duration = time.time() - start + + cv_vocab = cv_result.vocabulary if not cv_result.error else [] + vocab_list_cv = [] + for v in cv_vocab: + vocab_list_cv.append({ + "english": v.get("english", ""), + "german": v.get("german", ""), + "example": v.get("example", ""), + }) + + methods_results["cv_pipeline"] = { + "name": "CV Pipeline (Document Reconstruction)", + "model": "opencv + tesseract (multi-pass)", + "duration_seconds": round(duration, 1), + "vocabulary_count": len(vocab_list_cv), + "vocabulary": vocab_list_cv, + "confidence": 0.8 if cv_vocab else 0, + "success": len(vocab_list_cv) > 0, + "error": cv_result.error, + "word_count": cv_result.word_count, + "columns_detected": cv_result.columns_detected, + "stages": cv_result.stages, + } + all_vocab_sets["cv_pipeline"] = { + (v["english"].lower().strip(), v["german"].lower().strip()) + for v in vocab_list_cv if v["english"] and v["german"] + } + + except Exception as e: + logger.error(f"CV Pipeline failed: {e}") + import traceback + logger.debug(traceback.format_exc()) + methods_results["cv_pipeline"] = { + "name": "CV Pipeline (Document Reconstruction)", + "model": "opencv + tesseract (multi-pass)", + "duration_seconds": 0, + "vocabulary_count": 0, + "vocabulary": [], + "confidence": 0, + "success": False, + "error": str(e), + } + all_vocab_sets["cv_pipeline"] = set() + + # --- Build comparison --- + all_unique = set() + for vs in all_vocab_sets.values(): + all_unique |= vs + + found_by_all = [] + found_by_some = [] + for english, german in sorted(all_unique): + found_in = [m for m, vs in all_vocab_sets.items() if (english, german) in vs] + entry = {"english": english, "german": german, "methods": found_in} + if len(found_in) == len(all_vocab_sets): + found_by_all.append(entry) + else: + found_by_some.append(entry) + + total_methods = max(len(all_vocab_sets), 1) + agreement_rate = len(found_by_all) / max(len(all_unique), 1) if all_unique else 0 + + # Find best method + best_method = max(all_vocab_sets, key=lambda m: len(all_vocab_sets[m])) if all_vocab_sets else "vision_llm" + + return { + "session_id": session_id, + "page_number": page_number, + "methods": methods_results, + "comparison": { + "found_by_all_methods": found_by_all, + "found_by_some_methods": found_by_some, + "total_unique_vocabulary": len(all_unique), + "agreement_rate": agreement_rate, + }, + "recommendation": { + "best_method": best_method, + "reason": f"{len(all_vocab_sets.get(best_method, set()))} Vokabeln erkannt mit hoher Konfidenz", + }, + } + + +@compare_router.post("/sessions/{session_id}/analyze-grid/{page_number}") +async def analyze_grid(session_id: str, page_number: int, use_tesseract: bool = Query(True)): + """ + Analyze the grid/table structure of a vocabulary page. + + Hybrid approach: + 1. If Tesseract bounding boxes are available (from compare-ocr), use them for + real spatial positions via GridDetectionService. + 2. Otherwise fall back to Vision LLM for grid structure detection. + + page_number is 0-indexed. + Returns GridData structure expected by the frontend GridOverlay component. + """ + import httpx + + logger.info(f"Grid analysis for session {session_id}, page {page_number} (use_tesseract={use_tesseract})") + + if session_id not in _get_sessions(): + raise HTTPException(status_code=404, detail="Session not found") + + session = _get_sessions()[session_id] + pdf_data = session.get("pdf_data") + + if not pdf_data: + raise HTTPException(status_code=400, detail="No PDF uploaded for this session") + + page_count = session.get("pdf_page_count", 1) + if page_number < 0 or page_number >= page_count: + raise HTTPException(status_code=400, detail=f"Invalid page number.") + + # Convert page to image + image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False) + + # --- Strategy 1: Use Tesseract bounding boxes + GridDetectionService --- + tess_page_data = session.get(f"tesseract_page_{page_number}") + + if use_tesseract and TESSERACT_AVAILABLE and GRID_SERVICE_AVAILABLE: + try: + # Run Tesseract if not already cached + if not tess_page_data: + logger.info("Running Tesseract for grid analysis (not cached)") + from tesseract_vocab_extractor import run_tesseract_pipeline as _run_tess + tess_page_data = await _run_tess(image_data, lang="eng+deu") + session[f"tesseract_page_{page_number}"] = tess_page_data + session["tesseract_words"] = tess_page_data.get("words", []) + session["tesseract_image_width"] = tess_page_data.get("image_width", 0) + session["tesseract_image_height"] = tess_page_data.get("image_height", 0) + + tess_words = tess_page_data.get("words", []) + img_w = tess_page_data.get("image_width", 0) + img_h = tess_page_data.get("image_height", 0) + + if tess_words and img_w > 0 and img_h > 0: + service = GridDetectionService() + regions = service.convert_tesseract_regions(tess_words, img_w, img_h) + + if regions: + grid_result = service.detect_grid(regions) + grid_dict = grid_result.to_dict() + + # Merge LLM text if available (better quality than Tesseract text) + # The LLM vocab was stored during compare-ocr + grid_dict["source"] = "tesseract+grid_service" + grid_dict["word_count"] = len(tess_words) + + logger.info(f"Tesseract grid: {grid_result.rows}x{grid_result.columns}, " + f"{grid_result.stats.get('recognized', 0)} recognized") + + return {"success": True, "grid": grid_dict} + + logger.info("Tesseract data insufficient, falling back to LLM") + + except Exception as e: + logger.warning(f"Tesseract grid analysis failed, falling back to LLM: {e}") + import traceback + logger.debug(traceback.format_exc()) + + # --- Strategy 2: Fall back to Vision LLM --- + image_base64 = base64.b64encode(image_data).decode("utf-8") + + grid_prompt = """Analyze this textbook page image. It contains a vocabulary table/grid. + +Your task: Identify the TABLE STRUCTURE and extract each cell's content. + +Return a JSON object with this EXACT structure: +{ + "rows":| {entry.english} | {entry.german} |
| {entry.english} |
| {entry.german} | {entry.english} |
| {entry.german} |
| {entry.english} | ' + html += '' + if show_solutions: + html += f' {entry.english} ' * repetitions + html += ' |
{i}. {gap_sentence}
' + if show_solutions: + html += f'Loesung: {entry.english}
' + else: + html += f'({entry.german})
' + html += '| {entry.english} | {entry.german} |
| {entry.english} |
| {entry.german} | {entry.english} |
| {entry.german} |
| {entry.english} | ' - html += '' - if show_solutions: - html += f' {entry.english} ' * repetitions - html += ' |
{i}. {gap_sentence}
' - if show_solutions: - html += f'Loesung: {entry.english}
' - else: - html += f'({entry.german})
' - html += '