""" Grid Editor — header/heading detection and colspan (merged cell) detection. Split from grid_editor_helpers.py. Pure computation, no HTTP/DB side effects. Lizenz: Apache 2.0 | DATENSCHUTZ: Alle Verarbeitung erfolgt lokal. """ import logging import re from typing import Any, Dict, List, Optional from cv_ocr_engines import _text_has_garbled_ipa logger = logging.getLogger(__name__) def _detect_heading_rows_by_color(zones_data: List[Dict], img_w: int, img_h: int) -> int: """Detect heading rows by color + height after color annotation. A row is a heading if: 1. ALL word_boxes have color_name != 'black' (typically 'blue') 2. Mean word height > 1.2x median height of all words in the zone Detected heading rows are merged into a single spanning cell. Returns count of headings detected. """ heading_count = 0 for z in zones_data: cells = z.get("cells", []) rows = z.get("rows", []) columns = z.get("columns", []) if not cells or not rows or len(columns) < 2: continue # Compute median word height across the zone all_heights = [] for cell in cells: for wb in cell.get("word_boxes") or []: h = wb.get("height", 0) if h > 0: all_heights.append(h) if not all_heights: continue all_heights_sorted = sorted(all_heights) median_h = all_heights_sorted[len(all_heights_sorted) // 2] heading_row_indices = [] for row in rows: if row.get("is_header"): continue # already detected as header ri = row["index"] row_cells = [c for c in cells if c.get("row_index") == ri] row_wbs = [ wb for cell in row_cells for wb in cell.get("word_boxes") or [] ] if not row_wbs: continue # Condition 1: ALL words are non-black all_colored = all( wb.get("color_name", "black") != "black" for wb in row_wbs ) if not all_colored: continue # Condition 2: mean height > 1.2x median mean_h = sum(wb.get("height", 0) for wb in row_wbs) / len(row_wbs) if mean_h <= median_h * 1.2: continue heading_row_indices.append(ri) # Merge heading cells into spanning cells for hri in heading_row_indices: header_cells = [c for c in cells if c.get("row_index") == hri] if len(header_cells) <= 1: # Single cell -- just mark it as heading if header_cells: header_cells[0]["col_type"] = "heading" heading_count += 1 # Mark row as header for row in rows: if row["index"] == hri: row["is_header"] = True continue # Collect all word_boxes and text from all columns all_wb = [] all_text_parts = [] for hc in sorted(header_cells, key=lambda c: c["col_index"]): all_wb.extend(hc.get("word_boxes", [])) if hc.get("text", "").strip(): all_text_parts.append(hc["text"].strip()) # Remove all cells for this row, replace with one spanning cell z["cells"] = [c for c in z["cells"] if c.get("row_index") != hri] if all_wb: x_min = min(wb["left"] for wb in all_wb) y_min = min(wb["top"] for wb in all_wb) x_max = max(wb["left"] + wb["width"] for wb in all_wb) y_max = max(wb["top"] + wb["height"] for wb in all_wb) # Use the actual starting col_index from the first cell first_col = min(hc["col_index"] for hc in header_cells) zone_idx = z.get("zone_index", 0) z["cells"].append({ "cell_id": f"Z{zone_idx}_R{hri:02d}_C{first_col}", "zone_index": zone_idx, "row_index": hri, "col_index": first_col, "col_type": "heading", "text": " ".join(all_text_parts), "confidence": 0.0, "bbox_px": {"x": x_min, "y": y_min, "w": x_max - x_min, "h": y_max - y_min}, "bbox_pct": { "x": round(x_min / img_w * 100, 2) if img_w else 0, "y": round(y_min / img_h * 100, 2) if img_h else 0, "w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0, "h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0, }, "word_boxes": all_wb, "ocr_engine": "words_first", "is_bold": True, }) # Mark row as header for row in rows: if row["index"] == hri: row["is_header"] = True heading_count += 1 return heading_count def _detect_heading_rows_by_single_cell( zones_data: List[Dict], img_w: int, img_h: int, ) -> int: """Detect heading rows that have only a single content cell. Black headings like "Theme" have normal color and height, so they are missed by ``_detect_heading_rows_by_color``. The distinguishing signal is that they occupy only one column while normal vocabulary rows fill at least 2-3 columns. A row qualifies as a heading if: 1. It is not already marked as a header/heading. 2. It has exactly ONE cell whose col_type starts with ``column_`` (excluding column_1 / page_ref which only carries page numbers). 3. That single cell is NOT in the last column (continuation/example lines like "2. Ver\u00e4nderung, Wechsel" often sit alone in column_4). 4. The text does not start with ``[`` (IPA continuation). 5. The zone has >=3 columns and >=5 rows (avoids false positives in tiny zones). 6. The majority of rows in the zone have >=2 content cells (ensures we are in a multi-column vocab layout). """ heading_count = 0 for z in zones_data: cells = z.get("cells", []) rows = z.get("rows", []) columns = z.get("columns", []) if len(columns) < 3 or len(rows) < 5: continue # Determine the last col_index (example/sentence column) col_indices = sorted(set(c.get("col_index", 0) for c in cells)) if not col_indices: continue last_col = col_indices[-1] # Count content cells per row (column_* but not column_1/page_ref). # Exception: column_1 cells that contain a dictionary article word # (die/der/das etc.) ARE content -- they appear in dictionary layouts # where the leftmost column holds grammatical articles. _ARTICLE_WORDS = { "die", "der", "das", "dem", "den", "des", "ein", "eine", "the", "a", "an", } row_content_counts: Dict[int, int] = {} for cell in cells: ct = cell.get("col_type", "") if not ct.startswith("column_"): continue if ct == "column_1": ctext = (cell.get("text") or "").strip().lower() if ctext not in _ARTICLE_WORDS: continue ri = cell.get("row_index", -1) row_content_counts[ri] = row_content_counts.get(ri, 0) + 1 # Majority of rows must have >=2 content cells multi_col_rows = sum(1 for cnt in row_content_counts.values() if cnt >= 2) if multi_col_rows < len(rows) * 0.4: continue # Exclude first and last non-header rows -- these are typically # page numbers or footer text, not headings. non_header_rows = [r for r in rows if not r.get("is_header")] if len(non_header_rows) < 3: continue first_ri = non_header_rows[0]["index"] last_ri = non_header_rows[-1]["index"] heading_row_indices = [] for row in rows: if row.get("is_header"): continue ri = row["index"] if ri == first_ri or ri == last_ri: continue row_cells = [c for c in cells if c.get("row_index") == ri] content_cells = [ c for c in row_cells if c.get("col_type", "").startswith("column_") and (c.get("col_type") != "column_1" or (c.get("text") or "").strip().lower() in _ARTICLE_WORDS) ] if len(content_cells) != 1: continue cell = content_cells[0] # Not in the last column (continuation/example lines) if cell.get("col_index") == last_col: continue text = (cell.get("text") or "").strip() if not text or text.startswith("["): continue # Continuation lines start with "(" -- e.g. "(usw.)", "(TV-Serie)" if text.startswith("("): continue # Single cell NOT in the first content column is likely a # continuation/overflow line, not a heading. Real headings # ("Theme 1", "Unit 3: ...") appear in the first or second # content column. first_content_col = col_indices[0] if col_indices else 0 if cell.get("col_index", 0) > first_content_col + 1: continue # Skip garbled IPA without brackets (e.g. "ska:f -- ska:vz") # but NOT text with real IPA symbols (e.g. "Theme [\u03b8\u02c8i\u02d0m]") _REAL_IPA_CHARS = set("\u02c8\u02cc\u0259\u026a\u025b\u0252\u028a\u028c\u00e6\u0251\u0254\u0283\u0292\u03b8\u00f0\u014b") if _text_has_garbled_ipa(text) and not any(c in _REAL_IPA_CHARS for c in text): continue # Guard: dictionary section headings are short (1-4 alpha chars # like "A", "Ab", "Zi", "Sch"). Longer text that starts # lowercase is a regular vocabulary word (e.g. "zentral") that # happens to appear alone in its row. alpha_only = re.sub(r'[^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]', '', text) if len(alpha_only) > 4 and text[0].islower(): continue heading_row_indices.append(ri) # Guard: if >25% of eligible rows would become headings, the # heuristic is misfiring (e.g. sparse single-column layout where # most rows naturally have only 1 content cell). eligible_rows = len(non_header_rows) - 2 # minus first/last excluded if eligible_rows > 0 and len(heading_row_indices) > eligible_rows * 0.25: logger.debug( "Skipping single-cell heading detection for zone %s: " "%d/%d rows would be headings (>25%%)", z.get("zone_index"), len(heading_row_indices), eligible_rows, ) continue for hri in heading_row_indices: header_cells = [c for c in cells if c.get("row_index") == hri] if not header_cells: continue # Collect all word_boxes and text all_wb = [] all_text_parts = [] for hc in sorted(header_cells, key=lambda c: c["col_index"]): all_wb.extend(hc.get("word_boxes", [])) if hc.get("text", "").strip(): all_text_parts.append(hc["text"].strip()) first_col_idx = min(hc["col_index"] for hc in header_cells) # Remove old cells for this row, add spanning heading cell z["cells"] = [c for c in z["cells"] if c.get("row_index") != hri] if all_wb: x_min = min(wb["left"] for wb in all_wb) y_min = min(wb["top"] for wb in all_wb) x_max = max(wb["left"] + wb["width"] for wb in all_wb) y_max = max(wb["top"] + wb["height"] for wb in all_wb) else: # Fallback to first cell bbox bp = header_cells[0].get("bbox_px", {}) x_min = bp.get("x", 0) y_min = bp.get("y", 0) x_max = x_min + bp.get("w", 0) y_max = y_min + bp.get("h", 0) zone_idx = z.get("zone_index", 0) z["cells"].append({ "cell_id": f"Z{zone_idx}_R{hri:02d}_C{first_col_idx}", "zone_index": zone_idx, "row_index": hri, "col_index": first_col_idx, "col_type": "heading", "text": " ".join(all_text_parts), "confidence": 0.0, "bbox_px": {"x": x_min, "y": y_min, "w": x_max - x_min, "h": y_max - y_min}, "bbox_pct": { "x": round(x_min / img_w * 100, 2) if img_w else 0, "y": round(y_min / img_h * 100, 2) if img_h else 0, "w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0, "h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0, }, "word_boxes": all_wb, "ocr_engine": "words_first", "is_bold": False, }) for row in rows: if row["index"] == hri: row["is_header"] = True heading_count += 1 return heading_count def _detect_header_rows( rows: List[Dict], zone_words: List[Dict], zone_y: int, columns: Optional[List[Dict]] = None, skip_first_row_header: bool = False, ) -> List[int]: """Detect header rows: first-row heuristic + spanning header detection. A "spanning header" is a row whose words stretch across multiple column boundaries (e.g. "Unit4: Bonnie Scotland" centred across 4 columns). """ if len(rows) < 2: return [] headers = [] if not skip_first_row_header: first_row = rows[0] second_row = rows[1] # Gap between first and second row > 0.5x average row height avg_h = sum(r["y_max"] - r["y_min"] for r in rows) / len(rows) gap = second_row["y_min"] - first_row["y_max"] if gap > avg_h * 0.5: headers.append(0) # Also check if first row words are taller than average (bold/header text) all_heights = [w["height"] for w in zone_words] median_h = sorted(all_heights)[len(all_heights) // 2] if all_heights else 20 first_row_words = [ w for w in zone_words if first_row["y_min"] <= w["top"] + w["height"] / 2 <= first_row["y_max"] ] if first_row_words: first_h = max(w["height"] for w in first_row_words) if first_h > median_h * 1.3: if 0 not in headers: headers.append(0) # Note: Spanning-header detection (rows spanning all columns) has been # disabled because it produces too many false positives on vocabulary # worksheets where IPA transcriptions or short entries naturally span # multiple columns with few words. The first-row heuristic above is # sufficient for detecting real headers. return headers def _detect_colspan_cells( zone_words: List[Dict], columns: List[Dict], rows: List[Dict], cells: List[Dict], img_w: int, img_h: int, ) -> List[Dict]: """Detect and merge cells that span multiple columns (colspan). A word-block (PaddleOCR phrase) that extends significantly past a column boundary into the next column indicates a merged cell. This replaces the incorrectly split cells with a single cell spanning multiple columns. Works for both full-page scans and box zones. """ if len(columns) < 2 or not zone_words or not rows: return cells from cv_words_first import _assign_word_to_row # Column boundaries (midpoints between adjacent columns) col_boundaries = [] for ci in range(len(columns) - 1): col_boundaries.append((columns[ci]["x_max"] + columns[ci + 1]["x_min"]) / 2) def _cols_covered(w_left: float, w_right: float) -> List[int]: """Return list of column indices that a word-block covers.""" covered = [] for col in columns: col_mid = (col["x_min"] + col["x_max"]) / 2 # Word covers a column if it extends past the column's midpoint if w_left < col_mid < w_right: covered.append(col["index"]) # Also include column if word starts within it elif col["x_min"] <= w_left < col["x_max"]: covered.append(col["index"]) return sorted(set(covered)) # Group original word-blocks by row row_word_blocks: Dict[int, List[Dict]] = {} for w in zone_words: ri = _assign_word_to_row(w, rows) row_word_blocks.setdefault(ri, []).append(w) # For each row, check if any word-block spans multiple columns rows_to_merge: Dict[int, List[Dict]] = {} # row_index -> list of spanning word-blocks for ri, wblocks in row_word_blocks.items(): spanning = [] for w in wblocks: w_left = w["left"] w_right = w_left + w["width"] covered = _cols_covered(w_left, w_right) if len(covered) >= 2: spanning.append({"word": w, "cols": covered}) if spanning: rows_to_merge[ri] = spanning if not rows_to_merge: return cells # Merge cells for spanning rows new_cells = [] for cell in cells: ri = cell.get("row_index", -1) if ri not in rows_to_merge: new_cells.append(cell) continue # Check if this cell's column is part of a spanning block ci = cell.get("col_index", -1) is_part_of_span = False for span in rows_to_merge[ri]: if ci in span["cols"]: is_part_of_span = True # Only emit the merged cell for the FIRST column in the span if ci == span["cols"][0]: # Use the ORIGINAL word-block text (not the split cell texts # which may have broken words like "euros a" + "nd cents") orig_word = span["word"] merged_text = orig_word.get("text", "").strip() all_wb = [orig_word] # Compute merged bbox if all_wb: x_min = min(wb["left"] for wb in all_wb) y_min = min(wb["top"] for wb in all_wb) x_max = max(wb["left"] + wb["width"] for wb in all_wb) y_max = max(wb["top"] + wb["height"] for wb in all_wb) else: x_min = y_min = x_max = y_max = 0 new_cells.append({ "cell_id": cell["cell_id"], "row_index": ri, "col_index": span["cols"][0], "col_type": "spanning_header", "colspan": len(span["cols"]), "text": merged_text, "confidence": cell.get("confidence", 0), "bbox_px": {"x": x_min, "y": y_min, "w": x_max - x_min, "h": y_max - y_min}, "bbox_pct": { "x": round(x_min / img_w * 100, 2) if img_w else 0, "y": round(y_min / img_h * 100, 2) if img_h else 0, "w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0, "h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0, }, "word_boxes": all_wb, "ocr_engine": cell.get("ocr_engine", ""), "is_bold": cell.get("is_bold", False), }) logger.info( "colspan detected: row %d, cols %s -> merged %d cells (%r)", ri, span["cols"], len(span["cols"]), merged_text[:50], ) break if not is_part_of_span: new_cells.append(cell) return new_cells