Fix heading col_index + detect black single-cell headings like "Theme"
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 1m53s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 16s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 1m53s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 16s
- Color headings now preserve actual starting col_index instead of hardcoded 0 - New _detect_heading_rows_by_single_cell: detects rows with only 1 content cell (excl. page_ref) as headings — catches black headings like "Theme" that have normal color/height but are alone in their row - Runs after Step 5d (IPA continuation) to avoid false positives - 5 new tests (32 total) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -623,12 +623,14 @@ def _detect_heading_rows_by_color(zones_data: List[Dict], img_w: int, img_h: int
|
||||
x_max = max(wb["left"] + wb["width"] for wb in all_wb)
|
||||
y_max = max(wb["top"] + wb["height"] for wb in all_wb)
|
||||
|
||||
# Use the actual starting col_index from the first cell
|
||||
first_col = min(hc["col_index"] for hc in header_cells)
|
||||
zone_idx = z.get("zone_index", 0)
|
||||
z["cells"].append({
|
||||
"cell_id": f"Z{zone_idx}_R{hri:02d}_C0",
|
||||
"cell_id": f"Z{zone_idx}_R{hri:02d}_C{first_col}",
|
||||
"zone_index": zone_idx,
|
||||
"row_index": hri,
|
||||
"col_index": 0,
|
||||
"col_index": first_col,
|
||||
"col_type": "heading",
|
||||
"text": " ".join(all_text_parts),
|
||||
"confidence": 0.0,
|
||||
@@ -654,6 +656,139 @@ def _detect_heading_rows_by_color(zones_data: List[Dict], img_w: int, img_h: int
|
||||
return heading_count
|
||||
|
||||
|
||||
def _detect_heading_rows_by_single_cell(
|
||||
zones_data: List[Dict], img_w: int, img_h: int,
|
||||
) -> int:
|
||||
"""Detect heading rows that have only a single content cell.
|
||||
|
||||
Black headings like "Theme" have normal color and height, so they are
|
||||
missed by ``_detect_heading_rows_by_color``. The distinguishing signal
|
||||
is that they occupy only one column while normal vocabulary rows fill
|
||||
at least 2-3 columns.
|
||||
|
||||
A row qualifies as a heading if:
|
||||
1. It is not already marked as a header/heading.
|
||||
2. It has exactly ONE cell whose col_type starts with ``column_``
|
||||
(excluding column_1 / page_ref which only carries page numbers).
|
||||
3. That single cell is NOT in the last column (continuation/example
|
||||
lines like "2. Veränderung, Wechsel" often sit alone in column_4).
|
||||
4. The text does not start with ``[`` (IPA continuation).
|
||||
5. The zone has ≥3 columns and ≥5 rows (avoids false positives in
|
||||
tiny zones).
|
||||
6. The majority of rows in the zone have ≥2 content cells (ensures
|
||||
we are in a multi-column vocab layout).
|
||||
"""
|
||||
heading_count = 0
|
||||
|
||||
for z in zones_data:
|
||||
cells = z.get("cells", [])
|
||||
rows = z.get("rows", [])
|
||||
columns = z.get("columns", [])
|
||||
if len(columns) < 3 or len(rows) < 5:
|
||||
continue
|
||||
|
||||
# Determine the last col_index (example/sentence column)
|
||||
col_indices = sorted(set(c.get("col_index", 0) for c in cells))
|
||||
if not col_indices:
|
||||
continue
|
||||
last_col = col_indices[-1]
|
||||
|
||||
# Count content cells per row (column_* but not column_1/page_ref)
|
||||
row_content_counts: Dict[int, int] = {}
|
||||
for cell in cells:
|
||||
ct = cell.get("col_type", "")
|
||||
if ct.startswith("column_") and ct != "column_1":
|
||||
ri = cell.get("row_index", -1)
|
||||
row_content_counts[ri] = row_content_counts.get(ri, 0) + 1
|
||||
|
||||
# Majority of rows must have ≥2 content cells
|
||||
multi_col_rows = sum(1 for cnt in row_content_counts.values() if cnt >= 2)
|
||||
if multi_col_rows < len(rows) * 0.4:
|
||||
continue
|
||||
|
||||
heading_row_indices = []
|
||||
for row in rows:
|
||||
if row.get("is_header"):
|
||||
continue
|
||||
ri = row["index"]
|
||||
row_cells = [c for c in cells if c.get("row_index") == ri]
|
||||
content_cells = [
|
||||
c for c in row_cells
|
||||
if c.get("col_type", "").startswith("column_")
|
||||
and c.get("col_type") != "column_1"
|
||||
]
|
||||
if len(content_cells) != 1:
|
||||
continue
|
||||
cell = content_cells[0]
|
||||
# Not in the last column (continuation/example lines)
|
||||
if cell.get("col_index") == last_col:
|
||||
continue
|
||||
text = (cell.get("text") or "").strip()
|
||||
if not text or text.startswith("["):
|
||||
continue
|
||||
heading_row_indices.append(ri)
|
||||
|
||||
for hri in heading_row_indices:
|
||||
header_cells = [c for c in cells if c.get("row_index") == hri]
|
||||
if not header_cells:
|
||||
continue
|
||||
|
||||
# Collect all word_boxes and text
|
||||
all_wb = []
|
||||
all_text_parts = []
|
||||
for hc in sorted(header_cells, key=lambda c: c["col_index"]):
|
||||
all_wb.extend(hc.get("word_boxes", []))
|
||||
if hc.get("text", "").strip():
|
||||
all_text_parts.append(hc["text"].strip())
|
||||
|
||||
first_col_idx = min(hc["col_index"] for hc in header_cells)
|
||||
|
||||
# Remove old cells for this row, add spanning heading cell
|
||||
z["cells"] = [c for c in z["cells"] if c.get("row_index") != hri]
|
||||
|
||||
if all_wb:
|
||||
x_min = min(wb["left"] for wb in all_wb)
|
||||
y_min = min(wb["top"] for wb in all_wb)
|
||||
x_max = max(wb["left"] + wb["width"] for wb in all_wb)
|
||||
y_max = max(wb["top"] + wb["height"] for wb in all_wb)
|
||||
else:
|
||||
# Fallback to first cell bbox
|
||||
bp = header_cells[0].get("bbox_px", {})
|
||||
x_min = bp.get("x", 0)
|
||||
y_min = bp.get("y", 0)
|
||||
x_max = x_min + bp.get("w", 0)
|
||||
y_max = y_min + bp.get("h", 0)
|
||||
|
||||
zone_idx = z.get("zone_index", 0)
|
||||
z["cells"].append({
|
||||
"cell_id": f"Z{zone_idx}_R{hri:02d}_C{first_col_idx}",
|
||||
"zone_index": zone_idx,
|
||||
"row_index": hri,
|
||||
"col_index": first_col_idx,
|
||||
"col_type": "heading",
|
||||
"text": " ".join(all_text_parts),
|
||||
"confidence": 0.0,
|
||||
"bbox_px": {"x": x_min, "y": y_min,
|
||||
"w": x_max - x_min, "h": y_max - y_min},
|
||||
"bbox_pct": {
|
||||
"x": round(x_min / img_w * 100, 2) if img_w else 0,
|
||||
"y": round(y_min / img_h * 100, 2) if img_h else 0,
|
||||
"w": round((x_max - x_min) / img_w * 100, 2) if img_w else 0,
|
||||
"h": round((y_max - y_min) / img_h * 100, 2) if img_h else 0,
|
||||
},
|
||||
"word_boxes": all_wb,
|
||||
"ocr_engine": "words_first",
|
||||
"is_bold": False,
|
||||
})
|
||||
|
||||
for row in rows:
|
||||
if row["index"] == hri:
|
||||
row["is_header"] = True
|
||||
heading_count += 1
|
||||
|
||||
return heading_count
|
||||
|
||||
|
||||
def _detect_header_rows(
|
||||
rows: List[Dict],
|
||||
zone_words: List[Dict],
|
||||
@@ -1680,6 +1815,14 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
||||
if ipa_cont_fixed:
|
||||
logger.info("Fixed %d IPA continuation cells", ipa_cont_fixed)
|
||||
|
||||
# 5e. Heading detection by single-cell rows — black headings like
|
||||
# "Theme" that have normal color and height but are the ONLY cell
|
||||
# in their row (excluding page_ref column_1). Must run AFTER 5d
|
||||
# so IPA continuation cells are already processed.
|
||||
single_heading_count = _detect_heading_rows_by_single_cell(zones_data, img_w, img_h)
|
||||
if single_heading_count:
|
||||
logger.info("Detected %d heading rows by single-cell heuristic", single_heading_count)
|
||||
|
||||
duration = time.time() - t0
|
||||
|
||||
# 6. Build result
|
||||
|
||||
@@ -18,6 +18,7 @@ from grid_editor_api import (
|
||||
_filter_border_ghosts,
|
||||
_detect_header_rows,
|
||||
_detect_heading_rows_by_color,
|
||||
_detect_heading_rows_by_single_cell,
|
||||
)
|
||||
from cv_ocr_engines import _text_has_garbled_ipa, fix_ipa_continuation_cell
|
||||
|
||||
@@ -520,3 +521,157 @@ class TestGarbledIpaDetection:
|
||||
# But text does NOT start with '[' — Step 5d bracket guard blocks it
|
||||
text = "employee [im'ploi:]"
|
||||
assert not (text.strip().startswith('[') and text.strip().endswith(']'))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _detect_heading_rows_by_single_cell
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestDetectHeadingRowsBySingleCell:
|
||||
"""Test heading detection for black single-cell rows (e.g. 'Theme')."""
|
||||
|
||||
def _make_word_box(self, text, left, top, width, height, color="black"):
|
||||
return {
|
||||
"text": text, "left": left, "top": top,
|
||||
"width": width, "height": height, "color_name": color, "conf": 90,
|
||||
}
|
||||
|
||||
def _make_vocab_zone(self):
|
||||
"""Build a typical 4-column vocab zone with 8 rows.
|
||||
|
||||
Columns: column_1 (page_ref), column_2 (EN), column_3 (DE), column_4 (example)
|
||||
Row 4 has only 1 cell in column_2 → heading candidate ("Theme").
|
||||
"""
|
||||
cells = []
|
||||
for ri in range(8):
|
||||
if ri == 4:
|
||||
# Single-cell row: "Theme" in column_2 only
|
||||
cells.append({
|
||||
"cell_id": f"Z0_R{ri:02d}_C1",
|
||||
"zone_index": 0, "row_index": ri, "col_index": 1,
|
||||
"col_type": "column_2", "text": "Theme",
|
||||
"word_boxes": [self._make_word_box("Theme", 130, 100 + ri * 30, 70, 20)],
|
||||
})
|
||||
continue
|
||||
# Normal vocab row: 3-4 cells
|
||||
cells.append({
|
||||
"cell_id": f"Z0_R{ri:02d}_C0",
|
||||
"zone_index": 0, "row_index": ri, "col_index": 0,
|
||||
"col_type": "column_1", "text": f"p.{70 + ri}",
|
||||
"word_boxes": [self._make_word_box(f"p.{70+ri}", 10, 100 + ri * 30, 30, 20)],
|
||||
})
|
||||
cells.append({
|
||||
"cell_id": f"Z0_R{ri:02d}_C1",
|
||||
"zone_index": 0, "row_index": ri, "col_index": 1,
|
||||
"col_type": "column_2", "text": f"word_{ri}",
|
||||
"word_boxes": [self._make_word_box(f"word_{ri}", 130, 100 + ri * 30, 80, 20)],
|
||||
})
|
||||
cells.append({
|
||||
"cell_id": f"Z0_R{ri:02d}_C2",
|
||||
"zone_index": 0, "row_index": ri, "col_index": 2,
|
||||
"col_type": "column_3", "text": f"Wort_{ri}",
|
||||
"word_boxes": [self._make_word_box(f"Wort_{ri}", 400, 100 + ri * 30, 80, 20)],
|
||||
})
|
||||
cells.append({
|
||||
"cell_id": f"Z0_R{ri:02d}_C3",
|
||||
"zone_index": 0, "row_index": ri, "col_index": 3,
|
||||
"col_type": "column_4", "text": f"Example sentence {ri}.",
|
||||
"word_boxes": [self._make_word_box(f"Example", 600, 100 + ri * 30, 120, 20)],
|
||||
})
|
||||
|
||||
rows = [
|
||||
{"index": ri, "y_min_px": 100 + ri * 30, "y_max_px": 120 + ri * 30, "is_header": False}
|
||||
for ri in range(8)
|
||||
]
|
||||
columns = [
|
||||
{"col_index": 0, "col_type": "column_1"},
|
||||
{"col_index": 1, "col_type": "column_2"},
|
||||
{"col_index": 2, "col_type": "column_3"},
|
||||
{"col_index": 3, "col_type": "column_4"},
|
||||
]
|
||||
return {
|
||||
"zone_index": 0, "zone_type": "content",
|
||||
"bbox_px": {"x": 0, "y": 0, "w": 800, "h": 1000},
|
||||
"cells": cells, "rows": rows, "columns": columns,
|
||||
}
|
||||
|
||||
def test_single_cell_heading_detected(self):
|
||||
"""Row with only 1 content cell in column_2 → heading."""
|
||||
zone = self._make_vocab_zone()
|
||||
zones_data = [zone]
|
||||
count = _detect_heading_rows_by_single_cell(zones_data, 800, 1000)
|
||||
assert count == 1
|
||||
heading_cells = [c for c in zone["cells"] if c["row_index"] == 4]
|
||||
assert len(heading_cells) == 1
|
||||
assert heading_cells[0]["col_type"] == "heading"
|
||||
assert heading_cells[0]["text"] == "Theme"
|
||||
assert heading_cells[0]["col_index"] == 1 # Starts at column_2, not 0
|
||||
|
||||
def test_single_cell_in_last_column_not_heading(self):
|
||||
"""Row with only 1 cell in column_4 (last) → NOT heading (continuation)."""
|
||||
zone = self._make_vocab_zone()
|
||||
# Add a single-cell row in the last column (column_4)
|
||||
zone["cells"].append({
|
||||
"cell_id": "Z0_R04_C3",
|
||||
"zone_index": 0, "row_index": 4, "col_index": 3,
|
||||
"col_type": "column_4", "text": "2. Veränderung",
|
||||
"word_boxes": [self._make_word_box("2.", 600, 220, 100, 20)],
|
||||
})
|
||||
# Remove the "Theme" cell from row 4
|
||||
zone["cells"] = [c for c in zone["cells"]
|
||||
if not (c["row_index"] == 4 and c["col_index"] == 1)]
|
||||
zones_data = [zone]
|
||||
count = _detect_heading_rows_by_single_cell(zones_data, 800, 1000)
|
||||
# Row 4 now only has column_4 → should NOT be heading
|
||||
# But original row 4 "Theme" was removed, so no heading at all
|
||||
assert count == 0
|
||||
|
||||
def test_ipa_bracket_text_not_heading(self):
|
||||
"""Row with single cell starting with '[' → IPA continuation, not heading."""
|
||||
zone = self._make_vocab_zone()
|
||||
# Replace "Theme" with IPA continuation
|
||||
for c in zone["cells"]:
|
||||
if c["row_index"] == 4 and c["col_index"] == 1:
|
||||
c["text"] = "[θˈiːm]"
|
||||
break
|
||||
zones_data = [zone]
|
||||
count = _detect_heading_rows_by_single_cell(zones_data, 800, 1000)
|
||||
assert count == 0
|
||||
|
||||
def test_multi_cell_row_not_heading(self):
|
||||
"""Normal vocab row with multiple cells → NOT heading."""
|
||||
zone = self._make_vocab_zone()
|
||||
zones_data = [zone]
|
||||
count = _detect_heading_rows_by_single_cell(zones_data, 800, 1000)
|
||||
# Only row 4 (Theme) should be heading, other rows have 3-4 cells
|
||||
assert count == 1
|
||||
# Verify normal rows are NOT marked as heading
|
||||
for ri in [0, 1, 2, 3, 5, 6, 7]:
|
||||
row_cells = [c for c in zone["cells"] if c["row_index"] == ri]
|
||||
for c in row_cells:
|
||||
assert c["col_type"] != "heading"
|
||||
|
||||
def test_color_heading_preserves_correct_col_index(self):
|
||||
"""Color heading starting in column_2 → col_index should be 1, not 0."""
|
||||
zone = self._make_vocab_zone()
|
||||
# Make row 3 a color heading: blue words in column_2 and column_3 only
|
||||
# (no column_1 page_ref for this row)
|
||||
zone["cells"] = [c for c in zone["cells"] if c["row_index"] != 3]
|
||||
zone["cells"].append({
|
||||
"cell_id": "Z0_R03_C1", "zone_index": 0, "row_index": 3,
|
||||
"col_index": 1, "col_type": "column_2", "text": "Unit 4:",
|
||||
"word_boxes": [self._make_word_box("Unit", 130, 190, 50, 26, "blue"),
|
||||
self._make_word_box("4:", 185, 190, 20, 26, "blue")],
|
||||
})
|
||||
zone["cells"].append({
|
||||
"cell_id": "Z0_R03_C2", "zone_index": 0, "row_index": 3,
|
||||
"col_index": 2, "col_type": "column_3", "text": "Scotland",
|
||||
"word_boxes": [self._make_word_box("Scotland", 400, 190, 100, 26, "blue")],
|
||||
})
|
||||
zones_data = [zone]
|
||||
count = _detect_heading_rows_by_color(zones_data, 800, 1000)
|
||||
assert count == 1
|
||||
heading = [c for c in zone["cells"] if c["row_index"] == 3]
|
||||
assert len(heading) == 1
|
||||
assert heading[0]["col_type"] == "heading"
|
||||
assert heading[0]["col_index"] == 1 # Should start at column_2, not 0
|
||||
|
||||
Reference in New Issue
Block a user