fix: remove image-area artifacts + fix heading false positive for dictionary entries

Three fixes for dictionary page session 5997: 1. Heading detection: column_1 cells with article words (die/der/das) now count as content cells, preventing "die Zuschrift, die Zuschriften" from being falsely merged into a spanning heading cell. 2. Step 5j-pre: new artifact cell filter removes short garbled text from OCR on image areas (e.g. "7 EN", "Tr", "\\", "PEE", "a="). Cells survive earlier filters because their rows have real content in other columns. Also cleans up empty rows after removal. 3. Footer "PEE" auto-fixed: artifact filter removes the noise cell, empty row gets cleaned up, footer detection no longer sees it. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-22 07:59:24 +01:00
parent 1fae39dbb8
commit 882b177fc3
1 changed files with 67 additions and 5 deletions
@@ -855,13 +855,25 @@ def _detect_heading_rows_by_single_cell(
            continue
        last_col = col_indices[-1]
-        # Count content cells per row (column_* but not column_1/page_ref)
+        # Count content cells per row (column_* but not column_1/page_ref).
        # Exception: column_1 cells that contain a dictionary article word
        # (die/der/das etc.) ARE content — they appear in dictionary layouts
        # where the leftmost column holds grammatical articles.
        _ARTICLE_WORDS = {
            "die", "der", "das", "dem", "den", "des", "ein", "eine",
            "the", "a", "an",
        }
        row_content_counts: Dict[int, int] = {}
        for cell in cells:
            ct = cell.get("col_type", "")
-            if ct.startswith("column_") and ct != "column_1":
+            if not ct.startswith("column_"):
-                ri = cell.get("row_index", -1)
+                continue
-                row_content_counts[ri] = row_content_counts.get(ri, 0) + 1
+            if ct == "column_1":
                ctext = (cell.get("text") or "").strip().lower()
                if ctext not in _ARTICLE_WORDS:
                    continue
            ri = cell.get("row_index", -1)
            row_content_counts[ri] = row_content_counts.get(ri, 0) + 1
        # Majority of rows must have ≥2 content cells
        multi_col_rows = sum(1 for cnt in row_content_counts.values() if cnt >= 2)
@@ -887,7 +899,8 @@ def _detect_heading_rows_by_single_cell(
            content_cells = [
                c for c in row_cells
                if c.get("col_type", "").startswith("column_")
-                and c.get("col_type") != "column_1"
+                and (c.get("col_type") != "column_1"
                     or (c.get("text") or "").strip().lower() in _ARTICLE_WORDS)
            ]
            if len(content_cells) != 1:
                continue
@@ -2483,6 +2496,55 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                          if (c.get("word_boxes") or c.get("text", "").strip())]
        logger.info("Step 5i: removed %d bullet/artifact word_boxes", bullet_removed)
    # 5j-pre. Remove cells whose text is entirely garbled / artifact noise.
    # OCR on image areas produces short nonsensical fragments ("7 EN", "Tr",
    # "\\", "PEE", "a=") that survive earlier filters because their rows also
    # contain real content in other columns.  Remove them here.
    _COMMON_SHORT_WORDS = {
        # German
        "ab", "am", "an", "da", "du", "er", "es", "im", "in", "ja",
        "ob", "so", "um", "zu", "wo", "je", "oh", "or",
        "die", "der", "das", "dem", "den", "des", "ein", "und",
        "auf", "aus", "bei", "bis", "für", "mit", "nur", "von",
        # English
        "a", "i", "an", "as", "at", "be", "by", "do", "go", "he",
        "if", "in", "is", "it", "me", "my", "no", "of", "oh", "ok",
        "on", "or", "so", "to", "up", "us", "we",
        "the", "and", "but", "for", "not",
    }
    _PURE_JUNK_RE = re.compile(r'^[\W\d\s]+$')
    artifact_cells_removed = 0
    for z in zones_data:
        before = len(z.get("cells", []))
        kept = []
        for cell in z.get("cells", []):
            text = (cell.get("text") or "").strip()
            core = text.rstrip(".,;:!?'\"")
            is_artifact = False
            if not core:
                is_artifact = True
            elif _PURE_JUNK_RE.match(core):
                is_artifact = True
            elif len(core) <= 2 and core.lower() not in _COMMON_SHORT_WORDS:
                is_artifact = True
            elif len(core) <= 3 and core.isupper() and core.lower() not in _COMMON_SHORT_WORDS:
                is_artifact = True
            elif len(core) <= 5 and re.search(r'\d', core) and re.search(r'[A-Za-z]', core):
                # Mixed digits + letters in short text (e.g. "7 EN", "a=3")
                is_artifact = True
            if is_artifact:
                kept.append(None)  # placeholder
            else:
                kept.append(cell)
        z["cells"] = [c for c in kept if c is not None]
        artifact_cells_removed += before - len(z["cells"])
    if artifact_cells_removed:
        # Also remove rows that became completely empty
        for z in zones_data:
            cell_ris = {c.get("row_index") for c in z.get("cells", [])}
            z["rows"] = [r for r in z.get("rows", []) if r["index"] in cell_ris]
        logger.info("Step 5j-pre: removed %d artifact cells", artifact_cells_removed)
    # 5j. Normalise word_box order to reading order (group by Y, sort by X).
    # The frontend renders colored cells from word_boxes array order
    # (GridTable.tsx), so they MUST be in left-to-right reading order.