fix: remove image-area artifacts + fix heading false positive for dictionary entries

Three fixes for dictionary page session 5997: 1. Heading detection: column_1 cells with article words (die/der/das) now count as content cells, preventing "die Zuschrift, die Zuschriften" from being falsely merged into a spanning heading cell. 2. Step 5j-pre: new artifact cell filter removes short garbled text from OCR on image areas (e.g. "7 EN", "Tr", "\\", "PEE", "a="). Cells survive earlier filters because their rows have real content in other columns. Also cleans up empty rows after removal. 3. Footer "PEE" auto-fixed: artifact filter removes the noise cell, empty row gets cleaned up, footer detection no longer sees it. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-22 07:59:24 +01:00
parent 1fae39dbb8
commit 882b177fc3
1 changed files with 67 additions and 5 deletions
@@ -855,13 +855,25 @@ def _detect_heading_rows_by_single_cell(
            continue
        last_col = col_indices[-1]

-        # Count content cells per row (column_* but not column_1/page_ref)
+        # Count content cells per row (column_* but not column_1/page_ref).
+        # Exception: column_1 cells that contain a dictionary article word
+        # (die/der/das etc.) ARE content — they appear in dictionary layouts
+        # where the leftmost column holds grammatical articles.
+        _ARTICLE_WORDS = {
+            "die", "der", "das", "dem", "den", "des", "ein", "eine",
+            "the", "a", "an",
+        }
        row_content_counts: Dict[int, int] = {}
        for cell in cells:
            ct = cell.get("col_type", "")
-            if ct.startswith("column_") and ct != "column_1":
-                ri = cell.get("row_index", -1)
-                row_content_counts[ri] = row_content_counts.get(ri, 0) + 1
+            if not ct.startswith("column_"):
+                continue
+            if ct == "column_1":
+                ctext = (cell.get("text") or "").strip().lower()
+                if ctext not in _ARTICLE_WORDS:
+                    continue
+            ri = cell.get("row_index", -1)
+            row_content_counts[ri] = row_content_counts.get(ri, 0) + 1

        # Majority of rows must have ≥2 content cells
        multi_col_rows = sum(1 for cnt in row_content_counts.values() if cnt >= 2)
@@ -887,7 +899,8 @@ def _detect_heading_rows_by_single_cell(
            content_cells = [
                c for c in row_cells
                if c.get("col_type", "").startswith("column_")
-                and c.get("col_type") != "column_1"
+                and (c.get("col_type") != "column_1"
+                     or (c.get("text") or "").strip().lower() in _ARTICLE_WORDS)
            ]
            if len(content_cells) != 1:
                continue
@@ -2483,6 +2496,55 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
                          if (c.get("word_boxes") or c.get("text", "").strip())]
        logger.info("Step 5i: removed %d bullet/artifact word_boxes", bullet_removed)

+    # 5j-pre. Remove cells whose text is entirely garbled / artifact noise.
+    # OCR on image areas produces short nonsensical fragments ("7 EN", "Tr",
+    # "\\", "PEE", "a=") that survive earlier filters because their rows also
+    # contain real content in other columns.  Remove them here.
+    _COMMON_SHORT_WORDS = {
+        # German
+        "ab", "am", "an", "da", "du", "er", "es", "im", "in", "ja",
+        "ob", "so", "um", "zu", "wo", "je", "oh", "or",
+        "die", "der", "das", "dem", "den", "des", "ein", "und",
+        "auf", "aus", "bei", "bis", "für", "mit", "nur", "von",
+        # English
+        "a", "i", "an", "as", "at", "be", "by", "do", "go", "he",
+        "if", "in", "is", "it", "me", "my", "no", "of", "oh", "ok",
+        "on", "or", "so", "to", "up", "us", "we",
+        "the", "and", "but", "for", "not",
+    }
+    _PURE_JUNK_RE = re.compile(r'^[\W\d\s]+$')
+    artifact_cells_removed = 0
+    for z in zones_data:
+        before = len(z.get("cells", []))
+        kept = []
+        for cell in z.get("cells", []):
+            text = (cell.get("text") or "").strip()
+            core = text.rstrip(".,;:!?'\"")
+            is_artifact = False
+            if not core:
+                is_artifact = True
+            elif _PURE_JUNK_RE.match(core):
+                is_artifact = True
+            elif len(core) <= 2 and core.lower() not in _COMMON_SHORT_WORDS:
+                is_artifact = True
+            elif len(core) <= 3 and core.isupper() and core.lower() not in _COMMON_SHORT_WORDS:
+                is_artifact = True
+            elif len(core) <= 5 and re.search(r'\d', core) and re.search(r'[A-Za-z]', core):
+                # Mixed digits + letters in short text (e.g. "7 EN", "a=3")
+                is_artifact = True
+            if is_artifact:
+                kept.append(None)  # placeholder
+            else:
+                kept.append(cell)
+        z["cells"] = [c for c in kept if c is not None]
+        artifact_cells_removed += before - len(z["cells"])
+    if artifact_cells_removed:
+        # Also remove rows that became completely empty
+        for z in zones_data:
+            cell_ris = {c.get("row_index") for c in z.get("cells", [])}
+            z["rows"] = [r for r in z.get("rows", []) if r["index"] in cell_ris]
+        logger.info("Step 5j-pre: removed %d artifact cells", artifact_cells_removed)
+
    # 5j. Normalise word_box order to reading order (group by Y, sort by X).
    # The frontend renders colored cells from word_boxes array order
    # (GridTable.tsx), so they MUST be in left-to-right reading order.