fix: remove image-area artifacts + fix heading false positive for dictionary entries
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 1m55s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 16s

Three fixes for dictionary page session 5997:

1. Heading detection: column_1 cells with article words (die/der/das)
   now count as content cells, preventing "die Zuschrift, die Zuschriften"
   from being falsely merged into a spanning heading cell.

2. Step 5j-pre: new artifact cell filter removes short garbled text from
   OCR on image areas (e.g. "7 EN", "Tr", "\\", "PEE", "a="). Cells
   survive earlier filters because their rows have real content in other
   columns. Also cleans up empty rows after removal.

3. Footer "PEE" auto-fixed: artifact filter removes the noise cell,
   empty row gets cleaned up, footer detection no longer sees it.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-22 07:59:24 +01:00
parent 1fae39dbb8
commit 882b177fc3

View File

@@ -855,13 +855,25 @@ def _detect_heading_rows_by_single_cell(
continue
last_col = col_indices[-1]
# Count content cells per row (column_* but not column_1/page_ref)
# Count content cells per row (column_* but not column_1/page_ref).
# Exception: column_1 cells that contain a dictionary article word
# (die/der/das etc.) ARE content — they appear in dictionary layouts
# where the leftmost column holds grammatical articles.
_ARTICLE_WORDS = {
"die", "der", "das", "dem", "den", "des", "ein", "eine",
"the", "a", "an",
}
row_content_counts: Dict[int, int] = {}
for cell in cells:
ct = cell.get("col_type", "")
if ct.startswith("column_") and ct != "column_1":
ri = cell.get("row_index", -1)
row_content_counts[ri] = row_content_counts.get(ri, 0) + 1
if not ct.startswith("column_"):
continue
if ct == "column_1":
ctext = (cell.get("text") or "").strip().lower()
if ctext not in _ARTICLE_WORDS:
continue
ri = cell.get("row_index", -1)
row_content_counts[ri] = row_content_counts.get(ri, 0) + 1
# Majority of rows must have ≥2 content cells
multi_col_rows = sum(1 for cnt in row_content_counts.values() if cnt >= 2)
@@ -887,7 +899,8 @@ def _detect_heading_rows_by_single_cell(
content_cells = [
c for c in row_cells
if c.get("col_type", "").startswith("column_")
and c.get("col_type") != "column_1"
and (c.get("col_type") != "column_1"
or (c.get("text") or "").strip().lower() in _ARTICLE_WORDS)
]
if len(content_cells) != 1:
continue
@@ -2483,6 +2496,55 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
if (c.get("word_boxes") or c.get("text", "").strip())]
logger.info("Step 5i: removed %d bullet/artifact word_boxes", bullet_removed)
# 5j-pre. Remove cells whose text is entirely garbled / artifact noise.
# OCR on image areas produces short nonsensical fragments ("7 EN", "Tr",
# "\\", "PEE", "a=") that survive earlier filters because their rows also
# contain real content in other columns. Remove them here.
_COMMON_SHORT_WORDS = {
# German
"ab", "am", "an", "da", "du", "er", "es", "im", "in", "ja",
"ob", "so", "um", "zu", "wo", "je", "oh", "or",
"die", "der", "das", "dem", "den", "des", "ein", "und",
"auf", "aus", "bei", "bis", "für", "mit", "nur", "von",
# English
"a", "i", "an", "as", "at", "be", "by", "do", "go", "he",
"if", "in", "is", "it", "me", "my", "no", "of", "oh", "ok",
"on", "or", "so", "to", "up", "us", "we",
"the", "and", "but", "for", "not",
}
_PURE_JUNK_RE = re.compile(r'^[\W\d\s]+$')
artifact_cells_removed = 0
for z in zones_data:
before = len(z.get("cells", []))
kept = []
for cell in z.get("cells", []):
text = (cell.get("text") or "").strip()
core = text.rstrip(".,;:!?'\"")
is_artifact = False
if not core:
is_artifact = True
elif _PURE_JUNK_RE.match(core):
is_artifact = True
elif len(core) <= 2 and core.lower() not in _COMMON_SHORT_WORDS:
is_artifact = True
elif len(core) <= 3 and core.isupper() and core.lower() not in _COMMON_SHORT_WORDS:
is_artifact = True
elif len(core) <= 5 and re.search(r'\d', core) and re.search(r'[A-Za-z]', core):
# Mixed digits + letters in short text (e.g. "7 EN", "a=3")
is_artifact = True
if is_artifact:
kept.append(None) # placeholder
else:
kept.append(cell)
z["cells"] = [c for c in kept if c is not None]
artifact_cells_removed += before - len(z["cells"])
if artifact_cells_removed:
# Also remove rows that became completely empty
for z in zones_data:
cell_ris = {c.get("row_index") for c in z.get("cells", [])}
z["rows"] = [r for r in z.get("rows", []) if r["index"] in cell_ris]
logger.info("Step 5j-pre: removed %d artifact cells", artifact_cells_removed)
# 5j. Normalise word_box order to reading order (group by Y, sort by X).
# The frontend renders colored cells from word_boxes array order
# (GridTable.tsx), so they MUST be in left-to-right reading order.