fix: remove image-area artifacts + fix heading false positive for dictionary entries
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 1m55s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 16s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 1m55s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 16s
Three fixes for dictionary page session 5997: 1. Heading detection: column_1 cells with article words (die/der/das) now count as content cells, preventing "die Zuschrift, die Zuschriften" from being falsely merged into a spanning heading cell. 2. Step 5j-pre: new artifact cell filter removes short garbled text from OCR on image areas (e.g. "7 EN", "Tr", "\\", "PEE", "a="). Cells survive earlier filters because their rows have real content in other columns. Also cleans up empty rows after removal. 3. Footer "PEE" auto-fixed: artifact filter removes the noise cell, empty row gets cleaned up, footer detection no longer sees it. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -855,13 +855,25 @@ def _detect_heading_rows_by_single_cell(
|
|||||||
continue
|
continue
|
||||||
last_col = col_indices[-1]
|
last_col = col_indices[-1]
|
||||||
|
|
||||||
# Count content cells per row (column_* but not column_1/page_ref)
|
# Count content cells per row (column_* but not column_1/page_ref).
|
||||||
|
# Exception: column_1 cells that contain a dictionary article word
|
||||||
|
# (die/der/das etc.) ARE content — they appear in dictionary layouts
|
||||||
|
# where the leftmost column holds grammatical articles.
|
||||||
|
_ARTICLE_WORDS = {
|
||||||
|
"die", "der", "das", "dem", "den", "des", "ein", "eine",
|
||||||
|
"the", "a", "an",
|
||||||
|
}
|
||||||
row_content_counts: Dict[int, int] = {}
|
row_content_counts: Dict[int, int] = {}
|
||||||
for cell in cells:
|
for cell in cells:
|
||||||
ct = cell.get("col_type", "")
|
ct = cell.get("col_type", "")
|
||||||
if ct.startswith("column_") and ct != "column_1":
|
if not ct.startswith("column_"):
|
||||||
ri = cell.get("row_index", -1)
|
continue
|
||||||
row_content_counts[ri] = row_content_counts.get(ri, 0) + 1
|
if ct == "column_1":
|
||||||
|
ctext = (cell.get("text") or "").strip().lower()
|
||||||
|
if ctext not in _ARTICLE_WORDS:
|
||||||
|
continue
|
||||||
|
ri = cell.get("row_index", -1)
|
||||||
|
row_content_counts[ri] = row_content_counts.get(ri, 0) + 1
|
||||||
|
|
||||||
# Majority of rows must have ≥2 content cells
|
# Majority of rows must have ≥2 content cells
|
||||||
multi_col_rows = sum(1 for cnt in row_content_counts.values() if cnt >= 2)
|
multi_col_rows = sum(1 for cnt in row_content_counts.values() if cnt >= 2)
|
||||||
@@ -887,7 +899,8 @@ def _detect_heading_rows_by_single_cell(
|
|||||||
content_cells = [
|
content_cells = [
|
||||||
c for c in row_cells
|
c for c in row_cells
|
||||||
if c.get("col_type", "").startswith("column_")
|
if c.get("col_type", "").startswith("column_")
|
||||||
and c.get("col_type") != "column_1"
|
and (c.get("col_type") != "column_1"
|
||||||
|
or (c.get("text") or "").strip().lower() in _ARTICLE_WORDS)
|
||||||
]
|
]
|
||||||
if len(content_cells) != 1:
|
if len(content_cells) != 1:
|
||||||
continue
|
continue
|
||||||
@@ -2483,6 +2496,55 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
|||||||
if (c.get("word_boxes") or c.get("text", "").strip())]
|
if (c.get("word_boxes") or c.get("text", "").strip())]
|
||||||
logger.info("Step 5i: removed %d bullet/artifact word_boxes", bullet_removed)
|
logger.info("Step 5i: removed %d bullet/artifact word_boxes", bullet_removed)
|
||||||
|
|
||||||
|
# 5j-pre. Remove cells whose text is entirely garbled / artifact noise.
|
||||||
|
# OCR on image areas produces short nonsensical fragments ("7 EN", "Tr",
|
||||||
|
# "\\", "PEE", "a=") that survive earlier filters because their rows also
|
||||||
|
# contain real content in other columns. Remove them here.
|
||||||
|
_COMMON_SHORT_WORDS = {
|
||||||
|
# German
|
||||||
|
"ab", "am", "an", "da", "du", "er", "es", "im", "in", "ja",
|
||||||
|
"ob", "so", "um", "zu", "wo", "je", "oh", "or",
|
||||||
|
"die", "der", "das", "dem", "den", "des", "ein", "und",
|
||||||
|
"auf", "aus", "bei", "bis", "für", "mit", "nur", "von",
|
||||||
|
# English
|
||||||
|
"a", "i", "an", "as", "at", "be", "by", "do", "go", "he",
|
||||||
|
"if", "in", "is", "it", "me", "my", "no", "of", "oh", "ok",
|
||||||
|
"on", "or", "so", "to", "up", "us", "we",
|
||||||
|
"the", "and", "but", "for", "not",
|
||||||
|
}
|
||||||
|
_PURE_JUNK_RE = re.compile(r'^[\W\d\s]+$')
|
||||||
|
artifact_cells_removed = 0
|
||||||
|
for z in zones_data:
|
||||||
|
before = len(z.get("cells", []))
|
||||||
|
kept = []
|
||||||
|
for cell in z.get("cells", []):
|
||||||
|
text = (cell.get("text") or "").strip()
|
||||||
|
core = text.rstrip(".,;:!?'\"")
|
||||||
|
is_artifact = False
|
||||||
|
if not core:
|
||||||
|
is_artifact = True
|
||||||
|
elif _PURE_JUNK_RE.match(core):
|
||||||
|
is_artifact = True
|
||||||
|
elif len(core) <= 2 and core.lower() not in _COMMON_SHORT_WORDS:
|
||||||
|
is_artifact = True
|
||||||
|
elif len(core) <= 3 and core.isupper() and core.lower() not in _COMMON_SHORT_WORDS:
|
||||||
|
is_artifact = True
|
||||||
|
elif len(core) <= 5 and re.search(r'\d', core) and re.search(r'[A-Za-z]', core):
|
||||||
|
# Mixed digits + letters in short text (e.g. "7 EN", "a=3")
|
||||||
|
is_artifact = True
|
||||||
|
if is_artifact:
|
||||||
|
kept.append(None) # placeholder
|
||||||
|
else:
|
||||||
|
kept.append(cell)
|
||||||
|
z["cells"] = [c for c in kept if c is not None]
|
||||||
|
artifact_cells_removed += before - len(z["cells"])
|
||||||
|
if artifact_cells_removed:
|
||||||
|
# Also remove rows that became completely empty
|
||||||
|
for z in zones_data:
|
||||||
|
cell_ris = {c.get("row_index") for c in z.get("cells", [])}
|
||||||
|
z["rows"] = [r for r in z.get("rows", []) if r["index"] in cell_ris]
|
||||||
|
logger.info("Step 5j-pre: removed %d artifact cells", artifact_cells_removed)
|
||||||
|
|
||||||
# 5j. Normalise word_box order to reading order (group by Y, sort by X).
|
# 5j. Normalise word_box order to reading order (group by Y, sort by X).
|
||||||
# The frontend renders colored cells from word_boxes array order
|
# The frontend renders colored cells from word_boxes array order
|
||||||
# (GridTable.tsx), so they MUST be in left-to-right reading order.
|
# (GridTable.tsx), so they MUST be in left-to-right reading order.
|
||||||
|
|||||||
Reference in New Issue
Block a user