fix: remove image-area artifacts + fix heading false positive for dictionary entries
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 1m55s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 16s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 1m55s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 16s
Three fixes for dictionary page session 5997: 1. Heading detection: column_1 cells with article words (die/der/das) now count as content cells, preventing "die Zuschrift, die Zuschriften" from being falsely merged into a spanning heading cell. 2. Step 5j-pre: new artifact cell filter removes short garbled text from OCR on image areas (e.g. "7 EN", "Tr", "\\", "PEE", "a="). Cells survive earlier filters because their rows have real content in other columns. Also cleans up empty rows after removal. 3. Footer "PEE" auto-fixed: artifact filter removes the noise cell, empty row gets cleaned up, footer detection no longer sees it. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -855,11 +855,23 @@ def _detect_heading_rows_by_single_cell(
|
||||
continue
|
||||
last_col = col_indices[-1]
|
||||
|
||||
# Count content cells per row (column_* but not column_1/page_ref)
|
||||
# Count content cells per row (column_* but not column_1/page_ref).
|
||||
# Exception: column_1 cells that contain a dictionary article word
|
||||
# (die/der/das etc.) ARE content — they appear in dictionary layouts
|
||||
# where the leftmost column holds grammatical articles.
|
||||
_ARTICLE_WORDS = {
|
||||
"die", "der", "das", "dem", "den", "des", "ein", "eine",
|
||||
"the", "a", "an",
|
||||
}
|
||||
row_content_counts: Dict[int, int] = {}
|
||||
for cell in cells:
|
||||
ct = cell.get("col_type", "")
|
||||
if ct.startswith("column_") and ct != "column_1":
|
||||
if not ct.startswith("column_"):
|
||||
continue
|
||||
if ct == "column_1":
|
||||
ctext = (cell.get("text") or "").strip().lower()
|
||||
if ctext not in _ARTICLE_WORDS:
|
||||
continue
|
||||
ri = cell.get("row_index", -1)
|
||||
row_content_counts[ri] = row_content_counts.get(ri, 0) + 1
|
||||
|
||||
@@ -887,7 +899,8 @@ def _detect_heading_rows_by_single_cell(
|
||||
content_cells = [
|
||||
c for c in row_cells
|
||||
if c.get("col_type", "").startswith("column_")
|
||||
and c.get("col_type") != "column_1"
|
||||
and (c.get("col_type") != "column_1"
|
||||
or (c.get("text") or "").strip().lower() in _ARTICLE_WORDS)
|
||||
]
|
||||
if len(content_cells) != 1:
|
||||
continue
|
||||
@@ -2483,6 +2496,55 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
||||
if (c.get("word_boxes") or c.get("text", "").strip())]
|
||||
logger.info("Step 5i: removed %d bullet/artifact word_boxes", bullet_removed)
|
||||
|
||||
# 5j-pre. Remove cells whose text is entirely garbled / artifact noise.
|
||||
# OCR on image areas produces short nonsensical fragments ("7 EN", "Tr",
|
||||
# "\\", "PEE", "a=") that survive earlier filters because their rows also
|
||||
# contain real content in other columns. Remove them here.
|
||||
_COMMON_SHORT_WORDS = {
|
||||
# German
|
||||
"ab", "am", "an", "da", "du", "er", "es", "im", "in", "ja",
|
||||
"ob", "so", "um", "zu", "wo", "je", "oh", "or",
|
||||
"die", "der", "das", "dem", "den", "des", "ein", "und",
|
||||
"auf", "aus", "bei", "bis", "für", "mit", "nur", "von",
|
||||
# English
|
||||
"a", "i", "an", "as", "at", "be", "by", "do", "go", "he",
|
||||
"if", "in", "is", "it", "me", "my", "no", "of", "oh", "ok",
|
||||
"on", "or", "so", "to", "up", "us", "we",
|
||||
"the", "and", "but", "for", "not",
|
||||
}
|
||||
_PURE_JUNK_RE = re.compile(r'^[\W\d\s]+$')
|
||||
artifact_cells_removed = 0
|
||||
for z in zones_data:
|
||||
before = len(z.get("cells", []))
|
||||
kept = []
|
||||
for cell in z.get("cells", []):
|
||||
text = (cell.get("text") or "").strip()
|
||||
core = text.rstrip(".,;:!?'\"")
|
||||
is_artifact = False
|
||||
if not core:
|
||||
is_artifact = True
|
||||
elif _PURE_JUNK_RE.match(core):
|
||||
is_artifact = True
|
||||
elif len(core) <= 2 and core.lower() not in _COMMON_SHORT_WORDS:
|
||||
is_artifact = True
|
||||
elif len(core) <= 3 and core.isupper() and core.lower() not in _COMMON_SHORT_WORDS:
|
||||
is_artifact = True
|
||||
elif len(core) <= 5 and re.search(r'\d', core) and re.search(r'[A-Za-z]', core):
|
||||
# Mixed digits + letters in short text (e.g. "7 EN", "a=3")
|
||||
is_artifact = True
|
||||
if is_artifact:
|
||||
kept.append(None) # placeholder
|
||||
else:
|
||||
kept.append(cell)
|
||||
z["cells"] = [c for c in kept if c is not None]
|
||||
artifact_cells_removed += before - len(z["cells"])
|
||||
if artifact_cells_removed:
|
||||
# Also remove rows that became completely empty
|
||||
for z in zones_data:
|
||||
cell_ris = {c.get("row_index") for c in z.get("cells", [])}
|
||||
z["rows"] = [r for r in z.get("rows", []) if r["index"] in cell_ris]
|
||||
logger.info("Step 5j-pre: removed %d artifact cells", artifact_cells_removed)
|
||||
|
||||
# 5j. Normalise word_box order to reading order (group by Y, sort by X).
|
||||
# The frontend renders colored cells from word_boxes array order
|
||||
# (GridTable.tsx), so they MUST be in left-to-right reading order.
|
||||
|
||||
Reference in New Issue
Block a user