Fix 3 grid issues: artifact cells, connector col noise, footer false positive
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 34s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 2m9s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 18s

1. Add per-cell artifact filter (4b2): removes single-word cells with
   ≤2 chars and confidence <65 (e.g. "as" from stray OCR marks)

2. Add narrow connector column normalization (4d2): when ≥60% of cells
   in a column share the same short text (e.g. "oder"), normalize
   near-match outliers like "oderb" → "oder"

3. Fix footer detection: require short text (≤20 chars) and no commas.
   Comma-separated lists like "Uhrzeit, Vergangenheit, Zukunft" are
   content continuations, not page numbers.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-26 08:18:55 +01:00
parent 525de55791
commit 5af5d821a5

View File

@@ -631,6 +631,34 @@ async def _build_grid_core(
sorted(junk_row_indices),
)
# 4b2. Remove individual cells that consist of a single very-short,
# low-confidence word (OCR artifacts like "as", "b" from stray marks).
# These survive row-level junk removal when the row has valid cells
# in other columns.
_ARTIFACT_MAX_LEN = 2
_ARTIFACT_CONF_THRESHOLD = 65
for z in zones_data:
cells = z.get("cells", [])
if not cells:
continue
artifact_ids = set()
for cell in cells:
wbs = cell.get("word_boxes") or []
if len(wbs) != 1:
continue
wb = wbs[0]
text = (wb.get("text") or "").strip()
conf = wb.get("conf", 100)
if len(text) <= _ARTIFACT_MAX_LEN and conf < _ARTIFACT_CONF_THRESHOLD:
artifact_ids.add(cell.get("cell_id"))
if artifact_ids:
z["cells"] = [c for c in cells if c.get("cell_id") not in artifact_ids]
logger.info(
"build-grid: removed %d artifact cells from zone %d: %s",
len(artifact_ids), z.get("zone_index", 0),
[c.get("text") for c in cells if c.get("cell_id") in artifact_ids],
)
# 4c. Remove oversized word_boxes from individual cells.
# OCR artifacts from graphics/images (e.g. a huge "N" from a map image)
# have word heights 3-5x the median. Remove them per-word so they don't
@@ -707,6 +735,52 @@ async def _build_grid_core(
if cleaned != text.strip():
cell["text"] = cleaned
# 4d2. Normalize narrow connector columns.
# In synonym dictionaries a narrow column repeats the same word
# (e.g. "oder") in every row. OCR sometimes appends noise chars
# (e.g. "oderb" instead of "oder"). If ≥60% of cells in a column
# share the same short text, normalize near-match outliers.
for z in zones_data:
cols = z.get("columns", [])
cells = z.get("cells", [])
if not cols or not cells:
continue
for col in cols:
ci = col.get("index")
col_cells = [c for c in cells if c.get("col_index") == ci]
if len(col_cells) < 3:
continue
# Count text occurrences
text_counts: Dict[str, int] = {}
for c in col_cells:
t = (c.get("text") or "").strip()
if t:
text_counts[t] = text_counts.get(t, 0) + 1
if not text_counts:
continue
dominant_text = max(text_counts, key=text_counts.get) # type: ignore[arg-type]
dominant_count = text_counts[dominant_text]
# Only normalize if dominant word is short and appears in ≥60%
if len(dominant_text) > 10 or dominant_count < len(col_cells) * 0.6:
continue
# Fix outliers that start with the dominant text
fixed = 0
for c in col_cells:
t = (c.get("text") or "").strip()
if t != dominant_text and t.startswith(dominant_text) and len(t) <= len(dominant_text) + 2:
c["text"] = dominant_text
# Also fix word_boxes
wbs = c.get("word_boxes") or []
if len(wbs) == 1:
wbs[0]["text"] = dominant_text
fixed += 1
if fixed:
logger.info(
"build-grid: normalized %d outlier cells in connector column %d "
"(dominant='%s') zone %d",
fixed, ci, dominant_text, z.get("zone_index", 0),
)
# 4e. Detect and remove page-border decoration strips.
# Skipped when the pre-filter already removed border words BEFORE
# column detection — re-running would incorrectly detect the
@@ -1095,8 +1169,9 @@ async def _build_grid_core(
if c.get("cell_id") not in page_ref_cell_ids]
# Detect footer: last non-header row if it has only 1 cell
# and the text is NOT IPA (no real IPA Unicode symbols).
# This catches page numbers like "two hundred and twelve".
# with short, non-content text (page numbers like "233" or
# "two hundred and twelve"). Comma-separated lists and long
# text are content continuations, not page numbers.
footer_rows = []
non_header_rows = [r for r in rows if not r.get("is_header")]
if non_header_rows:
@@ -1108,7 +1183,13 @@ async def _build_grid_core(
text = (last_cells[0].get("text") or "").strip()
# Not IPA (no real IPA symbols) and not a heading
has_real_ipa = any(c in _REAL_IPA_CHARS_SET for c in text)
if text and not has_real_ipa and last_cells[0].get("col_type") != "heading":
# Comma-separated text is a content continuation, not a footer
has_commas = ',' in text
# Long text (>20 chars) is unlikely a page number
is_short = len(text) <= 20
if (text and not has_real_ipa and not has_commas
and is_short
and last_cells[0].get("col_type") != "heading"):
footer_rows.append({
"row_index": last_ri,
"text": text,