Fix 3 grid issues: artifact cells, connector col noise, footer false positive
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 34s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 2m9s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 18s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 34s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 2m9s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 18s
1. Add per-cell artifact filter (4b2): removes single-word cells with ≤2 chars and confidence <65 (e.g. "as" from stray OCR marks) 2. Add narrow connector column normalization (4d2): when ≥60% of cells in a column share the same short text (e.g. "oder"), normalize near-match outliers like "oderb" → "oder" 3. Fix footer detection: require short text (≤20 chars) and no commas. Comma-separated lists like "Uhrzeit, Vergangenheit, Zukunft" are content continuations, not page numbers. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -631,6 +631,34 @@ async def _build_grid_core(
|
||||
sorted(junk_row_indices),
|
||||
)
|
||||
|
||||
# 4b2. Remove individual cells that consist of a single very-short,
|
||||
# low-confidence word (OCR artifacts like "as", "b" from stray marks).
|
||||
# These survive row-level junk removal when the row has valid cells
|
||||
# in other columns.
|
||||
_ARTIFACT_MAX_LEN = 2
|
||||
_ARTIFACT_CONF_THRESHOLD = 65
|
||||
for z in zones_data:
|
||||
cells = z.get("cells", [])
|
||||
if not cells:
|
||||
continue
|
||||
artifact_ids = set()
|
||||
for cell in cells:
|
||||
wbs = cell.get("word_boxes") or []
|
||||
if len(wbs) != 1:
|
||||
continue
|
||||
wb = wbs[0]
|
||||
text = (wb.get("text") or "").strip()
|
||||
conf = wb.get("conf", 100)
|
||||
if len(text) <= _ARTIFACT_MAX_LEN and conf < _ARTIFACT_CONF_THRESHOLD:
|
||||
artifact_ids.add(cell.get("cell_id"))
|
||||
if artifact_ids:
|
||||
z["cells"] = [c for c in cells if c.get("cell_id") not in artifact_ids]
|
||||
logger.info(
|
||||
"build-grid: removed %d artifact cells from zone %d: %s",
|
||||
len(artifact_ids), z.get("zone_index", 0),
|
||||
[c.get("text") for c in cells if c.get("cell_id") in artifact_ids],
|
||||
)
|
||||
|
||||
# 4c. Remove oversized word_boxes from individual cells.
|
||||
# OCR artifacts from graphics/images (e.g. a huge "N" from a map image)
|
||||
# have word heights 3-5x the median. Remove them per-word so they don't
|
||||
@@ -707,6 +735,52 @@ async def _build_grid_core(
|
||||
if cleaned != text.strip():
|
||||
cell["text"] = cleaned
|
||||
|
||||
# 4d2. Normalize narrow connector columns.
|
||||
# In synonym dictionaries a narrow column repeats the same word
|
||||
# (e.g. "oder") in every row. OCR sometimes appends noise chars
|
||||
# (e.g. "oderb" instead of "oder"). If ≥60% of cells in a column
|
||||
# share the same short text, normalize near-match outliers.
|
||||
for z in zones_data:
|
||||
cols = z.get("columns", [])
|
||||
cells = z.get("cells", [])
|
||||
if not cols or not cells:
|
||||
continue
|
||||
for col in cols:
|
||||
ci = col.get("index")
|
||||
col_cells = [c for c in cells if c.get("col_index") == ci]
|
||||
if len(col_cells) < 3:
|
||||
continue
|
||||
# Count text occurrences
|
||||
text_counts: Dict[str, int] = {}
|
||||
for c in col_cells:
|
||||
t = (c.get("text") or "").strip()
|
||||
if t:
|
||||
text_counts[t] = text_counts.get(t, 0) + 1
|
||||
if not text_counts:
|
||||
continue
|
||||
dominant_text = max(text_counts, key=text_counts.get) # type: ignore[arg-type]
|
||||
dominant_count = text_counts[dominant_text]
|
||||
# Only normalize if dominant word is short and appears in ≥60%
|
||||
if len(dominant_text) > 10 or dominant_count < len(col_cells) * 0.6:
|
||||
continue
|
||||
# Fix outliers that start with the dominant text
|
||||
fixed = 0
|
||||
for c in col_cells:
|
||||
t = (c.get("text") or "").strip()
|
||||
if t != dominant_text and t.startswith(dominant_text) and len(t) <= len(dominant_text) + 2:
|
||||
c["text"] = dominant_text
|
||||
# Also fix word_boxes
|
||||
wbs = c.get("word_boxes") or []
|
||||
if len(wbs) == 1:
|
||||
wbs[0]["text"] = dominant_text
|
||||
fixed += 1
|
||||
if fixed:
|
||||
logger.info(
|
||||
"build-grid: normalized %d outlier cells in connector column %d "
|
||||
"(dominant='%s') zone %d",
|
||||
fixed, ci, dominant_text, z.get("zone_index", 0),
|
||||
)
|
||||
|
||||
# 4e. Detect and remove page-border decoration strips.
|
||||
# Skipped when the pre-filter already removed border words BEFORE
|
||||
# column detection — re-running would incorrectly detect the
|
||||
@@ -1095,8 +1169,9 @@ async def _build_grid_core(
|
||||
if c.get("cell_id") not in page_ref_cell_ids]
|
||||
|
||||
# Detect footer: last non-header row if it has only 1 cell
|
||||
# and the text is NOT IPA (no real IPA Unicode symbols).
|
||||
# This catches page numbers like "two hundred and twelve".
|
||||
# with short, non-content text (page numbers like "233" or
|
||||
# "two hundred and twelve"). Comma-separated lists and long
|
||||
# text are content continuations, not page numbers.
|
||||
footer_rows = []
|
||||
non_header_rows = [r for r in rows if not r.get("is_header")]
|
||||
if non_header_rows:
|
||||
@@ -1108,7 +1183,13 @@ async def _build_grid_core(
|
||||
text = (last_cells[0].get("text") or "").strip()
|
||||
# Not IPA (no real IPA symbols) and not a heading
|
||||
has_real_ipa = any(c in _REAL_IPA_CHARS_SET for c in text)
|
||||
if text and not has_real_ipa and last_cells[0].get("col_type") != "heading":
|
||||
# Comma-separated text is a content continuation, not a footer
|
||||
has_commas = ',' in text
|
||||
# Long text (>20 chars) is unlikely a page number
|
||||
is_short = len(text) <= 20
|
||||
if (text and not has_real_ipa and not has_commas
|
||||
and is_short
|
||||
and last_cells[0].get("col_type") != "heading"):
|
||||
footer_rows.append({
|
||||
"row_index": last_ri,
|
||||
"text": text,
|
||||
|
||||
Reference in New Issue
Block a user