Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
391 lines
15 KiB
Python
391 lines
15 KiB
Python
"""
|
|
Grid Build Cleanup — Phase 3: Junk row removal, artifact cleanup, pipe
|
|
divider removal, connector normalization, border strip detection, and
|
|
alphabet sidebar removal.
|
|
|
|
Extracted from grid_build_core.py for maintainability.
|
|
"""
|
|
|
|
import logging
|
|
import re
|
|
from typing import Any, Dict, List
|
|
|
|
from cv_ocr_engines import _words_to_reading_order_text
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_PIPE_RE = re.compile(r"^\|+$")
|
|
|
|
|
|
def _cleanup_zones(
|
|
zones_data: List[Dict[str, Any]],
|
|
border_prefiltered: bool,
|
|
session_id: str,
|
|
) -> bool:
|
|
"""Clean up zone data: remove junk rows, artifacts, pipes, border strips.
|
|
|
|
Args:
|
|
zones_data: List of zone dicts (modified in place).
|
|
border_prefiltered: Whether border words were already pre-filtered.
|
|
session_id: For logging.
|
|
|
|
Returns:
|
|
Updated border_prefiltered flag.
|
|
"""
|
|
_remove_junk_rows(zones_data)
|
|
_remove_artifact_cells(zones_data)
|
|
_remove_oversized_word_boxes(zones_data)
|
|
_remove_pipe_dividers(zones_data)
|
|
_normalize_connector_columns(zones_data)
|
|
border_prefiltered = _remove_border_strips(zones_data, border_prefiltered)
|
|
_remove_alphabet_sidebars(zones_data)
|
|
return border_prefiltered
|
|
|
|
|
|
def _remove_junk_rows(zones_data: List[Dict[str, Any]]) -> None:
|
|
"""Remove rows where ALL cells contain only short, low-confidence text.
|
|
|
|
Also removes 'oversized stub' rows and 'scattered debris' rows.
|
|
"""
|
|
_JUNK_CONF_THRESHOLD = 50
|
|
_JUNK_MAX_TEXT_LEN = 3
|
|
|
|
for z in zones_data:
|
|
cells = z.get("cells", [])
|
|
rows = z.get("rows", [])
|
|
if not cells or not rows:
|
|
continue
|
|
|
|
# Compute median word height across the zone for oversized detection
|
|
all_wb_heights = [
|
|
wb["height"]
|
|
for cell in cells
|
|
for wb in cell.get("word_boxes") or []
|
|
if wb.get("height", 0) > 0
|
|
]
|
|
median_wb_h = sorted(all_wb_heights)[len(all_wb_heights) // 2] if all_wb_heights else 28
|
|
|
|
junk_row_indices = set()
|
|
for row in rows:
|
|
ri = row["index"]
|
|
row_cells = [c for c in cells if c.get("row_index") == ri]
|
|
if not row_cells:
|
|
continue
|
|
|
|
row_wbs = [
|
|
wb for cell in row_cells
|
|
for wb in cell.get("word_boxes") or []
|
|
]
|
|
|
|
# Rule 1: ALL word_boxes are low-conf AND short text
|
|
all_junk = True
|
|
for wb in row_wbs:
|
|
text = (wb.get("text") or "").strip()
|
|
conf = wb.get("conf", 0)
|
|
if conf >= _JUNK_CONF_THRESHOLD or len(text) > _JUNK_MAX_TEXT_LEN:
|
|
all_junk = False
|
|
break
|
|
if all_junk and row_wbs:
|
|
junk_row_indices.add(ri)
|
|
continue
|
|
|
|
# Rule 2: oversized stub -- <=3 words, short total text,
|
|
# and word height > 1.8x median
|
|
if len(row_wbs) <= 3:
|
|
total_text = "".join((wb.get("text") or "").strip() for wb in row_wbs)
|
|
max_h = max((wb.get("height", 0) for wb in row_wbs), default=0)
|
|
has_page_ref = any(
|
|
re.match(r'^[pPsS]\.?\s*\d+$', (wb.get("text") or "").strip())
|
|
for wb in row_wbs
|
|
)
|
|
if len(total_text) <= 5 and max_h > median_wb_h * 1.8 and not has_page_ref:
|
|
junk_row_indices.add(ri)
|
|
continue
|
|
|
|
# Rule 3: scattered debris -- rows with only tiny fragments
|
|
longest = max(len((wb.get("text") or "").strip()) for wb in row_wbs)
|
|
if longest <= 2:
|
|
junk_row_indices.add(ri)
|
|
continue
|
|
|
|
if junk_row_indices:
|
|
z["cells"] = [c for c in cells if c.get("row_index") not in junk_row_indices]
|
|
z["rows"] = [r for r in rows if r["index"] not in junk_row_indices]
|
|
logger.info(
|
|
"build-grid: removed %d junk rows from zone %d: %s",
|
|
len(junk_row_indices), z["zone_index"],
|
|
sorted(junk_row_indices),
|
|
)
|
|
|
|
|
|
def _remove_artifact_cells(zones_data: List[Dict[str, Any]]) -> None:
|
|
"""Remove individual cells with a single very-short, low-conf word."""
|
|
_ARTIFACT_MAX_LEN = 2
|
|
_ARTIFACT_CONF_THRESHOLD = 65
|
|
|
|
for z in zones_data:
|
|
cells = z.get("cells", [])
|
|
if not cells:
|
|
continue
|
|
artifact_ids = set()
|
|
for cell in cells:
|
|
wbs = cell.get("word_boxes") or []
|
|
if len(wbs) != 1:
|
|
continue
|
|
wb = wbs[0]
|
|
text = (wb.get("text") or "").strip()
|
|
conf = wb.get("conf", 100)
|
|
if len(text) <= _ARTIFACT_MAX_LEN and conf < _ARTIFACT_CONF_THRESHOLD:
|
|
artifact_ids.add(cell.get("cell_id"))
|
|
if artifact_ids:
|
|
z["cells"] = [c for c in cells if c.get("cell_id") not in artifact_ids]
|
|
logger.info(
|
|
"build-grid: removed %d artifact cells from zone %d: %s",
|
|
len(artifact_ids), z.get("zone_index", 0),
|
|
[c.get("text") for c in cells if c.get("cell_id") in artifact_ids],
|
|
)
|
|
|
|
|
|
def _remove_oversized_word_boxes(zones_data: List[Dict[str, Any]]) -> None:
|
|
"""Remove word_boxes whose height is 3x+ the median (graphic artifacts)."""
|
|
for z in zones_data:
|
|
cells = z.get("cells", [])
|
|
if not cells:
|
|
continue
|
|
all_wh = [
|
|
wb["height"]
|
|
for cell in cells
|
|
for wb in cell.get("word_boxes") or []
|
|
if wb.get("height", 0) > 0
|
|
]
|
|
if not all_wh:
|
|
continue
|
|
med_h = sorted(all_wh)[len(all_wh) // 2]
|
|
oversized_threshold = med_h * 3
|
|
removed_oversized = 0
|
|
for cell in cells:
|
|
wbs = cell.get("word_boxes") or []
|
|
filtered = [wb for wb in wbs if wb.get("height", 0) <= oversized_threshold]
|
|
if len(filtered) < len(wbs):
|
|
removed_oversized += len(wbs) - len(filtered)
|
|
cell["word_boxes"] = filtered
|
|
cell["text"] = _words_to_reading_order_text(filtered)
|
|
if removed_oversized:
|
|
z["cells"] = [c for c in cells if c.get("word_boxes")]
|
|
logger.info(
|
|
"build-grid: removed %d oversized word_boxes (>%dpx) from zone %d",
|
|
removed_oversized, oversized_threshold, z.get("zone_index", 0),
|
|
)
|
|
|
|
|
|
def _remove_pipe_dividers(zones_data: List[Dict[str, Any]]) -> None:
|
|
"""Remove pipe-character word_boxes (column divider artifacts)."""
|
|
for z in zones_data:
|
|
if z.get("vsplit_group") is not None:
|
|
continue # pipes already removed before split
|
|
removed_pipes = 0
|
|
for cell in z.get("cells", []):
|
|
wbs = cell.get("word_boxes") or []
|
|
filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())]
|
|
if len(filtered) < len(wbs):
|
|
removed_pipes += len(wbs) - len(filtered)
|
|
cell["word_boxes"] = filtered
|
|
cell["text"] = _words_to_reading_order_text(filtered)
|
|
if removed_pipes:
|
|
z["cells"] = [c for c in z.get("cells", []) if (c.get("word_boxes") or c.get("text", "").strip())]
|
|
logger.info(
|
|
"build-grid: removed %d pipe-divider word_boxes from zone %d",
|
|
removed_pipes, z.get("zone_index", 0),
|
|
)
|
|
|
|
# Strip pipe chars ONLY from cell edges (OCR artifacts).
|
|
# Preserve pipes embedded in words as syllable separators.
|
|
for z in zones_data:
|
|
for cell in z.get("cells", []):
|
|
text = cell.get("text", "")
|
|
if "|" in text:
|
|
cleaned = text.strip("|").strip()
|
|
if cleaned != text.strip():
|
|
cell["text"] = cleaned
|
|
|
|
|
|
def _normalize_connector_columns(zones_data: List[Dict[str, Any]]) -> None:
|
|
"""Normalize narrow connector columns where OCR appends noise chars.
|
|
|
|
In synonym dictionaries a narrow column repeats the same word
|
|
(e.g. "oder") in every row. OCR sometimes appends noise chars.
|
|
"""
|
|
for z in zones_data:
|
|
cols = z.get("columns", [])
|
|
cells = z.get("cells", [])
|
|
if not cols or not cells:
|
|
continue
|
|
for col in cols:
|
|
ci = col.get("index")
|
|
col_cells = [c for c in cells if c.get("col_index") == ci]
|
|
if len(col_cells) < 3:
|
|
continue
|
|
text_counts: Dict[str, int] = {}
|
|
for c in col_cells:
|
|
t = (c.get("text") or "").strip()
|
|
if t:
|
|
text_counts[t] = text_counts.get(t, 0) + 1
|
|
if not text_counts:
|
|
continue
|
|
dominant_text = max(text_counts, key=text_counts.get) # type: ignore[arg-type]
|
|
dominant_count = text_counts[dominant_text]
|
|
if len(dominant_text) > 10 or dominant_count < len(col_cells) * 0.6:
|
|
continue
|
|
fixed = 0
|
|
for c in col_cells:
|
|
t = (c.get("text") or "").strip()
|
|
if t != dominant_text and t.startswith(dominant_text) and len(t) <= len(dominant_text) + 2:
|
|
c["text"] = dominant_text
|
|
wbs = c.get("word_boxes") or []
|
|
if len(wbs) == 1:
|
|
wbs[0]["text"] = dominant_text
|
|
fixed += 1
|
|
if fixed:
|
|
logger.info(
|
|
"build-grid: normalized %d outlier cells in connector column %d "
|
|
"(dominant='%s') zone %d",
|
|
fixed, ci, dominant_text, z.get("zone_index", 0),
|
|
)
|
|
|
|
|
|
def _remove_border_strips(
|
|
zones_data: List[Dict[str, Any]],
|
|
border_prefiltered: bool,
|
|
) -> bool:
|
|
"""Detect and remove page-border decoration strips.
|
|
|
|
Returns updated border_prefiltered flag.
|
|
"""
|
|
border_strip_removed = 0
|
|
if border_prefiltered:
|
|
logger.info("Step 4e: skipped (border pre-filter already applied)")
|
|
return border_prefiltered
|
|
|
|
for z in zones_data:
|
|
cells = z.get("cells", [])
|
|
if not cells:
|
|
continue
|
|
all_wbs_with_cell: list = []
|
|
for cell in cells:
|
|
for wb in cell.get("word_boxes") or []:
|
|
all_wbs_with_cell.append((wb.get("left", 0), wb, cell))
|
|
if len(all_wbs_with_cell) < 10:
|
|
continue
|
|
all_wbs_with_cell.sort(key=lambda t: t[0])
|
|
total = len(all_wbs_with_cell)
|
|
|
|
# -- Left-edge scan --
|
|
left_strip_count = 0
|
|
left_gap = 0
|
|
running_right = 0
|
|
for gi in range(total - 1):
|
|
running_right = max(
|
|
running_right,
|
|
all_wbs_with_cell[gi][0] + all_wbs_with_cell[gi][1].get("width", 0),
|
|
)
|
|
gap = all_wbs_with_cell[gi + 1][0] - running_right
|
|
if gap > 30:
|
|
left_strip_count = gi + 1
|
|
left_gap = gap
|
|
break
|
|
|
|
# -- Right-edge scan --
|
|
right_strip_count = 0
|
|
right_gap = 0
|
|
running_left = all_wbs_with_cell[-1][0]
|
|
for gi in range(total - 1, 0, -1):
|
|
running_left = min(running_left, all_wbs_with_cell[gi][0])
|
|
prev_right = (
|
|
all_wbs_with_cell[gi - 1][0]
|
|
+ all_wbs_with_cell[gi - 1][1].get("width", 0)
|
|
)
|
|
gap = running_left - prev_right
|
|
if gap > 30:
|
|
right_strip_count = total - gi
|
|
right_gap = gap
|
|
break
|
|
|
|
strip_wbs: set = set()
|
|
strip_side = ""
|
|
strip_gap = 0
|
|
strip_count = 0
|
|
if left_strip_count > 0 and left_strip_count / total < 0.20:
|
|
strip_side = "left"
|
|
strip_count = left_strip_count
|
|
strip_gap = left_gap
|
|
strip_wbs = {id(t[1]) for t in all_wbs_with_cell[:left_strip_count]}
|
|
elif right_strip_count > 0 and right_strip_count / total < 0.20:
|
|
strip_side = "right"
|
|
strip_count = right_strip_count
|
|
strip_gap = right_gap
|
|
strip_wbs = {id(t[1]) for t in all_wbs_with_cell[total - right_strip_count:]}
|
|
|
|
if not strip_wbs:
|
|
continue
|
|
for cell in cells:
|
|
wbs = cell.get("word_boxes") or []
|
|
filtered = [wb for wb in wbs if id(wb) not in strip_wbs]
|
|
if len(filtered) < len(wbs):
|
|
border_strip_removed += len(wbs) - len(filtered)
|
|
cell["word_boxes"] = filtered
|
|
cell["text"] = _words_to_reading_order_text(filtered)
|
|
z["cells"] = [c for c in cells
|
|
if (c.get("word_boxes") or c.get("text", "").strip())]
|
|
logger.info(
|
|
"Step 4e: removed %d border-strip word_boxes (%s) from zone %d "
|
|
"(gap=%dpx, strip=%d/%d wbs)",
|
|
border_strip_removed, strip_side, z.get("zone_index", 0),
|
|
strip_gap, strip_count, total,
|
|
)
|
|
|
|
return border_prefiltered
|
|
|
|
|
|
def _remove_alphabet_sidebars(zones_data: List[Dict[str, Any]]) -> None:
|
|
"""Remove decorative edge columns (alphabet sidebar safety net).
|
|
|
|
Dictionary pages have A-Z letter sidebars that OCR reads as single-
|
|
character word_boxes.
|
|
"""
|
|
for z in zones_data:
|
|
columns = z.get("columns", [])
|
|
cells = z.get("cells", [])
|
|
if len(columns) < 3 or not cells:
|
|
continue
|
|
col_cells: Dict[str, List[Dict]] = {}
|
|
for cell in cells:
|
|
ct = cell.get("col_type", "")
|
|
if ct.startswith("column_"):
|
|
col_cells.setdefault(ct, []).append(cell)
|
|
col_types_ordered = sorted(col_cells.keys())
|
|
if len(col_types_ordered) < 3:
|
|
continue
|
|
for edge_ct in [col_types_ordered[0], col_types_ordered[-1]]:
|
|
edge_cells_list = col_cells.get(edge_ct, [])
|
|
if len(edge_cells_list) < 3:
|
|
continue
|
|
texts = [(c.get("text") or "").strip() for c in edge_cells_list]
|
|
avg_len = sum(len(t) for t in texts) / len(texts)
|
|
single_char = sum(1 for t in texts if len(t) <= 1)
|
|
single_ratio = single_char / len(texts)
|
|
if avg_len > 1.5:
|
|
continue
|
|
if single_ratio < 0.7:
|
|
continue
|
|
removed_count = len(edge_cells_list)
|
|
edge_ids = {id(c) for c in edge_cells_list}
|
|
z["cells"] = [c for c in cells if id(c) not in edge_ids]
|
|
z["columns"] = [col for col in columns if col.get("col_type") != edge_ct]
|
|
logger.info(
|
|
"Step 4f: removed decorative edge column '%s' from zone %d "
|
|
"(%d cells, avg_len=%.1f, single_char=%.0f%%)",
|
|
edge_ct, z.get("zone_index", 0), removed_count,
|
|
avg_len, single_ratio * 100,
|
|
)
|
|
break # only remove one edge per zone
|