Fix: Remove broken getKlausurApiUrl and clean up empty lines
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s

sed replacement left orphaned hostname references in story page
and empty lines in getApiBase functions.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-24 16:02:04 +02:00
parent b07f802c24
commit 9ba420fa91
150 changed files with 30231 additions and 32053 deletions

View File

@@ -0,0 +1,390 @@
"""
Grid Build Cleanup — Phase 3: Junk row removal, artifact cleanup, pipe
divider removal, connector normalization, border strip detection, and
alphabet sidebar removal.
Extracted from grid_build_core.py for maintainability.
"""
import logging
import re
from typing import Any, Dict, List
from cv_ocr_engines import _words_to_reading_order_text
logger = logging.getLogger(__name__)
_PIPE_RE = re.compile(r"^\|+$")
def _cleanup_zones(
zones_data: List[Dict[str, Any]],
border_prefiltered: bool,
session_id: str,
) -> bool:
"""Clean up zone data: remove junk rows, artifacts, pipes, border strips.
Args:
zones_data: List of zone dicts (modified in place).
border_prefiltered: Whether border words were already pre-filtered.
session_id: For logging.
Returns:
Updated border_prefiltered flag.
"""
_remove_junk_rows(zones_data)
_remove_artifact_cells(zones_data)
_remove_oversized_word_boxes(zones_data)
_remove_pipe_dividers(zones_data)
_normalize_connector_columns(zones_data)
border_prefiltered = _remove_border_strips(zones_data, border_prefiltered)
_remove_alphabet_sidebars(zones_data)
return border_prefiltered
def _remove_junk_rows(zones_data: List[Dict[str, Any]]) -> None:
"""Remove rows where ALL cells contain only short, low-confidence text.
Also removes 'oversized stub' rows and 'scattered debris' rows.
"""
_JUNK_CONF_THRESHOLD = 50
_JUNK_MAX_TEXT_LEN = 3
for z in zones_data:
cells = z.get("cells", [])
rows = z.get("rows", [])
if not cells or not rows:
continue
# Compute median word height across the zone for oversized detection
all_wb_heights = [
wb["height"]
for cell in cells
for wb in cell.get("word_boxes") or []
if wb.get("height", 0) > 0
]
median_wb_h = sorted(all_wb_heights)[len(all_wb_heights) // 2] if all_wb_heights else 28
junk_row_indices = set()
for row in rows:
ri = row["index"]
row_cells = [c for c in cells if c.get("row_index") == ri]
if not row_cells:
continue
row_wbs = [
wb for cell in row_cells
for wb in cell.get("word_boxes") or []
]
# Rule 1: ALL word_boxes are low-conf AND short text
all_junk = True
for wb in row_wbs:
text = (wb.get("text") or "").strip()
conf = wb.get("conf", 0)
if conf >= _JUNK_CONF_THRESHOLD or len(text) > _JUNK_MAX_TEXT_LEN:
all_junk = False
break
if all_junk and row_wbs:
junk_row_indices.add(ri)
continue
# Rule 2: oversized stub -- <=3 words, short total text,
# and word height > 1.8x median
if len(row_wbs) <= 3:
total_text = "".join((wb.get("text") or "").strip() for wb in row_wbs)
max_h = max((wb.get("height", 0) for wb in row_wbs), default=0)
has_page_ref = any(
re.match(r'^[pPsS]\.?\s*\d+$', (wb.get("text") or "").strip())
for wb in row_wbs
)
if len(total_text) <= 5 and max_h > median_wb_h * 1.8 and not has_page_ref:
junk_row_indices.add(ri)
continue
# Rule 3: scattered debris -- rows with only tiny fragments
longest = max(len((wb.get("text") or "").strip()) for wb in row_wbs)
if longest <= 2:
junk_row_indices.add(ri)
continue
if junk_row_indices:
z["cells"] = [c for c in cells if c.get("row_index") not in junk_row_indices]
z["rows"] = [r for r in rows if r["index"] not in junk_row_indices]
logger.info(
"build-grid: removed %d junk rows from zone %d: %s",
len(junk_row_indices), z["zone_index"],
sorted(junk_row_indices),
)
def _remove_artifact_cells(zones_data: List[Dict[str, Any]]) -> None:
"""Remove individual cells with a single very-short, low-conf word."""
_ARTIFACT_MAX_LEN = 2
_ARTIFACT_CONF_THRESHOLD = 65
for z in zones_data:
cells = z.get("cells", [])
if not cells:
continue
artifact_ids = set()
for cell in cells:
wbs = cell.get("word_boxes") or []
if len(wbs) != 1:
continue
wb = wbs[0]
text = (wb.get("text") or "").strip()
conf = wb.get("conf", 100)
if len(text) <= _ARTIFACT_MAX_LEN and conf < _ARTIFACT_CONF_THRESHOLD:
artifact_ids.add(cell.get("cell_id"))
if artifact_ids:
z["cells"] = [c for c in cells if c.get("cell_id") not in artifact_ids]
logger.info(
"build-grid: removed %d artifact cells from zone %d: %s",
len(artifact_ids), z.get("zone_index", 0),
[c.get("text") for c in cells if c.get("cell_id") in artifact_ids],
)
def _remove_oversized_word_boxes(zones_data: List[Dict[str, Any]]) -> None:
"""Remove word_boxes whose height is 3x+ the median (graphic artifacts)."""
for z in zones_data:
cells = z.get("cells", [])
if not cells:
continue
all_wh = [
wb["height"]
for cell in cells
for wb in cell.get("word_boxes") or []
if wb.get("height", 0) > 0
]
if not all_wh:
continue
med_h = sorted(all_wh)[len(all_wh) // 2]
oversized_threshold = med_h * 3
removed_oversized = 0
for cell in cells:
wbs = cell.get("word_boxes") or []
filtered = [wb for wb in wbs if wb.get("height", 0) <= oversized_threshold]
if len(filtered) < len(wbs):
removed_oversized += len(wbs) - len(filtered)
cell["word_boxes"] = filtered
cell["text"] = _words_to_reading_order_text(filtered)
if removed_oversized:
z["cells"] = [c for c in cells if c.get("word_boxes")]
logger.info(
"build-grid: removed %d oversized word_boxes (>%dpx) from zone %d",
removed_oversized, oversized_threshold, z.get("zone_index", 0),
)
def _remove_pipe_dividers(zones_data: List[Dict[str, Any]]) -> None:
"""Remove pipe-character word_boxes (column divider artifacts)."""
for z in zones_data:
if z.get("vsplit_group") is not None:
continue # pipes already removed before split
removed_pipes = 0
for cell in z.get("cells", []):
wbs = cell.get("word_boxes") or []
filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())]
if len(filtered) < len(wbs):
removed_pipes += len(wbs) - len(filtered)
cell["word_boxes"] = filtered
cell["text"] = _words_to_reading_order_text(filtered)
if removed_pipes:
z["cells"] = [c for c in z.get("cells", []) if (c.get("word_boxes") or c.get("text", "").strip())]
logger.info(
"build-grid: removed %d pipe-divider word_boxes from zone %d",
removed_pipes, z.get("zone_index", 0),
)
# Strip pipe chars ONLY from cell edges (OCR artifacts).
# Preserve pipes embedded in words as syllable separators.
for z in zones_data:
for cell in z.get("cells", []):
text = cell.get("text", "")
if "|" in text:
cleaned = text.strip("|").strip()
if cleaned != text.strip():
cell["text"] = cleaned
def _normalize_connector_columns(zones_data: List[Dict[str, Any]]) -> None:
"""Normalize narrow connector columns where OCR appends noise chars.
In synonym dictionaries a narrow column repeats the same word
(e.g. "oder") in every row. OCR sometimes appends noise chars.
"""
for z in zones_data:
cols = z.get("columns", [])
cells = z.get("cells", [])
if not cols or not cells:
continue
for col in cols:
ci = col.get("index")
col_cells = [c for c in cells if c.get("col_index") == ci]
if len(col_cells) < 3:
continue
text_counts: Dict[str, int] = {}
for c in col_cells:
t = (c.get("text") or "").strip()
if t:
text_counts[t] = text_counts.get(t, 0) + 1
if not text_counts:
continue
dominant_text = max(text_counts, key=text_counts.get) # type: ignore[arg-type]
dominant_count = text_counts[dominant_text]
if len(dominant_text) > 10 or dominant_count < len(col_cells) * 0.6:
continue
fixed = 0
for c in col_cells:
t = (c.get("text") or "").strip()
if t != dominant_text and t.startswith(dominant_text) and len(t) <= len(dominant_text) + 2:
c["text"] = dominant_text
wbs = c.get("word_boxes") or []
if len(wbs) == 1:
wbs[0]["text"] = dominant_text
fixed += 1
if fixed:
logger.info(
"build-grid: normalized %d outlier cells in connector column %d "
"(dominant='%s') zone %d",
fixed, ci, dominant_text, z.get("zone_index", 0),
)
def _remove_border_strips(
zones_data: List[Dict[str, Any]],
border_prefiltered: bool,
) -> bool:
"""Detect and remove page-border decoration strips.
Returns updated border_prefiltered flag.
"""
border_strip_removed = 0
if border_prefiltered:
logger.info("Step 4e: skipped (border pre-filter already applied)")
return border_prefiltered
for z in zones_data:
cells = z.get("cells", [])
if not cells:
continue
all_wbs_with_cell: list = []
for cell in cells:
for wb in cell.get("word_boxes") or []:
all_wbs_with_cell.append((wb.get("left", 0), wb, cell))
if len(all_wbs_with_cell) < 10:
continue
all_wbs_with_cell.sort(key=lambda t: t[0])
total = len(all_wbs_with_cell)
# -- Left-edge scan --
left_strip_count = 0
left_gap = 0
running_right = 0
for gi in range(total - 1):
running_right = max(
running_right,
all_wbs_with_cell[gi][0] + all_wbs_with_cell[gi][1].get("width", 0),
)
gap = all_wbs_with_cell[gi + 1][0] - running_right
if gap > 30:
left_strip_count = gi + 1
left_gap = gap
break
# -- Right-edge scan --
right_strip_count = 0
right_gap = 0
running_left = all_wbs_with_cell[-1][0]
for gi in range(total - 1, 0, -1):
running_left = min(running_left, all_wbs_with_cell[gi][0])
prev_right = (
all_wbs_with_cell[gi - 1][0]
+ all_wbs_with_cell[gi - 1][1].get("width", 0)
)
gap = running_left - prev_right
if gap > 30:
right_strip_count = total - gi
right_gap = gap
break
strip_wbs: set = set()
strip_side = ""
strip_gap = 0
strip_count = 0
if left_strip_count > 0 and left_strip_count / total < 0.20:
strip_side = "left"
strip_count = left_strip_count
strip_gap = left_gap
strip_wbs = {id(t[1]) for t in all_wbs_with_cell[:left_strip_count]}
elif right_strip_count > 0 and right_strip_count / total < 0.20:
strip_side = "right"
strip_count = right_strip_count
strip_gap = right_gap
strip_wbs = {id(t[1]) for t in all_wbs_with_cell[total - right_strip_count:]}
if not strip_wbs:
continue
for cell in cells:
wbs = cell.get("word_boxes") or []
filtered = [wb for wb in wbs if id(wb) not in strip_wbs]
if len(filtered) < len(wbs):
border_strip_removed += len(wbs) - len(filtered)
cell["word_boxes"] = filtered
cell["text"] = _words_to_reading_order_text(filtered)
z["cells"] = [c for c in cells
if (c.get("word_boxes") or c.get("text", "").strip())]
logger.info(
"Step 4e: removed %d border-strip word_boxes (%s) from zone %d "
"(gap=%dpx, strip=%d/%d wbs)",
border_strip_removed, strip_side, z.get("zone_index", 0),
strip_gap, strip_count, total,
)
return border_prefiltered
def _remove_alphabet_sidebars(zones_data: List[Dict[str, Any]]) -> None:
"""Remove decorative edge columns (alphabet sidebar safety net).
Dictionary pages have A-Z letter sidebars that OCR reads as single-
character word_boxes.
"""
for z in zones_data:
columns = z.get("columns", [])
cells = z.get("cells", [])
if len(columns) < 3 or not cells:
continue
col_cells: Dict[str, List[Dict]] = {}
for cell in cells:
ct = cell.get("col_type", "")
if ct.startswith("column_"):
col_cells.setdefault(ct, []).append(cell)
col_types_ordered = sorted(col_cells.keys())
if len(col_types_ordered) < 3:
continue
for edge_ct in [col_types_ordered[0], col_types_ordered[-1]]:
edge_cells_list = col_cells.get(edge_ct, [])
if len(edge_cells_list) < 3:
continue
texts = [(c.get("text") or "").strip() for c in edge_cells_list]
avg_len = sum(len(t) for t in texts) / len(texts)
single_char = sum(1 for t in texts if len(t) <= 1)
single_ratio = single_char / len(texts)
if avg_len > 1.5:
continue
if single_ratio < 0.7:
continue
removed_count = len(edge_cells_list)
edge_ids = {id(c) for c in edge_cells_list}
z["cells"] = [c for c in cells if id(c) not in edge_ids]
z["columns"] = [col for col in columns if col.get("col_type") != edge_ct]
logger.info(
"Step 4f: removed decorative edge column '%s' from zone %d "
"(%d cells, avg_len=%.1f, single_char=%.0f%%)",
edge_ct, z.get("zone_index", 0), removed_count,
avg_len, single_ratio * 100,
)
break # only remove one edge per zone