Restructure: Move grid_* + vocab_* into packages (klausur-service)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m31s
CI / test-python-agent-core (push) Successful in 20s
CI / test-nodejs-website (push) Successful in 23s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 28s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 2m31s
CI / test-python-agent-core (push) Successful in 20s
CI / test-nodejs-website (push) Successful in 23s
grid/ package (16 files): grid/build/ — core, zones, cleanup, text_ops, cell_ops, finalize grid/editor/ — api, helpers, columns, filters, headers, zones vocab/ package (10 files): vocab/worksheet/ — api, models, extraction, generation, ocr, upload, analysis, compare vocab/ — session_store, learn_bridge 26 backward-compat shims. Internal imports relative. RAG untouched. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
11
klausur-service/backend/grid/build/__init__.py
Normal file
11
klausur-service/backend/grid/build/__init__.py
Normal file
@@ -0,0 +1,11 @@
|
||||
"""
|
||||
Grid Build sub-package — grid construction pipeline.
|
||||
|
||||
Modules:
|
||||
- core — _build_grid_core() main entry point
|
||||
- zones — image loading, graphic/box detection, zone-aware grid building
|
||||
- cleanup — junk row removal, artifact cleanup, pipe dividers
|
||||
- text_ops — color annotation, heading detection, IPA correction
|
||||
- cell_ops — bullet removal, garbled cells, word-box reordering
|
||||
- finalize — dictionary detection, spell checking, result assembly
|
||||
"""
|
||||
305
klausur-service/backend/grid/build/cell_ops.py
Normal file
305
klausur-service/backend/grid/build/cell_ops.py
Normal file
@@ -0,0 +1,305 @@
|
||||
"""
|
||||
Grid Build Cell Ops — Cell-level operations: bullet/artifact removal,
|
||||
garbled cell cleanup, word-box reordering, and max_columns enforcement.
|
||||
|
||||
Extracted from grid_build_core.py for maintainability.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Dict, List, Tuple
|
||||
|
||||
from cv_ocr_engines import (
|
||||
_words_to_reading_order_text, _group_words_into_lines, _lookup_ipa,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _remove_bullets_and_artifacts(zones_data: List[Dict[str, Any]]) -> None:
|
||||
"""Remove blue bullet/artifact word_boxes (Step 5i).
|
||||
|
||||
Handles tiny coloured symbols, overlapping word_boxes, duplicate text,
|
||||
and syllable-split word merging.
|
||||
"""
|
||||
_ALPHA_WORD_RE = re.compile(r'^[A-Za-z\u00c0-\u024f\-]+[.,;:!?]*$')
|
||||
_REMOVE_SYMBOLS = {'>', '<', '~', '\\', '^', '`', '#', '|', '¬', '¦'}
|
||||
|
||||
bullet_removed = 0
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
wbs = cell.get("word_boxes") or []
|
||||
if len(wbs) < 2:
|
||||
continue
|
||||
to_remove: set = set()
|
||||
|
||||
# Rule (a): tiny coloured symbols
|
||||
for i, wb in enumerate(wbs):
|
||||
cn = wb.get("color_name", "black")
|
||||
if (cn != "black"
|
||||
and wb.get("width", 0) * wb.get("height", 0) < 200
|
||||
and wb.get("conf", 100) < 85):
|
||||
to_remove.add(i)
|
||||
|
||||
# Rule (a2): isolated non-alphanumeric symbols
|
||||
for i, wb in enumerate(wbs):
|
||||
t = (wb.get("text") or "").strip()
|
||||
if t and not re.search(r'[a-zA-Z0-9äöüÄÖÜß]', t) and len(t) <= 2:
|
||||
if t in _REMOVE_SYMBOLS:
|
||||
to_remove.add(i)
|
||||
|
||||
# Rule (b) + (c): overlap and duplicate detection
|
||||
to_merge: List[Tuple[int, int]] = []
|
||||
indexed = sorted(enumerate(wbs), key=lambda iw: iw[1].get("left", 0))
|
||||
for p in range(len(indexed) - 1):
|
||||
i1, w1 = indexed[p]
|
||||
i2, w2 = indexed[p + 1]
|
||||
x1s, x1e = w1.get("left", 0), w1.get("left", 0) + w1.get("width", 0)
|
||||
x2s, x2e = w2.get("left", 0), w2.get("left", 0) + w2.get("width", 0)
|
||||
overlap = max(0, min(x1e, x2e) - max(x1s, x2s))
|
||||
min_w = min(w1.get("width", 1), w2.get("width", 1))
|
||||
gap = x2s - x1e
|
||||
overlap_pct = overlap / min_w if min_w > 0 else 0
|
||||
|
||||
if overlap_pct > 0.20:
|
||||
t1 = (w1.get("text") or "").strip()
|
||||
t2 = (w2.get("text") or "").strip()
|
||||
|
||||
# Syllable-split words
|
||||
if (overlap_pct <= 0.75
|
||||
and _ALPHA_WORD_RE.match(t1)
|
||||
and _ALPHA_WORD_RE.match(t2)):
|
||||
to_merge.append((i1, i2))
|
||||
continue
|
||||
|
||||
# High overlap with short prefix
|
||||
if (overlap_pct > 0.75
|
||||
and _ALPHA_WORD_RE.match(t1)
|
||||
and _ALPHA_WORD_RE.match(t2)
|
||||
and t1.rstrip(".,;:!?").lower() != t2.rstrip(".,;:!?").lower()
|
||||
and min(len(t1.rstrip(".,;:!?")), len(t2.rstrip(".,;:!?"))) <= 4):
|
||||
to_merge.append((i1, i2))
|
||||
continue
|
||||
|
||||
if overlap_pct <= 0.40:
|
||||
continue
|
||||
|
||||
c1 = w1.get("conf", 50)
|
||||
c2 = w2.get("conf", 50)
|
||||
|
||||
# Very high overlap: prefer IPA-dictionary word
|
||||
if overlap_pct > 0.90 and t1.lower() != t2.lower():
|
||||
in_dict_1 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t1.lower()), "british")) if t1.isalpha() else False
|
||||
in_dict_2 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t2.lower()), "british")) if t2.isalpha() else False
|
||||
if in_dict_1 and not in_dict_2:
|
||||
to_remove.add(i2)
|
||||
continue
|
||||
elif in_dict_2 and not in_dict_1:
|
||||
to_remove.add(i1)
|
||||
continue
|
||||
|
||||
if c1 < c2:
|
||||
to_remove.add(i1)
|
||||
elif c2 < c1:
|
||||
to_remove.add(i2)
|
||||
else:
|
||||
if w1.get("height", 0) > w2.get("height", 0):
|
||||
to_remove.add(i1)
|
||||
else:
|
||||
to_remove.add(i2)
|
||||
|
||||
elif (gap < 6
|
||||
and w1.get("color_name") == "blue"
|
||||
and w2.get("color_name") == "blue"
|
||||
and (w1.get("text") or "").strip() == (w2.get("text") or "").strip()):
|
||||
c1 = w1.get("conf", 50)
|
||||
c2 = w2.get("conf", 50)
|
||||
to_remove.add(i1 if c1 <= c2 else i2)
|
||||
|
||||
# Execute merges first (syllable-split words)
|
||||
if to_merge:
|
||||
merge_parent: Dict[int, int] = {}
|
||||
for mi1, mi2 in to_merge:
|
||||
actual_mi1 = mi1
|
||||
while actual_mi1 in merge_parent:
|
||||
actual_mi1 = merge_parent[actual_mi1]
|
||||
if actual_mi1 in to_remove or mi2 in to_remove:
|
||||
continue
|
||||
if mi2 in merge_parent:
|
||||
continue
|
||||
mw1, mw2 = wbs[actual_mi1], wbs[mi2]
|
||||
mt1 = (mw1.get("text") or "").rstrip(".,;:!?")
|
||||
mt2 = (mw2.get("text") or "").strip()
|
||||
merged_text = mt1 + mt2
|
||||
mx = min(mw1["left"], mw2["left"])
|
||||
my = min(mw1["top"], mw2["top"])
|
||||
mr = max(mw1["left"] + mw1["width"],
|
||||
mw2["left"] + mw2["width"])
|
||||
mb = max(mw1["top"] + mw1["height"],
|
||||
mw2["top"] + mw2["height"])
|
||||
mw1["text"] = merged_text
|
||||
mw1["left"] = mx
|
||||
mw1["top"] = my
|
||||
mw1["width"] = mr - mx
|
||||
mw1["height"] = mb - my
|
||||
mw1["conf"] = (mw1.get("conf", 50) + mw2.get("conf", 50)) / 2
|
||||
to_remove.add(mi2)
|
||||
merge_parent[mi2] = actual_mi1
|
||||
bullet_removed -= 1
|
||||
|
||||
if to_remove:
|
||||
bullet_removed += len(to_remove)
|
||||
filtered = [wb for i, wb in enumerate(wbs) if i not in to_remove]
|
||||
cell["word_boxes"] = filtered
|
||||
if not cell.get("_ipa_corrected"):
|
||||
cell["text"] = _words_to_reading_order_text(filtered)
|
||||
|
||||
if bullet_removed:
|
||||
for z in zones_data:
|
||||
z["cells"] = [c for c in z.get("cells", [])
|
||||
if (c.get("word_boxes") or c.get("text", "").strip())]
|
||||
logger.info("Step 5i: removed %d bullet/artifact word_boxes", bullet_removed)
|
||||
|
||||
|
||||
def _remove_garbled_cells(zones_data: List[Dict[str, Any]]) -> None:
|
||||
"""Remove cells whose text is entirely garbled / artifact noise (Step 5j-pre)."""
|
||||
_COMMON_SHORT_WORDS = {
|
||||
"ab", "am", "an", "da", "du", "er", "es", "im", "in", "ja",
|
||||
"ob", "so", "um", "zu", "wo", "je", "oh", "or",
|
||||
"die", "der", "das", "dem", "den", "des", "ein", "und",
|
||||
"auf", "aus", "bei", "bis", "für", "mit", "nur", "von",
|
||||
"a", "i", "an", "as", "at", "be", "by", "do", "go", "he",
|
||||
"if", "in", "is", "it", "me", "my", "no", "of", "oh", "ok",
|
||||
"on", "or", "so", "to", "up", "us", "we",
|
||||
"the", "and", "but", "for", "not",
|
||||
}
|
||||
_PURE_JUNK_RE = re.compile(r'^[\W\d\s]+$')
|
||||
artifact_cells_removed = 0
|
||||
|
||||
for z in zones_data:
|
||||
before = len(z.get("cells", []))
|
||||
kept = []
|
||||
for cell in z.get("cells", []):
|
||||
text = (cell.get("text") or "").strip()
|
||||
core = text.rstrip(".,;:!?'\"")
|
||||
is_artifact = False
|
||||
if not core:
|
||||
is_artifact = True
|
||||
elif _PURE_JUNK_RE.match(core):
|
||||
if core.strip() not in ('=', '(=', '=)', ';', ':', '-', '–', '—'):
|
||||
is_artifact = True
|
||||
elif len(core) <= 2 and core.lower() not in _COMMON_SHORT_WORDS and not core.isalpha():
|
||||
is_artifact = True
|
||||
elif len(core) <= 3 and core.isupper() and core.lower() not in _COMMON_SHORT_WORDS:
|
||||
is_artifact = True
|
||||
elif (len(core) <= 5 and re.search(r'\d', core) and re.search(r'[A-Za-z]', core)
|
||||
and not re.match(r'^[pPsS]\.?\d+$', core)):
|
||||
is_artifact = True
|
||||
if is_artifact:
|
||||
kept.append(None)
|
||||
else:
|
||||
kept.append(cell)
|
||||
z["cells"] = [c for c in kept if c is not None]
|
||||
artifact_cells_removed += before - len(z["cells"])
|
||||
|
||||
if artifact_cells_removed:
|
||||
for z in zones_data:
|
||||
cell_ris = {c.get("row_index") for c in z.get("cells", [])}
|
||||
z["rows"] = [r for r in z.get("rows", []) if r["index"] in cell_ris]
|
||||
logger.info("Step 5j-pre: removed %d artifact cells", artifact_cells_removed)
|
||||
|
||||
|
||||
def _normalize_word_order(zones_data: List[Dict[str, Any]]) -> None:
|
||||
"""Normalise word_box order to reading order (Step 5j)."""
|
||||
wb_reordered = 0
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
wbs = cell.get("word_boxes") or []
|
||||
if len(wbs) < 2:
|
||||
continue
|
||||
lines = _group_words_into_lines(wbs, y_tolerance_px=15)
|
||||
sorted_wbs = [w for line in lines for w in line]
|
||||
if [id(w) for w in sorted_wbs] != [id(w) for w in wbs]:
|
||||
cell["word_boxes"] = sorted_wbs
|
||||
wb_reordered += 1
|
||||
if wb_reordered:
|
||||
logger.info("Step 5j: re-ordered word_boxes in %d cells to reading order", wb_reordered)
|
||||
|
||||
|
||||
def _enforce_max_columns(
|
||||
zones_data: List[Dict[str, Any]],
|
||||
max_columns: int,
|
||||
) -> None:
|
||||
"""Enforce max_columns by merging narrowest columns (Step 5k)."""
|
||||
for z in zones_data:
|
||||
if z.get("zone_type") != "content":
|
||||
continue
|
||||
cols = z.get("columns", [])
|
||||
cells = z.get("cells", [])
|
||||
if len(cols) <= max_columns:
|
||||
continue
|
||||
|
||||
logger.info(
|
||||
"max_columns=%d: zone %s has %d columns -> merging",
|
||||
max_columns, z.get("zone_index"), len(cols),
|
||||
)
|
||||
|
||||
cols_by_width = sorted(cols, key=lambda c: (c.get("x_max_px", 0) - c.get("x_min_px", 0)))
|
||||
|
||||
while len(cols) > max_columns:
|
||||
narrowest = cols_by_width.pop(0)
|
||||
ni = narrowest["index"]
|
||||
|
||||
sorted_by_x = sorted(cols, key=lambda c: c.get("x_min_px", c.get("x_min_pct", 0)))
|
||||
pos = next(i for i, c in enumerate(sorted_by_x) if c["index"] == ni)
|
||||
if pos + 1 < len(sorted_by_x):
|
||||
merge_target = sorted_by_x[pos + 1]
|
||||
elif pos > 0:
|
||||
merge_target = sorted_by_x[pos - 1]
|
||||
else:
|
||||
break
|
||||
|
||||
ti = merge_target["index"]
|
||||
|
||||
merge_target["x_min_px"] = min(
|
||||
merge_target.get("x_min_px", merge_target.get("x_min_pct", 0)),
|
||||
narrowest.get("x_min_px", narrowest.get("x_min_pct", 0)),
|
||||
)
|
||||
merge_target["x_max_px"] = max(
|
||||
merge_target.get("x_max_px", merge_target.get("x_max_pct", 100)),
|
||||
narrowest.get("x_max_px", narrowest.get("x_max_pct", 100)),
|
||||
)
|
||||
if "x_min_pct" in merge_target and "x_min_pct" in narrowest:
|
||||
merge_target["x_min_pct"] = min(merge_target["x_min_pct"], narrowest["x_min_pct"])
|
||||
merge_target["x_max_pct"] = max(merge_target["x_max_pct"], narrowest["x_max_pct"])
|
||||
|
||||
for cell in cells:
|
||||
if cell.get("col_index") == ni:
|
||||
cell["col_index"] = ti
|
||||
existing = next(
|
||||
(c for c in cells if c["col_index"] == ti
|
||||
and c["row_index"] == cell["row_index"]
|
||||
and c is not cell),
|
||||
None,
|
||||
)
|
||||
if existing:
|
||||
existing["text"] = (
|
||||
(existing.get("text", "") + " " + cell.get("text", "")).strip()
|
||||
)
|
||||
existing["word_boxes"] = existing.get("word_boxes", []) + cell.get("word_boxes", [])
|
||||
cell["_merged"] = True
|
||||
|
||||
z["cells"] = [c for c in cells if not c.get("_merged")]
|
||||
cells = z["cells"]
|
||||
cols.remove(narrowest)
|
||||
cols_by_width = [c for c in cols_by_width if c["index"] != ni]
|
||||
|
||||
# Re-index columns 0..N-1
|
||||
for new_idx, col in enumerate(sorted(cols, key=lambda c: c.get("x_min_px", c.get("x_min_pct", 0)))):
|
||||
old_idx = col["index"]
|
||||
col["index"] = new_idx
|
||||
for cell in cells:
|
||||
if cell.get("col_index") == old_idx:
|
||||
cell["col_index"] = new_idx
|
||||
|
||||
logger.info("max_columns: zone %s now has %d columns", z.get("zone_index"), len(cols))
|
||||
390
klausur-service/backend/grid/build/cleanup.py
Normal file
390
klausur-service/backend/grid/build/cleanup.py
Normal file
@@ -0,0 +1,390 @@
|
||||
"""
|
||||
Grid Build Cleanup — Phase 3: Junk row removal, artifact cleanup, pipe
|
||||
divider removal, connector normalization, border strip detection, and
|
||||
alphabet sidebar removal.
|
||||
|
||||
Extracted from grid_build_core.py for maintainability.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from cv_ocr_engines import _words_to_reading_order_text
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_PIPE_RE = re.compile(r"^\|+$")
|
||||
|
||||
|
||||
def _cleanup_zones(
|
||||
zones_data: List[Dict[str, Any]],
|
||||
border_prefiltered: bool,
|
||||
session_id: str,
|
||||
) -> bool:
|
||||
"""Clean up zone data: remove junk rows, artifacts, pipes, border strips.
|
||||
|
||||
Args:
|
||||
zones_data: List of zone dicts (modified in place).
|
||||
border_prefiltered: Whether border words were already pre-filtered.
|
||||
session_id: For logging.
|
||||
|
||||
Returns:
|
||||
Updated border_prefiltered flag.
|
||||
"""
|
||||
_remove_junk_rows(zones_data)
|
||||
_remove_artifact_cells(zones_data)
|
||||
_remove_oversized_word_boxes(zones_data)
|
||||
_remove_pipe_dividers(zones_data)
|
||||
_normalize_connector_columns(zones_data)
|
||||
border_prefiltered = _remove_border_strips(zones_data, border_prefiltered)
|
||||
_remove_alphabet_sidebars(zones_data)
|
||||
return border_prefiltered
|
||||
|
||||
|
||||
def _remove_junk_rows(zones_data: List[Dict[str, Any]]) -> None:
|
||||
"""Remove rows where ALL cells contain only short, low-confidence text.
|
||||
|
||||
Also removes 'oversized stub' rows and 'scattered debris' rows.
|
||||
"""
|
||||
_JUNK_CONF_THRESHOLD = 50
|
||||
_JUNK_MAX_TEXT_LEN = 3
|
||||
|
||||
for z in zones_data:
|
||||
cells = z.get("cells", [])
|
||||
rows = z.get("rows", [])
|
||||
if not cells or not rows:
|
||||
continue
|
||||
|
||||
# Compute median word height across the zone for oversized detection
|
||||
all_wb_heights = [
|
||||
wb["height"]
|
||||
for cell in cells
|
||||
for wb in cell.get("word_boxes") or []
|
||||
if wb.get("height", 0) > 0
|
||||
]
|
||||
median_wb_h = sorted(all_wb_heights)[len(all_wb_heights) // 2] if all_wb_heights else 28
|
||||
|
||||
junk_row_indices = set()
|
||||
for row in rows:
|
||||
ri = row["index"]
|
||||
row_cells = [c for c in cells if c.get("row_index") == ri]
|
||||
if not row_cells:
|
||||
continue
|
||||
|
||||
row_wbs = [
|
||||
wb for cell in row_cells
|
||||
for wb in cell.get("word_boxes") or []
|
||||
]
|
||||
|
||||
# Rule 1: ALL word_boxes are low-conf AND short text
|
||||
all_junk = True
|
||||
for wb in row_wbs:
|
||||
text = (wb.get("text") or "").strip()
|
||||
conf = wb.get("conf", 0)
|
||||
if conf >= _JUNK_CONF_THRESHOLD or len(text) > _JUNK_MAX_TEXT_LEN:
|
||||
all_junk = False
|
||||
break
|
||||
if all_junk and row_wbs:
|
||||
junk_row_indices.add(ri)
|
||||
continue
|
||||
|
||||
# Rule 2: oversized stub -- <=3 words, short total text,
|
||||
# and word height > 1.8x median
|
||||
if len(row_wbs) <= 3:
|
||||
total_text = "".join((wb.get("text") or "").strip() for wb in row_wbs)
|
||||
max_h = max((wb.get("height", 0) for wb in row_wbs), default=0)
|
||||
has_page_ref = any(
|
||||
re.match(r'^[pPsS]\.?\s*\d+$', (wb.get("text") or "").strip())
|
||||
for wb in row_wbs
|
||||
)
|
||||
if len(total_text) <= 5 and max_h > median_wb_h * 1.8 and not has_page_ref:
|
||||
junk_row_indices.add(ri)
|
||||
continue
|
||||
|
||||
# Rule 3: scattered debris -- rows with only tiny fragments
|
||||
longest = max(len((wb.get("text") or "").strip()) for wb in row_wbs)
|
||||
if longest <= 2:
|
||||
junk_row_indices.add(ri)
|
||||
continue
|
||||
|
||||
if junk_row_indices:
|
||||
z["cells"] = [c for c in cells if c.get("row_index") not in junk_row_indices]
|
||||
z["rows"] = [r for r in rows if r["index"] not in junk_row_indices]
|
||||
logger.info(
|
||||
"build-grid: removed %d junk rows from zone %d: %s",
|
||||
len(junk_row_indices), z["zone_index"],
|
||||
sorted(junk_row_indices),
|
||||
)
|
||||
|
||||
|
||||
def _remove_artifact_cells(zones_data: List[Dict[str, Any]]) -> None:
|
||||
"""Remove individual cells with a single very-short, low-conf word."""
|
||||
_ARTIFACT_MAX_LEN = 2
|
||||
_ARTIFACT_CONF_THRESHOLD = 65
|
||||
|
||||
for z in zones_data:
|
||||
cells = z.get("cells", [])
|
||||
if not cells:
|
||||
continue
|
||||
artifact_ids = set()
|
||||
for cell in cells:
|
||||
wbs = cell.get("word_boxes") or []
|
||||
if len(wbs) != 1:
|
||||
continue
|
||||
wb = wbs[0]
|
||||
text = (wb.get("text") or "").strip()
|
||||
conf = wb.get("conf", 100)
|
||||
if len(text) <= _ARTIFACT_MAX_LEN and conf < _ARTIFACT_CONF_THRESHOLD:
|
||||
artifact_ids.add(cell.get("cell_id"))
|
||||
if artifact_ids:
|
||||
z["cells"] = [c for c in cells if c.get("cell_id") not in artifact_ids]
|
||||
logger.info(
|
||||
"build-grid: removed %d artifact cells from zone %d: %s",
|
||||
len(artifact_ids), z.get("zone_index", 0),
|
||||
[c.get("text") for c in cells if c.get("cell_id") in artifact_ids],
|
||||
)
|
||||
|
||||
|
||||
def _remove_oversized_word_boxes(zones_data: List[Dict[str, Any]]) -> None:
|
||||
"""Remove word_boxes whose height is 3x+ the median (graphic artifacts)."""
|
||||
for z in zones_data:
|
||||
cells = z.get("cells", [])
|
||||
if not cells:
|
||||
continue
|
||||
all_wh = [
|
||||
wb["height"]
|
||||
for cell in cells
|
||||
for wb in cell.get("word_boxes") or []
|
||||
if wb.get("height", 0) > 0
|
||||
]
|
||||
if not all_wh:
|
||||
continue
|
||||
med_h = sorted(all_wh)[len(all_wh) // 2]
|
||||
oversized_threshold = med_h * 3
|
||||
removed_oversized = 0
|
||||
for cell in cells:
|
||||
wbs = cell.get("word_boxes") or []
|
||||
filtered = [wb for wb in wbs if wb.get("height", 0) <= oversized_threshold]
|
||||
if len(filtered) < len(wbs):
|
||||
removed_oversized += len(wbs) - len(filtered)
|
||||
cell["word_boxes"] = filtered
|
||||
cell["text"] = _words_to_reading_order_text(filtered)
|
||||
if removed_oversized:
|
||||
z["cells"] = [c for c in cells if c.get("word_boxes")]
|
||||
logger.info(
|
||||
"build-grid: removed %d oversized word_boxes (>%dpx) from zone %d",
|
||||
removed_oversized, oversized_threshold, z.get("zone_index", 0),
|
||||
)
|
||||
|
||||
|
||||
def _remove_pipe_dividers(zones_data: List[Dict[str, Any]]) -> None:
|
||||
"""Remove pipe-character word_boxes (column divider artifacts)."""
|
||||
for z in zones_data:
|
||||
if z.get("vsplit_group") is not None:
|
||||
continue # pipes already removed before split
|
||||
removed_pipes = 0
|
||||
for cell in z.get("cells", []):
|
||||
wbs = cell.get("word_boxes") or []
|
||||
filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())]
|
||||
if len(filtered) < len(wbs):
|
||||
removed_pipes += len(wbs) - len(filtered)
|
||||
cell["word_boxes"] = filtered
|
||||
cell["text"] = _words_to_reading_order_text(filtered)
|
||||
if removed_pipes:
|
||||
z["cells"] = [c for c in z.get("cells", []) if (c.get("word_boxes") or c.get("text", "").strip())]
|
||||
logger.info(
|
||||
"build-grid: removed %d pipe-divider word_boxes from zone %d",
|
||||
removed_pipes, z.get("zone_index", 0),
|
||||
)
|
||||
|
||||
# Strip pipe chars ONLY from cell edges (OCR artifacts).
|
||||
# Preserve pipes embedded in words as syllable separators.
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
text = cell.get("text", "")
|
||||
if "|" in text:
|
||||
cleaned = text.strip("|").strip()
|
||||
if cleaned != text.strip():
|
||||
cell["text"] = cleaned
|
||||
|
||||
|
||||
def _normalize_connector_columns(zones_data: List[Dict[str, Any]]) -> None:
|
||||
"""Normalize narrow connector columns where OCR appends noise chars.
|
||||
|
||||
In synonym dictionaries a narrow column repeats the same word
|
||||
(e.g. "oder") in every row. OCR sometimes appends noise chars.
|
||||
"""
|
||||
for z in zones_data:
|
||||
cols = z.get("columns", [])
|
||||
cells = z.get("cells", [])
|
||||
if not cols or not cells:
|
||||
continue
|
||||
for col in cols:
|
||||
ci = col.get("index")
|
||||
col_cells = [c for c in cells if c.get("col_index") == ci]
|
||||
if len(col_cells) < 3:
|
||||
continue
|
||||
text_counts: Dict[str, int] = {}
|
||||
for c in col_cells:
|
||||
t = (c.get("text") or "").strip()
|
||||
if t:
|
||||
text_counts[t] = text_counts.get(t, 0) + 1
|
||||
if not text_counts:
|
||||
continue
|
||||
dominant_text = max(text_counts, key=text_counts.get) # type: ignore[arg-type]
|
||||
dominant_count = text_counts[dominant_text]
|
||||
if len(dominant_text) > 10 or dominant_count < len(col_cells) * 0.6:
|
||||
continue
|
||||
fixed = 0
|
||||
for c in col_cells:
|
||||
t = (c.get("text") or "").strip()
|
||||
if t != dominant_text and t.startswith(dominant_text) and len(t) <= len(dominant_text) + 2:
|
||||
c["text"] = dominant_text
|
||||
wbs = c.get("word_boxes") or []
|
||||
if len(wbs) == 1:
|
||||
wbs[0]["text"] = dominant_text
|
||||
fixed += 1
|
||||
if fixed:
|
||||
logger.info(
|
||||
"build-grid: normalized %d outlier cells in connector column %d "
|
||||
"(dominant='%s') zone %d",
|
||||
fixed, ci, dominant_text, z.get("zone_index", 0),
|
||||
)
|
||||
|
||||
|
||||
def _remove_border_strips(
|
||||
zones_data: List[Dict[str, Any]],
|
||||
border_prefiltered: bool,
|
||||
) -> bool:
|
||||
"""Detect and remove page-border decoration strips.
|
||||
|
||||
Returns updated border_prefiltered flag.
|
||||
"""
|
||||
border_strip_removed = 0
|
||||
if border_prefiltered:
|
||||
logger.info("Step 4e: skipped (border pre-filter already applied)")
|
||||
return border_prefiltered
|
||||
|
||||
for z in zones_data:
|
||||
cells = z.get("cells", [])
|
||||
if not cells:
|
||||
continue
|
||||
all_wbs_with_cell: list = []
|
||||
for cell in cells:
|
||||
for wb in cell.get("word_boxes") or []:
|
||||
all_wbs_with_cell.append((wb.get("left", 0), wb, cell))
|
||||
if len(all_wbs_with_cell) < 10:
|
||||
continue
|
||||
all_wbs_with_cell.sort(key=lambda t: t[0])
|
||||
total = len(all_wbs_with_cell)
|
||||
|
||||
# -- Left-edge scan --
|
||||
left_strip_count = 0
|
||||
left_gap = 0
|
||||
running_right = 0
|
||||
for gi in range(total - 1):
|
||||
running_right = max(
|
||||
running_right,
|
||||
all_wbs_with_cell[gi][0] + all_wbs_with_cell[gi][1].get("width", 0),
|
||||
)
|
||||
gap = all_wbs_with_cell[gi + 1][0] - running_right
|
||||
if gap > 30:
|
||||
left_strip_count = gi + 1
|
||||
left_gap = gap
|
||||
break
|
||||
|
||||
# -- Right-edge scan --
|
||||
right_strip_count = 0
|
||||
right_gap = 0
|
||||
running_left = all_wbs_with_cell[-1][0]
|
||||
for gi in range(total - 1, 0, -1):
|
||||
running_left = min(running_left, all_wbs_with_cell[gi][0])
|
||||
prev_right = (
|
||||
all_wbs_with_cell[gi - 1][0]
|
||||
+ all_wbs_with_cell[gi - 1][1].get("width", 0)
|
||||
)
|
||||
gap = running_left - prev_right
|
||||
if gap > 30:
|
||||
right_strip_count = total - gi
|
||||
right_gap = gap
|
||||
break
|
||||
|
||||
strip_wbs: set = set()
|
||||
strip_side = ""
|
||||
strip_gap = 0
|
||||
strip_count = 0
|
||||
if left_strip_count > 0 and left_strip_count / total < 0.20:
|
||||
strip_side = "left"
|
||||
strip_count = left_strip_count
|
||||
strip_gap = left_gap
|
||||
strip_wbs = {id(t[1]) for t in all_wbs_with_cell[:left_strip_count]}
|
||||
elif right_strip_count > 0 and right_strip_count / total < 0.20:
|
||||
strip_side = "right"
|
||||
strip_count = right_strip_count
|
||||
strip_gap = right_gap
|
||||
strip_wbs = {id(t[1]) for t in all_wbs_with_cell[total - right_strip_count:]}
|
||||
|
||||
if not strip_wbs:
|
||||
continue
|
||||
for cell in cells:
|
||||
wbs = cell.get("word_boxes") or []
|
||||
filtered = [wb for wb in wbs if id(wb) not in strip_wbs]
|
||||
if len(filtered) < len(wbs):
|
||||
border_strip_removed += len(wbs) - len(filtered)
|
||||
cell["word_boxes"] = filtered
|
||||
cell["text"] = _words_to_reading_order_text(filtered)
|
||||
z["cells"] = [c for c in cells
|
||||
if (c.get("word_boxes") or c.get("text", "").strip())]
|
||||
logger.info(
|
||||
"Step 4e: removed %d border-strip word_boxes (%s) from zone %d "
|
||||
"(gap=%dpx, strip=%d/%d wbs)",
|
||||
border_strip_removed, strip_side, z.get("zone_index", 0),
|
||||
strip_gap, strip_count, total,
|
||||
)
|
||||
|
||||
return border_prefiltered
|
||||
|
||||
|
||||
def _remove_alphabet_sidebars(zones_data: List[Dict[str, Any]]) -> None:
|
||||
"""Remove decorative edge columns (alphabet sidebar safety net).
|
||||
|
||||
Dictionary pages have A-Z letter sidebars that OCR reads as single-
|
||||
character word_boxes.
|
||||
"""
|
||||
for z in zones_data:
|
||||
columns = z.get("columns", [])
|
||||
cells = z.get("cells", [])
|
||||
if len(columns) < 3 or not cells:
|
||||
continue
|
||||
col_cells: Dict[str, List[Dict]] = {}
|
||||
for cell in cells:
|
||||
ct = cell.get("col_type", "")
|
||||
if ct.startswith("column_"):
|
||||
col_cells.setdefault(ct, []).append(cell)
|
||||
col_types_ordered = sorted(col_cells.keys())
|
||||
if len(col_types_ordered) < 3:
|
||||
continue
|
||||
for edge_ct in [col_types_ordered[0], col_types_ordered[-1]]:
|
||||
edge_cells_list = col_cells.get(edge_ct, [])
|
||||
if len(edge_cells_list) < 3:
|
||||
continue
|
||||
texts = [(c.get("text") or "").strip() for c in edge_cells_list]
|
||||
avg_len = sum(len(t) for t in texts) / len(texts)
|
||||
single_char = sum(1 for t in texts if len(t) <= 1)
|
||||
single_ratio = single_char / len(texts)
|
||||
if avg_len > 1.5:
|
||||
continue
|
||||
if single_ratio < 0.7:
|
||||
continue
|
||||
removed_count = len(edge_cells_list)
|
||||
edge_ids = {id(c) for c in edge_cells_list}
|
||||
z["cells"] = [c for c in cells if id(c) not in edge_ids]
|
||||
z["columns"] = [col for col in columns if col.get("col_type") != edge_ct]
|
||||
logger.info(
|
||||
"Step 4f: removed decorative edge column '%s' from zone %d "
|
||||
"(%d cells, avg_len=%.1f, single_char=%.0f%%)",
|
||||
edge_ct, z.get("zone_index", 0), removed_count,
|
||||
avg_len, single_ratio * 100,
|
||||
)
|
||||
break # only remove one edge per zone
|
||||
213
klausur-service/backend/grid/build/core.py
Normal file
213
klausur-service/backend/grid/build/core.py
Normal file
@@ -0,0 +1,213 @@
|
||||
"""
|
||||
Grid Build Core — the main _build_grid_core() function.
|
||||
|
||||
Extracted from grid_editor_api.py for maintainability.
|
||||
Takes merged OCR word positions and builds a structured, zone-aware grid.
|
||||
|
||||
The function delegates to phase-specific modules:
|
||||
- grid_build_zones.py — image loading, graphic/box detection, zone grids
|
||||
- grid_build_cleanup.py — junk rows, artifacts, pipes, border strips
|
||||
- grid_build_text_ops.py — color, headings, IPA, page refs
|
||||
- grid_build_finalize.py — bullets, max_columns, dictionary, spelling, result
|
||||
"""
|
||||
|
||||
import logging
|
||||
import time
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from grid.editor.filters import (
|
||||
_flatten_word_boxes,
|
||||
_get_content_bounds,
|
||||
_filter_decorative_margin,
|
||||
_filter_footer_words,
|
||||
_filter_header_junk,
|
||||
)
|
||||
|
||||
from .zones import _build_zones
|
||||
from .cleanup import _cleanup_zones
|
||||
from .text_ops import _process_text
|
||||
from .finalize import _finalize_grid
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def _build_grid_core(
|
||||
session_id: str,
|
||||
session: dict,
|
||||
*,
|
||||
ipa_mode: str = "auto",
|
||||
syllable_mode: str = "auto",
|
||||
enhance: bool = True,
|
||||
max_columns: Optional[int] = None,
|
||||
min_conf: Optional[int] = None,
|
||||
) -> dict:
|
||||
"""Core grid building logic — pure computation, no HTTP or DB side effects.
|
||||
|
||||
Args:
|
||||
session_id: Session identifier (for logging and image loading).
|
||||
session: Full session dict from get_session_db().
|
||||
ipa_mode: "auto" (only when English headwords detected), "all"
|
||||
(force IPA on all content columns), "en" (English column only),
|
||||
"de" (German/definition columns only), or "none" (skip entirely).
|
||||
syllable_mode: "auto" (only when original has pipe dividers),
|
||||
"all" (force syllabification on all words), "en" (English only),
|
||||
"de" (German only), or "none" (skip).
|
||||
|
||||
Returns:
|
||||
StructuredGrid result dict.
|
||||
|
||||
Raises:
|
||||
ValueError: If session data is incomplete.
|
||||
"""
|
||||
t0 = time.time()
|
||||
|
||||
# ── Phase 1: Input Validation & Word Filtering ──────────────────
|
||||
|
||||
# 1. Validate and load word results
|
||||
word_result = session.get("word_result")
|
||||
if not word_result or not word_result.get("cells"):
|
||||
raise ValueError("No word results found. Run paddle-kombi or rapid-kombi first.")
|
||||
|
||||
img_w = word_result.get("image_width", 0)
|
||||
img_h = word_result.get("image_height", 0)
|
||||
if not img_w or not img_h:
|
||||
raise ValueError("Missing image dimensions in word_result")
|
||||
|
||||
# 2. Flatten all word boxes from cells
|
||||
all_words = _flatten_word_boxes(word_result["cells"])
|
||||
if not all_words:
|
||||
raise ValueError("No word boxes found in cells")
|
||||
|
||||
# 2a-pre. Apply min_conf filter if specified
|
||||
if min_conf and min_conf > 0:
|
||||
before = len(all_words)
|
||||
all_words = [w for w in all_words if w.get('conf', 100) >= min_conf]
|
||||
removed = before - len(all_words)
|
||||
if removed:
|
||||
logger.info("build-grid session %s: min_conf=%d removed %d/%d words",
|
||||
session_id, min_conf, removed, before)
|
||||
|
||||
logger.info("build-grid session %s: %d words from %d cells (enhance=%s, max_cols=%s, min_conf=%s)",
|
||||
session_id, len(all_words), len(word_result["cells"]),
|
||||
enhance, max_columns, min_conf)
|
||||
|
||||
# 2b. Filter decorative margin columns (alphabet graphics)
|
||||
margin_strip_info = _filter_decorative_margin(all_words, img_w, logger, session_id)
|
||||
margin_strip_detected = margin_strip_info.get("found", False)
|
||||
|
||||
# Read document_category from session
|
||||
document_category = session.get("document_category")
|
||||
|
||||
# 2c. Filter footer rows (page numbers at the very bottom)
|
||||
page_number_info = _filter_footer_words(all_words, img_h, logger, session_id)
|
||||
|
||||
# 2c2. Filter OCR junk from header illustrations
|
||||
_filter_header_junk(all_words, img_h, logger, session_id)
|
||||
|
||||
# 2d. Filter words inside user-defined exclude regions
|
||||
structure_result = session.get("structure_result")
|
||||
exclude_rects = []
|
||||
if structure_result:
|
||||
for er in structure_result.get("exclude_regions", []):
|
||||
exclude_rects.append({
|
||||
"x": er["x"], "y": er["y"],
|
||||
"w": er["w"], "h": er["h"],
|
||||
})
|
||||
if exclude_rects:
|
||||
before = len(all_words)
|
||||
filtered = []
|
||||
for w in all_words:
|
||||
w_cx = w["left"] + w.get("width", 0) / 2
|
||||
w_cy = w["top"] + w.get("height", 0) / 2
|
||||
inside = any(
|
||||
er["x"] <= w_cx <= er["x"] + er["w"]
|
||||
and er["y"] <= w_cy <= er["y"] + er["h"]
|
||||
for er in exclude_rects
|
||||
)
|
||||
if not inside:
|
||||
filtered.append(w)
|
||||
removed = before - len(filtered)
|
||||
if removed:
|
||||
all_words = filtered
|
||||
logger.info(
|
||||
"build-grid session %s: removed %d words inside %d user exclude region(s)",
|
||||
session_id, removed, len(exclude_rects),
|
||||
)
|
||||
|
||||
# 2e. Hard-filter words inside graphic/image regions from structure step
|
||||
graphic_rects: List[Dict[str, int]] = []
|
||||
if structure_result:
|
||||
for g in structure_result.get("graphics", []):
|
||||
graphic_rects.append({
|
||||
"x": g["x"], "y": g["y"],
|
||||
"w": g["w"], "h": g["h"],
|
||||
})
|
||||
if graphic_rects:
|
||||
before = len(all_words)
|
||||
all_words = [
|
||||
w for w in all_words
|
||||
if not any(
|
||||
gr["x"] <= w["left"] + w.get("width", 0) / 2 <= gr["x"] + gr["w"]
|
||||
and gr["y"] <= w["top"] + w.get("height", 0) / 2 <= gr["y"] + gr["h"]
|
||||
for gr in graphic_rects
|
||||
)
|
||||
]
|
||||
removed = before - len(all_words)
|
||||
if removed:
|
||||
logger.info(
|
||||
"build-grid session %s: hard-removed %d words inside %d structure graphic region(s)",
|
||||
session_id, removed, len(graphic_rects),
|
||||
)
|
||||
|
||||
content_x, content_y, content_w, content_h = _get_content_bounds(all_words)
|
||||
|
||||
# ── Phase 2: Image Processing & Zone Detection ──────────────────
|
||||
|
||||
zone_result = await _build_zones(
|
||||
session_id, session, all_words, graphic_rects,
|
||||
content_x, content_y, content_w, content_h,
|
||||
img_w, img_h,
|
||||
)
|
||||
zones_data = zone_result["zones_data"]
|
||||
boxes_detected = zone_result["boxes_detected"]
|
||||
recovered_count = zone_result["recovered_count"]
|
||||
border_prefiltered = zone_result["border_prefiltered"]
|
||||
img_bgr = zone_result["img_bgr"]
|
||||
|
||||
# ── Phase 3: Junk Removal & Cell Cleanup ────────────────────────
|
||||
|
||||
border_prefiltered = _cleanup_zones(zones_data, border_prefiltered, session_id)
|
||||
|
||||
# ── Phase 4+5a: Color, Headings, IPA, Page Refs ─────────────────
|
||||
|
||||
text_result = _process_text(
|
||||
zones_data, img_bgr, img_w, img_h, ipa_mode, page_number_info,
|
||||
)
|
||||
|
||||
# ── Phase 5b+6: Finalize & Result Assembly ──────────────────────
|
||||
|
||||
duration = time.time() - t0
|
||||
|
||||
result = _finalize_grid(
|
||||
zones_data=zones_data,
|
||||
all_words=all_words,
|
||||
img_bgr=img_bgr,
|
||||
img_w=img_w,
|
||||
img_h=img_h,
|
||||
session_id=session_id,
|
||||
max_columns=max_columns,
|
||||
ipa_mode=ipa_mode,
|
||||
syllable_mode=syllable_mode,
|
||||
en_col_type=text_result["en_col_type"],
|
||||
ipa_target_cols=text_result["ipa_target_cols"],
|
||||
all_content_cols=text_result["all_content_cols"],
|
||||
skip_ipa=text_result["skip_ipa"],
|
||||
document_category=document_category,
|
||||
margin_strip_detected=margin_strip_detected,
|
||||
page_number_info=text_result["page_number_info"],
|
||||
boxes_detected=boxes_detected,
|
||||
recovered_count=recovered_count,
|
||||
duration=duration,
|
||||
)
|
||||
|
||||
return result
|
||||
452
klausur-service/backend/grid/build/finalize.py
Normal file
452
klausur-service/backend/grid/build/finalize.py
Normal file
@@ -0,0 +1,452 @@
|
||||
"""
|
||||
Grid Build Finalize — Phase 5b+6: Orchestrates cell-level operations,
|
||||
dictionary detection, syllable dividers, spell checking, empty column
|
||||
removal, and result assembly.
|
||||
|
||||
Extracted from grid_build_core.py for maintainability.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from .cell_ops import (
|
||||
_remove_bullets_and_artifacts,
|
||||
_remove_garbled_cells,
|
||||
_normalize_word_order,
|
||||
_enforce_max_columns,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _finalize_grid(
|
||||
zones_data: List[Dict[str, Any]],
|
||||
all_words: List[Dict[str, Any]],
|
||||
img_bgr: Any,
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
session_id: str,
|
||||
max_columns: Optional[int],
|
||||
ipa_mode: str,
|
||||
syllable_mode: str,
|
||||
en_col_type: Optional[str],
|
||||
ipa_target_cols: set,
|
||||
all_content_cols: set,
|
||||
skip_ipa: bool,
|
||||
document_category: Optional[str],
|
||||
margin_strip_detected: bool,
|
||||
page_number_info: Optional[Dict],
|
||||
boxes_detected: int,
|
||||
recovered_count: int,
|
||||
duration: float,
|
||||
) -> dict:
|
||||
"""Run final processing steps and assemble result dict.
|
||||
|
||||
Handles: bullet removal, artifact cells, word ordering, max_columns,
|
||||
dictionary detection, syllable dividers, spell check, empty columns,
|
||||
internal flag cleanup, and result assembly.
|
||||
"""
|
||||
total_cols = sum(len(z.get("columns", [])) for z in zones_data)
|
||||
|
||||
# 5i. Remove blue bullet/artifact word_boxes
|
||||
_remove_bullets_and_artifacts(zones_data)
|
||||
|
||||
# 5j-pre. Remove cells whose text is entirely garbled / artifact noise
|
||||
_remove_garbled_cells(zones_data)
|
||||
|
||||
# 5j. Normalise word_box order to reading order
|
||||
_normalize_word_order(zones_data)
|
||||
|
||||
# 5k. Enforce max_columns by merging narrowest columns
|
||||
if max_columns and max_columns > 0:
|
||||
_enforce_max_columns(zones_data, max_columns)
|
||||
|
||||
# --- Dictionary detection on assembled grid ---
|
||||
dict_detection = _detect_dictionary(
|
||||
zones_data, img_w, img_h, document_category, margin_strip_detected
|
||||
)
|
||||
|
||||
# --- Word-gap merge ---
|
||||
try:
|
||||
from cv_syllable_detect import merge_word_gaps_in_zones
|
||||
merge_word_gaps_in_zones(zones_data, session_id)
|
||||
except Exception as e:
|
||||
logger.warning("Word-gap merge failed: %s", e)
|
||||
|
||||
# --- Pipe auto-correction ---
|
||||
try:
|
||||
from cv_syllable_detect import autocorrect_pipe_artifacts
|
||||
autocorrect_pipe_artifacts(zones_data, session_id)
|
||||
except Exception as e:
|
||||
logger.warning("Pipe autocorrect failed: %s", e)
|
||||
|
||||
# --- Syllable divider insertion ---
|
||||
syllable_insertions = _insert_syllable_dividers(
|
||||
zones_data, img_bgr, session_id, syllable_mode, dict_detection,
|
||||
en_col_type, all_content_cols, total_cols,
|
||||
)
|
||||
|
||||
# --- Split merged words ---
|
||||
_split_merged_words(zones_data, session_id)
|
||||
|
||||
# --- Ensure space before IPA/phonetic brackets ---
|
||||
_fix_ipa_spacing(zones_data)
|
||||
|
||||
# --- SmartSpellChecker ---
|
||||
_run_spell_checker(zones_data, session_id, en_col_type, total_cols)
|
||||
|
||||
# --- Debug log cell counts per column ---
|
||||
for z in zones_data:
|
||||
if z.get("zone_type") == "content":
|
||||
from collections import Counter as _Counter
|
||||
_cc = _Counter(c.get("col_index") for c in z.get("cells", []))
|
||||
_cols = z.get("columns", [])
|
||||
logger.info(
|
||||
"pre-empty-col-removal zone %d: %d cols, cells_per_col=%s",
|
||||
z.get("zone_index", 0), len(_cols), dict(sorted(_cc.items())),
|
||||
)
|
||||
|
||||
# --- Remove empty columns ---
|
||||
_remove_empty_columns(zones_data)
|
||||
|
||||
# Clean up internal flags before returning
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
cell.pop("_ipa_corrected", None)
|
||||
|
||||
# 6. Build result
|
||||
return _assemble_result(
|
||||
zones_data, all_words, img_w, img_h, session_id,
|
||||
ipa_mode, syllable_mode, ipa_target_cols, skip_ipa,
|
||||
dict_detection, page_number_info, boxes_detected,
|
||||
recovered_count, duration, syllable_insertions,
|
||||
)
|
||||
|
||||
|
||||
def _detect_dictionary(
|
||||
zones_data: List[Dict[str, Any]],
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
document_category: Optional[str],
|
||||
margin_strip_detected: bool,
|
||||
) -> Dict[str, Any]:
|
||||
"""Run dictionary detection on the assembled grid."""
|
||||
from cv_layout import _score_dictionary_signals
|
||||
dict_detection: Dict[str, Any] = {"is_dictionary": False, "confidence": 0.0}
|
||||
try:
|
||||
from cv_vocab_types import ColumnGeometry
|
||||
for z in zones_data:
|
||||
zone_cells = z.get("cells", [])
|
||||
zone_cols = z.get("columns", [])
|
||||
if len(zone_cols) < 2 or len(zone_cells) < 10:
|
||||
continue
|
||||
pseudo_geoms = []
|
||||
for col in zone_cols:
|
||||
ci = col["index"]
|
||||
col_cells = [c for c in zone_cells if c.get("col_index") == ci]
|
||||
col_words = []
|
||||
for cell in col_cells:
|
||||
for wb in cell.get("word_boxes") or []:
|
||||
col_words.append({
|
||||
"text": wb.get("text", ""),
|
||||
"conf": wb.get("conf", 0),
|
||||
"top": wb.get("top", 0),
|
||||
"left": wb.get("left", 0),
|
||||
"height": wb.get("height", 0),
|
||||
"width": wb.get("width", 0),
|
||||
})
|
||||
if not cell.get("word_boxes") and cell.get("text"):
|
||||
col_words.append({
|
||||
"text": cell["text"],
|
||||
"conf": cell.get("confidence", 50),
|
||||
"top": cell.get("bbox_px", {}).get("y", 0),
|
||||
"left": cell.get("bbox_px", {}).get("x", 0),
|
||||
"height": cell.get("bbox_px", {}).get("h", 20),
|
||||
"width": cell.get("bbox_px", {}).get("w", 50),
|
||||
})
|
||||
col_w = col.get("x_max_px", 0) - col.get("x_min_px", 0)
|
||||
pseudo_geoms.append(ColumnGeometry(
|
||||
index=ci, x=col.get("x_min_px", 0), y=0,
|
||||
width=max(col_w, 1), height=img_h,
|
||||
word_count=len(col_words), words=col_words,
|
||||
width_ratio=col_w / max(img_w, 1),
|
||||
))
|
||||
if len(pseudo_geoms) >= 2:
|
||||
dd = _score_dictionary_signals(
|
||||
pseudo_geoms,
|
||||
document_category=document_category,
|
||||
margin_strip_detected=margin_strip_detected,
|
||||
)
|
||||
if dd["confidence"] > dict_detection["confidence"]:
|
||||
dict_detection = dd
|
||||
except Exception as e:
|
||||
logger.warning("Dictionary detection failed: %s", e)
|
||||
return dict_detection
|
||||
|
||||
|
||||
def _insert_syllable_dividers(
|
||||
zones_data: List[Dict[str, Any]],
|
||||
img_bgr: Any,
|
||||
session_id: str,
|
||||
syllable_mode: str,
|
||||
dict_detection: Dict[str, Any],
|
||||
en_col_type: Optional[str],
|
||||
all_content_cols: set,
|
||||
total_cols: int,
|
||||
) -> int:
|
||||
"""Insert syllable dividers for dictionary pages. Returns insertion count."""
|
||||
syllable_insertions = 0
|
||||
if syllable_mode == "none" or img_bgr is None:
|
||||
if syllable_mode == "none":
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
t = cell.get("text", "")
|
||||
if "|" in t:
|
||||
cell["text"] = t.replace("|", "")
|
||||
return syllable_insertions
|
||||
|
||||
_syllable_eligible = False
|
||||
if syllable_mode in ("all", "de", "en"):
|
||||
_syllable_eligible = True
|
||||
elif (dict_detection.get("is_dictionary")
|
||||
and dict_detection.get("article_col_index") is not None):
|
||||
_syllable_eligible = True
|
||||
|
||||
_syllable_col_filter: Optional[set] = None
|
||||
if syllable_mode == "en":
|
||||
_syllable_col_filter = {en_col_type} if en_col_type else set()
|
||||
elif syllable_mode == "de":
|
||||
if en_col_type and total_cols >= 3:
|
||||
_syllable_col_filter = all_content_cols - {en_col_type}
|
||||
|
||||
if _syllable_eligible:
|
||||
try:
|
||||
from cv_syllable_detect import insert_syllable_dividers
|
||||
force_syllables = (syllable_mode in ("all", "de", "en"))
|
||||
syllable_insertions = insert_syllable_dividers(
|
||||
zones_data, img_bgr, session_id,
|
||||
force=force_syllables,
|
||||
col_filter=_syllable_col_filter,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning("Syllable insertion failed: %s", e)
|
||||
|
||||
return syllable_insertions
|
||||
|
||||
|
||||
def _split_merged_words(
|
||||
zones_data: List[Dict[str, Any]],
|
||||
session_id: str,
|
||||
) -> None:
|
||||
"""Split merged words using dictionary lookup."""
|
||||
try:
|
||||
from cv_review import _try_split_merged_word, _SPELL_AVAILABLE
|
||||
if not _SPELL_AVAILABLE:
|
||||
return
|
||||
split_count = 0
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
text = cell.get("text", "")
|
||||
if not text:
|
||||
continue
|
||||
parts = []
|
||||
changed = False
|
||||
for token in text.split():
|
||||
clean = token
|
||||
bracket_pos = clean.find('[')
|
||||
suffix_ipa = ""
|
||||
if bracket_pos > 0:
|
||||
suffix_ipa = clean[bracket_pos:]
|
||||
clean = clean[:bracket_pos]
|
||||
suffix_punct = ""
|
||||
stripped = clean.rstrip(".,!?;:'\")")
|
||||
if stripped != clean:
|
||||
suffix_punct = clean[len(stripped):]
|
||||
clean = stripped
|
||||
suffix = suffix_punct + suffix_ipa
|
||||
contraction = ""
|
||||
if "'" in clean and clean.index("'") >= 2:
|
||||
apos_pos = clean.index("'")
|
||||
contraction = clean[apos_pos:]
|
||||
clean = clean[:apos_pos]
|
||||
suffix = contraction + suffix
|
||||
if len(clean) >= 4 and clean.isalpha():
|
||||
split = _try_split_merged_word(clean)
|
||||
if split:
|
||||
parts.append(split + suffix)
|
||||
changed = True
|
||||
continue
|
||||
parts.append(token)
|
||||
if changed:
|
||||
cell["text"] = " ".join(parts)
|
||||
split_count += 1
|
||||
if split_count:
|
||||
logger.info("build-grid session %s: split %d merged words", session_id, split_count)
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
def _fix_ipa_spacing(zones_data: List[Dict[str, Any]]) -> None:
|
||||
"""Ensure space before IPA/phonetic brackets: 'word[ipa]' -> 'word [ipa]'."""
|
||||
_IPA_NOSPACE_RE = re.compile(r'([a-zA-ZäöüÄÖÜß])(\[[^\]]{2,}\])')
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
text = cell.get("text", "")
|
||||
if text and "[" in text:
|
||||
fixed = _IPA_NOSPACE_RE.sub(r'\1 \2', text)
|
||||
if fixed != text:
|
||||
cell["text"] = fixed
|
||||
|
||||
|
||||
def _run_spell_checker(
|
||||
zones_data: List[Dict[str, Any]],
|
||||
session_id: str,
|
||||
en_col_type: Optional[str],
|
||||
total_cols: int,
|
||||
) -> None:
|
||||
"""Run SmartSpellChecker on all cells."""
|
||||
try:
|
||||
from smart_spell import SmartSpellChecker
|
||||
_ssc = SmartSpellChecker()
|
||||
spell_fix_count = 0
|
||||
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
text = cell.get("text", "")
|
||||
if not text or not text.strip():
|
||||
continue
|
||||
ct = cell.get("col_type", "")
|
||||
if not ct.startswith("column_"):
|
||||
continue
|
||||
|
||||
if total_cols >= 3 and en_col_type:
|
||||
lang = "en" if ct == en_col_type else "de"
|
||||
elif total_cols <= 2:
|
||||
lang = "auto"
|
||||
else:
|
||||
lang = "auto"
|
||||
|
||||
result = _ssc.correct_text(text, lang=lang)
|
||||
if result.changed:
|
||||
cell["text"] = result.corrected
|
||||
spell_fix_count += 1
|
||||
|
||||
if spell_fix_count:
|
||||
logger.info(
|
||||
"build-grid session %s: SmartSpellChecker fixed %d cells",
|
||||
session_id, spell_fix_count,
|
||||
)
|
||||
except ImportError:
|
||||
logger.debug("SmartSpellChecker not available in build-grid")
|
||||
except Exception as e:
|
||||
logger.warning("SmartSpellChecker error in build-grid: %s", e)
|
||||
|
||||
|
||||
def _remove_empty_columns(zones_data: List[Dict[str, Any]]) -> None:
|
||||
"""Remove columns that have no cells assigned."""
|
||||
for z in zones_data:
|
||||
cells = z.get("cells", [])
|
||||
used_col_indices = {c.get("col_index") for c in cells}
|
||||
old_cols = z.get("columns", [])
|
||||
new_cols = [c for c in old_cols if c.get("col_index", c.get("index", -1)) in used_col_indices]
|
||||
if len(new_cols) < len(old_cols):
|
||||
old_to_new = {}
|
||||
for new_i, col in enumerate(new_cols):
|
||||
old_i = col.get("col_index", col.get("index", new_i))
|
||||
old_to_new[old_i] = new_i
|
||||
col["col_index"] = new_i
|
||||
col["index"] = new_i
|
||||
col["label"] = f"column_{new_i + 1}" if len(new_cols) > 1 else "column_text"
|
||||
for cell in cells:
|
||||
old_ci = cell.get("col_index", 0)
|
||||
cell["col_index"] = old_to_new.get(old_ci, old_ci)
|
||||
cell["col_type"] = f"column_{cell['col_index'] + 1}" if len(new_cols) > 1 else "column_text"
|
||||
z["columns"] = new_cols
|
||||
|
||||
|
||||
def _assemble_result(
|
||||
zones_data: List[Dict[str, Any]],
|
||||
all_words: List[Dict[str, Any]],
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
session_id: str,
|
||||
ipa_mode: str,
|
||||
syllable_mode: str,
|
||||
ipa_target_cols: set,
|
||||
skip_ipa: bool,
|
||||
dict_detection: Dict[str, Any],
|
||||
page_number_info: Optional[Dict],
|
||||
boxes_detected: int,
|
||||
recovered_count: int,
|
||||
duration: float,
|
||||
syllable_insertions: int,
|
||||
) -> dict:
|
||||
"""Build the final result dict (Phase 6)."""
|
||||
total_cells = sum(len(z.get("cells", [])) for z in zones_data)
|
||||
total_columns = sum(len(z.get("columns", [])) for z in zones_data)
|
||||
total_rows = sum(len(z.get("rows", [])) for z in zones_data)
|
||||
|
||||
# Collect color statistics
|
||||
color_stats: Dict[str, int] = {}
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
for wb in cell.get("word_boxes", []):
|
||||
cn = wb.get("color_name", "black")
|
||||
color_stats[cn] = color_stats.get(cn, 0) + 1
|
||||
|
||||
# Compute layout metrics
|
||||
all_content_row_heights: List[float] = []
|
||||
for z in zones_data:
|
||||
for row in z.get("rows", []):
|
||||
if not row.get("is_header", False):
|
||||
h = row.get("y_max_px", 0) - row.get("y_min_px", 0)
|
||||
if h > 0:
|
||||
all_content_row_heights.append(h)
|
||||
avg_row_height = (
|
||||
sum(all_content_row_heights) / len(all_content_row_heights)
|
||||
if all_content_row_heights else 30.0
|
||||
)
|
||||
font_size_suggestion = max(10, int(avg_row_height * 0.6))
|
||||
|
||||
return {
|
||||
"session_id": session_id,
|
||||
"image_width": img_w,
|
||||
"image_height": img_h,
|
||||
"zones": zones_data,
|
||||
"boxes_detected": boxes_detected,
|
||||
"summary": {
|
||||
"total_zones": len(zones_data),
|
||||
"total_columns": total_columns,
|
||||
"total_rows": total_rows,
|
||||
"total_cells": total_cells,
|
||||
"total_words": len(all_words),
|
||||
"recovered_colored": recovered_count,
|
||||
"color_stats": color_stats,
|
||||
},
|
||||
"formatting": {
|
||||
"bold_columns": [],
|
||||
"header_rows": [],
|
||||
},
|
||||
"layout_metrics": {
|
||||
"page_width_px": img_w,
|
||||
"page_height_px": img_h,
|
||||
"avg_row_height_px": round(avg_row_height, 1),
|
||||
"font_size_suggestion_px": font_size_suggestion,
|
||||
},
|
||||
"dictionary_detection": {
|
||||
"is_dictionary": dict_detection.get("is_dictionary", False),
|
||||
"confidence": dict_detection.get("confidence", 0.0),
|
||||
"signals": dict_detection.get("signals", {}),
|
||||
"article_col_index": dict_detection.get("article_col_index"),
|
||||
"headword_col_index": dict_detection.get("headword_col_index"),
|
||||
},
|
||||
"processing_modes": {
|
||||
"ipa_mode": ipa_mode,
|
||||
"syllable_mode": syllable_mode,
|
||||
"ipa_applied": bool(ipa_target_cols) if not skip_ipa else False,
|
||||
"syllables_applied": syllable_insertions > 0,
|
||||
},
|
||||
"page_number": page_number_info,
|
||||
"duration_seconds": round(duration, 2),
|
||||
}
|
||||
489
klausur-service/backend/grid/build/text_ops.py
Normal file
489
klausur-service/backend/grid/build/text_ops.py
Normal file
@@ -0,0 +1,489 @@
|
||||
"""
|
||||
Grid Build Text Ops — Phase 4+5a: Color annotation, heading detection,
|
||||
parenthesis fix, IPA phonetic correction, page ref extraction, and
|
||||
slash-IPA conversion.
|
||||
|
||||
Extracted from grid_build_core.py for maintainability.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional, Set, Tuple
|
||||
|
||||
from cv_color_detect import detect_word_colors
|
||||
from cv_ocr_engines import (
|
||||
fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa,
|
||||
_lookup_ipa,
|
||||
)
|
||||
from grid.editor.headers import (
|
||||
_detect_heading_rows_by_color,
|
||||
_detect_heading_rows_by_single_cell,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _process_text(
|
||||
zones_data: List[Dict[str, Any]],
|
||||
img_bgr: Any,
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
ipa_mode: str,
|
||||
page_number_info: Optional[Dict],
|
||||
) -> Dict[str, Any]:
|
||||
"""Run color annotation, heading detection, IPA correction, and page refs.
|
||||
|
||||
Args:
|
||||
zones_data: List of zone dicts (modified in place).
|
||||
img_bgr: BGR image array (or None).
|
||||
img_w: Image width.
|
||||
img_h: Image height.
|
||||
ipa_mode: IPA processing mode.
|
||||
page_number_info: Existing page number metadata (may be None).
|
||||
|
||||
Returns:
|
||||
Dict with keys: en_col_type, ipa_target_cols, all_content_cols,
|
||||
skip_ipa, page_number_info.
|
||||
"""
|
||||
# 5. Color annotation on final word_boxes in cells
|
||||
if img_bgr is not None:
|
||||
all_wb: List[Dict] = []
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
all_wb.extend(cell.get("word_boxes", []))
|
||||
detect_word_colors(img_bgr, all_wb)
|
||||
|
||||
# 5a. Heading detection by color + height
|
||||
heading_count = _detect_heading_rows_by_color(zones_data, img_w, img_h)
|
||||
if heading_count:
|
||||
logger.info("Detected %d heading rows by color+height", heading_count)
|
||||
|
||||
# 5b. Fix unmatched parentheses in cell text
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
text = cell.get("text", "")
|
||||
if ")" in text and "(" not in text:
|
||||
cell["text"] = "(" + text
|
||||
|
||||
# 5c. IPA phonetic correction
|
||||
all_cells = [cell for z in zones_data for cell in z.get("cells", [])]
|
||||
total_cols = sum(len(z.get("columns", [])) for z in zones_data)
|
||||
en_col_type = None
|
||||
ipa_target_cols: set = set()
|
||||
all_content_cols: set = set()
|
||||
skip_ipa = (ipa_mode == "none")
|
||||
|
||||
# When ipa_mode=none, strip ALL square brackets from ALL content columns
|
||||
if skip_ipa:
|
||||
_SQUARE_BRACKET_RE_NONE = re.compile(r'\s*\[[^\]]+\]')
|
||||
for cell in all_cells:
|
||||
ct = cell.get("col_type", "")
|
||||
if not ct.startswith("column_"):
|
||||
continue
|
||||
text = cell.get("text", "")
|
||||
if "[" in text:
|
||||
stripped = _SQUARE_BRACKET_RE_NONE.sub("", text)
|
||||
if stripped != text:
|
||||
cell["text"] = stripped.strip()
|
||||
cell["_ipa_corrected"] = True
|
||||
|
||||
if not skip_ipa and total_cols >= 3:
|
||||
en_col_type, ipa_target_cols, all_content_cols = _run_ipa_correction(
|
||||
all_cells, total_cols, ipa_mode, zones_data
|
||||
)
|
||||
elif not skip_ipa:
|
||||
# Collect all_content_cols even when <3 cols (needed by finalize)
|
||||
for cell in all_cells:
|
||||
ct = cell.get("col_type", "")
|
||||
if ct.startswith("column_") and (cell.get("text") or "").strip():
|
||||
all_content_cols.add(ct)
|
||||
|
||||
# 5e. Heading detection by single-cell rows
|
||||
single_heading_count = _detect_heading_rows_by_single_cell(zones_data, img_w, img_h)
|
||||
if single_heading_count:
|
||||
logger.info("Detected %d heading rows by single-cell heuristic", single_heading_count)
|
||||
|
||||
# 5f. Strip IPA from headings
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
if cell.get("col_type") != "heading":
|
||||
continue
|
||||
text = cell.get("text", "")
|
||||
stripped = re.sub(r'\s*\[[^\]]*\]\s*$', '', text).strip()
|
||||
if stripped and stripped != text:
|
||||
cell["text"] = stripped
|
||||
|
||||
# 5g. Extract page_ref cells and footer rows
|
||||
_extract_page_refs_and_footers(zones_data, page_number_info)
|
||||
|
||||
# 5h. Convert slash-delimited IPA to bracket notation
|
||||
_convert_slash_ipa(zones_data, skip_ipa, en_col_type)
|
||||
|
||||
return {
|
||||
"en_col_type": en_col_type,
|
||||
"ipa_target_cols": ipa_target_cols,
|
||||
"all_content_cols": all_content_cols,
|
||||
"skip_ipa": skip_ipa,
|
||||
"page_number_info": page_number_info,
|
||||
}
|
||||
|
||||
|
||||
def _run_ipa_correction(
|
||||
all_cells: List[Dict],
|
||||
total_cols: int,
|
||||
ipa_mode: str,
|
||||
zones_data: List[Dict[str, Any]],
|
||||
) -> Tuple[Optional[str], set, set]:
|
||||
"""Run IPA correction on cells. Returns (en_col_type, ipa_target_cols, all_content_cols)."""
|
||||
en_col_type = None
|
||||
all_content_cols: set = set()
|
||||
|
||||
# Detect English headword column via IPA signals
|
||||
col_ipa_count: Dict[str, int] = {}
|
||||
for cell in all_cells:
|
||||
ct = cell.get("col_type", "")
|
||||
if not ct.startswith("column_"):
|
||||
continue
|
||||
txt = cell.get("text", "") or ""
|
||||
if txt.strip():
|
||||
all_content_cols.add(ct)
|
||||
if '[' in txt or _text_has_garbled_ipa(txt):
|
||||
col_ipa_count[ct] = col_ipa_count.get(ct, 0) + 1
|
||||
if col_ipa_count:
|
||||
en_col_type = max(col_ipa_count, key=col_ipa_count.get)
|
||||
elif ipa_mode == "all":
|
||||
col_cell_count: Dict[str, int] = {}
|
||||
for cell in all_cells:
|
||||
ct = cell.get("col_type", "")
|
||||
if ct.startswith("column_") and (cell.get("text") or "").strip():
|
||||
col_cell_count[ct] = col_cell_count.get(ct, 0) + 1
|
||||
if col_cell_count:
|
||||
en_col_type = max(col_cell_count, key=col_cell_count.get)
|
||||
|
||||
# Decide which columns to process based on ipa_mode
|
||||
en_ipa_target_cols: set = set()
|
||||
de_ipa_target_cols: set = set()
|
||||
if ipa_mode in ("auto", "en"):
|
||||
if en_col_type:
|
||||
en_ipa_target_cols.add(en_col_type)
|
||||
elif ipa_mode == "de":
|
||||
de_ipa_target_cols = all_content_cols - {en_col_type} if en_col_type else all_content_cols
|
||||
elif ipa_mode == "all":
|
||||
if en_col_type:
|
||||
en_ipa_target_cols.add(en_col_type)
|
||||
de_ipa_target_cols = all_content_cols - en_ipa_target_cols
|
||||
|
||||
# --- Strip IPA from columns NOT in the target set ---
|
||||
_SQUARE_BRACKET_RE = re.compile(r'\s*\[[^\]]+\]')
|
||||
strip_en_ipa = en_col_type and en_col_type not in en_ipa_target_cols
|
||||
if strip_en_ipa or ipa_mode == "none":
|
||||
strip_cols = {en_col_type} if strip_en_ipa and ipa_mode != "none" else all_content_cols
|
||||
for cell in all_cells:
|
||||
ct = cell.get("col_type", "")
|
||||
if ct not in strip_cols:
|
||||
continue
|
||||
text = cell.get("text", "")
|
||||
if "[" in text:
|
||||
stripped = _SQUARE_BRACKET_RE.sub("", text)
|
||||
if stripped != text:
|
||||
cell["text"] = stripped.strip()
|
||||
cell["_ipa_corrected"] = True
|
||||
|
||||
# --- English IPA (Britfone + eng_to_ipa) ---
|
||||
if en_ipa_target_cols:
|
||||
for cell in all_cells:
|
||||
ct = cell.get("col_type")
|
||||
if ct in en_ipa_target_cols:
|
||||
cell["_orig_col_type"] = ct
|
||||
cell["col_type"] = "column_en"
|
||||
_pre_ipa = {id(c): c.get("text", "") for c in all_cells}
|
||||
fix_cell_phonetics(all_cells, pronunciation="british")
|
||||
for cell in all_cells:
|
||||
orig = cell.pop("_orig_col_type", None)
|
||||
if orig:
|
||||
cell["col_type"] = orig
|
||||
if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
|
||||
cell["_ipa_corrected"] = True
|
||||
|
||||
# --- German IPA (wiki-pronunciation-dict + epitran) ---
|
||||
if de_ipa_target_cols:
|
||||
from cv_ipa_german import insert_german_ipa
|
||||
insert_german_ipa(all_cells, de_ipa_target_cols)
|
||||
|
||||
ipa_target_cols = en_ipa_target_cols | de_ipa_target_cols
|
||||
|
||||
# Mark cells whose text was changed by IPA correction
|
||||
for cell in all_cells:
|
||||
if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
|
||||
cell["_ipa_corrected"] = True
|
||||
|
||||
# 5d. Fix IPA continuation cells
|
||||
skip_ipa = (ipa_mode == "none")
|
||||
_REAL_IPA_CHARS = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
|
||||
ipa_cont_fixed = 0
|
||||
for z in ([] if skip_ipa else zones_data):
|
||||
rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"])
|
||||
z_cells = z.get("cells", [])
|
||||
for idx, row in enumerate(rows_sorted):
|
||||
if idx == 0:
|
||||
continue
|
||||
ri = row["index"]
|
||||
row_cells = [c for c in z_cells if c.get("row_index") == ri]
|
||||
for cell in row_cells:
|
||||
ct = cell.get("col_type", "")
|
||||
if not ct.startswith("column_"):
|
||||
continue
|
||||
cell_text = (cell.get("text") or "").strip()
|
||||
if not cell_text:
|
||||
wb_texts = [w.get("text", "")
|
||||
for w in cell.get("word_boxes", [])]
|
||||
cell_text = " ".join(wb_texts).strip()
|
||||
if not cell_text:
|
||||
continue
|
||||
|
||||
is_bracketed = (
|
||||
cell_text.startswith('[') and cell_text.endswith(']')
|
||||
)
|
||||
|
||||
if is_bracketed:
|
||||
if not _text_has_garbled_ipa(cell_text):
|
||||
continue
|
||||
if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', cell_text):
|
||||
continue
|
||||
else:
|
||||
content_cells_in_row = [
|
||||
c for c in row_cells
|
||||
if c.get("col_type", "").startswith("column_")
|
||||
and c.get("col_type") != "column_1"
|
||||
]
|
||||
if len(content_cells_in_row) != 1:
|
||||
continue
|
||||
if not _text_has_garbled_ipa(cell_text):
|
||||
continue
|
||||
if any(c in _REAL_IPA_CHARS for c in cell_text):
|
||||
continue
|
||||
_words_in_text = re.findall(r'[A-Za-zÄÖÜäöüß]{3,}', cell_text)
|
||||
if len(_words_in_text) >= 3:
|
||||
continue
|
||||
|
||||
# Find headword in previous row, same column
|
||||
prev_ri = rows_sorted[idx - 1]["index"]
|
||||
prev_same_col = [
|
||||
c for c in z_cells
|
||||
if c.get("row_index") == prev_ri
|
||||
and c.get("col_type") == ct
|
||||
]
|
||||
if not prev_same_col:
|
||||
continue
|
||||
prev_text = prev_same_col[0].get("text", "")
|
||||
fixed = fix_ipa_continuation_cell(
|
||||
cell_text, prev_text, pronunciation="british",
|
||||
)
|
||||
if fixed != cell_text:
|
||||
cell["text"] = fixed
|
||||
ipa_cont_fixed += 1
|
||||
logger.info(
|
||||
"IPA continuation R%d %s: '%s' -> '%s'",
|
||||
ri, ct, cell_text, fixed,
|
||||
)
|
||||
if ipa_cont_fixed:
|
||||
logger.info("Fixed %d IPA continuation cells", ipa_cont_fixed)
|
||||
|
||||
return en_col_type, ipa_target_cols, all_content_cols
|
||||
|
||||
|
||||
def _extract_page_refs_and_footers(
|
||||
zones_data: List[Dict[str, Any]],
|
||||
page_number_info: Optional[Dict],
|
||||
) -> None:
|
||||
"""Extract page_ref cells and footer rows from content zones.
|
||||
|
||||
Modifies zones_data in place. Updates page_number_info if a page number
|
||||
footer is found.
|
||||
"""
|
||||
_REAL_IPA_CHARS_SET = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
|
||||
_PAGE_REF_RE = re.compile(r'^[pP,]?\s*\.?\s*\d+$')
|
||||
_NUMBER_WORDS = {
|
||||
"one", "two", "three", "four", "five", "six", "seven",
|
||||
"eight", "nine", "ten", "eleven", "twelve", "thirteen",
|
||||
"fourteen", "fifteen", "sixteen", "seventeen", "eighteen",
|
||||
"nineteen", "twenty", "thirty", "forty", "fifty", "sixty",
|
||||
"seventy", "eighty", "ninety", "hundred", "thousand", "and",
|
||||
"einhundert", "zweihundert", "dreihundert", "vierhundert",
|
||||
"und", "zehn", "zwanzig", "dreißig", "vierzig", "fünfzig",
|
||||
}
|
||||
|
||||
for z in zones_data:
|
||||
if z.get("zone_type") != "content":
|
||||
continue
|
||||
cells = z.get("cells", [])
|
||||
rows = z.get("rows", [])
|
||||
if not rows:
|
||||
continue
|
||||
|
||||
# Extract column_1 cells that look like page references
|
||||
page_refs = []
|
||||
page_ref_cell_ids = set()
|
||||
for cell in cells:
|
||||
if cell.get("col_type") != "column_1":
|
||||
continue
|
||||
text = (cell.get("text") or "").strip()
|
||||
if not text:
|
||||
continue
|
||||
if not _PAGE_REF_RE.match(text):
|
||||
continue
|
||||
page_refs.append({
|
||||
"row_index": cell.get("row_index"),
|
||||
"text": text,
|
||||
"bbox_pct": cell.get("bbox_pct", {}),
|
||||
})
|
||||
page_ref_cell_ids.add(cell.get("cell_id"))
|
||||
|
||||
# Detect footer: last non-header row if it has only 1 cell
|
||||
footer_rows = []
|
||||
non_header_rows = [r for r in rows if not r.get("is_header")]
|
||||
if non_header_rows:
|
||||
last_row = non_header_rows[-1]
|
||||
last_ri = last_row["index"]
|
||||
last_cells = [c for c in z["cells"]
|
||||
if c.get("row_index") == last_ri]
|
||||
if len(last_cells) == 1:
|
||||
text = (last_cells[0].get("text") or "").strip()
|
||||
has_real_ipa = any(c in _REAL_IPA_CHARS_SET for c in text)
|
||||
has_commas = ',' in text
|
||||
text_words = set(text.lower().split())
|
||||
is_written_number = len(text_words) >= 2 and text_words.issubset(_NUMBER_WORDS)
|
||||
is_page_number = len(text) <= 20 or is_written_number
|
||||
if (text and not has_real_ipa and not has_commas
|
||||
and is_page_number
|
||||
and last_cells[0].get("col_type") != "heading"):
|
||||
footer_rows.append({
|
||||
"row_index": last_ri,
|
||||
"text": text,
|
||||
"bbox_pct": last_cells[0].get("bbox_pct", {}),
|
||||
})
|
||||
|
||||
# Classify footer rows
|
||||
page_number_footers = []
|
||||
other_footers = []
|
||||
for fr in footer_rows:
|
||||
ft = fr["text"].strip()
|
||||
digits = "".join(c for c in ft if c.isdigit())
|
||||
if digits and re.match(r'^[\d\s.]+$', ft):
|
||||
page_number_footers.append(fr)
|
||||
elif ft.lower().split() and set(ft.lower().split()).issubset(_NUMBER_WORDS):
|
||||
page_number_footers.append(fr)
|
||||
else:
|
||||
other_footers.append(fr)
|
||||
|
||||
# Remove page-number footer rows from grid entirely
|
||||
if page_number_footers:
|
||||
pn_ris = {fr["row_index"] for fr in page_number_footers}
|
||||
z["cells"] = [c for c in z["cells"] if c.get("row_index") not in pn_ris]
|
||||
z["rows"] = [r for r in z["rows"] if r["index"] not in pn_ris]
|
||||
pn_text = page_number_footers[0]["text"].strip()
|
||||
pn_digits = "".join(c for c in pn_text if c.isdigit())
|
||||
if not page_number_info:
|
||||
page_number_info = {
|
||||
"text": pn_text,
|
||||
"y_pct": page_number_footers[0].get("bbox_pct", {}).get("y", 95),
|
||||
}
|
||||
if pn_digits:
|
||||
page_number_info["number"] = int(pn_digits)
|
||||
|
||||
# Mark remaining footer rows
|
||||
if other_footers:
|
||||
footer_ris = {fr["row_index"] for fr in other_footers}
|
||||
for r in z["rows"]:
|
||||
if r["index"] in footer_ris:
|
||||
r["is_footer"] = True
|
||||
for c in z["cells"]:
|
||||
if c.get("row_index") in footer_ris:
|
||||
c["col_type"] = "footer"
|
||||
|
||||
if page_refs or footer_rows:
|
||||
logger.info(
|
||||
"Extracted %d page_refs + %d footer rows (%d page numbers removed) from zone %d",
|
||||
len(page_refs), len(footer_rows), len(page_number_footers),
|
||||
z.get("zone_index", 0),
|
||||
)
|
||||
|
||||
if page_refs:
|
||||
z["page_refs"] = page_refs
|
||||
if other_footers:
|
||||
z["footer"] = other_footers
|
||||
|
||||
|
||||
def _convert_slash_ipa(
|
||||
zones_data: List[Dict[str, Any]],
|
||||
skip_ipa: bool,
|
||||
en_col_type: Optional[str],
|
||||
) -> None:
|
||||
"""Convert slash-delimited IPA to bracket notation.
|
||||
|
||||
Dictionary-style pages print IPA between slashes: "tiger /'taiga/"
|
||||
"""
|
||||
_SLASH_IPA_RE = re.compile(
|
||||
r'(\b[a-zA-Z]+[²³¹]?)\s*' # headword (capture group 1)
|
||||
r"(/[^/]{2,}/)" # /ipa/ (capture group 2), min 2 chars
|
||||
)
|
||||
_STANDALONE_SLASH_IPA_RE = re.compile(r'^/([^/]{2,})/')
|
||||
_SLASH_IPA_REJECT_RE = re.compile(r'[\s(),]')
|
||||
slash_ipa_fixed = 0
|
||||
|
||||
for z in ([] if skip_ipa else zones_data):
|
||||
for cell in z.get("cells", []):
|
||||
if en_col_type and cell.get("col_type") != en_col_type:
|
||||
continue
|
||||
text = cell.get("text", "")
|
||||
if "/" not in text:
|
||||
continue
|
||||
|
||||
def _replace_slash_ipa(m: re.Match) -> str:
|
||||
nonlocal slash_ipa_fixed
|
||||
headword = m.group(1)
|
||||
ocr_ipa = m.group(2)
|
||||
inner_raw = ocr_ipa.strip("/").strip()
|
||||
if _SLASH_IPA_REJECT_RE.search(inner_raw):
|
||||
return m.group(0)
|
||||
clean_hw = re.sub(r'[²³¹\d]', '', headword).strip()
|
||||
ipa = _lookup_ipa(clean_hw, "british") if clean_hw else None
|
||||
if ipa:
|
||||
slash_ipa_fixed += 1
|
||||
return f"{headword} [{ipa}]"
|
||||
inner = inner_raw.lstrip("'").strip()
|
||||
if inner:
|
||||
slash_ipa_fixed += 1
|
||||
return f"{headword} [{inner}]"
|
||||
return m.group(0)
|
||||
|
||||
new_text = _SLASH_IPA_RE.sub(_replace_slash_ipa, text)
|
||||
|
||||
_AFTER_BRACKET_SLASH = re.compile(r'(?<=\])\s*(/[^/]{2,}/)')
|
||||
|
||||
def _replace_trailing_slash(m: re.Match) -> str:
|
||||
nonlocal slash_ipa_fixed
|
||||
inner = m.group(1).strip("/").strip().lstrip("'").strip()
|
||||
if _SLASH_IPA_REJECT_RE.search(inner):
|
||||
return m.group(0)
|
||||
if inner:
|
||||
slash_ipa_fixed += 1
|
||||
return f" [{inner}]"
|
||||
return m.group(0)
|
||||
new_text = _AFTER_BRACKET_SLASH.sub(_replace_trailing_slash, new_text)
|
||||
|
||||
if new_text == text:
|
||||
m = _STANDALONE_SLASH_IPA_RE.match(text)
|
||||
if m:
|
||||
inner = m.group(1).strip()
|
||||
if not _SLASH_IPA_REJECT_RE.search(inner):
|
||||
inner = inner.lstrip("'").strip()
|
||||
if inner:
|
||||
new_text = "[" + inner + "]" + text[m.end():]
|
||||
slash_ipa_fixed += 1
|
||||
|
||||
if new_text != text:
|
||||
cell["text"] = new_text
|
||||
|
||||
if slash_ipa_fixed:
|
||||
logger.info("Step 5h: converted %d slash-IPA to bracket notation", slash_ipa_fixed)
|
||||
464
klausur-service/backend/grid/build/zones.py
Normal file
464
klausur-service/backend/grid/build/zones.py
Normal file
@@ -0,0 +1,464 @@
|
||||
"""
|
||||
Grid Build Zones — Phase 2: Image processing, graphic detection, box/zone
|
||||
detection and zone-aware grid building.
|
||||
|
||||
Extracted from grid_build_core.py for maintainability.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
from cv_box_detect import detect_boxes, split_page_into_zones
|
||||
from cv_graphic_detect import detect_graphic_elements
|
||||
from cv_color_detect import recover_colored_text
|
||||
from cv_vocab_types import PageZone
|
||||
from ocr_pipeline_session_store import get_session_image
|
||||
|
||||
from grid.editor.filters import (
|
||||
_filter_border_strip_words,
|
||||
_filter_border_ghosts,
|
||||
_words_in_zone,
|
||||
)
|
||||
from grid.editor.zones import (
|
||||
_PIPE_RE_VSPLIT,
|
||||
_detect_vertical_dividers,
|
||||
_split_zone_at_vertical_dividers,
|
||||
_merge_content_zones_across_boxes,
|
||||
_build_zone_grid,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def _build_zones(
|
||||
session_id: str,
|
||||
session: dict,
|
||||
all_words: List[Dict[str, Any]],
|
||||
graphic_rects: List[Dict[str, int]],
|
||||
content_x: int,
|
||||
content_y: int,
|
||||
content_w: int,
|
||||
content_h: int,
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
) -> Dict[str, Any]:
|
||||
"""Load image, detect graphics/boxes, build zone-aware grids.
|
||||
|
||||
Returns a dict with keys:
|
||||
zones_data, boxes_detected, recovered_count, border_prefiltered,
|
||||
img_bgr, all_words (modified in-place but returned for clarity).
|
||||
"""
|
||||
zones_data: List[Dict[str, Any]] = []
|
||||
boxes_detected = 0
|
||||
recovered_count = 0
|
||||
border_prefiltered = False
|
||||
img_bgr = None
|
||||
|
||||
# 3. Load image for box detection
|
||||
img_png = await get_session_image(session_id, "cropped")
|
||||
if not img_png:
|
||||
img_png = await get_session_image(session_id, "dewarped")
|
||||
if not img_png:
|
||||
img_png = await get_session_image(session_id, "original")
|
||||
|
||||
if img_png:
|
||||
# Decode image for color detection + box detection
|
||||
arr = np.frombuffer(img_png, dtype=np.uint8)
|
||||
img_bgr = cv2.imdecode(arr, cv2.IMREAD_COLOR)
|
||||
|
||||
if img_bgr is not None:
|
||||
# --- 3a. Detect graphic/image regions via CV and hard-filter ---
|
||||
sig_words = [w for w in all_words if len((w.get("text") or "").strip()) >= 3]
|
||||
fresh_graphics = detect_graphic_elements(img_bgr, sig_words)
|
||||
if fresh_graphics:
|
||||
fresh_rects = [
|
||||
{"x": g.x, "y": g.y, "w": g.width, "h": g.height}
|
||||
for g in fresh_graphics
|
||||
]
|
||||
graphic_rects.extend(fresh_rects)
|
||||
logger.info(
|
||||
"build-grid session %s: detected %d graphic region(s) via CV",
|
||||
session_id, len(fresh_graphics),
|
||||
)
|
||||
# Hard-filter words inside newly detected graphic regions
|
||||
before = len(all_words)
|
||||
all_words[:] = [
|
||||
w for w in all_words
|
||||
if not any(
|
||||
gr["x"] <= w["left"] + w.get("width", 0) / 2 <= gr["x"] + gr["w"]
|
||||
and gr["y"] <= w["top"] + w.get("height", 0) / 2 <= gr["y"] + gr["h"]
|
||||
for gr in fresh_rects
|
||||
)
|
||||
]
|
||||
removed = before - len(all_words)
|
||||
if removed:
|
||||
logger.info(
|
||||
"build-grid session %s: hard-removed %d words inside %d fresh graphic region(s)",
|
||||
session_id, removed, len(fresh_rects),
|
||||
)
|
||||
|
||||
# --- Recover colored text that OCR missed (before grid building) ---
|
||||
recovered = recover_colored_text(img_bgr, all_words)
|
||||
if recovered and graphic_rects:
|
||||
# Filter recovered chars inside graphic regions
|
||||
recovered = [
|
||||
r for r in recovered
|
||||
if not any(
|
||||
gr["x"] <= r["left"] + r.get("width", 0) / 2 <= gr["x"] + gr["w"]
|
||||
and gr["y"] <= r["top"] + r.get("height", 0) / 2 <= gr["y"] + gr["h"]
|
||||
for gr in graphic_rects
|
||||
)
|
||||
]
|
||||
if recovered:
|
||||
recovered_count = len(recovered)
|
||||
all_words.extend(recovered)
|
||||
logger.info(
|
||||
"build-grid session %s: +%d recovered colored words",
|
||||
session_id, recovered_count,
|
||||
)
|
||||
|
||||
# Detect bordered boxes
|
||||
boxes = detect_boxes(
|
||||
img_bgr,
|
||||
content_x=content_x,
|
||||
content_w=content_w,
|
||||
content_y=content_y,
|
||||
content_h=content_h,
|
||||
)
|
||||
boxes_detected = len(boxes)
|
||||
|
||||
if boxes:
|
||||
# Filter border ghost words before grid building
|
||||
all_words_new, ghost_count = _filter_border_ghosts(all_words, boxes)
|
||||
if ghost_count:
|
||||
all_words[:] = all_words_new
|
||||
logger.info(
|
||||
"build-grid session %s: removed %d border ghost words",
|
||||
session_id, ghost_count,
|
||||
)
|
||||
|
||||
# Split page into zones
|
||||
page_zones = split_page_into_zones(
|
||||
content_x, content_y, content_w, content_h, boxes
|
||||
)
|
||||
|
||||
# Merge content zones separated by box zones
|
||||
page_zones = _merge_content_zones_across_boxes(
|
||||
page_zones, content_x, content_w
|
||||
)
|
||||
|
||||
# 3b. Detect vertical dividers and split content zones
|
||||
page_zones, border_prefiltered_vd = _detect_and_split_vertical_dividers(
|
||||
page_zones, all_words
|
||||
)
|
||||
|
||||
# --- First pass: build grids per zone independently ---
|
||||
zone_grids = _build_grids_per_zone(
|
||||
page_zones, all_words, img_w, img_h
|
||||
)
|
||||
border_prefiltered = border_prefiltered or any(
|
||||
zg.get("_border_prefiltered") for zg in zone_grids
|
||||
)
|
||||
|
||||
# --- Second pass: merge column boundaries from all content zones ---
|
||||
_merge_content_zone_columns(
|
||||
zone_grids, all_words, content_w, img_w, img_h, session_id
|
||||
)
|
||||
|
||||
# --- Build zones_data from zone_grids ---
|
||||
for zg in zone_grids:
|
||||
pz = zg["pz"]
|
||||
grid = zg["grid"]
|
||||
grid.pop("_raw_columns", None)
|
||||
|
||||
zone_entry: Dict[str, Any] = {
|
||||
"zone_index": pz.index,
|
||||
"zone_type": pz.zone_type,
|
||||
"bbox_px": {
|
||||
"x": pz.x, "y": pz.y,
|
||||
"w": pz.width, "h": pz.height,
|
||||
},
|
||||
"bbox_pct": {
|
||||
"x": round(pz.x / img_w * 100, 2) if img_w else 0,
|
||||
"y": round(pz.y / img_h * 100, 2) if img_h else 0,
|
||||
"w": round(pz.width / img_w * 100, 2) if img_w else 0,
|
||||
"h": round(pz.height / img_h * 100, 2) if img_h else 0,
|
||||
},
|
||||
"border": None,
|
||||
"word_count": len(zg["words"]),
|
||||
**grid,
|
||||
}
|
||||
|
||||
if pz.box:
|
||||
zone_entry["border"] = {
|
||||
"thickness": pz.box.border_thickness,
|
||||
"confidence": pz.box.confidence,
|
||||
}
|
||||
|
||||
if pz.image_overlays:
|
||||
zone_entry["image_overlays"] = pz.image_overlays
|
||||
|
||||
if pz.layout_hint:
|
||||
zone_entry["layout_hint"] = pz.layout_hint
|
||||
if pz.vsplit_group is not None:
|
||||
zone_entry["vsplit_group"] = pz.vsplit_group
|
||||
|
||||
zones_data.append(zone_entry)
|
||||
|
||||
# 4. Fallback: no boxes detected -> single zone with all words
|
||||
if not zones_data:
|
||||
before = len(all_words)
|
||||
filtered_words = [
|
||||
w for w in all_words
|
||||
if not (w.get("recovered") and len(w.get("text", "").strip()) <= 2)
|
||||
]
|
||||
removed = before - len(filtered_words)
|
||||
if removed:
|
||||
logger.info(
|
||||
"build-grid session %s: filtered %d recovered artifacts (fallback zone)",
|
||||
session_id, removed,
|
||||
)
|
||||
filtered_words, bs_removed = _filter_border_strip_words(filtered_words)
|
||||
if bs_removed:
|
||||
border_prefiltered = True
|
||||
logger.info(
|
||||
"build-grid session %s: pre-filtered %d border-strip words",
|
||||
session_id, bs_removed,
|
||||
)
|
||||
grid = _build_zone_grid(
|
||||
filtered_words, content_x, content_y, content_w, content_h,
|
||||
0, img_w, img_h,
|
||||
)
|
||||
grid.pop("_raw_columns", None)
|
||||
zones_data.append({
|
||||
"zone_index": 0,
|
||||
"zone_type": "content",
|
||||
"bbox_px": {
|
||||
"x": content_x, "y": content_y,
|
||||
"w": content_w, "h": content_h,
|
||||
},
|
||||
"bbox_pct": {
|
||||
"x": round(content_x / img_w * 100, 2) if img_w else 0,
|
||||
"y": round(content_y / img_h * 100, 2) if img_h else 0,
|
||||
"w": round(content_w / img_w * 100, 2) if img_w else 0,
|
||||
"h": round(content_h / img_h * 100, 2) if img_h else 0,
|
||||
},
|
||||
"border": None,
|
||||
"word_count": len(all_words),
|
||||
**grid,
|
||||
})
|
||||
|
||||
return {
|
||||
"zones_data": zones_data,
|
||||
"boxes_detected": boxes_detected,
|
||||
"recovered_count": recovered_count,
|
||||
"border_prefiltered": border_prefiltered,
|
||||
"img_bgr": img_bgr,
|
||||
}
|
||||
|
||||
|
||||
def _detect_and_split_vertical_dividers(
|
||||
page_zones: List[PageZone],
|
||||
all_words: List[Dict[str, Any]],
|
||||
) -> tuple:
|
||||
"""Detect vertical dividers and split content zones.
|
||||
|
||||
Returns (expanded_zones, border_prefiltered_from_vsplit).
|
||||
"""
|
||||
vsplit_group_counter = 0
|
||||
expanded_zones: List = []
|
||||
for pz in page_zones:
|
||||
if pz.zone_type != "content":
|
||||
expanded_zones.append(pz)
|
||||
continue
|
||||
zone_words = _words_in_zone(
|
||||
all_words, pz.y, pz.height, pz.x, pz.width
|
||||
)
|
||||
divider_xs = _detect_vertical_dividers(
|
||||
zone_words, pz.x, pz.width, pz.y, pz.height
|
||||
)
|
||||
if divider_xs:
|
||||
sub_zones = _split_zone_at_vertical_dividers(
|
||||
pz, divider_xs, vsplit_group_counter
|
||||
)
|
||||
expanded_zones.extend(sub_zones)
|
||||
vsplit_group_counter += 1
|
||||
# Remove pipe words so they don't appear in sub-zones
|
||||
pipe_ids = set(
|
||||
id(w) for w in zone_words
|
||||
if _PIPE_RE_VSPLIT.match((w.get("text") or "").strip())
|
||||
)
|
||||
all_words[:] = [w for w in all_words if id(w) not in pipe_ids]
|
||||
logger.info(
|
||||
"build-grid: vertical split zone %d at x=%s -> %d sub-zones",
|
||||
pz.index, [int(x) for x in divider_xs], len(sub_zones),
|
||||
)
|
||||
else:
|
||||
expanded_zones.append(pz)
|
||||
# Re-index zones
|
||||
for i, pz in enumerate(expanded_zones):
|
||||
pz.index = i
|
||||
return expanded_zones, False
|
||||
|
||||
|
||||
def _build_grids_per_zone(
|
||||
page_zones: List[PageZone],
|
||||
all_words: List[Dict[str, Any]],
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Build grids for each zone independently (first pass)."""
|
||||
zone_grids: List[Dict] = []
|
||||
|
||||
for pz in page_zones:
|
||||
zone_words = _words_in_zone(
|
||||
all_words, pz.y, pz.height, pz.x, pz.width
|
||||
)
|
||||
if pz.zone_type == "content":
|
||||
logger.info(
|
||||
"build-grid zone %d (%s): bounds x=%d..%d y=%d..%d -> %d/%d words",
|
||||
pz.index, pz.zone_type,
|
||||
pz.x, pz.x + pz.width, pz.y, pz.y + pz.height,
|
||||
len(zone_words), len(all_words),
|
||||
)
|
||||
# Filter recovered single-char artifacts in ALL zones
|
||||
before = len(zone_words)
|
||||
zone_words = [
|
||||
w for w in zone_words
|
||||
if not (
|
||||
w.get("recovered")
|
||||
and len(w.get("text", "").strip()) <= 2
|
||||
)
|
||||
]
|
||||
removed = before - len(zone_words)
|
||||
if removed:
|
||||
logger.info(
|
||||
"build-grid: filtered %d recovered artifacts from %s zone %d",
|
||||
removed, pz.zone_type, pz.index,
|
||||
)
|
||||
# Filter words inside image overlay regions (merged box zones)
|
||||
if pz.image_overlays:
|
||||
before_ov = len(zone_words)
|
||||
zone_words = [
|
||||
w for w in zone_words
|
||||
if not any(
|
||||
ov["y"] <= w["top"] + w["height"] / 2 <= ov["y"] + ov["height"]
|
||||
and ov["x"] <= w["left"] + w["width"] / 2 <= ov["x"] + ov["width"]
|
||||
for ov in pz.image_overlays
|
||||
)
|
||||
]
|
||||
ov_removed = before_ov - len(zone_words)
|
||||
if ov_removed:
|
||||
logger.info(
|
||||
"build-grid: filtered %d words inside image overlays from zone %d",
|
||||
ov_removed, pz.index,
|
||||
)
|
||||
zone_words, bs_removed = _filter_border_strip_words(zone_words)
|
||||
bp = False
|
||||
if bs_removed:
|
||||
bp = True
|
||||
logger.info(
|
||||
"build-grid: pre-filtered %d border-strip words from zone %d",
|
||||
bs_removed, pz.index,
|
||||
)
|
||||
grid = _build_zone_grid(
|
||||
zone_words, pz.x, pz.y, pz.width, pz.height,
|
||||
pz.index, img_w, img_h,
|
||||
skip_first_row_header=bool(pz.image_overlays),
|
||||
)
|
||||
zone_grids.append({
|
||||
"pz": pz, "words": zone_words, "grid": grid,
|
||||
"_border_prefiltered": bp,
|
||||
})
|
||||
|
||||
return zone_grids
|
||||
|
||||
|
||||
def _merge_content_zone_columns(
|
||||
zone_grids: List[Dict[str, Any]],
|
||||
all_words: List[Dict[str, Any]],
|
||||
content_w: int,
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
session_id: str,
|
||||
) -> None:
|
||||
"""Second pass: merge column boundaries from all content zones.
|
||||
|
||||
Modifies zone_grids in place.
|
||||
"""
|
||||
content_zones = [
|
||||
zg for zg in zone_grids
|
||||
if zg["pz"].zone_type == "content"
|
||||
and zg["pz"].vsplit_group is None
|
||||
]
|
||||
if len(content_zones) <= 1:
|
||||
return
|
||||
|
||||
# Collect column split points (x_min of non-first columns)
|
||||
all_split_xs: List[float] = []
|
||||
for zg in content_zones:
|
||||
raw_cols = zg["grid"].get("_raw_columns", [])
|
||||
for col in raw_cols[1:]:
|
||||
all_split_xs.append(col["x_min"])
|
||||
|
||||
if not all_split_xs:
|
||||
return
|
||||
|
||||
all_split_xs.sort()
|
||||
merge_distance = max(25, int(content_w * 0.03))
|
||||
merged_xs = [all_split_xs[0]]
|
||||
for x in all_split_xs[1:]:
|
||||
if x - merged_xs[-1] < merge_distance:
|
||||
merged_xs[-1] = (merged_xs[-1] + x) / 2
|
||||
else:
|
||||
merged_xs.append(x)
|
||||
|
||||
total_cols = len(merged_xs) + 1
|
||||
max_zone_cols = max(
|
||||
len(zg["grid"].get("_raw_columns", []))
|
||||
for zg in content_zones
|
||||
)
|
||||
|
||||
if total_cols < max_zone_cols:
|
||||
return
|
||||
|
||||
cx_min = min(w["left"] for w in all_words)
|
||||
cx_max = max(w["left"] + w["width"] for w in all_words)
|
||||
merged_columns: List[Dict[str, Any]] = []
|
||||
prev_x = cx_min
|
||||
for i, sx in enumerate(merged_xs):
|
||||
merged_columns.append({
|
||||
"index": i,
|
||||
"type": f"column_{i + 1}",
|
||||
"x_min": prev_x,
|
||||
"x_max": sx,
|
||||
})
|
||||
prev_x = sx
|
||||
merged_columns.append({
|
||||
"index": len(merged_xs),
|
||||
"type": f"column_{len(merged_xs) + 1}",
|
||||
"x_min": prev_x,
|
||||
"x_max": cx_max,
|
||||
})
|
||||
|
||||
# Re-build ALL content zones with merged columns
|
||||
for zg in zone_grids:
|
||||
pz = zg["pz"]
|
||||
if pz.zone_type == "content":
|
||||
grid = _build_zone_grid(
|
||||
zg["words"], pz.x, pz.y,
|
||||
pz.width, pz.height,
|
||||
pz.index, img_w, img_h,
|
||||
global_columns=merged_columns,
|
||||
skip_first_row_header=bool(pz.image_overlays),
|
||||
)
|
||||
zg["grid"] = grid
|
||||
logger.info(
|
||||
"build-grid session %s: union of %d content "
|
||||
"zones -> %d merged columns (max single zone: %d)",
|
||||
session_id, len(content_zones),
|
||||
total_cols, max_zone_cols,
|
||||
)
|
||||
Reference in New Issue
Block a user